Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Class Members | File Members

ComputePme.C

Go to the documentation of this file.
00001 
00007 #ifdef NAMD_FFTW
00008 #ifdef NAMD_FFTW_NO_TYPE_PREFIX
00009 #include <fftw.h>
00010 #include <rfftw.h>
00011 #else
00012 #include <sfftw.h>
00013 #include <srfftw.h>
00014 #endif
00015 #endif
00016 
00017 #include "InfoStream.h"
00018 #include "Node.h"
00019 #include "PatchMap.h"
00020 #include "PatchMap.inl"
00021 #include "AtomMap.h"
00022 #include "ComputePme.h"
00023 #include "ComputePmeMgr.decl.h"
00024 #include "PmeRealSpace.h"
00025 #include "PmeKSpace.h"
00026 #include "ComputeNonbondedUtil.h"
00027 #include "PatchMgr.h"
00028 #include "Molecule.h"
00029 #include "ReductionMgr.h"
00030 #include "ComputeMgr.h"
00031 #include "ComputeMgr.decl.h"
00032 // #define DEBUGM
00033 #define MIN_DEBUG_LEVEL 3
00034 #include "Debug.h"
00035 #include "SimParameters.h"
00036 #include "WorkDistrib.h"
00037 #include "varsizemsg.h"
00038 #include "Random.h"
00039 #include "Priorities.h"
00040 
00041 // commlib has been observed to cause hangs when starting load balancing
00042 // #define USE_COMM_LIB 1
00043 #ifdef USE_COMM_LIB
00044 #include "EachToManyMulticastStrategy.h"
00045 #endif
00046 
00047 #ifndef SQRT_PI
00048 #define SQRT_PI 1.7724538509055160273 /* mathematica 15 digits*/
00049 #endif
00050 
00051 char *pencilPMEProcessors;
00052 
00053 
00054 class PmeGridMsg : public CMessage_PmeGridMsg {
00055 public:
00056 
00057   int sourceNode;
00058   int sequence;
00059   int hasData;
00060   Lattice lattice;
00061   PmeReduction *evir;
00062   int start;
00063   int len;
00064   int zlistlen;
00065   int *zlist;
00066   char *fgrid;
00067   float *qgrid;
00068 };
00069 
00070 class PmeTransMsg : public CMessage_PmeTransMsg {
00071 public:
00072 
00073   int sourceNode;
00074   int sequence;
00075   int hasData;
00076   Lattice lattice;
00077   int x_start;
00078   int nx;
00079   float *qgrid;
00080 };
00081 
00082 
00083 class PmeUntransMsg : public CMessage_PmeUntransMsg {
00084 public:
00085 
00086   int sourceNode;
00087   int has_evir;
00088   PmeReduction *evir;
00089   int y_start;
00090   int ny;
00091   float *qgrid;
00092 
00093 };
00094 
00095 
00096 // use this idiom since messages don't have copy constructors
00097 struct PmePencilInitMsgData {
00098   PmeGrid grid;
00099   int xBlocks, yBlocks, zBlocks;
00100   CProxy_PmeXPencil xPencil;
00101   CProxy_PmeYPencil yPencil;
00102   CProxy_PmeZPencil zPencil;
00103   CProxy_ComputePmeMgr pmeProxy;
00104 };
00105 
00106 class PmePencilInitMsg : public CMessage_PmePencilInitMsg {
00107 public:
00108    PmePencilInitMsg(PmePencilInitMsgData &d) { data = d; }
00109    PmePencilInitMsgData data;
00110 };
00111 
00112 
00113 struct LocalPmeInfo {
00114   int nx, x_start;
00115   int ny_after_transpose, y_start_after_transpose;
00116 };
00117 
00118 
00119 //Assigns gridPeMap and transPeMap to the same set of processors.
00120 void generatePmePeList(int *peMap, int numPes){
00121   // decide which pes to use by bit reversal
00122   int i;
00123   int ncpus = CkNumPes();
00124   
00125   // find next highest power of two
00126   int npow2 = 1;  int nbits = 0;
00127   while ( npow2 < ncpus ) { npow2 *= 2; nbits += 1; }
00128   
00129   // build bit reversal sequence
00130   SortableResizeArray<int> seq(ncpus);
00131   i = 0;
00132   for ( int icpu=0; icpu<ncpus; ++icpu ) {
00133     int ri;
00134     for ( ri = ncpus; ri >= ncpus; ++i ) {
00135       ri = 0;
00136       int pow2 = 1;
00137       int rpow2 = npow2 / 2;
00138       for ( int j=0; j<nbits; ++j ) {
00139         ri += rpow2 * ( ( i / pow2 ) % 2 );
00140         pow2 *= 2;  rpow2 /= 2;
00141       }
00142     }
00143     seq[icpu] = ri;
00144   }
00145   
00146   // extract and sort PME locations
00147   for ( i=0; i<numPes; ++i ) {
00148     seq[i] = seq[ncpus - numPes + i];
00149   }
00150   seq.resize(numPes);
00151   seq.sort();
00152   
00153   for ( i=0; i<numPes; ++i ) 
00154       peMap[i] = seq[i];
00155 
00156   //peMap[0] = 0;
00157 }
00158 
00159 //Assigns gridPeMap and transPeMap to different set of processors.
00160 void generatePmePeList2(int *gridPeMap, int numGridPes, int *transPeMap, int numTransPes){
00161   // decide which pes to use by bit reversal
00162   int i;
00163   int ncpus = CkNumPes();
00164   
00165   // find next highest power of two
00166   int npow2 = 1;  int nbits = 0;
00167   while ( npow2 < ncpus ) { npow2 *= 2; nbits += 1; }
00168   
00169   // build bit reversal sequence
00170   SortableResizeArray<int> seq(ncpus);
00171   SortableResizeArray<int> seq2(ncpus);
00172   i = 0;
00173   for ( int icpu=0; icpu<ncpus; ++icpu ) {
00174     int ri;
00175     for ( ri = ncpus; ri >= ncpus; ++i ) {
00176       ri = 0;
00177       int pow2 = 1;
00178       int rpow2 = npow2 / 2;
00179       for ( int j=0; j<nbits; ++j ) {
00180         ri += rpow2 * ( ( i / pow2 ) % 2 );
00181         pow2 *= 2;  rpow2 /= 2;
00182       }
00183     }
00184     seq[icpu] = ri;
00185     seq2[icpu] = ri;
00186   }
00187   
00188   // extract and sort PME locations
00189   for ( i=0; i<numGridPes; ++i ) {
00190     seq[i] = seq[ncpus - numGridPes + i];
00191   }
00192   seq.resize(numGridPes);
00193   seq.sort();
00194   int firstTransPe = ncpus - numGridPes - numTransPes;
00195   if ( firstTransPe < 0 ) {
00196     firstTransPe = 0;
00197     // 0 should be first in list, skip if possible
00198     if ( ncpus > numTransPes ) firstTransPe = 1;
00199   }
00200   for ( i=0; i<numTransPes; ++i ) {
00201     seq2[i] = seq2[firstTransPe + i];
00202   }
00203   seq2.resize(numTransPes);
00204   seq2.sort();
00205   
00206   for ( i=0; i<numGridPes; ++i ) 
00207     gridPeMap[i] = seq[i];
00208 
00209   for ( i=0; i<numTransPes; ++i ) 
00210     transPeMap[i] = seq2[i];
00211 }
00212 
00213 #if USE_TOPOMAP 
00214 //Topology aware PME allocation
00215 bool generateBGLORBPmePeList(int *pemap, int numPes, int *block_pes=0, 
00216                              int nbpes=0);
00217 #endif
00218 
00219 class ComputePmeMgr : public BOCclass {
00220 public:
00221   friend class ComputePme;
00222   ComputePmeMgr();
00223   ~ComputePmeMgr();
00224 
00225   void initialize(CkQdMsg*);
00226   void initialize_pencils(CkQdMsg*);
00227   void activate_pencils(CkQdMsg*);
00228   void recvArrays(CProxy_PmeXPencil, CProxy_PmeYPencil, CProxy_PmeZPencil);
00229 
00230   void sendGrid(void);
00231   void recvGrid(PmeGridMsg *);
00232   void gridCalc1(void);
00233   void sendTransBarrier(void);
00234   void sendTrans(void);
00235   void recvTrans(PmeTransMsg *);
00236   void gridCalc2(void);
00237   void sendUntrans(void);
00238   void recvUntrans(PmeUntransMsg *);
00239   void gridCalc3(void);
00240   void sendUngrid(void);
00241   void recvUngrid(PmeGridMsg *);
00242   void ungridCalc(void);
00243 
00244   void setCompute(ComputePme *c) { pmeCompute = c; c->setMgr(this); }
00245 
00246   //Tells if the current processor is a PME processor or not. Called by NamdCentralLB
00247   int isPmeProcessor(int p);  
00248 
00249 #ifdef NAMD_FFTW
00250   static CmiNodeLock fftw_plan_lock;
00251 #endif
00252 
00253 private:
00254   CProxy_ComputePmeMgr pmeProxy;
00255   CProxy_ComputePmeMgr pmeProxyDir;
00256   ComputePme *pmeCompute;
00257   PmeGrid myGrid;
00258   Lattice lattice;
00259   PmeKSpace *myKSpace;
00260   float *qgrid;
00261   float *kgrid;
00262 
00263 #ifdef NAMD_FFTW
00264   fftw_plan forward_plan_x, backward_plan_x;
00265   rfftwnd_plan forward_plan_yz, backward_plan_yz;
00266   fftw_complex *work;
00267 #else
00268   float *work;
00269 #endif
00270 
00271   int alchFepOn, alchThermIntOn, lesOn, lesFactor, pairOn, selfOn, numGrids;
00272   int alchDecouple;
00273   BigReal alchElecLambdaStart;
00274   BigReal elecLambdaUp;
00275   BigReal elecLambdaDown;
00276 
00277   
00278   LocalPmeInfo *localInfo;
00279   int qgrid_size;
00280   int qgrid_start;
00281   int qgrid_len;
00282   int fgrid_start;
00283   int fgrid_len;
00284 
00285   int numSources;
00286   int numGridPes;
00287   int numTransPes;
00288   int numDestRecipPes;
00289   int myGridPe;
00290   int myTransPe;
00291   int *gridPeMap;
00292   int *transPeMap;
00293   int *recipPeDest;
00294   int *gridPeOrder;
00295   int *transPeOrder;
00296   char *isPmeFlag;
00297   int grid_count;
00298   int trans_count;
00299   int untrans_count;
00300   int ungrid_count;
00301   PmeGridMsg **gridmsg_reuse;
00302   PmeReduction recip_evir[PME_MAX_EVALS];
00303   PmeReduction recip_evir2[PME_MAX_EVALS];
00304 
00305   int sequence;  // used for priorities
00306   int useBarrier;
00307   int sendTransBarrier_received;
00308 
00309   int usePencils;
00310   int xBlocks, yBlocks, zBlocks;
00311   CProxy_PmeXPencil xPencil;
00312   CProxy_PmeYPencil yPencil;
00313   CProxy_PmeZPencil zPencil;
00314   char *pencilActive;
00315   int numPencilsActive;
00316 };
00317 
00318 #ifdef NAMD_FFTW
00319   CmiNodeLock ComputePmeMgr::fftw_plan_lock;
00320 #endif
00321 
00322 int isPmeProcessor(int p){ 
00323   if (pencilPMEProcessors)
00324     return pencilPMEProcessors[p];
00325   else
00326     return CProxy_ComputePmeMgr::ckLocalBranch(CkpvAccess(BOCclass_group).computePmeMgr)->isPmeProcessor(p);
00327 }
00328 
00329 int ComputePmeMgr::isPmeProcessor(int p){ 
00330   return ( usePencils ? pencilPMEProcessors[p] : isPmeFlag[p] );
00331 }
00332 
00333 ComputePmeMgr::ComputePmeMgr() : pmeProxy(thisgroup), 
00334                                  pmeProxyDir(thisgroup), pmeCompute(0) {
00335 
00336   CkpvAccess(BOCclass_group).computePmeMgr = thisgroup;
00337 
00338 #ifdef USE_COMM_LIB
00339   ComlibDelegateProxy(&pmeProxy);
00340 #endif
00341 
00342 #ifdef NAMD_FFTW
00343   if ( CmiMyRank() == 0 ) {
00344     fftw_plan_lock = CmiCreateLock();
00345   }
00346 #endif
00347 
00348   myKSpace = 0;
00349   kgrid = 0;
00350   work = 0;
00351   grid_count = 0;
00352   trans_count = 0;
00353   untrans_count = 0;
00354   ungrid_count = 0;
00355   gridmsg_reuse= new PmeGridMsg*[CkNumPes()];
00356   useBarrier = 0;
00357   sendTransBarrier_received = 0;
00358   usePencils = 0;
00359 }
00360 
00361 
00362 void ComputePmeMgr::recvArrays(
00363         CProxy_PmeXPencil x, CProxy_PmeYPencil y, CProxy_PmeZPencil z) {
00364   xPencil = x;  yPencil = y;  zPencil = z;
00365 }
00366 
00367 
00368 void ComputePmeMgr::initialize(CkQdMsg *msg) {
00369   delete msg;
00370 
00371   localInfo = new LocalPmeInfo[CkNumPes()];
00372   gridPeMap = new int[CkNumPes()];
00373   transPeMap = new int[CkNumPes()];
00374   recipPeDest = new int[CkNumPes()];
00375   gridPeOrder = new int[CkNumPes()];
00376   transPeOrder = new int[CkNumPes()];
00377   isPmeFlag = new char[CkNumPes()];  
00378 
00379   SimParameters *simParams = Node::Object()->simParameters;
00380   PatchMap *patchMap = PatchMap::Object();
00381 
00382   alchFepOn = simParams->alchFepOn;
00383   alchThermIntOn = simParams->alchThermIntOn;
00384   alchDecouple = (alchFepOn || alchThermIntOn) && (simParams->alchDecouple);
00385   alchElecLambdaStart = (alchFepOn || alchThermIntOn) ? 
00386     simParams->alchElecLambdaStart : 0;
00387   if (alchFepOn || alchThermIntOn) {
00388     numGrids = 2;
00389     if (alchDecouple) numGrids += 2;
00390     if (alchElecLambdaStart || alchThermIntOn) numGrids ++;
00391   }
00392   else numGrids = 1;
00393   lesOn = simParams->lesOn;
00394   useBarrier = simParams->PMEBarrier;
00395   if ( lesOn ) {
00396     lesFactor = simParams->lesFactor;
00397     numGrids = lesFactor;
00398   }
00399   selfOn = 0;
00400   pairOn = simParams->pairInteractionOn;
00401   if ( pairOn ) {
00402     selfOn = simParams->pairInteractionSelf;
00403     if ( selfOn ) pairOn = 0;  // make pairOn and selfOn exclusive
00404     numGrids = selfOn ? 1 : 3;
00405   }
00406 
00407   if ( numGrids != 1 || simParams->PMEPencils == 0 ) usePencils = 0;
00408   else if ( simParams->PMEPencils > 0 ) usePencils = 1;
00409   else {
00410     int dimx = simParams->PMEGridSizeX;
00411     int dimy = simParams->PMEGridSizeY;
00412     int maxslabs = 1 + (dimx - 1) / simParams->PMEMinSlices;
00413     if ( maxslabs > CkNumPes() ) maxslabs = CkNumPes();
00414     int maxpencils = ( simParams->PMEGridSizeX * simParams->PMEGridSizeY
00415                 * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
00416     if ( maxpencils > CkNumPes() ) maxpencils = CkNumPes();
00417     if ( maxpencils > 3 * maxslabs ) usePencils = 1;
00418     else usePencils = 0;
00419   }
00420 
00421   if ( usePencils ) {
00422     if ( simParams->PMEPencils > 1 ) {
00423       xBlocks = yBlocks = zBlocks = simParams->PMEPencils;
00424     } else {
00425       int nb2 = ( simParams->PMEGridSizeX * simParams->PMEGridSizeY
00426                 * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
00427       if ( nb2 > CkNumPes() ) nb2 = CkNumPes();
00428       int nb = (int) sqrt((float)nb2);
00429       xBlocks = zBlocks = nb;
00430       yBlocks = nb2 / nb;
00431     }
00432 
00433     int dimx = simParams->PMEGridSizeX;
00434     int bx = 1 + ( dimx - 1 ) / xBlocks;
00435     xBlocks = 1 + ( dimx - 1 ) / bx;
00436 
00437     int dimy = simParams->PMEGridSizeY;
00438     int by = 1 + ( dimy - 1 ) / yBlocks;
00439     yBlocks = 1 + ( dimy - 1 ) / by;
00440 
00441     int dimz = simParams->PMEGridSizeZ / 2 + 1;  // complex
00442     int bz = 1 + ( dimz - 1 ) / zBlocks;
00443     zBlocks = 1 + ( dimz - 1 ) / bz;
00444 
00445     if ( ! CkMyPe() ) {
00446       iout << iINFO << "PME using " << xBlocks << " x " <<
00447         yBlocks << " x " << zBlocks <<
00448         " pencil grid for FFT and reciprocal sum.\n" << endi;
00449     }
00450   } else { // usePencils
00451 
00452   {  // decide how many pes to use for reciprocal sum
00453 
00454     // rules based on work available
00455     int minslices = simParams->PMEMinSlices;
00456     int dimx = simParams->PMEGridSizeX;
00457     int nrpx = ( dimx + minslices - 1 ) / minslices;
00458     int dimy = simParams->PMEGridSizeY;
00459     int nrpy = ( dimy + minslices - 1 ) / minslices;
00460 
00461     // rules based on processors available
00462     int nrpp = CkNumPes();
00463     // if ( nrpp > 32 ) nrpp = 32;  // cap to limit messages
00464     if ( nrpp < nrpx ) nrpx = nrpp;
00465     if ( nrpp < nrpy ) nrpy = nrpp;
00466 
00467     // user override
00468     int nrps = simParams->PMEProcessors;
00469     if ( nrps > CkNumPes() ) nrps = CkNumPes();
00470     if ( nrps > 0 ) nrpx = nrps;
00471     if ( nrps > 0 ) nrpy = nrps;
00472 
00473     // make sure there aren't any totally empty processors
00474     int bx = ( dimx + nrpx - 1 ) / nrpx;
00475     nrpx = ( dimx + bx - 1 ) / bx;
00476     int by = ( dimy + nrpy - 1 ) / nrpy;
00477     nrpy = ( dimy + by - 1 ) / by;
00478     if ( bx != ( dimx + nrpx - 1 ) / nrpx )
00479       NAMD_bug("Error in selecting number of PME processors.");
00480     if ( by != ( dimy + nrpy - 1 ) / nrpy )
00481       NAMD_bug("Error in selecting number of PME processors.");
00482 
00483     numGridPes = nrpx;
00484     numTransPes = nrpy;
00485   }
00486   if ( ! CkMyPe() ) {
00487     iout << iINFO << "PME using " << numGridPes << " and " << numTransPes <<
00488       " processors for FFT and reciprocal sum.\n" << endi;
00489   }
00490   { // generate random orderings for grid and trans messages
00491     int i;
00492     for ( i = 0; i < numGridPes; ++i ) {
00493       gridPeOrder[i] = i;
00494     }
00495     for ( i = 0; i < numTransPes; ++i ) {
00496       transPeOrder[i] = i;
00497     }
00498     Random rand(CkMyPe());
00499     rand.reorder(gridPeOrder,numGridPes);
00500     rand.reorder(transPeOrder,numTransPes);
00501   }
00502 
00503   int sum_npes = numTransPes + numGridPes;
00504   int max_npes = (numTransPes > numGridPes)?numTransPes:numGridPes;
00505 
00506 #if USE_TOPOMAP
00507   PatchMap * pmap = PatchMap::Object();
00508   
00509   int patch_pes = pmap->numNodesWithPatches();
00510   TopoManager tmgr;
00511   if(tmgr.hasMultipleProcsPerNode())
00512     patch_pes *= 2;
00513  
00514   bool done = false;
00515 #ifndef USE_COMM_LIB  
00516   if(CkNumPes() > 2*sum_npes + patch_pes) {    
00517     done = generateBGLORBPmePeList(transPeMap, numTransPes);
00518     done &= generateBGLORBPmePeList(gridPeMap, numGridPes, transPeMap, numTransPes);    
00519   }
00520   else 
00521 #endif
00522     if(CkNumPes() > 2 *max_npes + patch_pes) {
00523       done = generateBGLORBPmePeList(transPeMap, max_npes);
00524       gridPeMap = transPeMap;
00525     }
00526 
00527   if (!done)
00528 #endif
00529     {
00530       //generatePmePeList(transPeMap, max_npes);
00531       //gridPeMap = transPeMap;
00532       generatePmePeList2(gridPeMap, numGridPes, transPeMap, numTransPes);
00533     }
00534   
00535 #ifdef USE_COMM_LIB  
00536   if(CkMyPe() == 0) {
00537       ComlibInstanceHandle cinst1 = CkCreateComlibInstance();
00538       EachToManyMulticastStrategy *strat = new 
00539           EachToManyMulticastStrategy(USE_DIRECT, numGridPes, 
00540                                       gridPeMap, numTransPes, transPeMap);
00541       cinst1.setStrategy(strat);
00542       
00543       ComlibInstanceHandle cinst2 = CkCreateComlibInstance();
00544       strat = new EachToManyMulticastStrategy(USE_DIRECT, numTransPes, transPeMap
00545                                               , numGridPes, gridPeMap);
00546       cinst2.setStrategy(strat);
00547       ComlibDoneCreating();
00548   }
00549 #endif
00550 
00551   myGridPe = -1;
00552   int i = 0;
00553   for ( i=0; i<CkNumPes(); ++i )
00554     isPmeFlag[i] = 0;
00555   for ( i=0; i<numGridPes; ++i ) {
00556     if ( gridPeMap[i] == CkMyPe() ) myGridPe = i;
00557     isPmeFlag[gridPeMap[i]] |= 1;
00558   }
00559   myTransPe = -1;
00560   for ( i=0; i<numTransPes; ++i ) {
00561     if ( transPeMap[i] == CkMyPe() ) myTransPe = i;
00562     isPmeFlag[transPeMap[i]] |= 2;
00563   }
00564   
00565   if ( ! CkMyPe() ) {
00566     iout << iINFO << "PME GRID LOCATIONS:";
00567     int i;
00568     for ( i=0; i<numGridPes && i<10; ++i ) {
00569       iout << " " << gridPeMap[i];
00570     }
00571     if ( i < numGridPes ) iout << " ...";
00572     iout << "\n" << endi;
00573     iout << iINFO << "PME TRANS LOCATIONS:";
00574     for ( i=0; i<numTransPes && i<10; ++i ) {
00575       iout << " " << transPeMap[i];
00576     }
00577     if ( i < numTransPes ) iout << " ...";
00578     iout << "\n" << endi;
00579   }
00580 
00581   } // ! usePencils
00582 
00583   myGrid.K1 = simParams->PMEGridSizeX;
00584   myGrid.K2 = simParams->PMEGridSizeY;
00585   myGrid.K3 = simParams->PMEGridSizeZ;
00586   myGrid.order = simParams->PMEInterpOrder;
00587   myGrid.dim2 = myGrid.K2;
00588   myGrid.dim3 = 2 * (myGrid.K3/2 + 1);
00589 
00590   if ( ! usePencils ) {
00591     myGrid.block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
00592     myGrid.block2 = ( myGrid.K2 + numTransPes - 1 ) / numTransPes;
00593     myGrid.block3 = myGrid.dim3 / 2;  // complex
00594   }
00595 
00596   if ( usePencils ) {
00597     myGrid.block1 = ( myGrid.K1 + xBlocks - 1 ) / xBlocks;
00598     myGrid.block2 = ( myGrid.K2 + yBlocks - 1 ) / yBlocks;
00599     myGrid.block3 = ( myGrid.K3/2 + 1 + zBlocks - 1 ) / zBlocks;  // complex
00600 
00601     if ( CkMyPe() == 0 ) {
00602       int basepe = 0;  int npe = CkNumPes();
00603       if ( npe > xBlocks*yBlocks &&
00604                 npe > xBlocks*zBlocks &&
00605                 npe > yBlocks*zBlocks ) {
00606         // avoid node 0
00607         ++basepe;
00608         --npe;
00609       }
00610 
00611       zPencil = CProxy_PmeZPencil::ckNew();  // (xBlocks,yBlocks,1);
00612       yPencil = CProxy_PmeYPencil::ckNew();  // (xBlocks,1,zBlocks);
00613       xPencil = CProxy_PmeXPencil::ckNew();  // (1,yBlocks,zBlocks);
00614       
00615       // decide which pes to use by bit reversal and patch use
00616       int i;
00617       int ncpus = CkNumPes();
00618   
00619       // find next highest power of two
00620       int npow2 = 1;  int nbits = 0;
00621       while ( npow2 < ncpus ) { npow2 *= 2; nbits += 1; }
00622   
00623       // build bit reversal sequence
00624       SortableResizeArray<int> patches, nopatches, pmeprocs;
00625       PatchMap *pmap = PatchMap::Object();
00626       i = 0;
00627       for ( int icpu=0; icpu<ncpus; ++icpu ) {
00628         int ri;
00629         for ( ri = ncpus; ri >= ncpus; ++i ) {
00630           ri = 0;
00631           int pow2 = 1;
00632           int rpow2 = npow2 / 2;
00633           for ( int j=0; j<nbits; ++j ) {
00634             ri += rpow2 * ( ( i / pow2 ) % 2 );
00635             pow2 *= 2;  rpow2 /= 2;
00636           }
00637         }
00638         // seq[icpu] = ri;
00639         if ( ri ) { // keep 0 for special case
00640           if ( pmap->numPatchesOnNode(ri) ) patches.add(ri);
00641           else nopatches.add(ri);
00642         }
00643       }
00644 
00645       // only use zero if it eliminates overloading or has patches
00646       int useZero = 0;
00647       int npens = xBlocks*yBlocks;
00648       if ( npens % ncpus == 0 ) useZero = 1;
00649       if ( npens == nopatches.size() + 1 ) useZero = 1;
00650       npens += xBlocks*zBlocks;
00651       if ( npens % ncpus == 0 ) useZero = 1;
00652       if ( npens == nopatches.size() + 1 ) useZero = 1;
00653       npens += yBlocks*zBlocks;
00654       if ( npens % ncpus == 0 ) useZero = 1;
00655       if ( npens == nopatches.size() + 1 ) useZero = 1;
00656 
00657       // add nopatches then patches in reversed order
00658       for ( i=nopatches.size()-1; i>=0; --i ) pmeprocs.add(nopatches[i]);
00659       if ( useZero && ! pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
00660       for ( i=patches.size()-1; i>=0; --i ) pmeprocs.add(patches[i]);
00661       if ( pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
00662   
00663       int pe = 0;
00664       int npes = pmeprocs.size();
00665       SortableResizeArray<int> zprocs(xBlocks*yBlocks);
00666       for ( i=0; i<xBlocks*yBlocks; ++i, ++pe ) zprocs[i] = pmeprocs[pe%npes];
00667       zprocs.sort();
00668       SortableResizeArray<int> yprocs(xBlocks*zBlocks);
00669       for ( i=0; i<xBlocks*zBlocks; ++i, ++pe ) yprocs[i] = pmeprocs[pe%npes];
00670       yprocs.sort();
00671       SortableResizeArray<int> xprocs(yBlocks*zBlocks);
00672       for ( i=0; i<yBlocks*zBlocks; ++i, ++pe ) xprocs[i] = pmeprocs[pe%npes];
00673       xprocs.sort();
00674 
00675       pencilPMEProcessors = new char [CkNumPes()];
00676       memset (pencilPMEProcessors, 0, sizeof(char) * CkNumPes());
00677 
00678       int x,y,z;
00679 
00680       iout << iINFO << "PME Z PENCIL LOCATIONS:";
00681       for ( i=0; i<zprocs.size() && i<10; ++i ) {
00682         iout << " " << zprocs[i];
00683       }
00684       if ( i < zprocs.size() ) iout << " ...";
00685       iout << "\n" << endi;
00686 
00687       for (pe=0, x = 0; x < xBlocks; ++x)
00688         for (y = 0; y < yBlocks; ++y, ++pe ) {
00689           zPencil(x,y,0).insert(zprocs[pe]);
00690           pencilPMEProcessors[zprocs[pe]] = 1;
00691         }
00692       zPencil.doneInserting();
00693       
00694       iout << iINFO << "PME Y PENCIL LOCATIONS:";
00695       for ( i=0; i<yprocs.size() && i<10; ++i ) {
00696         iout << " " << yprocs[i];
00697       }
00698       if ( i < yprocs.size() ) iout << " ...";
00699       iout << "\n" << endi;
00700 
00701       for (pe=0, z = 0; z < zBlocks; ++z )
00702         for (x = 0; x < xBlocks; ++x, ++pe ) {
00703           yPencil(x,0,z).insert(yprocs[pe]);
00704           pencilPMEProcessors[yprocs[pe]] = 1;
00705         }
00706       yPencil.doneInserting();
00707       
00708       iout << iINFO << "PME X PENCIL LOCATIONS:";
00709       for ( i=0; i<xprocs.size() && i<10; ++i ) {
00710         iout << " " << xprocs[i];
00711       }
00712       if ( i < xprocs.size() ) iout << " ...";
00713       iout << "\n" << endi;
00714 
00715       for (pe=0, y = 0; y < yBlocks; ++y )      
00716         for (z = 0; z < zBlocks; ++z, ++pe ) {
00717           xPencil(0,y,z).insert(xprocs[pe]);
00718           pencilPMEProcessors[xprocs[pe]] = 1;
00719         }
00720       xPencil.doneInserting();      
00721         
00722       pmeProxy.recvArrays(xPencil,yPencil,zPencil);
00723       PmePencilInitMsgData msgdata;
00724       msgdata.grid = myGrid;
00725       msgdata.xBlocks = xBlocks;
00726       msgdata.yBlocks = yBlocks;
00727       msgdata.zBlocks = zBlocks;
00728       msgdata.xPencil = xPencil;
00729       msgdata.yPencil = yPencil;
00730       msgdata.zPencil = zPencil;
00731       msgdata.pmeProxy = pmeProxyDir;
00732       xPencil.init(new PmePencilInitMsg(msgdata));
00733       yPencil.init(new PmePencilInitMsg(msgdata));
00734       zPencil.init(new PmePencilInitMsg(msgdata));
00735     }
00736     return;  // continue in initialize_pencils() at next startup stage
00737   }
00738 
00739 
00740   int pe;
00741   int nx = 0;
00742   for ( pe = 0; pe < numGridPes; ++pe ) {
00743     localInfo[pe].x_start = nx;
00744     nx += myGrid.block1;
00745     if ( nx > myGrid.K1 ) nx = myGrid.K1;
00746     localInfo[pe].nx = nx - localInfo[pe].x_start;
00747   }
00748   int ny = 0;
00749   for ( pe = 0; pe < numTransPes; ++pe ) {
00750     localInfo[pe].y_start_after_transpose = ny;
00751     ny += myGrid.block2;
00752     if ( ny > myGrid.K2 ) ny = myGrid.K2;
00753     localInfo[pe].ny_after_transpose =
00754                         ny - localInfo[pe].y_start_after_transpose;
00755   }
00756 
00757   {  // decide how many pes this node exchanges charges with
00758 
00759   PatchMap *patchMap = PatchMap::Object();
00760   Lattice lattice = simParams->lattice;
00761   BigReal sysdima = lattice.a_r().unit() * lattice.a();
00762   BigReal cutoff = simParams->cutoff;
00763   BigReal patchdim = simParams->patchDimension;
00764   int numPatches = patchMap->numPatches();
00765   int numNodes = CkNumPes();
00766   int *source_flags = new int[numNodes];
00767   int node;
00768   for ( node=0; node<numNodes; ++node ) {
00769     source_flags[node] = 0;
00770     recipPeDest[node] = 0;
00771   }
00772 
00773   // // make sure that we don't get ahead of ourselves on this node
00774   // if ( CkMyPe() < numPatches && myRecipPe >= 0 ) {
00775   //   source_flags[CkMyPe()] = 1;
00776   //   recipPeDest[myRecipPe] = 1;
00777   // }
00778 
00779   for ( int pid=0; pid < numPatches; ++pid ) {
00780     int pnode = patchMap->node(pid);
00781     BigReal minx = patchMap->min_a(pid);
00782     BigReal maxx = patchMap->max_a(pid);
00783     BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
00784     // min1 (max1) is smallest (largest) grid line for this patch
00785     int min1 = ((int) floor(myGrid.K1 * (minx - margina))) - myGrid.order + 1;
00786     int max1 = ((int) floor(myGrid.K1 * (maxx + margina)));
00787     for ( int i=min1; i<=max1; ++i ) {
00788       int ix = i;
00789       while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
00790       while ( ix < 0 ) ix += myGrid.K1;
00791       // set source_flags[pnode] if this patch sends to our node
00792       if ( myGridPe >= 0 && ix >= localInfo[myGridPe].x_start &&
00793            ix < localInfo[myGridPe].x_start + localInfo[myGridPe].nx ) {
00794         source_flags[pnode] = 1;
00795       }
00796       // set dest_flags[] for node that our patch sends to
00797       if ( pnode == CkMyPe() ) {
00798         recipPeDest[ix / myGrid.block1] = 1;
00799       }
00800     }
00801   }
00802 
00803   numSources = 0;
00804   numDestRecipPes = 0;
00805   for ( node=0; node<numNodes; ++node ) {
00806     if ( source_flags[node] ) ++numSources;
00807     if ( recipPeDest[node] ) ++numDestRecipPes;
00808   }
00809 
00810 #if 0
00811   if ( numSources ) {
00812     iout << iINFO << "PME " << CkMyPe() << " sources:";
00813     for ( node=0; node<numNodes; ++node ) {
00814       if ( source_flags[node] ) iout << " " << node;
00815     }
00816     iout << "\n" << endi;
00817   }
00818 #endif
00819 
00820   delete [] source_flags;
00821 
00822   // CkPrintf("PME on node %d has %d sources and %d destinations\n",
00823   //           CkMyPe(), numSources, numDestRecipPes);
00824 
00825   }  // decide how many pes this node exchanges charges with (end)
00826 
00827   ungrid_count = numDestRecipPes;
00828 
00829   sendTransBarrier_received = 0;
00830 
00831   if ( myGridPe < 0 && myTransPe < 0 ) return;
00832   // the following only for nodes doing reciprocal sum
00833 
00834   if ( myTransPe >= 0 ) {
00835       int k2_start = localInfo[myTransPe].y_start_after_transpose;
00836       int k2_end = k2_start + localInfo[myTransPe].ny_after_transpose;
00837       myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
00838   }
00839 
00840   int local_size = myGrid.block1 * myGrid.K2 * myGrid.dim3;
00841   int local_size_2 = myGrid.block2 * myGrid.K1 * myGrid.dim3;
00842   if ( local_size < local_size_2 ) local_size = local_size_2;
00843   qgrid = new float[local_size*numGrids];
00844   if ( numGridPes > 1 || numTransPes > 1 ) {
00845     kgrid = new float[local_size*numGrids];
00846   } else {
00847     kgrid = qgrid;
00848   }
00849   qgrid_size = local_size;
00850 
00851   if ( myGridPe >= 0 ) {
00852   qgrid_start = localInfo[myGridPe].x_start * myGrid.K2 * myGrid.dim3;
00853   qgrid_len = localInfo[myGridPe].nx * myGrid.K2 * myGrid.dim3;
00854   fgrid_start = localInfo[myGridPe].x_start * myGrid.K2;
00855   fgrid_len = localInfo[myGridPe].nx * myGrid.K2;
00856   }
00857 
00858   int n[3]; n[0] = myGrid.K1; n[1] = myGrid.K2; n[2] = myGrid.K3;
00859 
00860 #ifdef NAMD_FFTW
00861   CmiLock(fftw_plan_lock);
00862 
00863   work = new fftw_complex[n[0]];
00864 
00865   if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps.  1..." << endi;
00866   if ( myGridPe >= 0 ) {
00867   forward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_REAL_TO_COMPLEX,
00868         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
00869         | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
00870   }
00871   if ( ! CkMyPe() ) iout << " 2..." << endi;
00872   if ( myTransPe >= 0 ) {
00873   forward_plan_x = fftw_create_plan_specific(n[0], FFTW_REAL_TO_COMPLEX,
00874         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
00875         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
00876         localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
00877   }
00878   if ( ! CkMyPe() ) iout << " 3..." << endi;
00879   if ( myTransPe >= 0 ) {
00880   backward_plan_x = fftw_create_plan_specific(n[0], FFTW_COMPLEX_TO_REAL,
00881         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
00882         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
00883         localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
00884   }
00885   if ( ! CkMyPe() ) iout << " 4..." << endi;
00886   if ( myGridPe >= 0 ) {
00887   backward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_COMPLEX_TO_REAL,
00888         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
00889         | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
00890   }
00891   if ( ! CkMyPe() ) iout << "   Done.\n" << endi;
00892 
00893   CmiUnlock(fftw_plan_lock);
00894 #else
00895   NAMD_die("Sorry, FFTW must be compiled in to use PME.");
00896 #endif
00897 
00898   if ( myGridPe >= 0 && numSources == 0 )
00899                 NAMD_bug("PME grid elements exist without sources.");
00900   grid_count = numSources;
00901   memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
00902   trans_count = numGridPes;
00903 }
00904 
00905 
00906 void ComputePmeMgr::initialize_pencils(CkQdMsg *msg) {
00907   delete msg;
00908   if ( ! usePencils ) return;
00909 
00910   SimParameters *simParams = Node::Object()->simParameters;
00911 
00912   PatchMap *patchMap = PatchMap::Object();
00913   Lattice lattice = simParams->lattice;
00914   BigReal sysdima = lattice.a_r().unit() * lattice.a();
00915   BigReal sysdimb = lattice.b_r().unit() * lattice.b();
00916   BigReal cutoff = simParams->cutoff;
00917   BigReal patchdim = simParams->patchDimension;
00918   int numPatches = patchMap->numPatches();
00919 
00920   pencilActive = new char[xBlocks*yBlocks];
00921   for ( int i=0; i<xBlocks; ++i ) {
00922     for ( int j=0; j<yBlocks; ++j ) {
00923       pencilActive[i*yBlocks+j] = 0;
00924     }
00925   }
00926 
00927   for ( int pid=0; pid < numPatches; ++pid ) {
00928     int pnode = patchMap->node(pid);
00929     if ( pnode != CkMyPe() ) continue;
00930 
00931     BigReal minx = patchMap->min_a(pid);
00932     BigReal maxx = patchMap->max_a(pid);
00933     BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
00934     // min1 (max1) is smallest (largest) grid line for this patch
00935     int min1 = ((int) floor(myGrid.K1 * (minx - margina))) - myGrid.order + 1;
00936     int max1 = ((int) floor(myGrid.K1 * (maxx + margina)));
00937 
00938     BigReal miny = patchMap->min_b(pid);
00939     BigReal maxy = patchMap->max_b(pid);
00940     BigReal marginb = 0.5 * ( patchdim - cutoff ) / sysdimb;
00941     // min2 (max2) is smallest (largest) grid line for this patch
00942     int min2 = ((int) floor(myGrid.K2 * (miny - marginb))) - myGrid.order + 1;
00943     int max2 = ((int) floor(myGrid.K2 * (maxy + marginb)));
00944 
00945     for ( int i=min1; i<=max1; ++i ) {
00946       int ix = i;
00947       while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
00948       while ( ix < 0 ) ix += myGrid.K1;
00949       for ( int j=min2; j<=max2; ++j ) {
00950         int jy = j;
00951         while ( jy >= myGrid.K2 ) jy -= myGrid.K2;
00952         while ( jy < 0 ) jy += myGrid.K2;
00953         pencilActive[(ix / myGrid.block1)*yBlocks + (jy / myGrid.block2)] = 1;
00954       }
00955     }
00956   }
00957 
00958   numPencilsActive = 0;
00959   for ( int i=0; i<xBlocks; ++i ) {
00960     for ( int j=0; j<yBlocks; ++j ) {
00961       if ( pencilActive[i*yBlocks+j] ) {
00962         ++numPencilsActive;
00963         zPencil(i,j,0).dummyRecvGrid(CkMyPe(),0);
00964       }
00965     }
00966   }
00967   //if ( numPencilsActive ) {
00968   //  CkPrintf("node %d sending to %d pencils\n", CkMyPe(), numPencilsActive);
00969   //}
00970 
00971   ungrid_count = numPencilsActive;
00972 }
00973 
00974 
00975 void ComputePmeMgr::activate_pencils(CkQdMsg *msg) {
00976   if ( ! usePencils ) return;
00977   if ( CkMyPe() == 0 ) zPencil.dummyRecvGrid(CkMyPe(),1);
00978 }
00979 
00980 
00981 ComputePmeMgr::~ComputePmeMgr() {
00982 
00983 #ifdef NAMD_FFTW
00984   if ( CmiMyRank() == 0 ) {
00985     CmiDestroyLock(fftw_plan_lock);
00986   }
00987 #endif
00988 
00989   delete myKSpace;
00990   delete [] localInfo;
00991   delete [] gridPeMap;
00992   delete [] transPeMap;
00993   delete [] recipPeDest;
00994   delete [] gridPeOrder;
00995   delete [] transPeOrder;
00996   delete [] isPmeFlag;
00997   delete [] qgrid;
00998   if ( kgrid != qgrid ) delete [] kgrid;
00999   delete [] work;
01000   delete [] gridmsg_reuse;
01001 }
01002 
01003 void ComputePmeMgr::sendGrid(void) {
01004   pmeCompute->sendData(numGridPes,gridPeOrder,recipPeDest,gridPeMap);
01005 }
01006 
01007 void ComputePmeMgr::recvGrid(PmeGridMsg *msg) {
01008   // CkPrintf("recvGrid from %d on Pe(%d)\n",msg->sourceNode,CkMyPe());
01009   if ( grid_count == 0 ) {
01010     NAMD_bug("Message order failure in ComputePmeMgr::recvGrid\n");
01011   }
01012   if ( grid_count == numSources ) {
01013     lattice = msg->lattice;
01014     sequence = msg->sequence;
01015   }
01016 
01017   int zdim = myGrid.dim3;
01018   int zlistlen = msg->zlistlen;
01019   int *zlist = msg->zlist;
01020   float *qmsg = msg->qgrid;
01021   for ( int g=0; g<numGrids; ++g ) {
01022     char *f = msg->fgrid + fgrid_len * g;
01023     float *q = qgrid + qgrid_size * g;
01024     for ( int i=0; i<fgrid_len; ++i ) {
01025       if ( f[i] ) {
01026         for ( int k=0; k<zlistlen; ++k ) {
01027           q[zlist[k]] += *(qmsg++);
01028         }
01029       }
01030       q += zdim;
01031     }
01032   }
01033 
01034   gridmsg_reuse[numSources-grid_count] = msg;
01035   --grid_count;
01036 
01037   if ( grid_count == 0 ) {
01038     pmeProxyDir[CkMyPe()].gridCalc1();
01039     if ( useBarrier ) pmeProxyDir[0].sendTransBarrier();
01040   }
01041 }
01042 
01043 void ComputePmeMgr::gridCalc1(void) {
01044   // CkPrintf("gridCalc1 on Pe(%d)\n",CkMyPe());
01045 
01046 #ifdef NAMD_FFTW
01047   for ( int g=0; g<numGrids; ++g ) {
01048     rfftwnd_real_to_complex(forward_plan_yz, localInfo[myGridPe].nx,
01049         qgrid + qgrid_size * g, 1, myGrid.dim2 * myGrid.dim3, 0, 0, 0);
01050   }
01051 #endif
01052 
01053   if ( ! useBarrier ) pmeProxyDir[CkMyPe()].sendTrans();
01054 }
01055 
01056 void ComputePmeMgr::sendTransBarrier(void) {
01057   sendTransBarrier_received += 1;
01058   // CkPrintf("sendTransBarrier on %d %d\n",myGridPe,numGridPes-sendTransBarrier_received);
01059   if ( sendTransBarrier_received < numGridPes ) return;
01060   sendTransBarrier_received = 0;
01061   for ( int i=0; i<numGridPes; ++i ) {
01062     pmeProxyDir[gridPeMap[i]].sendTrans();
01063   }
01064 }
01065 
01066 void ComputePmeMgr::sendTrans(void) {
01067   // CkPrintf("sendTrans on %d\n",myTransPe);
01068 
01069   // send data for transpose
01070   int zdim = myGrid.dim3;
01071   int nx = localInfo[myGridPe].nx;
01072   int x_start = localInfo[myGridPe].x_start;
01073   int slicelen = myGrid.K2 * zdim;
01074 
01075 #ifdef USE_COMM_LIB
01076   ComlibInstanceHandle cinst1 = CkGetComlibInstance(0);
01077   cinst1.beginIteration();
01078 #endif
01079 
01080 #if CMK_BLUEGENEL
01081   CmiNetworkProgressAfter (0);
01082 #endif
01083 
01084   for (int j=0; j<numTransPes; j++) {
01085     int pe = transPeOrder[j];  // different order on each node
01086     LocalPmeInfo &li = localInfo[pe];
01087     int cpylen = li.ny_after_transpose * zdim;
01088     PmeTransMsg *newmsg = new (nx * cpylen * numGrids,
01089                                 PRIORITY_SIZE) PmeTransMsg;
01090     newmsg->sourceNode = myGridPe;
01091     newmsg->lattice = lattice;
01092     newmsg->x_start = x_start;
01093     newmsg->nx = nx;
01094     for ( int g=0; g<numGrids; ++g ) {
01095       float *q = qgrid + qgrid_size * g + li.y_start_after_transpose * zdim;
01096       float *qmsg = newmsg->qgrid + nx * cpylen * g;
01097       for ( int x = 0; x < nx; ++x ) {
01098         CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
01099         q += slicelen;
01100         qmsg += cpylen;
01101       }
01102     }
01103     newmsg->sequence = sequence;
01104     SET_PRIORITY(newmsg,sequence,PME_TRANS_PRIORITY)
01105     pmeProxy[transPeMap[pe]].recvTrans(newmsg);
01106   }
01107  
01108   untrans_count = numTransPes;
01109 
01110 #ifdef USE_COMM_LIB
01111   cinst1.endIteration();
01112 #endif  
01113 }
01114 
01115 void ComputePmeMgr::recvTrans(PmeTransMsg *msg) {
01116   // CkPrintf("recvTrans on Pe(%d)\n",CkMyPe());
01117   if ( trans_count == numGridPes ) {
01118     lattice = msg->lattice;
01119     sequence = msg->sequence;
01120   }
01121 
01122   int zdim = myGrid.dim3;
01123   // int y_start = localInfo[myTransPe].y_start_after_transpose;
01124   int ny = localInfo[myTransPe].ny_after_transpose;
01125   int x_start = msg->x_start;
01126   int nx = msg->nx;
01127   for ( int g=0; g<numGrids; ++g ) {
01128     CmiMemcpy((void*)(kgrid + qgrid_size * g + x_start*ny*zdim),
01129         (void*)(msg->qgrid + nx*ny*zdim*g), nx*ny*zdim*sizeof(float));
01130   }
01131 
01132   delete msg;
01133   --trans_count;
01134 
01135   if ( trans_count == 0 ) {
01136     pmeProxyDir[CkMyPe()].gridCalc2();
01137   }
01138 }
01139 
01140 void ComputePmeMgr::gridCalc2(void) {
01141   // CkPrintf("gridCalc2 on Pe(%d)\n",CkMyPe());
01142 
01143 #if CMK_BLUEGENEL
01144   CmiNetworkProgressAfter (0);
01145 #endif
01146 
01147   int zdim = myGrid.dim3;
01148   // int y_start = localInfo[myTransPe].y_start_after_transpose;
01149   int ny = localInfo[myTransPe].ny_after_transpose;
01150 
01151   for ( int g=0; g<numGrids; ++g ) {
01152     // finish forward FFT (x dimension)
01153 #ifdef NAMD_FFTW
01154     fftw(forward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
01155         ny * zdim / 2, 1, work, 1, 0);
01156 #endif
01157 
01158     // reciprocal space portion of PME
01159     BigReal ewaldcof = ComputeNonbondedUtil::ewaldcof;
01160     recip_evir2[g][0] = myKSpace->compute_energy(kgrid+qgrid_size*g,
01161                         lattice, ewaldcof, &(recip_evir2[g][1]));
01162     // CkPrintf("Ewald reciprocal energy = %f\n", recip_evir2[g][0]);
01163 
01164     // start backward FFT (x dimension)
01165 #ifdef NAMD_FFTW
01166     fftw(backward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
01167         ny * zdim / 2, 1, work, 1, 0);
01168 #endif
01169   }
01170 
01171   pmeProxyDir[CkMyPe()].sendUntrans();
01172 }
01173 
01174 void ComputePmeMgr::sendUntrans(void) {
01175 
01176   int zdim = myGrid.dim3;
01177   int y_start = localInfo[myTransPe].y_start_after_transpose;
01178   int ny = localInfo[myTransPe].ny_after_transpose;
01179 
01180 #ifdef USE_COMM_LIB
01181   ComlibInstanceHandle cinst2 = CkGetComlibInstance(1); 
01182   cinst2.beginIteration();
01183 #endif  
01184 
01185 #if CMK_BLUEGENEL
01186   CmiNetworkProgressAfter (0);
01187 #endif
01188 
01189   // send data for reverse transpose
01190   for (int j=0; j<numGridPes; j++) {
01191     int pe = gridPeOrder[j];  // different order on each node
01192     LocalPmeInfo &li = localInfo[pe];
01193     int x_start =li.x_start;
01194     int nx = li.nx;
01195     PmeUntransMsg *newmsg = new (nx*ny*zdim*numGrids,numGrids,
01196                                 PRIORITY_SIZE) PmeUntransMsg;
01197     newmsg->sourceNode = myTransPe;
01198     newmsg->y_start = y_start;
01199     newmsg->ny = ny;
01200     for ( int g=0; g<numGrids; ++g ) {
01201       if ( j == 0 ) {  // only need these once
01202         newmsg->evir[g] = recip_evir2[g];
01203       } else {
01204         newmsg->evir[g] = 0.;
01205       }
01206       CmiMemcpy((void*)(newmsg->qgrid+nx*ny*zdim*g),
01207                 (void*)(kgrid + qgrid_size*g + x_start*ny*zdim),
01208                 nx*ny*zdim*sizeof(float));
01209     }
01210     SET_PRIORITY(newmsg,sequence,PME_UNTRANS_PRIORITY)
01211     pmeProxy[gridPeMap[pe]].recvUntrans(newmsg);
01212   }
01213 
01214 #ifdef USE_COMM_LIB
01215   cinst2.endIteration();
01216 #endif  
01217 
01218   trans_count = numGridPes;
01219 }
01220 
01221 void ComputePmeMgr::recvUntrans(PmeUntransMsg *msg) {
01222   // CkPrintf("recvUntrans on Pe(%d)\n",CkMyPe());
01223   if ( untrans_count == numTransPes ) {
01224     for ( int g=0; g<numGrids; ++g ) {
01225       recip_evir[g] = 0.;
01226     }
01227   }
01228 
01229 #if CMK_BLUEGENEL
01230   CmiNetworkProgressAfter (0);
01231 #endif
01232 
01233   int g;
01234   for ( g=0; g<numGrids; ++g ) {
01235     recip_evir[g] += msg->evir[g];
01236   }
01237 
01238   int zdim = myGrid.dim3;
01239   // int x_start = localInfo[myGridPe].x_start;
01240   int nx = localInfo[myGridPe].nx;
01241   int y_start = msg->y_start;
01242   int ny = msg->ny;
01243   int slicelen = myGrid.K2 * zdim;
01244   int cpylen = ny * zdim;
01245   for ( g=0; g<numGrids; ++g ) {
01246     float *q = qgrid + qgrid_size * g + y_start * zdim;
01247     float *qmsg = msg->qgrid + nx * cpylen * g;
01248     for ( int x = 0; x < nx; ++x ) {
01249       CmiMemcpy((void*)q, (void*)qmsg, cpylen*sizeof(float));
01250       q += slicelen;
01251       qmsg += cpylen;
01252     }
01253   }
01254 
01255   delete msg;
01256   --untrans_count;
01257 
01258   if ( untrans_count == 0 ) {
01259     pmeProxyDir[CkMyPe()].gridCalc3();
01260   }
01261 }
01262 
01263 void ComputePmeMgr::gridCalc3(void) {
01264   // CkPrintf("gridCalc3 on Pe(%d)\n",CkMyPe());
01265 
01266   // finish backward FFT
01267 #ifdef NAMD_FFTW
01268   for ( int g=0; g<numGrids; ++g ) {
01269     rfftwnd_complex_to_real(backward_plan_yz, localInfo[myGridPe].nx,
01270         (fftw_complex *) (qgrid + qgrid_size * g),
01271         1, myGrid.dim2 * myGrid.dim3 / 2, 0, 0, 0);
01272   }
01273 #endif
01274 
01275   pmeProxyDir[CkMyPe()].sendUngrid();
01276 }
01277 
01278 void ComputePmeMgr::sendUngrid(void) {
01279 
01280   for ( int j=0; j<numSources; ++j ) {
01281     // int msglen = qgrid_len;
01282     PmeGridMsg *newmsg = gridmsg_reuse[j];
01283     int pe = newmsg->sourceNode;
01284     if ( j == 0 ) {  // only need these once
01285       for ( int g=0; g<numGrids; ++g ) {
01286         newmsg->evir[g] = recip_evir[g];
01287       }
01288     } else {
01289       for ( int g=0; g<numGrids; ++g ) {
01290         newmsg->evir[g] = 0.;
01291       }
01292     }
01293     int zdim = myGrid.dim3;
01294     int flen = newmsg->len;
01295     int fstart = newmsg->start;
01296     int zlistlen = newmsg->zlistlen;
01297     int *zlist = newmsg->zlist;
01298     float *qmsg = newmsg->qgrid;
01299     for ( int g=0; g<numGrids; ++g ) {
01300       char *f = newmsg->fgrid + fgrid_len * g;
01301       float *q = qgrid + qgrid_size * g + (fstart-fgrid_start) * zdim;
01302       for ( int i=0; i<flen; ++i ) {
01303         if ( f[i] ) {
01304           for ( int k=0; k<zlistlen; ++k ) {
01305             *(qmsg++) = q[zlist[k]];
01306           }
01307         }
01308         q += zdim;
01309       }
01310     }
01311     newmsg->sourceNode = myGridPe;
01312 
01313     SET_PRIORITY(newmsg,sequence,PME_UNGRID_PRIORITY)
01314     pmeProxyDir[pe].recvUngrid(newmsg);
01315   }
01316   grid_count = numSources;
01317   memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
01318 }
01319 
01320 void ComputePmeMgr::recvUngrid(PmeGridMsg *msg) {
01321   // CkPrintf("recvUngrid on Pe(%d)\n",CkMyPe());
01322   if ( ungrid_count == 0 ) {
01323     NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
01324   }
01325 
01326   if ( usePencils ) pmeCompute->copyPencils(msg);
01327   else pmeCompute->copyResults(msg);
01328   delete msg;
01329   --ungrid_count;
01330 
01331   if ( ungrid_count == 0 ) {
01332     pmeProxyDir[CkMyPe()].ungridCalc();
01333   }
01334 }
01335 
01336 void ComputePmeMgr::ungridCalc(void) {
01337   // CkPrintf("ungridCalc on Pe(%d)\n",CkMyPe());
01338 
01339   pmeCompute->ungridForces();
01340 
01341   ungrid_count = (usePencils ? numPencilsActive : numDestRecipPes );
01342 }
01343 
01344 
01345 ComputePme::ComputePme(ComputeID c) :
01346   ComputeHomePatches(c)
01347 {
01348   DebugM(4,"ComputePme created.\n");
01349 
01350   basePriority = PME_PRIORITY;
01351 
01352   CProxy_ComputePmeMgr::ckLocalBranch(
01353         CkpvAccess(BOCclass_group).computePmeMgr)->setCompute(this);
01354 
01355   useAvgPositions = 1;
01356 
01357   reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
01358 
01359   SimParameters *simParams = Node::Object()->simParameters;
01360 
01361   alchFepOn = simParams->alchFepOn;
01362   alchThermIntOn = simParams->alchThermIntOn;
01363   alchDecouple = (alchFepOn || alchThermIntOn) && (simParams->alchDecouple);
01364   alchElecLambdaStart = (alchFepOn || alchThermIntOn) ? 
01365     simParams->alchElecLambdaStart : 0;
01366             
01367   if (alchFepOn || alchThermIntOn) {
01368     numGrids = 2;
01369     if (alchDecouple) numGrids += 2;
01370     if (alchElecLambdaStart || alchThermIntOn) numGrids ++;
01371   }
01372   else numGrids = 1;
01373   lesOn = simParams->lesOn;
01374   if ( lesOn ) {
01375     lesFactor = simParams->lesFactor;
01376     numGrids = lesFactor;
01377   }
01378   selfOn = 0;
01379   pairOn = simParams->pairInteractionOn;
01380   if ( pairOn ) {
01381     selfOn = simParams->pairInteractionSelf;
01382     if ( selfOn ) pairOn = 0;  // make pairOn and selfOn exclusive
01383     numGrids = selfOn ? 1 : 3;
01384   }
01385 
01386   myGrid.K1 = simParams->PMEGridSizeX;
01387   myGrid.K2 = simParams->PMEGridSizeY;
01388   myGrid.K3 = simParams->PMEGridSizeZ;
01389   myGrid.order = simParams->PMEInterpOrder;
01390   myGrid.dim2 = myGrid.K2;
01391   myGrid.dim3 = 2 * (myGrid.K3/2 + 1);
01392   qsize = myGrid.K1 * myGrid.dim2 * myGrid.dim3;
01393   fsize = myGrid.K1 * myGrid.dim2;
01394   q_arr = new double*[fsize*numGrids];
01395   memset( (void*) q_arr, 0, fsize*numGrids * sizeof(double*) );
01396   f_arr = new char[fsize*numGrids];
01397   fz_arr = new char[myGrid.K3];
01398 }
01399 
01400 ComputePme::~ComputePme()
01401 {
01402   for (int i=0; i<fsize*numGrids; ++i) {
01403     if ( q_arr[i] ) {
01404       delete [] q_arr[i];
01405     }
01406   }
01407   delete [] q_arr;
01408   delete [] f_arr;
01409   delete [] fz_arr;
01410 }
01411 
01412 void ComputePme::doWork()
01413 {
01414   DebugM(4,"Entering ComputePme::doWork().\n");
01415 
01416 #ifdef TRACE_COMPUTE_OBJECTS
01417     double traceObjStartTime = CmiWallTimer();
01418 #endif
01419 
01420   ResizeArrayIter<PatchElem> ap(patchList);
01421 
01422   // Skip computations if nothing to do.
01423   if ( ! patchList[0].p->flags.doFullElectrostatics )
01424   {
01425     for (ap = ap.begin(); ap != ap.end(); ap++) {
01426       CompAtom *x = (*ap).positionBox->open();
01427       Results *r = (*ap).forceBox->open();
01428       (*ap).positionBox->close(&x);
01429       (*ap).forceBox->close(&r);
01430     }
01431     reduction->submit();
01432     return;
01433   }
01434 
01435   // allocate storage
01436   numLocalAtoms = 0;
01437   for (ap = ap.begin(); ap != ap.end(); ap++) {
01438     numLocalAtoms += (*ap).p->getNumAtoms();
01439   }
01440 
01441   Lattice lattice = patchList[0].p->flags.lattice;
01442 
01443   localData = new PmeParticle[numLocalAtoms*(numGrids+
01444                                         ((numGrids>1 || selfOn)?1:0))];
01445   localPartition = new char[numLocalAtoms];
01446 
01447   int g;
01448   for ( g=0; g<numGrids; ++g ) {
01449     localGridData[g] = localData + numLocalAtoms*(g+1);
01450   }
01451 
01452   // get positions and charges
01453   PmeParticle * data_ptr = localData;
01454   char * part_ptr = localPartition;
01455   const BigReal coloumb_sqrt = sqrt( COLOUMB * ComputeNonbondedUtil::scaling
01456                                 * ComputeNonbondedUtil::dielectric_1 );
01457 
01458   for (ap = ap.begin(); ap != ap.end(); ap++) {
01459 #ifdef NETWORK_PROGRESS
01460     CmiNetworkProgress();
01461 #endif
01462 
01463     CompAtom *x = (*ap).positionBox->open();
01464     // CompAtomExt *xExt = (*ap).p->getCompAtomExtInfo();
01465     if ( patchList[0].p->flags.doMolly ) {
01466       (*ap).positionBox->close(&x);
01467       x = (*ap).avgPositionBox->open();
01468     }
01469     int numAtoms = (*ap).p->getNumAtoms();
01470 
01471     for(int i=0; i<numAtoms; ++i)
01472     {
01473       data_ptr->x = x[i].position.x;
01474       data_ptr->y = x[i].position.y;
01475       data_ptr->z = x[i].position.z;
01476       data_ptr->cg = coloumb_sqrt * x[i].charge;
01477       ++data_ptr;
01478       *part_ptr = x[i].partition;
01479       ++part_ptr;
01480     }
01481 
01482     if ( patchList[0].p->flags.doMolly ) { (*ap).avgPositionBox->close(&x); }
01483     else { (*ap).positionBox->close(&x); }
01484   }
01485 
01486   // copy to other grids if needed
01487   if ( ((alchFepOn || alchThermIntOn) && (!alchDecouple)) || lesOn ) {
01488     for ( g=0; g<numGrids; ++g ) {
01489 #ifdef NETWORK_PROGRESS
01490       CmiNetworkProgress();
01491 #endif
01492 
01493       PmeParticle *lgd = localGridData[g];
01494       int nga = 0;
01495       for(int i=0; i<numLocalAtoms; ++i) {
01496         if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
01497           // for FEP/TI: grid 0 gets non-alch + partition 1;
01498           // grid 1 gets non-alch + partition 2;
01499           // grid 2 (only if called for with numGrids=3) gets only non-alch
01500           lgd[nga++] = localData[i];
01501         }
01502       }
01503       numGridAtoms[g] = nga;
01504     }
01505   } else if ( (alchFepOn || alchThermIntOn) && alchDecouple) {
01506     // alchemical decoupling: four grids
01507     // g=0: partition 0 and partition 1
01508     // g=1: partition 0 and partition 2
01509     // g=2: only partition 1 atoms
01510     // g=3: only partition 2 atoms
01511     // plus one grid g=4, only partition 0, if numGrids=5
01512     for ( g=0; g<2; ++g ) {  // same as before for first 2
01513 #ifdef NETWORK_PROGRESS
01514       CmiNetworkProgress();
01515 #endif
01516 
01517       PmeParticle *lgd = localGridData[g];
01518       int nga = 0;
01519       for(int i=0; i<numLocalAtoms; ++i) {
01520         if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
01521           lgd[nga++] = localData[i];
01522         }
01523       }
01524       numGridAtoms[g] = nga;
01525     }
01526     for (g=2 ; g<4 ; ++g ) {  // only alchemical atoms for these 2
01527 #ifdef NETWORK_PROGRESS
01528       CmiNetworkProgress();
01529 #endif
01530 
01531       PmeParticle *lgd = localGridData[g];
01532       int nga = 0;
01533       for(int i=0; i<numLocalAtoms; ++i) {
01534         if ( localPartition[i] == (g-1) ) {
01535           lgd[nga++] = localData[i];
01536         }
01537       }
01538       numGridAtoms[g] = nga;
01539     }
01540     for (g=4 ; g<numGrids ; ++g ) {  // only non-alchemical atoms 
01541       // numGrids=5 only if alchElecLambdaStart > 0
01542 #ifdef NETWORK_PROGRESS
01543       CmiNetworkProgress();
01544 #endif
01545 
01546       PmeParticle *lgd = localGridData[g];
01547       int nga = 0;
01548       for(int i=0; i<numLocalAtoms; ++i) {
01549         if ( localPartition[i] == 0 ) {
01550           lgd[nga++] = localData[i];
01551         }
01552       }
01553       numGridAtoms[g] = nga;
01554     }
01555   } else if ( selfOn ) {
01556     if ( numGrids != 1 ) NAMD_bug("ComputePme::doWork assertion 1 failed");
01557     g = 0;
01558     PmeParticle *lgd = localGridData[g];
01559     int nga = 0;
01560     for(int i=0; i<numLocalAtoms; ++i) {
01561       if ( localPartition[i] == 1 ) {
01562         lgd[nga++] = localData[i];
01563       }
01564     }
01565     numGridAtoms[g] = nga;
01566   } else if ( pairOn ) {
01567     if ( numGrids != 3 ) NAMD_bug("ComputePme::doWork assertion 2 failed");
01568     g = 0;
01569     PmeParticle *lgd = localGridData[g];
01570     int nga = 0;
01571     for(int i=0; i<numLocalAtoms; ++i) {
01572       if ( localPartition[i] == 1 || localPartition[i] == 2 ) {
01573         lgd[nga++] = localData[i];
01574       }
01575     }
01576     numGridAtoms[g] = nga;
01577     for ( g=1; g<3; ++g ) {
01578       PmeParticle *lgd = localGridData[g];
01579       int nga = 0;
01580       for(int i=0; i<numLocalAtoms; ++i) {
01581         if ( localPartition[i] == g ) {
01582           lgd[nga++] = localData[i];
01583         }
01584       }
01585       numGridAtoms[g] = nga;
01586     }
01587   } else {
01588     if ( numGrids != 1 ) NAMD_bug("ComputePme::doWork assertion 3 failed");
01589     localGridData[0] = localData;
01590     numGridAtoms[0] = numLocalAtoms;
01591   }
01592 
01593   memset( (void*) fz_arr, 0, myGrid.K3 * sizeof(char) );
01594 
01595   // calculate self energy
01596   BigReal ewaldcof = ComputeNonbondedUtil::ewaldcof;
01597   for ( g=0; g<numGrids; ++g ) {
01598 #ifdef NETWORK_PROGRESS
01599     CmiNetworkProgress();
01600 #endif
01601 
01602     evir[g] = 0;
01603     BigReal selfEnergy = 0;
01604     data_ptr = localGridData[g];
01605     int i;
01606     for(i=0; i<numGridAtoms[g]; ++i)
01607     {
01608       selfEnergy += data_ptr->cg * data_ptr->cg;
01609       ++data_ptr;
01610     }
01611     selfEnergy *= -1. * ewaldcof / SQRT_PI;
01612     evir[g][0] += selfEnergy;
01613 
01614     double **q = q_arr + g*fsize;
01615     for (i=0; i<fsize; ++i) {
01616       if ( q[i] ) {
01617         memset( (void*) (q[i]), 0, myGrid.dim3 * sizeof(double) );
01618       }
01619     }
01620 
01621     char *f = f_arr + g*fsize;
01622     memset( (void*) f, 0, fsize * sizeof(char) );
01623     myRealSpace[g] = new PmeRealSpace(myGrid,numGridAtoms[g]);
01624     scale_coordinates(localGridData[g], numGridAtoms[g], lattice, myGrid);
01625     myRealSpace[g]->fill_charges(q, f, fz_arr, localGridData[g]);
01626   }
01627 
01628 #ifdef TRACE_COMPUTE_OBJECTS
01629     traceUserBracketEvent(TRACE_COMPOBJ_IDOFFSET+this->cid, traceObjStartTime, CmiWallTimer());
01630 #endif
01631 
01632   if ( myMgr->usePencils ) {
01633     sendPencils();
01634   } else {
01635 #if 0
01636   CProxy_ComputePmeMgr pmeProxy(CkpvAccess(BOCclass_group).computePmeMgr);
01637   pmeProxy[CkMyPe()].sendGrid();
01638 #else
01639   sendData(myMgr->numGridPes,myMgr->gridPeOrder,
01640                 myMgr->recipPeDest,myMgr->gridPeMap);
01641 #endif
01642   }
01643 
01644   if ( myMgr->ungrid_count == 0 ) {
01645     myMgr->pmeProxyDir[CkMyPe()].ungridCalc();
01646   }
01647 
01648 }
01649 
01650 
01651 void ComputePme::sendPencils() {
01652 
01653   // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
01654 
01655   int xBlocks = myMgr->xBlocks;
01656   int yBlocks = myMgr->yBlocks;
01657   int zBlocks = myMgr->zBlocks;
01658   myGrid.block1 = ( myGrid.K1 + xBlocks - 1 ) / xBlocks;
01659   myGrid.block2 = ( myGrid.K2 + yBlocks - 1 ) / yBlocks;
01660   int K1 = myGrid.K1;
01661   int K2 = myGrid.K2;
01662   int dim2 = myGrid.dim2;
01663   int dim3 = myGrid.dim3;
01664   int block1 = myGrid.block1;
01665   int block2 = myGrid.block2;
01666 
01667   Lattice lattice = patchList[0].p->flags.lattice;
01668 
01669   resultsRemaining = myMgr->numPencilsActive;
01670   const char *pencilActive = myMgr->pencilActive;
01671 
01672   strayChargeErrors = 0;
01673   // int savedMessages = 0;
01674 
01675   for (int ib=0; ib<xBlocks; ++ib) {
01676    for (int jb=0; jb<yBlocks; ++jb) {
01677     int ibegin = ib*block1;
01678     int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
01679     int jbegin = jb*block2;
01680     int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
01681     int flen = numGrids * (iend - ibegin) * (jend - jbegin);
01682     int fcount = 0;
01683 
01684     for ( int g=0; g<numGrids; ++g ) {
01685       char *f = f_arr + g*fsize;
01686       int fcount_g = 0;
01687       for ( int i=ibegin; i<iend; ++i ) {
01688        for ( int j=jbegin; j<jend; ++j ) {
01689         fcount_g += ( f[i*dim2+j] ? 1 : 0 );
01690        }
01691       }
01692       fcount += fcount_g;
01693       if ( ! pencilActive[ib*yBlocks+jb] ) {
01694         if ( fcount_g ) {
01695           ++strayChargeErrors;
01696           iout << iERROR << "Stray PME grid charges detected: "
01697                 << CkMyPe() << " sending to (x,y)";
01698           for ( int i=ibegin; i<iend; ++i ) {
01699            for ( int j=jbegin; j<jend; ++j ) {
01700             if ( f[i*dim2+j] ) { iout << " (" << i << "," << j << ")"; }
01701            }
01702           }
01703           iout << "\n" << endi;
01704         }
01705       }
01706     }
01707 
01708 #ifdef NETWORK_PROGRESS
01709     CmiNetworkProgress();
01710 #endif
01711 
01712     if ( ! pencilActive[ib*yBlocks+jb] ) continue;
01713 
01714     int zlistlen = 0;
01715     for ( int i=0; i<myGrid.K3; ++i ) {
01716       if ( fz_arr[i] ) ++zlistlen;
01717     }
01718 
01719     int hd = ( fcount? 1 : 0 );  // has data?
01720     // if ( ! hd ) ++savedMessages;
01721 
01722     PmeGridMsg *msg = new (hd*fcount*zlistlen, hd*zlistlen, hd*flen,
01723         hd*numGrids, PRIORITY_SIZE) PmeGridMsg;
01724     msg->sourceNode = CkMyPe();
01725     msg->hasData = hd;
01726     msg->lattice = lattice;
01727    if ( hd ) {
01728 #if 0
01729     msg->start = fstart;
01730     msg->len = flen;
01731 #else
01732     msg->start = -1;   // obsolete?
01733     msg->len = -1;   // obsolete?
01734 #endif
01735     msg->zlistlen = zlistlen;
01736     int *zlist = msg->zlist;
01737     zlistlen = 0;
01738     for ( int i=0; i<myGrid.K3; ++i ) {
01739       if ( fz_arr[i] ) zlist[zlistlen++] = i;
01740     }
01741     char *fmsg = msg->fgrid;
01742     float *qmsg = msg->qgrid;
01743     for ( int g=0; g<numGrids; ++g ) {
01744       char *f = f_arr + g*fsize;
01745       double **q = q_arr + g*fsize;
01746       for ( int i=ibegin; i<iend; ++i ) {
01747        for ( int j=jbegin; j<jend; ++j ) {
01748         *(fmsg++) = f[i*dim2+j];
01749         if( f[i*dim2+j] ) {
01750           for ( int k=0; k<zlistlen; ++k ) {
01751             *(qmsg++) = q[i*dim2+j][zlist[k]];
01752           }
01753         }
01754        }
01755       }
01756     }
01757    } else { // no data no reply
01758      myMgr->ungrid_count--;
01759    }
01760 
01761     msg->sequence = sequence();
01762     SET_PRIORITY(msg,sequence(),PME_GRID_PRIORITY)
01763     myMgr->zPencil(ib,jb,0).recvGrid(msg);
01764    }
01765   }
01766 
01767   // if ( savedMessages ) {
01768   //   CkPrintf("Pe %d eliminated %d PME messages\n",CkMyPe(),savedMessages);
01769   // }
01770 
01771   for (int i=0; i<fsize; ++i) {
01772     if ( q_arr[i] ) {
01773       memset( (void*) (q_arr[i]), -1, myGrid.dim3 * sizeof(double) );
01774     }
01775   }
01776 
01777 }
01778 
01779 
01780 void ComputePme::copyPencils(PmeGridMsg *msg) {
01781 
01782   int xBlocks = myMgr->xBlocks;
01783   int yBlocks = myMgr->yBlocks;
01784   int zBlocks = myMgr->zBlocks;
01785   myGrid.block1 = ( myGrid.K1 + xBlocks - 1 ) / xBlocks;
01786   myGrid.block2 = ( myGrid.K2 + yBlocks - 1 ) / yBlocks;
01787   int K1 = myGrid.K1;
01788   int K2 = myGrid.K2;
01789   int dim2 = myGrid.dim2;
01790   int dim3 = myGrid.dim3;
01791   int block1 = myGrid.block1;
01792   int block2 = myGrid.block2;
01793 
01794   // msg->sourceNode = thisIndex.x * initdata.yBlocks + thisIndex.y;
01795   int ib = msg->sourceNode / yBlocks;
01796   int jb = msg->sourceNode % yBlocks;
01797 
01798   int ibegin = ib*block1;
01799   int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
01800   int jbegin = jb*block2;
01801   int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
01802 
01803   int zlistlen = msg->zlistlen;
01804   int *zlist = msg->zlist;
01805   float *qmsg = msg->qgrid;
01806   int g;
01807   for ( g=0; g<numGrids; ++g ) {
01808     evir[g] += msg->evir[g];
01809     char *f = f_arr + g*fsize;
01810     double **q = q_arr + g*fsize;
01811     for ( int i=ibegin; i<iend; ++i ) {
01812      for ( int j=jbegin; j<jend; ++j ) {
01813       if( f[i*dim2+j] ) {
01814         for ( int k=0; k<zlistlen; ++k ) {
01815           q[i*dim2+j][zlist[k]] = *(qmsg++);
01816         }
01817       }
01818      }
01819     }
01820   }
01821 }
01822 
01823 
01824 void ComputePme::sendData(int numRecipPes, int *recipPeOrder,
01825                                 int *recipPeDest, int *gridPeMap) {
01826 
01827   // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
01828 
01829   myGrid.block1 = ( myGrid.K1 + numRecipPes - 1 ) / numRecipPes;
01830   myGrid.block2 = ( myGrid.K2 + numRecipPes - 1 ) / numRecipPes;
01831   bsize = myGrid.block1 * myGrid.dim2 * myGrid.dim3;
01832 
01833   Lattice lattice = patchList[0].p->flags.lattice;
01834 
01835   resultsRemaining = numRecipPes;
01836 
01837   strayChargeErrors = 0;
01838 
01839   CProxy_ComputePmeMgr pmeProxy(CkpvAccess(BOCclass_group).computePmeMgr);
01840   for (int j=0; j<numRecipPes; j++) {
01841     int pe = recipPeOrder[j];  // different order
01842     int start = pe * bsize;
01843     int len = bsize;
01844     if ( start >= qsize ) { start = 0; len = 0; }
01845     if ( start + len > qsize ) { len = qsize - start; }
01846     int zdim = myGrid.dim3;
01847     int fstart = start / zdim;
01848     int flen = len / zdim;
01849     int fcount = 0;
01850     int i;
01851 
01852     int g;
01853     for ( g=0; g<numGrids; ++g ) {
01854       char *f = f_arr + fstart + g*fsize;
01855       int fcount_g = 0;
01856       for ( i=0; i<flen; ++i ) {
01857         fcount_g += ( f[i] ? 1 : 0 );
01858       }
01859       fcount += fcount_g;
01860       if ( ! recipPeDest[pe] ) {
01861         if ( fcount_g ) {
01862           ++strayChargeErrors;
01863           iout << iERROR << "Stray PME grid charges detected: "
01864                 << CkMyPe() << " sending to " << gridPeMap[pe] << " for planes";
01865           int iz = -1;
01866           for ( i=0; i<flen; ++i ) {
01867             if ( f[i] ) {
01868               int jz = (i+fstart)/myGrid.K2;
01869               if ( iz != jz ) { iout << " " << jz;  iz = jz; }
01870             }
01871           }
01872           iout << "\n" << endi;
01873         }
01874       }
01875     }
01876 
01877 #ifdef NETWORK_PROGRESS
01878     CmiNetworkProgress();
01879 #endif
01880 
01881     if ( ! recipPeDest[pe] ) continue;
01882 
01883     int zlistlen = 0;
01884     for ( i=0; i<myGrid.K3; ++i ) {
01885       if ( fz_arr[i] ) ++zlistlen;
01886     }
01887 
01888     PmeGridMsg *msg = new (fcount*zlistlen, zlistlen, flen*numGrids,
01889                                 numGrids, PRIORITY_SIZE) PmeGridMsg;
01890     msg->sourceNode = CkMyPe();
01891     msg->lattice = lattice;
01892     msg->start = fstart;
01893     msg->len = flen;
01894     msg->zlistlen = zlistlen;
01895     int *zlist = msg->zlist;
01896     zlistlen = 0;
01897     for ( i=0; i<myGrid.K3; ++i ) {
01898       if ( fz_arr[i] ) zlist[zlistlen++] = i;
01899     }
01900     float *qmsg = msg->qgrid;
01901     for ( g=0; g<numGrids; ++g ) {
01902       char *f = f_arr + fstart + g*fsize;
01903       CmiMemcpy((void*)(msg->fgrid+g*flen),(void*)f,flen*sizeof(char));
01904       double **q = q_arr + fstart + g*fsize;
01905       for ( i=0; i<flen; ++i ) {
01906         if ( f[i] ) {
01907           for ( int k=0; k<zlistlen; ++k ) {
01908             *(qmsg++) = q[i][zlist[k]];
01909           }
01910         }
01911       }
01912     }
01913 
01914     msg->sequence = sequence();
01915     SET_PRIORITY(msg,sequence(),PME_GRID_PRIORITY)
01916     pmeProxy[gridPeMap[pe]].recvGrid(msg);
01917   }
01918 
01919   for (int i=0; i<fsize; ++i) {
01920     if ( q_arr[i] ) {
01921       memset( (void*) (q_arr[i]), -1, myGrid.dim3 * sizeof(double) );
01922     }
01923   }
01924 
01925 }
01926 
01927 void ComputePme::copyResults(PmeGridMsg *msg) {
01928 
01929   int zdim = myGrid.dim3;
01930   int flen = msg->len;
01931   int fstart = msg->start;
01932   int zlistlen = msg->zlistlen;
01933   int *zlist = msg->zlist;
01934   float *qmsg = msg->qgrid;
01935   int g;
01936   for ( g=0; g<numGrids; ++g ) {
01937     evir[g] += msg->evir[g];
01938     char *f = msg->fgrid + g*flen;
01939     double **q = q_arr + fstart + g*fsize;
01940     for ( int i=0; i<flen; ++i ) {
01941       if ( f[i] ) {
01942         for ( int k=0; k<zlistlen; ++k ) {
01943           q[i][zlist[k]] = *(qmsg++);
01944         }
01945       }
01946     }
01947   }
01948 }
01949 
01950 void ComputePme::ungridForces() {
01951 
01952     SimParameters *simParams = Node::Object()->simParameters;
01953 
01954     Vector *localResults = new Vector[numLocalAtoms*
01955                                         ((numGrids>1 || selfOn)?2:1)];
01956     Vector *gridResults;
01957     if ( alchFepOn || alchThermIntOn || lesOn || selfOn || pairOn ) {
01958       for(int i=0; i<numLocalAtoms; ++i) { localResults[i] = 0.; }
01959       gridResults = localResults + numLocalAtoms;
01960     } else {
01961       gridResults = localResults;
01962     }
01963 
01964     Vector pairForce = 0.;
01965     Lattice lattice = patchList[0].p->flags.lattice;
01966     int g = 0;
01967     if(!simParams->commOnly) {
01968     for ( g=0; g<numGrids; ++g ) {
01969 #ifdef NETWORK_PROGRESS
01970       CmiNetworkProgress();
01971 #endif
01972 
01973       myRealSpace[g]->compute_forces(q_arr+g*fsize, localGridData[g], gridResults);
01974       delete myRealSpace[g];
01975       scale_forces(gridResults, numGridAtoms[g], lattice);
01976 
01977       if ( alchFepOn || alchThermIntOn ) {
01978         double scale = 1.;
01979         elecLambdaUp =  (simParams->alchLambda <= alchElecLambdaStart)? 0. : \
01980           (simParams->alchLambda - alchElecLambdaStart) / (1. - alchElecLambdaStart);
01981         elecLambdaDown =  ((1-simParams->alchLambda) <= alchElecLambdaStart)? 0. : ((1-simParams->alchLambda) - alchElecLambdaStart) / (1. - alchElecLambdaStart);
01982         if ( g == 0 ) scale = elecLambdaUp;
01983         else if ( g == 1 ) scale = elecLambdaDown;
01984         else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
01985         if (alchDecouple) {
01986           if ( g == 2 ) scale = 1 - elecLambdaUp;
01987           else if ( g == 3 ) scale = 1 - elecLambdaDown;
01988           else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
01989         }
01990         int nga = 0;
01991         if (!alchDecouple) {
01992           for(int i=0; i<numLocalAtoms; ++i) {
01993             if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
01994               // (g=2: only partition 0)
01995               localResults[i] += gridResults[nga++] * scale;
01996             }
01997           }
01998         }
01999         else {  // alchDecouple
02000           if ( g < 2 ) {
02001             for(int i=0; i<numLocalAtoms; ++i) {
02002               if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
02003                 // g = 0: partition 0 or partition 1
02004                 // g = 1: partition 0 or partition 2
02005                 localResults[i] += gridResults[nga++] * scale;
02006               }
02007             }
02008           }
02009           else {
02010             for(int i=0; i<numLocalAtoms; ++i) {
02011               if ( localPartition[i] == (g-1) || localPartition[i] == (g-4)) {
02012                 // g = 2: partition 1 only
02013                 // g = 3: partition 2 only
02014                 // g = 4: partition 0 only
02015                 localResults[i] += gridResults[nga++] * scale;
02016               }
02017             }
02018           }
02019         }
02020       } else if ( lesOn ) {
02021         double scale = 1.;
02022         if ( alchFepOn ) {
02023           if ( g == 0 ) scale = simParams->alchLambda;
02024           else if ( g == 1 ) scale = 1. - simParams->alchLambda;
02025         } else if ( lesOn ) {
02026           scale = 1.0 / (double)lesFactor;
02027         }
02028         int nga = 0;
02029         for(int i=0; i<numLocalAtoms; ++i) {
02030           if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
02031             localResults[i] += gridResults[nga++] * scale;
02032           }
02033         }
02034       } else if ( selfOn ) {
02035         PmeParticle *lgd = localGridData[g];
02036         int nga = 0;
02037         for(int i=0; i<numLocalAtoms; ++i) {
02038           if ( localPartition[i] == 1 ) {
02039             pairForce += gridResults[nga];  // should add up to almost zero
02040             localResults[i] += gridResults[nga++];
02041           }
02042         }
02043       } else if ( pairOn ) {
02044         if ( g == 0 ) {
02045           int nga = 0;
02046           for(int i=0; i<numLocalAtoms; ++i) {
02047             if ( localPartition[i] == 1 ) {
02048               pairForce += gridResults[nga];
02049             }
02050             if ( localPartition[i] == 1 || localPartition[i] == 2 ) {
02051               localResults[i] += gridResults[nga++];
02052             }
02053           }
02054         } else if ( g == 1 ) {
02055           int nga = 0;
02056           for(int i=0; i<numLocalAtoms; ++i) {
02057             if ( localPartition[i] == g ) {
02058               pairForce -= gridResults[nga];  // should add up to almost zero
02059               localResults[i] -= gridResults[nga++];
02060             }
02061           }
02062         } else {
02063           int nga = 0;
02064           for(int i=0; i<numLocalAtoms; ++i) {
02065             if ( localPartition[i] == g ) {
02066               localResults[i] -= gridResults[nga++];
02067             }
02068           }
02069         }
02070       }
02071     }
02072     }
02073     else
02074     {// cleanup for commOnly to avoid leaking
02075       for ( g=0; g<numGrids; ++g ) {delete myRealSpace[g];}
02076     }
02077     delete [] localData;
02078     delete [] localPartition;
02079 
02080     Vector *results_ptr = localResults;
02081     ResizeArrayIter<PatchElem> ap(patchList);
02082 
02083     // add in forces
02084     for (ap = ap.begin(); ap != ap.end(); ap++) {
02085       Results *r = (*ap).forceBox->open();
02086       Force *f = r->f[Results::slow];
02087       int numAtoms = (*ap).p->getNumAtoms();
02088 
02089       if ( ! strayChargeErrors && ! simParams->commOnly ) {
02090         for(int i=0; i<numAtoms; ++i) {
02091           f[i].x += results_ptr->x;
02092           f[i].y += results_ptr->y;
02093           f[i].z += results_ptr->z;
02094           ++results_ptr;
02095         }
02096       }
02097   
02098       (*ap).forceBox->close(&r);
02099     }
02100 
02101     delete [] localResults;
02102    
02103     if ( pairOn || selfOn ) {
02104         ADD_VECTOR_OBJECT(reduction,REDUCTION_PAIR_ELECT_FORCE,pairForce);
02105     }
02106 
02107     for ( g=0; g<numGrids; ++g ) {
02108       double scale = 1.;
02109       if ( alchFepOn || alchThermIntOn ) {
02110         if ( g == 0 ) scale = elecLambdaUp;
02111         else if ( g == 1 ) scale = elecLambdaDown;
02112         else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
02113         if (alchDecouple) {
02114           if ( g == 2 ) scale = 1-elecLambdaUp;
02115           else if ( g == 3 ) scale = 1-elecLambdaDown;
02116           else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
02117         }
02118       } else if ( lesOn ) {
02119         scale = 1.0 / (double)lesFactor;
02120       } else if ( pairOn ) {
02121         scale = ( g == 0 ? 1. : -1. );
02122       }
02123       reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += evir[g][0] * scale;
02124       reduction->item(REDUCTION_VIRIAL_SLOW_XX) += evir[g][1] * scale;
02125       reduction->item(REDUCTION_VIRIAL_SLOW_XY) += evir[g][2] * scale;
02126       reduction->item(REDUCTION_VIRIAL_SLOW_XZ) += evir[g][3] * scale;
02127       reduction->item(REDUCTION_VIRIAL_SLOW_YX) += evir[g][2] * scale;
02128       reduction->item(REDUCTION_VIRIAL_SLOW_YY) += evir[g][4] * scale;
02129       reduction->item(REDUCTION_VIRIAL_SLOW_YZ) += evir[g][5] * scale;
02130       reduction->item(REDUCTION_VIRIAL_SLOW_ZX) += evir[g][3] * scale;
02131       reduction->item(REDUCTION_VIRIAL_SLOW_ZY) += evir[g][5] * scale;
02132       reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += evir[g][6] * scale;
02133 
02134       double scale2 = 0.;
02135 
02136       //alchElecLambdaStart = (alchFepOn || alchThermIntOn) ? simParams->alchElecLambdaStart : 0;
02137 
02138       if (alchFepOn) {
02139         BigReal elecLambda2Up =  (simParams->alchLambda2 <= alchElecLambdaStart)? 0. : \
02140               (simParams->alchLambda2 - alchElecLambdaStart) / (1. - alchElecLambdaStart);
02141         BigReal elecLambda2Down =  ((1-simParams->alchLambda2) <= alchElecLambdaStart)? 0. : \
02142               ((1-simParams->alchLambda2) - alchElecLambdaStart) / (1. - alchElecLambdaStart);
02143         
02144         
02145         if ( g == 0 ) scale2 = elecLambda2Up;
02146         else if ( g == 1 ) scale2 = elecLambda2Down;
02147         else if ( g == 2 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
02148         if (alchDecouple && g == 2 ) scale2 = 1 - elecLambda2Up;
02149         else if (alchDecouple && g == 3 ) scale2 = 1 - elecLambda2Down;
02150         else if (alchDecouple && g == 4 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
02151       }
02152       reduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += evir[g][0] * scale2;
02153       
02154       if (alchThermIntOn) {
02155         
02156         // no decoupling:
02157         // part. 1 <-> all of system except partition 2: g[0] - g[2] 
02158         // (interactions between all atoms [partition 0 OR partition 1], 
02159         // minus all [within partition 0])
02160         // U = elecLambdaUp * (U[0] - U[2])
02161         // dU/dl = U[0] - U[2];
02162         
02163         // part. 2 <-> all of system except partition 1: g[1] - g[2] 
02164         // (interactions between all atoms [partition 0 OR partition 2], 
02165         // minus all [within partition 0])
02166         // U = elecLambdaDown * (U[1] - U[2])
02167         // dU/dl = U[1] - U[2];
02168 
02169         // alchDecouple:
02170         // part. 1 <-> part. 0: g[0] - g[2] - g[4] 
02171         // (interactions between all atoms [partition 0 OR partition 1]
02172         // minus all [within partition 1] minus all [within partition 0]
02173         // U = elecLambdaUp * (U[0] - U[4]) + (1-elecLambdaUp)* U[2]
02174         // dU/dl = U[0] - U[2] - U[4];
02175 
02176         // part. 2 <-> part. 0: g[1] - g[3] - g[4] 
02177         // (interactions between all atoms [partition 0 OR partition 2]
02178         // minus all [within partition 2] minus all [within partition 0]
02179         // U = elecLambdaDown * (U[1] - U[4]) + (1-elecLambdaDown)* U[3]
02180         // dU/dl = U[1] - U[3] - U[4];
02181         
02182         
02183         if ( g == 0 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) += evir[g][0];
02184         if ( g == 1 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) += evir[g][0];
02185         if (!alchDecouple) {
02186           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
02187           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
02188         }
02189         else {  // alchDecouple
02190           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
02191           if ( g == 3 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
02192           if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
02193           if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
02194         }
02195       }
02196     }
02197     reduction->item(REDUCTION_STRAY_CHARGE_ERRORS) += strayChargeErrors;
02198     reduction->submit();
02199 
02200 }
02201 
02202 #if USE_TOPOMAP 
02203 
02204 #define NPRIMES 8
02205 const static unsigned int NAMDPrimes[] = {
02206   3,
02207   5,
02208   7,
02209   11,
02210   13,
02211   17,
02212   19,
02213   23,  
02214   29,
02215   31,
02216   37,
02217   59,
02218   73,
02219   93,
02220   113,
02221   157,
02222   307,
02223   617,
02224   1217                  //This should b enough for 64K nodes of BGL. 
02225 };
02226 
02227 #include "RecBisection.h"
02228 
02229 /***-----------------------------------------------------**********
02230     The Orthogonal Recursive Bisection strategy, which allocates PME
02231     objects close to the patches they communicate, and at the same
02232     time spreads them around the grid 
02233 ****----------------------------------------------------------****/
02234 
02235 bool generateBGLORBPmePeList(int *pemap, int numPes, 
02236                              int *block_pes, int nbpes) {
02237 
02238   PatchMap *pmap = PatchMap::Object();
02239   int *pmemap = new int [CkNumPes()];
02240 
02241   if (pemap == NULL)
02242     return false;
02243 
02244   TopoManager tmgr;
02245 
02246   memset(pmemap, 0, sizeof(int) * CkNumPes());
02247 
02248   for(int count = 0; count < CkNumPes(); count++) {
02249     if(count < nbpes)
02250       pmemap[block_pes[count]] = 1;
02251     
02252     if(pmap->numPatchesOnNode(count)) {
02253       pmemap[count] = 1;
02254       
02255       //Assumes an XYZT mapping !!
02256       if(tmgr.hasMultipleProcsPerNode()) {
02257         pmemap[(count + CkNumPes()/2)% CkNumPes()] = 1;
02258       }
02259     }
02260   }
02261 
02262   if(numPes + nbpes + pmap->numNodesWithPatches() > CkNumPes())
02263     //NAMD_bug("PME ORB Allocator: Processors Unavailable\n");
02264     return false;
02265 
02266   CProxy_Node nd(CkpvAccess(BOCclass_group).node);
02267   Node *node = nd.ckLocalBranch();
02268   SimParameters *simParams = node->simParameters;
02269 
02270   //first split PME processors into patch groups
02271 
02272   int xsize = 0, ysize = 0, zsize = 0;
02273 
02274   xsize = tmgr.getDimX();
02275   ysize = tmgr.getDimY();
02276   zsize = tmgr.getDimZ();
02277   
02278   int nx = xsize, ny = ysize, nz = zsize;
02279   DimensionMap dm;
02280   
02281   dm.x = 0;
02282   dm.y = 1;
02283   dm.z = 2;
02284   
02285   findOptimalDimensions(xsize, ysize, zsize, nx, ny, nz, dm);
02286 
02287   //group size processors have to be allocated to each YZ plane
02288   int group_size = numPes/nx;
02289   if(numPes % nx)
02290     group_size ++;
02291 
02292   int my_prime = NAMDPrimes[0];
02293   int density = (ny * nz)/group_size + 1;
02294   int count = 0;
02295   
02296   //Choose a suitable prime Number
02297   for(count = 0; count < NPRIMES; count ++) {
02298     //Find a prime just greater than the density
02299     if(density < NAMDPrimes[count]) {
02300       my_prime = NAMDPrimes[count];
02301       break;
02302     }      
02303   }
02304   
02305   if(count == NPRIMES)
02306     my_prime = NAMDPrimes[NPRIMES-1];
02307 
02308   //int gcount = numPes/2;
02309   int gcount = 0;
02310   int npme_pes = 0;
02311   
02312   int coord[3];
02313 
02314   for(int x = 0; x < nx; x++) {
02315     coord[0] = (x + nx/2)%nx;
02316     
02317     for(count=0; count < group_size && npme_pes < numPes; count++) {
02318       int dest = (count + 1) * my_prime;      
02319       dest = dest % (ny * nz);      
02320       
02321       coord[2] = dest / ny;
02322       coord[1] = dest - coord[2] * ny;
02323       
02324       //Locate where in the actual grid the processor is
02325       int destPe = coord[dm.x] + coord[dm.y] * xsize + 
02326         coord[dm.z] * xsize* ysize;
02327       
02328       if(pmemap[destPe] == 0) {
02329         pemap[gcount++] = destPe;
02330         pmemap[destPe] = 1;
02331         
02332         if(tmgr.hasMultipleProcsPerNode())
02333           pmemap[(destPe + CkNumPes()/2) % CkNumPes()] = 1;     
02334 
02335         npme_pes ++;
02336       }
02337       else {
02338         for(int pos = 1; pos < ny * nz; pos++) {
02339           
02340           coord[2] += pos / ny;
02341           coord[1] += pos % ny;
02342           
02343           coord[2] = coord[2] % nz;
02344           coord[1] = coord[1] % ny;       
02345           
02346           int newdest = coord[dm.x] + coord[dm.y] * xsize + 
02347             coord[dm.z] * xsize * ysize;
02348           
02349           if(pmemap[newdest] == 0) {
02350             pemap[gcount++] = newdest;
02351             pmemap[newdest] = 1;
02352             
02353             if(tmgr.hasMultipleProcsPerNode())
02354               pmemap[(newdest + CkNumPes()/2) % CkNumPes()] = 1;        
02355             
02356             npme_pes ++;
02357             break;
02358           }
02359         }
02360       }      
02361     }   
02362     
02363     if(gcount == numPes)
02364       gcount = 0;    
02365     
02366     if(npme_pes >= numPes)
02367       break;
02368   }
02369   
02370   delete [] pmemap;
02371   
02372   if(npme_pes != numPes)
02373     //NAMD_bug("ORB PME allocator failed\n");
02374     return false;
02375 
02376   return true;
02377 }
02378 
02379 #endif
02380 
02381 template <class T> class PmePencil : public T {
02382 public:
02383   PmePencil() {
02384     data = 0;
02385     work = 0;
02386     send_order = 0;
02387     needs_reply = 0;
02388   }
02389   ~PmePencil() {
02390     delete [] data;
02391     delete [] work;
02392     delete [] send_order;
02393     delete [] needs_reply;
02394   }
02395   void base_init(PmePencilInitMsg *msg) {
02396     initdata = msg->data;
02397   }
02398   void order_init(int nBlocks) {
02399     send_order = new int[nBlocks];
02400     for ( int i=0; i<nBlocks; ++i ) send_order[i] = i;
02401     Random rand(CkMyPe());
02402     rand.reorder(send_order,nBlocks);
02403     needs_reply = new int[nBlocks];
02404   }
02405   PmePencilInitMsgData initdata;
02406   Lattice lattice;
02407   PmeReduction evir;
02408   int sequence;  // used for priorities
02409   int imsg;  // used in sdag code
02410   int hasData;  // used in message elimination
02411   float *data;
02412   float *work;
02413   int *send_order;
02414   int *needs_reply;
02415 };
02416 
02417 class PmeZPencil : public PmePencil<CBase_PmeZPencil> {
02418 public:
02419     PmeZPencil_SDAG_CODE
02420     PmeZPencil() { __sdag_init(); setMigratable(false); }
02421     PmeZPencil(CkMigrateMessage *) { __sdag_init();  setMigratable (false); }
02422     void fft_init();
02423     void recv_grid(const PmeGridMsg *);
02424     void forward_fft();
02425     void send_trans();
02426     void recv_untrans(const PmeUntransMsg *);
02427     void backward_fft();
02428     void send_ungrid(PmeGridMsg *);
02429 private:
02430     ResizeArray<PmeGridMsg *> grid_msgs;
02431 #ifdef NAMD_FFTW
02432     rfftwnd_plan forward_plan, backward_plan;
02433 #endif
02434     int nx, ny;
02435 };
02436 
02437 class PmeYPencil : public PmePencil<CBase_PmeYPencil> {
02438 public:
02439     PmeYPencil_SDAG_CODE
02440     PmeYPencil() { __sdag_init(); setMigratable(false); }
02441     PmeYPencil(CkMigrateMessage *) { __sdag_init(); }
02442     void fft_init();
02443     void recv_trans(const PmeTransMsg *);
02444     void forward_fft();
02445     void send_trans();
02446     void recv_untrans(const PmeUntransMsg *);
02447     void backward_fft();
02448     void send_untrans();
02449 private:
02450 #ifdef NAMD_FFTW
02451     fftw_plan forward_plan, backward_plan;
02452 #endif
02453     int nx, nz;
02454 };
02455 
02456 class PmeXPencil : public PmePencil<CBase_PmeXPencil> {
02457 public:
02458     PmeXPencil_SDAG_CODE
02459     PmeXPencil() { __sdag_init();  myKSpace = 0; setMigratable(false); }
02460     PmeXPencil(CkMigrateMessage *) { __sdag_init(); }
02461     void fft_init();
02462     void recv_trans(const PmeTransMsg *);
02463     void forward_fft();
02464     void pme_kspace();
02465     void backward_fft();
02466     void send_untrans();
02467 #ifdef NAMD_FFTW
02468     fftw_plan forward_plan, backward_plan;
02469 #endif
02470     int ny, nz;
02471     PmeKSpace *myKSpace;
02472 };
02473 
02474 void PmeZPencil::fft_init() {
02475   CProxy_Node nd(CkpvAccess(BOCclass_group).node);
02476   Node *node = nd.ckLocalBranch();
02477   SimParameters *simParams = node->simParameters;
02478 
02479   int K1 = initdata.grid.K1;
02480   int K2 = initdata.grid.K2;
02481   int K3 = initdata.grid.K3;
02482   int dim3 = initdata.grid.dim3;
02483   int block1 = initdata.grid.block1;
02484   int block2 = initdata.grid.block2;
02485 
02486   nx = block1;
02487   if ( (thisIndex.x + 1) * block1 > K1 ) nx = K1 - thisIndex.x * block1;
02488   ny = block2;
02489   if ( (thisIndex.y + 1) * block2 > K2 ) ny = K2 - thisIndex.y * block2;
02490 
02491   data = new float[nx*ny*dim3];
02492   work = new float[dim3];
02493 
02494   order_init(initdata.zBlocks);
02495 
02496 #ifdef NAMD_FFTW
02497   CmiLock(ComputePmeMgr::fftw_plan_lock);
02498 
02499   forward_plan = rfftwnd_create_plan_specific(1, &K3, FFTW_REAL_TO_COMPLEX,
02500         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
02501         | FFTW_IN_PLACE | FFTW_USE_WISDOM, data, 1, work, 1);
02502   backward_plan = rfftwnd_create_plan_specific(1, &K3, FFTW_COMPLEX_TO_REAL,
02503         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
02504         | FFTW_IN_PLACE | FFTW_USE_WISDOM, data, 1, work, 1);
02505 
02506   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
02507 #else
02508   NAMD_die("Sorry, FFTW must be compiled in to use PME.");
02509 #endif
02510 }
02511 
02512 void PmeYPencil::fft_init() {
02513   CProxy_Node nd(CkpvAccess(BOCclass_group).node);
02514   Node *node = nd.ckLocalBranch();
02515   SimParameters *simParams = node->simParameters;
02516 
02517   int K1 = initdata.grid.K1;
02518   int K2 = initdata.grid.K2;
02519   int dim2 = initdata.grid.dim2;
02520   int dim3 = initdata.grid.dim3;
02521   int block1 = initdata.grid.block1;
02522   int block3 = initdata.grid.block3;
02523 
02524   nx = block1;
02525   if ( (thisIndex.x + 1) * block1 > K1 ) nx = K1 - thisIndex.x * block1;
02526   nz = block3;
02527   if ( (thisIndex.z+1)*block3 > dim3/2 ) nz = dim3/2 - thisIndex.z*block3;
02528 
02529   data = new float[nx*dim2*nz*2];
02530   work = new float[2*K2];
02531 
02532   order_init(initdata.yBlocks);
02533 
02534 #ifdef NAMD_FFTW
02535   CmiLock(ComputePmeMgr::fftw_plan_lock);
02536 
02537   forward_plan = fftw_create_plan_specific(K2, FFTW_FORWARD,
02538         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
02539         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
02540         nz, (fftw_complex *) work, 1);
02541   backward_plan = fftw_create_plan_specific(K2, FFTW_BACKWARD,
02542         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
02543         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
02544         nz, (fftw_complex *) work, 1);
02545 
02546   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
02547 #else
02548   NAMD_die("Sorry, FFTW must be compiled in to use PME.");
02549 #endif
02550 }
02551 
02552 void PmeXPencil::fft_init() {
02553   CProxy_Node nd(CkpvAccess(BOCclass_group).node);
02554   Node *node = nd.ckLocalBranch();
02555   SimParameters *simParams = node->simParameters;
02556 
02557   int K1 = initdata.grid.K1;
02558   int K2 = initdata.grid.K2;
02559   int dim3 = initdata.grid.dim3;
02560   int block2 = initdata.grid.block2;
02561   int block3 = initdata.grid.block3;
02562 
02563   ny = block2;
02564   if ( (thisIndex.y + 1) * block2 > K2 ) ny = K2 - thisIndex.y * block2;
02565   nz = block3;
02566   if ( (thisIndex.z+1)*block3 > dim3/2 ) nz = dim3/2 - thisIndex.z*block3;
02567 
02568   data = new float[K1*block2*block3*2];
02569   work = new float[2*K1];
02570 
02571   order_init(initdata.xBlocks);
02572 
02573 #ifdef NAMD_FFTW
02574   CmiLock(ComputePmeMgr::fftw_plan_lock);
02575 
02576   forward_plan = fftw_create_plan_specific(K1, FFTW_FORWARD,
02577         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
02578         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
02579         ny*nz, (fftw_complex *) work, 1);
02580   backward_plan = fftw_create_plan_specific(K1, FFTW_BACKWARD,
02581         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
02582         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
02583         ny*nz, (fftw_complex *) work, 1);
02584 
02585   CmiUnlock(ComputePmeMgr::fftw_plan_lock);
02586 #else
02587   NAMD_die("Sorry, FFTW must be compiled in to use PME.");
02588 #endif
02589 
02590   myKSpace = new PmeKSpace(initdata.grid,
02591                 thisIndex.y*block2, thisIndex.y*block2 + ny,
02592                 thisIndex.z*block3, thisIndex.z*block3 + nz);
02593 }
02594 
02595 // #define FFTCHECK   // run a grid of integers through the fft
02596 // #define ZEROCHECK  // check for suspicious zeros in fft
02597 
02598 void PmeZPencil::recv_grid(const PmeGridMsg *msg) {
02599 
02600   int dim3 = initdata.grid.dim3;
02601   if ( imsg == 0 ) {
02602     lattice = msg->lattice;
02603     sequence = msg->sequence;
02604     memset(data, 0, sizeof(float) * nx*ny*dim3);
02605   }
02606 
02607   if ( ! msg->hasData ) return;
02608 
02609   int zlistlen = msg->zlistlen;
02610   int *zlist = msg->zlist;
02611   char *fmsg = msg->fgrid;
02612   float *qmsg = msg->qgrid;
02613   float *d = data;
02614   int numGrids = 1;  // pencil FFT doesn't support multiple grids
02615   for ( int g=0; g<numGrids; ++g ) {
02616     for ( int i=0; i<nx; ++i ) {
02617      for ( int j=0; j<ny; ++j, d += dim3 ) {
02618       if( *(fmsg++) ) {
02619         for ( int k=0; k<zlistlen; ++k ) {
02620           d[zlist[k]] += *(qmsg++);
02621         }
02622       }
02623      }
02624     }
02625   }
02626 }
02627 
02628 void PmeZPencil::forward_fft() {
02629 #ifdef FFTCHECK
02630   int dim3 = initdata.grid.dim3;
02631   int K3 = initdata.grid.K3;
02632   float std_base = 100. * (thisIndex.x+1.) + 10. * (thisIndex.y+1.);
02633   float *d = data;
02634   for ( int i=0; i<nx; ++i ) {
02635    for ( int j=0; j<ny; ++j, d += dim3 ) {
02636     for ( int k=0; k<dim3; ++k ) {
02637       d[k] = 10. * (10. * (10. * std_base + i) + j) + k;
02638     }
02639    }
02640   }
02641 #endif
02642 #ifdef NAMD_FFTW
02643   rfftwnd_real_to_complex(forward_plan, nx*ny,
02644         data, 1, initdata.grid.dim3, (fftw_complex *) work, 1, 0);
02645 #endif
02646 #ifdef ZEROCHECK
02647   int dim3 = initdata.grid.dim3;
02648   int K3 = initdata.grid.K3;
02649   float *d = data;
02650   for ( int i=0; i<nx; ++i ) {
02651    for ( int j=0; j<ny; ++j, d += dim3 ) {
02652     for ( int k=0; k<dim3; ++k ) {
02653       if ( d[k] == 0. ) CkPrintf("0 in Z at %d %d %d %d %d %d %d %d %d\n",
02654         thisIndex.x, thisIndex.y, i, j, k, nx, ny, dim3);
02655     }
02656    }
02657   }
02658 #endif
02659 }
02660 
02661 void PmeZPencil::send_trans() {
02662   int zBlocks = initdata.zBlocks;
02663   int block3 = initdata.grid.block3;
02664   int dim3 = initdata.grid.dim3;
02665   for ( int isend=0; isend<zBlocks; ++isend ) {
02666     int kb = send_order[isend];
02667     int nz = block3;
02668     if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
02669     int hd = ( hasData ? 1 : 0 );
02670     PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
02671     msg->lattice = lattice;
02672     msg->sourceNode = thisIndex.y;
02673     msg->hasData = hasData;
02674     msg->nx = ny;
02675    if ( hasData ) {
02676     float *md = msg->qgrid;
02677     const float *d = data;
02678     for ( int i=0; i<nx; ++i ) {
02679      for ( int j=0; j<ny; ++j, d += dim3 ) {
02680       for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
02681         *(md++) = d[2*k];
02682         *(md++) = d[2*k+1];
02683       }
02684      }
02685     }
02686    }
02687     msg->sequence = sequence;
02688     SET_PRIORITY(msg,sequence,PME_TRANS_PRIORITY)
02689     initdata.yPencil(thisIndex.x,0,kb).recvTrans(msg);
02690   }
02691 }
02692 
02693 void PmeYPencil::recv_trans(const PmeTransMsg *msg) {
02694   if ( imsg == 0 ) {
02695     lattice = msg->lattice;
02696     sequence = msg->sequence;
02697   }
02698   int block2 = initdata.grid.block2;
02699   int K2 = initdata.grid.K2;
02700   int jb = msg->sourceNode;
02701   int ny = msg->nx;
02702  if ( msg->hasData ) {
02703   const float *md = msg->qgrid;
02704   float *d = data;
02705   for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
02706    for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
02707     for ( int k=0; k<nz; ++k ) {
02708 #ifdef ZEROCHECK
02709       if ( (*md) == 0. ) CkPrintf("0 in ZY at %d %d %d %d %d %d %d %d %d\n",
02710         thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
02711 #endif
02712       d[2*(j*nz+k)] = *(md++);
02713       d[2*(j*nz+k)+1] = *(md++);
02714     }
02715    }
02716   }
02717  } else {
02718   float *d = data;
02719   for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
02720    for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
02721     for ( int k=0; k<nz; ++k ) {
02722       d[2*(j*nz+k)] = 0;
02723       d[2*(j*nz+k)+1] = 0;
02724     }
02725    }
02726   }
02727  }
02728 }
02729 
02730 void PmeYPencil::forward_fft() {
02731 #ifdef NAMD_FFTW
02732   for ( int i=0; i<nx; ++i ) {
02733     fftw(forward_plan, nz,
02734         ((fftw_complex *) data) + i * nz * initdata.grid.K2,
02735         nz, 1, (fftw_complex *) work, 1, 0);
02736   }
02737 #endif
02738 }
02739 
02740 void PmeYPencil::send_trans() {
02741   int yBlocks = initdata.yBlocks;
02742   int block2 = initdata.grid.block2;
02743   int K2 = initdata.grid.K2;
02744   for ( int isend=0; isend<yBlocks; ++isend ) {
02745     int jb = send_order[isend];
02746     int ny = block2;
02747     if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
02748     int hd = ( hasData ? 1 : 0 );
02749     PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
02750     msg->lattice = lattice;
02751     msg->sourceNode = thisIndex.x;
02752     msg->hasData = hasData;
02753     msg->nx = nx;
02754    if ( hasData ) {
02755     float *md = msg->qgrid;
02756     const float *d = data;
02757     for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
02758      for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
02759       for ( int k=0; k<nz; ++k ) {
02760         *(md++) = d[2*(j*nz+k)];
02761         *(md++) = d[2*(j*nz+k)+1];
02762 #ifdef ZEROCHECK
02763         if ( *(md-2) == 0. ) CkPrintf("send 0 in YX at %d %d %d %d %d %d %d %d %d\n",
02764         thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
02765 #endif
02766       }
02767      }
02768     }
02769     if ( md != msg->qgrid + nx*ny*nz*2 ) CkPrintf("error in YX at %d %d %d\n",
02770         thisIndex.x, jb, thisIndex.z);
02771    }
02772     msg->sequence = sequence;
02773     SET_PRIORITY(msg,sequence,PME_TRANS2_PRIORITY)
02774     initdata.xPencil(0,jb,thisIndex.z).recvTrans(msg);
02775   }
02776 }
02777 
02778 void PmeXPencil::recv_trans(const PmeTransMsg *msg) {
02779   if ( imsg == 0 ) {
02780     lattice = msg->lattice;
02781     sequence = msg->sequence;
02782   }
02783   int block1 = initdata.grid.block1;
02784   int K1 = initdata.grid.K1;
02785   int ib = msg->sourceNode;
02786   int nx = msg->nx;
02787  if ( msg->hasData ) {
02788   const float *md = msg->qgrid;
02789   for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
02790    float *d = data + i*ny*nz*2;
02791    for ( int j=0; j<ny; ++j, d += nz*2 ) {
02792     for ( int k=0; k<nz; ++k ) {
02793 #ifdef ZEROCHECK
02794       if ( (*md) == 0. ) CkPrintf("0 in YX at %d %d %d %d %d %d %d %d %d\n",
02795         ib, thisIndex.y, thisIndex.z, i, j, k, nx, ny, nz);
02796 #endif
02797       d[2*k] = *(md++);
02798       d[2*k+1] = *(md++);
02799     }
02800    }
02801   }
02802  } else {
02803   for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
02804    float *d = data + i*ny*nz*2;
02805    for ( int j=0; j<ny; ++j, d += nz*2 ) {
02806     for ( int k=0; k<nz; ++k ) {
02807       d[2*k] = 0;
02808       d[2*k+1] = 0;
02809     }
02810    }
02811   }
02812  }
02813 }
02814 
02815 void PmeXPencil::forward_fft() {
02816 #ifdef NAMD_FFTW
02817   fftw(forward_plan, ny*nz,
02818         ((fftw_complex *) data), ny*nz, 1, (fftw_complex *) work, 1, 0);
02819 #endif
02820 }
02821 
02822 void PmeXPencil::pme_kspace() {
02823 
02824   evir = 0.;
02825 
02826 #ifdef FFTCHECK
02827   return;
02828 #endif
02829 
02830   BigReal ewaldcof = ComputeNonbondedUtil::ewaldcof;
02831 
02832   int numGrids = 1;
02833   for ( int g=0; g<numGrids; ++g ) {
02834     evir[0] = myKSpace->compute_energy(data+0*g,
02835                 lattice, ewaldcof, &(evir[1]));
02836   }
02837 
02838 }
02839 
02840 void PmeXPencil::backward_fft() {
02841 #ifdef NAMD_FFTW
02842   fftw(backward_plan, ny*nz,
02843         ((fftw_complex *) data), ny*nz, 1, (fftw_complex *) work, 1, 0);
02844 #endif
02845 }
02846 
02847 void PmeXPencil::send_untrans() {
02848   int xBlocks = initdata.xBlocks;
02849   int block1 = initdata.grid.block1;
02850   int K1 = initdata.grid.K1;
02851   int send_evir = 1;
02852   for ( int isend=0; isend<xBlocks; ++isend ) {
02853     int ib = send_order[isend];
02854     if ( ! needs_reply[ib] ) continue;
02855     int nx = block1;
02856     if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
02857     PmeUntransMsg *msg = new (nx*ny*nz*2,send_evir,PRIORITY_SIZE) PmeUntransMsg;
02858     if ( send_evir ) {
02859       msg->evir[0] = evir;
02860       msg->has_evir = 1;
02861       send_evir = 0;
02862     } else {
02863       msg->has_evir = 0;
02864     }
02865     msg->sourceNode = thisIndex.y;
02866     msg->ny = ny;
02867     float *md = msg->qgrid;
02868     for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
02869      float *d = data + i*ny*nz*2;
02870      for ( int j=0; j<ny; ++j, d += nz*2 ) {
02871       for ( int k=0; k<nz; ++k ) {
02872         *(md++) = d[2*k];
02873         *(md++) = d[2*k+1];
02874       }
02875      }
02876     }
02877     SET_PRIORITY(msg,sequence,PME_UNTRANS_PRIORITY)
02878     initdata.yPencil(ib,0,thisIndex.z).recvUntrans(msg);
02879   }
02880 }
02881 
02882 void PmeYPencil::recv_untrans(const PmeUntransMsg *msg) {
02883   if ( imsg == 0 ) evir = 0.;
02884   if ( msg->has_evir ) evir += msg->evir[0];
02885   int block2 = initdata.grid.block2;
02886   int K2 = initdata.grid.K2;
02887   int jb = msg->sourceNode;
02888   int ny = msg->ny;
02889   const float *md = msg->qgrid;
02890   float *d = data;
02891   for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
02892 #if CMK_BLUEGENEL
02893     CmiNetworkProgress();
02894 #endif   
02895     for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
02896       for ( int k=0; k<nz; ++k ) {
02897 #ifdef ZEROCHECK
02898         if ( (*md) == 0. ) CkPrintf("0 in XY at %d %d %d %d %d %d %d %d %d\n",
02899                                     thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
02900 #endif
02901         d[2*(j*nz+k)] = *(md++);
02902         d[2*(j*nz+k)+1] = *(md++);
02903       }
02904     }
02905   }
02906 }
02907 
02908 void PmeYPencil::backward_fft() {
02909 #ifdef NAMD_FFTW
02910   for ( int i=0; i<nx; ++i ) {
02911 #if CMK_BLUEGENEL
02912     CmiNetworkProgress();
02913 #endif
02914 
02915     fftw(backward_plan, nz,
02916         ((fftw_complex *) data) + i * nz * initdata.grid.K2,
02917         nz, 1, (fftw_complex *) work, 1, 0);
02918   }
02919 #endif
02920 }
02921 
02922 void PmeYPencil::send_untrans() {
02923   int yBlocks = initdata.yBlocks;
02924   int block2 = initdata.grid.block2;
02925   int K2 = initdata.grid.K2;
02926   int send_evir = 1;
02927   for ( int isend=0; isend<yBlocks; ++isend ) {
02928     int jb = send_order[isend];
02929     if ( ! needs_reply[jb] ) continue;
02930     int ny = block2;
02931     if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
02932     PmeUntransMsg *msg = new (nx*ny*nz*2,send_evir,PRIORITY_SIZE) PmeUntransMsg;
02933     if ( send_evir ) {
02934       msg->evir[0] = evir;
02935       msg->has_evir = 1;
02936       send_evir = 0;
02937     } else {
02938       msg->has_evir = 0;
02939     }
02940     msg->sourceNode = thisIndex.z;
02941     msg->ny = nz;
02942     float *md = msg->qgrid;
02943     const float *d = data;
02944     for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
02945      for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
02946       for ( int k=0; k<nz; ++k ) {
02947         *(md++) = d[2*(j*nz+k)];
02948         *(md++) = d[2*(j*nz+k)+1];
02949       }
02950      }
02951     }
02952     SET_PRIORITY(msg,sequence,PME_UNTRANS2_PRIORITY)
02953     initdata.zPencil(thisIndex.x,jb,0).recvUntrans(msg);
02954   }
02955 }
02956 
02957 void PmeZPencil::recv_untrans(const PmeUntransMsg *msg) {
02958   if ( imsg == 0 ) evir = 0.;
02959   if ( msg->has_evir ) evir += msg->evir[0];
02960   int block3 = initdata.grid.block3;
02961   int dim3 = initdata.grid.dim3;
02962   int kb = msg->sourceNode;
02963   int nz = msg->ny;
02964   const float *md = msg->qgrid;
02965   float *d = data;
02966   for ( int i=0; i<nx; ++i ) {
02967 #if CMK_BLUEGENEL
02968     CmiNetworkProgress();
02969 #endif   
02970     for ( int j=0; j<ny; ++j, d += dim3 ) {
02971       for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
02972 #ifdef ZEROCHECK
02973         if ( (*md) == 0. ) CkPrintf("0 in YZ at %d %d %d %d %d %d %d %d %d\n",
02974                                     thisIndex.x, thisIndex.y, kb, i, j, k, nx, ny, nz);
02975 #endif
02976         d[2*k] = *(md++);
02977         d[2*k+1] = *(md++);
02978       }
02979     }
02980   }
02981 }
02982 
02983 void PmeZPencil::backward_fft() {
02984 #ifdef NAMD_FFTW
02985   rfftwnd_complex_to_real(backward_plan, nx*ny,
02986             (fftw_complex *) data, 1, initdata.grid.dim3/2, work, 1, 0);
02987 #endif
02988   
02989 #if CMK_BLUEGENEL
02990   CmiNetworkProgress();
02991 #endif
02992 
02993 #ifdef FFTCHECK
02994   int dim3 = initdata.grid.dim3;
02995   int K1 = initdata.grid.K1;
02996   int K2 = initdata.grid.K2;
02997   int K3 = initdata.grid.K3;
02998   float scale = 1. / (1. * K1 * K2 * K3);
02999   float maxerr = 0.;
03000   float maxstd = 0.;
03001   int mi, mj, mk;  mi = mj = mk = -1;
03002   float std_base = 100. * (thisIndex.x+1.) + 10. * (thisIndex.y+1.);
03003   const float *d = data;
03004   for ( int i=0; i<nx; ++i ) {
03005    for ( int j=0; j<ny; ++j, d += dim3 ) {
03006     for ( int k=0; k<K3; ++k ) {
03007       float std = 10. * (10. * (10. * std_base + i) + j) + k;
03008       float err = scale * d[k] - std;
03009       if ( fabsf(err) > fabsf(maxerr) ) {
03010         maxerr = err;
03011         maxstd = std;
03012         mi = i;  mj = j;  mk = k;
03013       }
03014     }
03015    }
03016   }
03017   CkPrintf("pencil %d %d max error %f at %d %d %d (should be %f)\n",
03018                 thisIndex.x, thisIndex.y, maxerr, mi, mj, mk, maxstd);
03019 #endif
03020 }
03021 
03022 void PmeZPencil::send_ungrid(PmeGridMsg *msg) {
03023   int pe = msg->sourceNode;
03024   msg->sourceNode = thisIndex.x * initdata.yBlocks + thisIndex.y;
03025   int dim3 = initdata.grid.dim3;
03026   int zlistlen = msg->zlistlen;
03027   int *zlist = msg->zlist;
03028   char *fmsg = msg->fgrid;
03029   float *qmsg = msg->qgrid;
03030   float *d = data;
03031   int numGrids = 1;  // pencil FFT doesn't support multiple grids
03032   for ( int g=0; g<numGrids; ++g ) {
03033 #if CMK_BLUEGENEL
03034     CmiNetworkProgress();
03035 #endif    
03036     for ( int i=0; i<nx; ++i ) {
03037       for ( int j=0; j<ny; ++j, d += dim3 ) {
03038         if( *(fmsg++) ) {
03039           for ( int k=0; k<zlistlen; ++k ) {
03040             *(qmsg++) = d[zlist[k]];
03041           }
03042         }
03043       }
03044     }
03045   }
03046 
03047   SET_PRIORITY(msg,sequence,PME_UNGRID_PRIORITY)
03048   initdata.pmeProxy[pe].recvUngrid(msg);
03049 }
03050 
03051 
03052 #include "ComputePmeMgr.def.h"
03053 

Generated on Sun Nov 22 04:07:43 2009 for NAMD by  doxygen 1.3.9.1