00001
00007 #ifdef NAMD_FFTW
00008
00009 #ifdef NAMD_FFTW_3
00010 #include <fftw3.h>
00011 #else
00012
00013 #define fftwf_malloc fftw_malloc
00014 #define fftwf_free fftw_free
00015 #ifdef NAMD_FFTW_NO_TYPE_PREFIX
00016 #include <fftw.h>
00017 #include <rfftw.h>
00018 #else
00019 #include <sfftw.h>
00020 #include <srfftw.h>
00021 #endif
00022 #endif
00023 #endif
00024
00025 #include <vector>
00026 #include <algorithm>
00027 using namespace std;
00028
00029 #include "InfoStream.h"
00030 #include "Node.h"
00031 #include "PatchMap.h"
00032 #include "PatchMap.inl"
00033 #include "AtomMap.h"
00034 #include "ComputePme.h"
00035 #include "ComputePmeMgr.decl.h"
00036 #include "PmeBase.inl"
00037 #include "PmeRealSpace.h"
00038 #include "PmeKSpace.h"
00039 #include "ComputeNonbondedUtil.h"
00040 #include "PatchMgr.h"
00041 #include "Molecule.h"
00042 #include "ReductionMgr.h"
00043 #include "ComputeMgr.h"
00044 #include "ComputeMgr.decl.h"
00045
00046 #define MIN_DEBUG_LEVEL 3
00047 #include "Debug.h"
00048 #include "SimParameters.h"
00049 #include "WorkDistrib.h"
00050 #include "varsizemsg.h"
00051 #include "Random.h"
00052 #include "ckhashtable.h"
00053 #include "Priorities.h"
00054
00055 #include "ComputeMoa.h"
00056 #include "ComputeMoaMgr.decl.h"
00057
00058
00059
00060
00061
00062
00063
00064 #ifndef SQRT_PI
00065 #define SQRT_PI 1.7724538509055160273
00066 #endif
00067
00068 #if CMK_PERSISTENT_COMM
00069 #define USE_PERSISTENT 1
00070 #endif
00071
00072 #if USE_PERSISTENT
00073 #define Z_PERSIST 1
00074 #define Y_PERSIST 1
00075 #define X_PERSIST 1
00076 #endif
00077
00078
00079
00080 char *pencilPMEProcessors;
00081
00082 class PmeAckMsg : public CMessage_PmeAckMsg {
00083 };
00084
00085 class PmeGridMsg : public CMessage_PmeGridMsg {
00086 public:
00087
00088 int sourceNode;
00089 int sequence;
00090 int hasData;
00091 Lattice lattice;
00092 PmeReduction *evir;
00093 int start;
00094 int len;
00095 int zlistlen;
00096 int *zlist;
00097 char *fgrid;
00098 float *qgrid;
00099 CkArrayIndex3D destElem;
00100 };
00101
00102 class PmeTransMsg : public CMessage_PmeTransMsg {
00103 public:
00104
00105 int sourceNode;
00106 int sequence;
00107 int hasData;
00108 Lattice lattice;
00109 int x_start;
00110 int nx;
00111 float *qgrid;
00112 CkArrayIndex3D destElem;
00113 };
00114
00115 class PmeSharedTransMsg : public CMessage_PmeSharedTransMsg {
00116 public:
00117 PmeTransMsg *msg;
00118 int *count;
00119 CmiNodeLock lock;
00120 };
00121
00122 class PmeUntransMsg : public CMessage_PmeUntransMsg {
00123 public:
00124
00125 int sourceNode;
00126 int has_evir;
00127 PmeReduction *evir;
00128 int y_start;
00129 int ny;
00130 float *qgrid;
00131 CkArrayIndex3D destElem;
00132 };
00133
00134 class PmeSharedUntransMsg : public CMessage_PmeSharedUntransMsg {
00135 public:
00136 PmeUntransMsg *msg;
00137 int *count;
00138 CmiNodeLock lock;
00139 };
00140
00141 class PmePencilMap : public CBase_PmePencilMap {
00142 public:
00143 PmePencilMap(int i_a, int i_b, int n_b, int n, int *d)
00144 : ia(i_a), ib(i_b), nb(n_b),
00145 size(n), data(newcopyint(n,d)) {
00146 }
00147 virtual int registerArray(CkArrayIndexMax&, CkArrayID) {
00148
00149 return 0;
00150 }
00151 virtual int procNum(int, const CkArrayIndex &i) {
00152
00153 return data[ i.data()[ia] * nb + i.data()[ib] ];
00154 }
00155 virtual void populateInitial(int, CkArrayIndexMax &, void *msg, CkArrMgr *mgr) {
00156 int mype = CkMyPe();
00157 for ( int i=0; i < size; ++i ) {
00158 if ( data[i] == mype ) {
00159 CkArrayIndex3D ai(0,0,0);
00160 ai.data()[ia] = i / nb;
00161 ai.data()[ib] = i % nb;
00162 if ( procNum(0,ai) != mype ) NAMD_bug("PmePencilMap is inconsistent");
00163 if ( ! msg ) NAMD_bug("PmePencilMap multiple pencils on a pe?");
00164 mgr->insertInitial(ai,msg);
00165 msg = 0;
00166 }
00167 }
00168 mgr->doneInserting();
00169 if ( msg ) CkFreeMsg(msg);
00170 }
00171 private:
00172 const int ia, ib, nb, size;
00173 const int* const data;
00174 static int* newcopyint(int n, int *d) {
00175 int *newd = new int[n];
00176 memcpy(newd, d, n*sizeof(int));
00177 return newd;
00178 }
00179 };
00180
00181
00182 struct PmePencilInitMsgData {
00183 PmeGrid grid;
00184 int xBlocks, yBlocks, zBlocks;
00185 CProxy_PmeXPencil xPencil;
00186 CProxy_PmeYPencil yPencil;
00187 CProxy_PmeZPencil zPencil;
00188 CProxy_ComputePmeMgr pmeProxy;
00189 CProxy_NodePmeMgr pmeNodeProxy;
00190 CProxy_PmePencilMap xm;
00191 CProxy_PmePencilMap ym;
00192 CProxy_PmePencilMap zm;
00193 };
00194
00195 class PmePencilInitMsg : public CMessage_PmePencilInitMsg {
00196 public:
00197 PmePencilInitMsg(PmePencilInitMsgData &d) { data = d; }
00198 PmePencilInitMsgData data;
00199 };
00200
00201
00202 struct LocalPmeInfo {
00203 int nx, x_start;
00204 int ny_after_transpose, y_start_after_transpose;
00205 };
00206
00207 struct NodePmeInfo {
00208 int npe, pe_start, real_node;
00209 };
00210
00211
00212
00213 void generatePmePeList2(int *gridPeMap, int numGridPes, int *transPeMap, int numTransPes){
00214 int ncpus = CkNumPes();
00215
00216 for ( int i=0; i<numGridPes; ++i ) {
00217 gridPeMap[i] = WorkDistrib::peDiffuseOrdering[ncpus - numGridPes + i];
00218 }
00219 std::sort(gridPeMap,gridPeMap+numGridPes);
00220 int firstTransPe = ncpus - numGridPes - numTransPes;
00221 if ( firstTransPe < 0 ) {
00222 firstTransPe = 0;
00223
00224 if ( ncpus > numTransPes ) firstTransPe = 1;
00225 }
00226 for ( int i=0; i<numTransPes; ++i ) {
00227 transPeMap[i] = WorkDistrib::peDiffuseOrdering[firstTransPe + i];
00228 }
00229 std::sort(transPeMap,transPeMap+numTransPes);
00230 }
00231
00232 #if USE_TOPOMAP
00233
00234 bool generateBGLORBPmePeList(int *pemap, int numPes, int *block_pes=0,
00235 int nbpes=0);
00236 #endif
00237
00238
00239 int compare_bit_reversed(int a, int b) {
00240 int d = a ^ b;
00241 int c = 1;
00242 if ( d ) while ( ! (d & c) ) {
00243 c = c << 1;
00244 }
00245 return (a & c) - (b & c);
00246 }
00247
00248 inline bool less_than_bit_reversed(int a, int b) {
00249 int d = a ^ b;
00250 int c = 1;
00251 if ( d ) while ( ! (d & c) ) {
00252 c = c << 1;
00253 }
00254 return d && (b & c);
00255 }
00256
00257 struct sortop_bit_reversed {
00258 inline bool operator() (int a, int b) const {
00259 return less_than_bit_reversed(a,b);
00260 }
00261 };
00262
00263 struct ijpair {
00264 int i,j;
00265 ijpair() {;}
00266 ijpair(int I, int J) : i(I), j(J) {;}
00267 };
00268
00269 struct ijpair_sortop_bit_reversed {
00270 inline bool operator() (const ijpair &a, const ijpair &b) const {
00271 return ( less_than_bit_reversed(a.i,b.i)
00272 || ( (a.i == b.i) && less_than_bit_reversed(a.j,b.j) ) );
00273 }
00274 };
00275
00276 class ComputePmeMgr : public BOCclass {
00277 public:
00278 friend class ComputePme;
00279 friend class NodePmeMgr;
00280 ComputePmeMgr();
00281 ~ComputePmeMgr();
00282
00283 void initialize(CkQdMsg*);
00284 void initialize_pencils(CkQdMsg*);
00285 void activate_pencils(CkQdMsg*);
00286 void recvArrays(CProxy_PmeXPencil, CProxy_PmeYPencil, CProxy_PmeZPencil);
00287 void initialize_computes();
00288
00289 void sendData(Lattice &, int sequence);
00290 void sendPencils(Lattice &, int sequence);
00291 void recvGrid(PmeGridMsg *);
00292 void gridCalc1(void);
00293 void sendTransBarrier(void);
00294 void sendTrans(void);
00295 void fwdSharedTrans(PmeTransMsg *);
00296 void recvSharedTrans(PmeSharedTransMsg *);
00297 void recvTrans(PmeTransMsg *);
00298 void procTrans(PmeTransMsg *);
00299 void gridCalc2(void);
00300 #ifdef OPENATOM_VERSION
00301 void gridCalc2Moa(void);
00302 #endif // OPENATOM_VERSION
00303 void gridCalc2R(void);
00304 void fwdSharedUntrans(PmeUntransMsg *);
00305 void recvSharedUntrans(PmeSharedUntransMsg *);
00306 void sendUntrans(void);
00307 void recvUntrans(PmeUntransMsg *);
00308 void procUntrans(PmeUntransMsg *);
00309 void gridCalc3(void);
00310 void sendUngrid(void);
00311 void recvUngrid(PmeGridMsg *);
00312 void recvAck(PmeAckMsg *);
00313 void copyResults(PmeGridMsg *);
00314 void copyPencils(PmeGridMsg *);
00315 void ungridCalc(void);
00316 void submitReductions();
00317
00318 #if USE_PERSISTENT
00319 void setup_recvgrid_persistent();
00320 #endif
00321
00322
00323 int isPmeProcessor(int p);
00324
00325 static CmiNodeLock fftw_plan_lock;
00326
00327 private:
00328
00329 #if USE_PERSISTENT
00330 PersistentHandle *recvGrid_handle;
00331 #endif
00332
00333 CProxy_ComputePmeMgr pmeProxy;
00334 CProxy_ComputePmeMgr pmeProxyDir;
00335 CProxy_NodePmeMgr pmeNodeProxy;
00336
00337 void addCompute(ComputePme *c) {
00338 if ( ! pmeComputes.size() ) initialize_computes();
00339 pmeComputes.add(c);
00340 c->setMgr(this);
00341 }
00342
00343 ResizeArray<ComputePme*> pmeComputes;
00344 ResizeArray<ComputePme*> heldComputes;
00345 PmeGrid myGrid;
00346 Lattice lattice;
00347 PmeKSpace *myKSpace;
00348 float *qgrid;
00349 float *kgrid;
00350
00351 #ifdef NAMD_FFTW
00352 #ifdef NAMD_FFTW_3
00353 fftwf_plan *forward_plan_x, *backward_plan_x;
00354 fftwf_plan *forward_plan_yz, *backward_plan_yz;
00355 fftwf_complex *work;
00356 #else
00357 fftw_plan forward_plan_x, backward_plan_x;
00358 rfftwnd_plan forward_plan_yz, backward_plan_yz;
00359 fftw_complex *work;
00360 #endif
00361 #else
00362 float *work;
00363 #endif
00364
00365 int qsize, fsize, bsize;
00366 int alchFepOn, alchThermIntOn, lesOn, lesFactor, pairOn, selfOn, numGrids;
00367 int alchDecouple;
00368 BigReal alchElecLambdaStart;
00369 BigReal elecLambdaUp;
00370 BigReal elecLambdaDown;
00371
00372 float **q_arr;
00373 float **q_list;
00374 int q_count;
00375 char *f_arr;
00376 char *fz_arr;
00377 PmeReduction evir[PME_MAX_EVALS];
00378 SubmitReduction *reduction;
00379 SubmitReduction *amd_reduction;
00380
00381 int noWorkCount;
00382 int doWorkCount;
00383 int ungridForcesCount;
00384
00385 LocalPmeInfo *localInfo;
00386 NodePmeInfo *gridNodeInfo;
00387 NodePmeInfo *transNodeInfo;
00388 int qgrid_size;
00389 int qgrid_start;
00390 int qgrid_len;
00391 int fgrid_start;
00392 int fgrid_len;
00393
00394 int numSources;
00395 int numGridPes;
00396 int numTransPes;
00397 int numGridNodes;
00398 int numTransNodes;
00399 int numDestRecipPes;
00400 int myGridPe, myGridNode;
00401 int myTransPe, myTransNode;
00402 int *gridPeMap;
00403 int *transPeMap;
00404 int *recipPeDest;
00405 int *gridPeOrder;
00406 int *gridNodeOrder;
00407 int *transNodeOrder;
00408 char *isPmeFlag;
00409 int grid_count;
00410 int trans_count;
00411 int untrans_count;
00412 int ungrid_count;
00413 PmeGridMsg **gridmsg_reuse;
00414 PmeReduction recip_evir[PME_MAX_EVALS];
00415 PmeReduction recip_evir2[PME_MAX_EVALS];
00416
00417 int compute_sequence;
00418 int grid_sequence;
00419 int useBarrier;
00420 int sendTransBarrier_received;
00421
00422 int usePencils;
00423 int xBlocks, yBlocks, zBlocks;
00424 CProxy_PmeXPencil xPencil;
00425 CProxy_PmeYPencil yPencil;
00426 CProxy_PmeZPencil zPencil;
00427 char *pencilActive;
00428 ijpair *activePencils;
00429 int numPencilsActive;
00430 int strayChargeErrors;
00431 };
00432
00433 CmiNodeLock ComputePmeMgr::fftw_plan_lock;
00434
00435 int isPmeProcessor(int p){
00436 return CProxy_ComputePmeMgr::ckLocalBranch(CkpvAccess(BOCclass_group).computePmeMgr)->isPmeProcessor(p);
00437 }
00438
00439 int ComputePmeMgr::isPmeProcessor(int p){
00440 SimParameters *simParams = Node::Object()->simParameters;
00441 return ( usePencils || (simParams->PMEPencils>0) ? pencilPMEProcessors[p] : isPmeFlag[p] );
00442 }
00443
00444 class NodePmeMgr : public CBase_NodePmeMgr {
00445 public:
00446 friend class ComputePmeMgr;
00447 NodePmeMgr();
00448 ~NodePmeMgr();
00449 void initialize();
00450 void recvTrans(PmeTransMsg *);
00451 void recvUntrans(PmeUntransMsg *);
00452 void registerXPencil(CkArrayIndex3D, PmeXPencil *);
00453 void registerYPencil(CkArrayIndex3D, PmeYPencil *);
00454 void registerZPencil(CkArrayIndex3D, PmeZPencil *);
00455 void recvXTrans(PmeTransMsg *);
00456 void recvYTrans(PmeTransMsg *);
00457 void recvYUntrans(PmeUntransMsg *);
00458 void recvZGrid(PmeGridMsg *);
00459 void recvZUntrans(PmeUntransMsg *);
00460
00461 void recvPencilMapProxies(CProxy_PmePencilMap _xm, CProxy_PmePencilMap _ym, CProxy_PmePencilMap _zm){
00462 xm=_xm; ym=_ym; zm=_zm;
00463 }
00464 CProxy_PmePencilMap xm;
00465 CProxy_PmePencilMap ym;
00466 CProxy_PmePencilMap zm;
00467
00468 private:
00469 CProxy_ComputePmeMgr mgrProxy;
00470 ComputePmeMgr *mgrObject;
00471 ComputePmeMgr **mgrObjects;
00472 CProxy_PmeXPencil xPencil;
00473 CProxy_PmeYPencil yPencil;
00474 CProxy_PmeZPencil zPencil;
00475 CkHashtableT<CkArrayIndex3D,PmeXPencil*> xPencilObj;
00476 CkHashtableT<CkArrayIndex3D,PmeYPencil*> yPencilObj;
00477 CkHashtableT<CkArrayIndex3D,PmeZPencil*> zPencilObj;
00478 };
00479
00480 NodePmeMgr::NodePmeMgr() {
00481 mgrObjects = new ComputePmeMgr*[CkMyNodeSize()];
00482 }
00483
00484 NodePmeMgr::~NodePmeMgr() {
00485 delete [] mgrObjects;
00486 }
00487
00488 void NodePmeMgr::initialize() {
00489 CProxy_ComputePmeMgr proxy = CkpvAccess(BOCclass_group).computePmeMgr;
00490 mgrObjects[CkMyRank()] = proxy.ckLocalBranch();
00491 if ( CkMyRank() == 0 ) {
00492 mgrProxy = proxy;
00493 mgrObject = proxy.ckLocalBranch();
00494 }
00495 }
00496
00497 void NodePmeMgr::recvTrans(PmeTransMsg *msg) {
00498 mgrObject->fwdSharedTrans(msg);
00499 }
00500
00501 void NodePmeMgr::recvUntrans(PmeUntransMsg *msg) {
00502 mgrObject->fwdSharedUntrans(msg);
00503 }
00504
00505 void NodePmeMgr::registerXPencil(CkArrayIndex3D idx, PmeXPencil *obj)
00506 {
00507 CmiLock(ComputePmeMgr::fftw_plan_lock);
00508 xPencilObj.put(idx)=obj;
00509 CmiUnlock(ComputePmeMgr::fftw_plan_lock);
00510 }
00511 void NodePmeMgr::registerYPencil(CkArrayIndex3D idx, PmeYPencil *obj)
00512 {
00513 CmiLock(ComputePmeMgr::fftw_plan_lock);
00514 yPencilObj.put(idx)=obj;
00515 CmiUnlock(ComputePmeMgr::fftw_plan_lock);
00516 }
00517 void NodePmeMgr::registerZPencil(CkArrayIndex3D idx, PmeZPencil *obj)
00518 {
00519 CmiLock(ComputePmeMgr::fftw_plan_lock);
00520 zPencilObj.put(idx)=obj;
00521 CmiUnlock(ComputePmeMgr::fftw_plan_lock);
00522 }
00523
00524 ComputePmeMgr::ComputePmeMgr() : pmeProxy(thisgroup),
00525 pmeProxyDir(thisgroup) {
00526
00527 CkpvAccess(BOCclass_group).computePmeMgr = thisgroup;
00528 pmeNodeProxy = CkpvAccess(BOCclass_group).nodePmeMgr;
00529
00530 pmeNodeProxy.ckLocalBranch()->initialize();
00531
00532 if ( CmiMyRank() == 0 ) {
00533 fftw_plan_lock = CmiCreateLock();
00534 }
00535
00536 myKSpace = 0;
00537 kgrid = 0;
00538 work = 0;
00539 grid_count = 0;
00540 trans_count = 0;
00541 untrans_count = 0;
00542 ungrid_count = 0;
00543 gridmsg_reuse= new PmeGridMsg*[CkNumPes()];
00544 useBarrier = 0;
00545 sendTransBarrier_received = 0;
00546 usePencils = 0;
00547 }
00548
00549
00550 void ComputePmeMgr::recvArrays(
00551 CProxy_PmeXPencil x, CProxy_PmeYPencil y, CProxy_PmeZPencil z) {
00552 xPencil = x; yPencil = y; zPencil = z;
00553
00554 if(CmiMyRank()==0)
00555 {
00556 pmeNodeProxy.ckLocalBranch()->xPencil=x;
00557 pmeNodeProxy.ckLocalBranch()->yPencil=y;
00558 pmeNodeProxy.ckLocalBranch()->zPencil=z;
00559 }
00560 }
00561
00562 #if USE_TOPO_SFC
00563 struct Coord
00564 {
00565 int x, y, z;
00566 Coord(): x(0), y(0), z(0) {}
00567 Coord(int a, int b, int c): x(a), y(b), z(c) {}
00568 };
00569 extern void SFC_grid(int xdim, int ydim, int zdim, int xdim1, int ydim1, int zdim1, vector<Coord> &result);
00570
00571 void sort_sfc(SortableResizeArray<int> &procs, TopoManager &tmgr, vector<Coord> &result)
00572 {
00573 SortableResizeArray<int> newprocs(procs.size());
00574 int num = 0;
00575 for (int i=0; i<result.size(); i++) {
00576 Coord &c = result[i];
00577 for (int j=0; j<procs.size(); j++) {
00578 int pe = procs[j];
00579 int x,y,z,t;
00580 tmgr.rankToCoordinates(pe, x, y, z, t);
00581 if (x==c.x && y==c.y && z==c.z)
00582 newprocs[num++] = pe;
00583 }
00584 }
00585 CmiAssert(newprocs.size() == procs.size());
00586 procs = newprocs;
00587 }
00588
00589 int find_level_grid(int x)
00590 {
00591 int a = sqrt(x);
00592 int b;
00593 for (; a>0; a--) {
00594 if (x%a == 0) break;
00595 }
00596 if (a==1) a = x;
00597 b = x/a;
00598
00599 return b;
00600 }
00601 CmiNodeLock tmgr_lock;
00602 #endif
00603
00604 void Pme_init()
00605 {
00606 #if USE_TOPO_SFC
00607 if (CkMyRank() == 0)
00608 tmgr_lock = CmiCreateLock();
00609 #endif
00610 }
00611
00612 void ComputePmeMgr::initialize(CkQdMsg *msg) {
00613 delete msg;
00614
00615 localInfo = new LocalPmeInfo[CkNumPes()];
00616 gridNodeInfo = new NodePmeInfo[CkNumNodes()];
00617 transNodeInfo = new NodePmeInfo[CkNumNodes()];
00618 gridPeMap = new int[CkNumPes()];
00619 transPeMap = new int[CkNumPes()];
00620 recipPeDest = new int[CkNumPes()];
00621 gridPeOrder = new int[CkNumPes()];
00622 gridNodeOrder = new int[CkNumNodes()];
00623 transNodeOrder = new int[CkNumNodes()];
00624 isPmeFlag = new char[CkNumPes()];
00625
00626 SimParameters *simParams = Node::Object()->simParameters;
00627 PatchMap *patchMap = PatchMap::Object();
00628
00629 alchFepOn = simParams->alchFepOn;
00630 alchThermIntOn = simParams->alchThermIntOn;
00631 alchDecouple = (alchFepOn || alchThermIntOn) && (simParams->alchDecouple);
00632 alchElecLambdaStart = (alchFepOn || alchThermIntOn) ?
00633 simParams->alchElecLambdaStart : 0;
00634 if (alchFepOn || alchThermIntOn) {
00635 numGrids = 2;
00636 if (alchDecouple) numGrids += 2;
00637 if (alchElecLambdaStart || alchThermIntOn) numGrids ++;
00638 }
00639 else numGrids = 1;
00640 lesOn = simParams->lesOn;
00641 useBarrier = simParams->PMEBarrier;
00642 if ( lesOn ) {
00643 lesFactor = simParams->lesFactor;
00644 numGrids = lesFactor;
00645 }
00646 selfOn = 0;
00647 pairOn = simParams->pairInteractionOn;
00648 if ( pairOn ) {
00649 selfOn = simParams->pairInteractionSelf;
00650 if ( selfOn ) pairOn = 0;
00651 numGrids = selfOn ? 1 : 3;
00652 }
00653
00654 if ( numGrids != 1 || simParams->PMEPencils == 0 ) usePencils = 0;
00655 else if ( simParams->PMEPencils > 0 ) usePencils = 1;
00656 else {
00657 int nrps = simParams->PMEProcessors;
00658 if ( nrps <= 0 ) nrps = CkNumPes();
00659 if ( nrps > CkNumPes() ) nrps = CkNumPes();
00660 int dimx = simParams->PMEGridSizeX;
00661 int dimy = simParams->PMEGridSizeY;
00662 int maxslabs = 1 + (dimx - 1) / simParams->PMEMinSlices;
00663 if ( maxslabs > nrps ) maxslabs = nrps;
00664 int maxpencils = ( simParams->PMEGridSizeX * simParams->PMEGridSizeY
00665 * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
00666 if ( maxpencils > nrps ) maxpencils = nrps;
00667 if ( maxpencils > 3 * maxslabs ) usePencils = 1;
00668 else usePencils = 0;
00669 }
00670
00671 if ( usePencils ) {
00672 int nrps = simParams->PMEProcessors;
00673 if ( nrps <= 0 ) nrps = CkNumPes();
00674 if ( nrps > CkNumPes() ) nrps = CkNumPes();
00675 if ( simParams->PMEPencils > 1 &&
00676 simParams->PMEPencils * simParams->PMEPencils <= nrps ) {
00677 xBlocks = yBlocks = zBlocks = simParams->PMEPencils;
00678 } else {
00679 int nb2 = ( simParams->PMEGridSizeX * simParams->PMEGridSizeY
00680 * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
00681 if ( nb2 > nrps ) nb2 = nrps;
00682 if ( nb2 < 1 ) nb2 = 1;
00683 int nb = (int) sqrt((float)nb2);
00684 if ( nb < 1 ) nb = 1;
00685 xBlocks = zBlocks = nb;
00686 yBlocks = nb2 / nb;
00687 }
00688
00689 if ( simParams->PMEPencilsX > 0 ) xBlocks = simParams->PMEPencilsX;
00690 if ( simParams->PMEPencilsY > 0 ) yBlocks = simParams->PMEPencilsY;
00691 if ( simParams->PMEPencilsZ > 0 ) zBlocks = simParams->PMEPencilsZ;
00692
00693 int dimx = simParams->PMEGridSizeX;
00694 int bx = 1 + ( dimx - 1 ) / xBlocks;
00695 xBlocks = 1 + ( dimx - 1 ) / bx;
00696
00697 int dimy = simParams->PMEGridSizeY;
00698 int by = 1 + ( dimy - 1 ) / yBlocks;
00699 yBlocks = 1 + ( dimy - 1 ) / by;
00700
00701 int dimz = simParams->PMEGridSizeZ / 2 + 1;
00702 int bz = 1 + ( dimz - 1 ) / zBlocks;
00703 zBlocks = 1 + ( dimz - 1 ) / bz;
00704
00705 if ( xBlocks * yBlocks > CkNumPes() ) {
00706 NAMD_die("PME pencils xBlocks * yBlocks > numPes");
00707 }
00708 if ( xBlocks * zBlocks > CkNumPes() ) {
00709 NAMD_die("PME pencils xBlocks * zBlocks > numPes");
00710 }
00711 if ( yBlocks * zBlocks > CkNumPes() ) {
00712 NAMD_die("PME pencils yBlocks * zBlocks > numPes");
00713 }
00714
00715 if ( ! CkMyPe() ) {
00716 iout << iINFO << "PME using " << xBlocks << " x " <<
00717 yBlocks << " x " << zBlocks <<
00718 " pencil grid for FFT and reciprocal sum.\n" << endi;
00719 }
00720 } else {
00721
00722 {
00723
00724
00725 int minslices = simParams->PMEMinSlices;
00726 int dimx = simParams->PMEGridSizeX;
00727 int nrpx = ( dimx + minslices - 1 ) / minslices;
00728 int dimy = simParams->PMEGridSizeY;
00729 int nrpy = ( dimy + minslices - 1 ) / minslices;
00730
00731
00732 int nrpp = CkNumPes();
00733
00734 if ( nrpp < nrpx ) nrpx = nrpp;
00735 if ( nrpp < nrpy ) nrpy = nrpp;
00736
00737
00738 int nrps = simParams->PMEProcessors;
00739 if ( nrps > CkNumPes() ) nrps = CkNumPes();
00740 if ( nrps > 0 ) nrpx = nrps;
00741 if ( nrps > 0 ) nrpy = nrps;
00742
00743
00744 int bx = ( dimx + nrpx - 1 ) / nrpx;
00745 nrpx = ( dimx + bx - 1 ) / bx;
00746 int by = ( dimy + nrpy - 1 ) / nrpy;
00747 nrpy = ( dimy + by - 1 ) / by;
00748 if ( bx != ( dimx + nrpx - 1 ) / nrpx )
00749 NAMD_bug("Error in selecting number of PME processors.");
00750 if ( by != ( dimy + nrpy - 1 ) / nrpy )
00751 NAMD_bug("Error in selecting number of PME processors.");
00752
00753 numGridPes = nrpx;
00754 numTransPes = nrpy;
00755 }
00756 if ( ! CkMyPe() ) {
00757 iout << iINFO << "PME using " << numGridPes << " and " << numTransPes <<
00758 " processors for FFT and reciprocal sum.\n" << endi;
00759 }
00760
00761 int sum_npes = numTransPes + numGridPes;
00762 int max_npes = (numTransPes > numGridPes)?numTransPes:numGridPes;
00763
00764 #if 0 // USE_TOPOMAP
00765
00766 PatchMap * pmap = PatchMap::Object();
00767
00768 int patch_pes = pmap->numNodesWithPatches();
00769 TopoManager tmgr;
00770 if(tmgr.hasMultipleProcsPerNode())
00771 patch_pes *= 2;
00772
00773 bool done = false;
00774 if(CkNumPes() > 2*sum_npes + patch_pes) {
00775 done = generateBGLORBPmePeList(transPeMap, numTransPes);
00776 done &= generateBGLORBPmePeList(gridPeMap, numGridPes, transPeMap, numTransPes);
00777 }
00778 else
00779 if(CkNumPes() > 2 *max_npes + patch_pes) {
00780 done = generateBGLORBPmePeList(transPeMap, max_npes);
00781 gridPeMap = transPeMap;
00782 }
00783
00784 if (!done)
00785 #endif
00786 {
00787
00788
00789 generatePmePeList2(gridPeMap, numGridPes, transPeMap, numTransPes);
00790 }
00791
00792 if ( ! CkMyPe() ) {
00793 iout << iINFO << "PME GRID LOCATIONS:";
00794 int i;
00795 for ( i=0; i<numGridPes && i<10; ++i ) {
00796 iout << " " << gridPeMap[i];
00797 }
00798 if ( i < numGridPes ) iout << " ...";
00799 iout << "\n" << endi;
00800 iout << iINFO << "PME TRANS LOCATIONS:";
00801 for ( i=0; i<numTransPes && i<10; ++i ) {
00802 iout << " " << transPeMap[i];
00803 }
00804 if ( i < numTransPes ) iout << " ...";
00805 iout << "\n" << endi;
00806 }
00807
00808
00809 std::sort(gridPeMap,gridPeMap+numGridPes,WorkDistrib::pe_sortop_compact());
00810
00811 myGridPe = -1;
00812 myGridNode = -1;
00813 int i = 0;
00814 for ( i=0; i<CkNumPes(); ++i )
00815 isPmeFlag[i] = 0;
00816 int node = -1;
00817 int real_node = -1;
00818 for ( i=0; i<numGridPes; ++i ) {
00819 if ( gridPeMap[i] == CkMyPe() ) myGridPe = i;
00820 isPmeFlag[gridPeMap[i]] |= 1;
00821 int real_node_i = CkNodeOf(gridPeMap[i]);
00822 if ( real_node_i == real_node ) {
00823 gridNodeInfo[node].npe += 1;
00824 } else {
00825 real_node = real_node_i;
00826 ++node;
00827 gridNodeInfo[node].real_node = real_node;
00828 gridNodeInfo[node].pe_start = i;
00829 gridNodeInfo[node].npe = 1;
00830 }
00831 if ( CkMyNode() == real_node_i ) myGridNode = node;
00832 }
00833 numGridNodes = node + 1;
00834 myTransPe = -1;
00835 myTransNode = -1;
00836 node = -1;
00837 real_node = -1;
00838 for ( i=0; i<numTransPes; ++i ) {
00839 if ( transPeMap[i] == CkMyPe() ) myTransPe = i;
00840 isPmeFlag[transPeMap[i]] |= 2;
00841 int real_node_i = CkNodeOf(transPeMap[i]);
00842 if ( real_node_i == real_node ) {
00843 transNodeInfo[node].npe += 1;
00844 } else {
00845 real_node = real_node_i;
00846 ++node;
00847 transNodeInfo[node].real_node = real_node;
00848 transNodeInfo[node].pe_start = i;
00849 transNodeInfo[node].npe = 1;
00850 }
00851 if ( CkMyNode() == real_node_i ) myTransNode = node;
00852 }
00853 numTransNodes = node + 1;
00854
00855 if ( ! CkMyPe() ) {
00856 iout << iINFO << "PME USING " << numGridNodes << " GRID NODES AND "
00857 << numTransNodes << " TRANS NODES\n" << endi;
00858 }
00859
00860 {
00861 int i;
00862 for ( i = 0; i < numGridPes; ++i ) {
00863 gridPeOrder[i] = i;
00864 }
00865 Random rand(CkMyPe());
00866 if ( myGridPe < 0 ) {
00867 rand.reorder(gridPeOrder,numGridPes);
00868 } else {
00869 gridPeOrder[myGridPe] = numGridPes-1;
00870 gridPeOrder[numGridPes-1] = myGridPe;
00871 rand.reorder(gridPeOrder,numGridPes-1);
00872 }
00873 for ( i = 0; i < numGridNodes; ++i ) {
00874 gridNodeOrder[i] = i;
00875 }
00876 if ( myGridNode < 0 ) {
00877 rand.reorder(gridNodeOrder,numGridNodes);
00878 } else {
00879 gridNodeOrder[myGridNode] = numGridNodes-1;
00880 gridNodeOrder[numGridNodes-1] = myGridNode;
00881 rand.reorder(gridNodeOrder,numGridNodes-1);
00882 }
00883 for ( i = 0; i < numTransNodes; ++i ) {
00884 transNodeOrder[i] = i;
00885 }
00886 if ( myTransNode < 0 ) {
00887 rand.reorder(transNodeOrder,numTransNodes);
00888 } else {
00889 transNodeOrder[myTransNode] = numTransNodes-1;
00890 transNodeOrder[numTransNodes-1] = myTransNode;
00891 rand.reorder(transNodeOrder,numTransNodes-1);
00892 }
00893 }
00894
00895 }
00896
00897 myGrid.K1 = simParams->PMEGridSizeX;
00898 myGrid.K2 = simParams->PMEGridSizeY;
00899 myGrid.K3 = simParams->PMEGridSizeZ;
00900 myGrid.order = simParams->PMEInterpOrder;
00901 myGrid.dim2 = myGrid.K2;
00902 myGrid.dim3 = 2 * (myGrid.K3/2 + 1);
00903
00904 if ( ! usePencils ) {
00905 myGrid.block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
00906 myGrid.block2 = ( myGrid.K2 + numTransPes - 1 ) / numTransPes;
00907 myGrid.block3 = myGrid.dim3 / 2;
00908 }
00909
00910 if ( usePencils ) {
00911 myGrid.block1 = ( myGrid.K1 + xBlocks - 1 ) / xBlocks;
00912 myGrid.block2 = ( myGrid.K2 + yBlocks - 1 ) / yBlocks;
00913 myGrid.block3 = ( myGrid.K3/2 + 1 + zBlocks - 1 ) / zBlocks;
00914
00915
00916 int pe = 0;
00917 int x,y,z;
00918
00919 SortableResizeArray<int> zprocs(xBlocks*yBlocks);
00920 SortableResizeArray<int> yprocs(xBlocks*zBlocks);
00921 SortableResizeArray<int> xprocs(yBlocks*zBlocks);
00922 {
00923 int basepe = 0; int npe = CkNumPes();
00924 if ( npe > xBlocks*yBlocks &&
00925 npe > xBlocks*zBlocks &&
00926 npe > yBlocks*zBlocks ) {
00927
00928 ++basepe;
00929 --npe;
00930 }
00931
00932
00933
00934 int i;
00935 int ncpus = CkNumPes();
00936 SortableResizeArray<int> patches, nopatches, pmeprocs;
00937 PatchMap *pmap = PatchMap::Object();
00938 for ( int icpu=0; icpu<ncpus; ++icpu ) {
00939 int ri = WorkDistrib::peDiffuseOrdering[icpu];
00940 if ( ri ) {
00941 if ( pmap->numPatchesOnNode(ri) ) patches.add(ri);
00942 else nopatches.add(ri);
00943 }
00944 }
00945
00946 #if USE_RANDOM_TOPO
00947 Random rand(CkMyPe());
00948 int *tmp = new int[patches.size()];
00949 int nn = patches.size();
00950 for (i=0;i<nn;i++) tmp[i] = patches[i];
00951 rand.reorder(tmp, nn);
00952 patches.resize(0);
00953 for (i=0;i<nn;i++) patches.add(tmp[i]);
00954 delete [] tmp;
00955 tmp = new int[nopatches.size()];
00956 nn = nopatches.size();
00957 for (i=0;i<nn;i++) tmp[i] = nopatches[i];
00958 rand.reorder(tmp, nn);
00959 nopatches.resize(0);
00960 for (i=0;i<nn;i++) nopatches.add(tmp[i]);
00961 delete [] tmp;
00962 #endif
00963
00964
00965 int useZero = 0;
00966 int npens = xBlocks*yBlocks;
00967 if ( npens % ncpus == 0 ) useZero = 1;
00968 if ( npens == nopatches.size() + 1 ) useZero = 1;
00969 npens += xBlocks*zBlocks;
00970 if ( npens % ncpus == 0 ) useZero = 1;
00971 if ( npens == nopatches.size() + 1 ) useZero = 1;
00972 npens += yBlocks*zBlocks;
00973 if ( npens % ncpus == 0 ) useZero = 1;
00974 if ( npens == nopatches.size() + 1 ) useZero = 1;
00975
00976
00977 for ( i=nopatches.size()-1; i>=0; --i ) pmeprocs.add(nopatches[i]);
00978 if ( useZero && ! pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
00979 for ( i=patches.size()-1; i>=0; --i ) pmeprocs.add(patches[i]);
00980 if ( pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
00981
00982 int npes = pmeprocs.size();
00983 for ( i=0; i<xBlocks*yBlocks; ++i, ++pe ) zprocs[i] = pmeprocs[pe%npes];
00984 if ( i>1 && zprocs[0] == zprocs[i-1] ) zprocs[0] = 0;
00985 #if !USE_RANDOM_TOPO
00986 zprocs.sort();
00987 #endif
00988 for ( i=0; i<xBlocks*zBlocks; ++i, ++pe ) yprocs[i] = pmeprocs[pe%npes];
00989 if ( i>1 && yprocs[0] == yprocs[i-1] ) yprocs[0] = 0;
00990 #if !USE_RANDOM_TOPO
00991 yprocs.sort();
00992 #endif
00993 for ( i=0; i<yBlocks*zBlocks; ++i, ++pe ) xprocs[i] = pmeprocs[pe%npes];
00994 if ( i>1 && xprocs[0] == xprocs[i-1] ) xprocs[0] = 0;
00995 #if !USE_RANDOM_TOPO
00996 xprocs.sort();
00997 #endif
00998
00999 #if USE_TOPO_SFC
01000 CmiLock(tmgr_lock);
01001
01002 TopoManager tmgr;
01003 int xdim = tmgr.getDimNX();
01004 int ydim = tmgr.getDimNY();
01005 int zdim = tmgr.getDimNZ();
01006 int xdim1 = find_level_grid(xdim);
01007 int ydim1 = find_level_grid(ydim);
01008 int zdim1 = find_level_grid(zdim);
01009 if(CkMyPe() == 0)
01010 printf("xdim: %d %d %d, %d %d %d\n", xdim, ydim, zdim, xdim1, ydim1, zdim1);
01011
01012 vector<Coord> result;
01013 SFC_grid(xdim, ydim, zdim, xdim1, ydim1, zdim1, result);
01014 sort_sfc(xprocs, tmgr, result);
01015 sort_sfc(yprocs, tmgr, result);
01016 sort_sfc(zprocs, tmgr, result);
01017
01018 CmiUnlock(tmgr_lock);
01019 #endif
01020
01021 pencilPMEProcessors = new char [CkNumPes()];
01022 memset (pencilPMEProcessors, 0, sizeof(char) * CkNumPes());
01023
01024
01025 if(CkMyPe() == 0){
01026 iout << iINFO << "PME Z PENCIL LOCATIONS:";
01027 for ( i=0; i<zprocs.size() && i<10; ++i ) {
01028 #if USE_TOPO_SFC
01029 int x,y,z,t;
01030 tmgr.rankToCoordinates(zprocs[i], x,y, z, t);
01031 iout << " " << zprocs[i] << "(" << x << " " << y << " " << z << ")";
01032 #else
01033 iout << " " << zprocs[i];
01034 #endif
01035 }
01036 if ( i < zprocs.size() ) iout << " ...";
01037 iout << "\n" << endi;
01038 }
01039
01040 for (pe=0, x = 0; x < xBlocks; ++x)
01041 for (y = 0; y < yBlocks; ++y, ++pe ) {
01042 pencilPMEProcessors[zprocs[pe]] = 1;
01043 }
01044
01045 if(CkMyPe() == 0){
01046 iout << iINFO << "PME Y PENCIL LOCATIONS:";
01047 for ( i=0; i<yprocs.size() && i<10; ++i ) {
01048 #if USE_TOPO_SFC
01049 int x,y,z,t;
01050 tmgr.rankToCoordinates(yprocs[i], x,y, z, t);
01051 iout << " " << yprocs[i] << "(" << x << " " << y << " " << z << ")";
01052 #else
01053 iout << " " << yprocs[i];
01054 #endif
01055 }
01056 if ( i < yprocs.size() ) iout << " ...";
01057 iout << "\n" << endi;
01058 }
01059
01060 for (pe=0, z = 0; z < zBlocks; ++z )
01061 for (x = 0; x < xBlocks; ++x, ++pe ) {
01062 pencilPMEProcessors[yprocs[pe]] = 1;
01063 }
01064
01065 if(CkMyPe() == 0){
01066 iout << iINFO << "PME X PENCIL LOCATIONS:";
01067 for ( i=0; i<xprocs.size() && i<10; ++i ) {
01068 #if USE_TOPO_SFC
01069 int x,y,z,t;
01070 tmgr.rankToCoordinates(xprocs[i], x,y, z, t);
01071 iout << " " << xprocs[i] << "(" << x << " " << y << " " << z << ")";
01072 #else
01073 iout << " " << xprocs[i];
01074 #endif
01075 }
01076 if ( i < xprocs.size() ) iout << " ...";
01077 iout << "\n" << endi;
01078 }
01079
01080 for (pe=0, y = 0; y < yBlocks; ++y )
01081 for (z = 0; z < zBlocks; ++z, ++pe ) {
01082 pencilPMEProcessors[xprocs[pe]] = 1;
01083 }
01084
01085 }
01086
01087
01088
01089 if ( CkMyPe() == 0 ){
01090 #if !USE_RANDOM_TOPO
01091
01092 WorkDistrib::sortPmePes(zprocs.begin(),xBlocks,yBlocks);
01093 std::sort(yprocs.begin(),yprocs.end(),WorkDistrib::pe_sortop_compact());
01094 std::sort(xprocs.begin(),xprocs.end(),WorkDistrib::pe_sortop_compact());
01095 #endif
01096 #if 1
01097 CProxy_PmePencilMap zm = CProxy_PmePencilMap::ckNew(0,1,yBlocks,xBlocks*yBlocks,zprocs.begin());
01098 CProxy_PmePencilMap ym;
01099 if ( simParams->PMEPencilsYLayout )
01100 ym = CProxy_PmePencilMap::ckNew(0,2,zBlocks,zBlocks*xBlocks,yprocs.begin());
01101 else
01102 ym = CProxy_PmePencilMap::ckNew(2,0,xBlocks,zBlocks*xBlocks,yprocs.begin());
01103 CProxy_PmePencilMap xm;
01104 if ( simParams->PMEPencilsXLayout )
01105 xm = CProxy_PmePencilMap::ckNew(2,1,yBlocks,yBlocks*zBlocks,xprocs.begin());
01106 else
01107 xm = CProxy_PmePencilMap::ckNew(1,2,zBlocks,yBlocks*zBlocks,xprocs.begin());
01108 pmeNodeProxy.recvPencilMapProxies(xm,ym,zm);
01109 CkArrayOptions zo(xBlocks,yBlocks,1); zo.setMap(zm);
01110 CkArrayOptions yo(xBlocks,1,zBlocks); yo.setMap(ym);
01111 CkArrayOptions xo(1,yBlocks,zBlocks); xo.setMap(xm);
01112 #if CHARM_VERSION > 60301
01113 zo.setAnytimeMigration(false); zo.setStaticInsertion(true);
01114 yo.setAnytimeMigration(false); yo.setStaticInsertion(true);
01115 xo.setAnytimeMigration(false); xo.setStaticInsertion(true);
01116 #endif
01117 zPencil = CProxy_PmeZPencil::ckNew(zo);
01118 yPencil = CProxy_PmeYPencil::ckNew(yo);
01119 xPencil = CProxy_PmeXPencil::ckNew(xo);
01120 #else
01121 zPencil = CProxy_PmeZPencil::ckNew();
01122 yPencil = CProxy_PmeYPencil::ckNew();
01123 xPencil = CProxy_PmeXPencil::ckNew();
01124
01125 for (pe=0, x = 0; x < xBlocks; ++x)
01126 for (y = 0; y < yBlocks; ++y, ++pe ) {
01127 zPencil(x,y,0).insert(zprocs[pe]);
01128 }
01129 zPencil.doneInserting();
01130
01131 for (pe=0, x = 0; x < xBlocks; ++x)
01132 for (z = 0; z < zBlocks; ++z, ++pe ) {
01133 yPencil(x,0,z).insert(yprocs[pe]);
01134 }
01135 yPencil.doneInserting();
01136
01137
01138 for (pe=0, y = 0; y < yBlocks; ++y )
01139 for (z = 0; z < zBlocks; ++z, ++pe ) {
01140 xPencil(0,y,z).insert(xprocs[pe]);
01141 }
01142 xPencil.doneInserting();
01143 #endif
01144
01145 pmeProxy.recvArrays(xPencil,yPencil,zPencil);
01146 PmePencilInitMsgData msgdata;
01147 msgdata.grid = myGrid;
01148 msgdata.xBlocks = xBlocks;
01149 msgdata.yBlocks = yBlocks;
01150 msgdata.zBlocks = zBlocks;
01151 msgdata.xPencil = xPencil;
01152 msgdata.yPencil = yPencil;
01153 msgdata.zPencil = zPencil;
01154 msgdata.pmeProxy = pmeProxyDir;
01155 msgdata.pmeNodeProxy = pmeNodeProxy;
01156 msgdata.xm = xm;
01157 msgdata.ym = ym;
01158 msgdata.zm = zm;
01159 xPencil.init(new PmePencilInitMsg(msgdata));
01160 yPencil.init(new PmePencilInitMsg(msgdata));
01161 zPencil.init(new PmePencilInitMsg(msgdata));
01162 }
01163
01164 return;
01165 }
01166
01167
01168 int pe;
01169 int nx = 0;
01170 for ( pe = 0; pe < numGridPes; ++pe ) {
01171 localInfo[pe].x_start = nx;
01172 nx += myGrid.block1;
01173 if ( nx > myGrid.K1 ) nx = myGrid.K1;
01174 localInfo[pe].nx = nx - localInfo[pe].x_start;
01175 }
01176 int ny = 0;
01177 for ( pe = 0; pe < numTransPes; ++pe ) {
01178 localInfo[pe].y_start_after_transpose = ny;
01179 ny += myGrid.block2;
01180 if ( ny > myGrid.K2 ) ny = myGrid.K2;
01181 localInfo[pe].ny_after_transpose =
01182 ny - localInfo[pe].y_start_after_transpose;
01183 }
01184
01185 {
01186
01187 PatchMap *patchMap = PatchMap::Object();
01188 Lattice lattice = simParams->lattice;
01189 BigReal sysdima = lattice.a_r().unit() * lattice.a();
01190 BigReal cutoff = simParams->cutoff;
01191 BigReal patchdim = simParams->patchDimension;
01192 int numPatches = patchMap->numPatches();
01193 int numNodes = CkNumPes();
01194 int *source_flags = new int[numNodes];
01195 int node;
01196 for ( node=0; node<numNodes; ++node ) {
01197 source_flags[node] = 0;
01198 recipPeDest[node] = 0;
01199 }
01200
01201
01202
01203
01204
01205
01206
01207 for ( int pid=0; pid < numPatches; ++pid ) {
01208 int pnode = patchMap->node(pid);
01209 int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
01210 BigReal minx = patchMap->min_a(pid);
01211 BigReal maxx = patchMap->max_a(pid);
01212 BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
01213
01214 int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
01215 int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
01216 for ( int i=min1; i<=max1; ++i ) {
01217 int ix = i;
01218 while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
01219 while ( ix < 0 ) ix += myGrid.K1;
01220
01221 if ( myGridPe >= 0 && ix >= localInfo[myGridPe].x_start &&
01222 ix < localInfo[myGridPe].x_start + localInfo[myGridPe].nx ) {
01223 source_flags[pnode] = 1;
01224 }
01225
01226 if ( pnode == CkMyPe() ) {
01227 recipPeDest[ix / myGrid.block1] = 1;
01228 }
01229 }
01230 }
01231
01232 int numSourcesSamePhysicalNode = 0;
01233 numSources = 0;
01234 numDestRecipPes = 0;
01235 for ( node=0; node<numNodes; ++node ) {
01236 if ( source_flags[node] ) ++numSources;
01237 if ( recipPeDest[node] ) ++numDestRecipPes;
01238 if ( source_flags[node] && CmiPeOnSamePhysicalNode(node,CkMyPe()) ) ++numSourcesSamePhysicalNode;
01239 }
01240
01241 #if 0
01242 if ( numSources ) {
01243 CkPrintf("pe %5d pme %5d of %5d on same physical node\n",
01244 CkMyPe(), numSourcesSamePhysicalNode, numSources);
01245 iout << iINFO << "PME " << CkMyPe() << " sources:";
01246 for ( node=0; node<numNodes; ++node ) {
01247 if ( source_flags[node] ) iout << " " << node;
01248 }
01249 iout << "\n" << endi;
01250 }
01251 #endif
01252
01253 delete [] source_flags;
01254
01255
01256
01257
01258 }
01259
01260 ungrid_count = numDestRecipPes;
01261
01262 sendTransBarrier_received = 0;
01263
01264 if ( myGridPe < 0 && myTransPe < 0 ) return;
01265
01266
01267 if ( myTransPe >= 0 ) {
01268 int k2_start = localInfo[myTransPe].y_start_after_transpose;
01269 int k2_end = k2_start + localInfo[myTransPe].ny_after_transpose;
01270 #ifdef OPENATOM_VERSION
01271 if ( simParams->openatomOn ) {
01272 CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
01273 myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2, moaProxy);
01274 } else {
01275 myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
01276 }
01277 #else // OPENATOM_VERSION
01278 myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
01279 #endif // OPENATOM_VERSION
01280 }
01281
01282 int local_size = myGrid.block1 * myGrid.K2 * myGrid.dim3;
01283 int local_size_2 = myGrid.block2 * myGrid.K1 * myGrid.dim3;
01284 if ( local_size < local_size_2 ) local_size = local_size_2;
01285 qgrid = new float[local_size*numGrids];
01286 if ( numGridPes > 1 || numTransPes > 1 ) {
01287 kgrid = new float[local_size*numGrids];
01288 } else {
01289 kgrid = qgrid;
01290 }
01291 qgrid_size = local_size;
01292
01293 if ( myGridPe >= 0 ) {
01294 qgrid_start = localInfo[myGridPe].x_start * myGrid.K2 * myGrid.dim3;
01295 qgrid_len = localInfo[myGridPe].nx * myGrid.K2 * myGrid.dim3;
01296 fgrid_start = localInfo[myGridPe].x_start * myGrid.K2;
01297 fgrid_len = localInfo[myGridPe].nx * myGrid.K2;
01298 }
01299
01300 int n[3]; n[0] = myGrid.K1; n[1] = myGrid.K2; n[2] = myGrid.K3;
01301 #ifdef NAMD_FFTW
01302 CmiLock(fftw_plan_lock);
01303 #ifdef NAMD_FFTW_3
01304 work = new fftwf_complex[n[0]];
01305 int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT : simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE ;
01306 if ( myGridPe >= 0 ) {
01307 forward_plan_yz=new fftwf_plan[numGrids];
01308 backward_plan_yz=new fftwf_plan[numGrids];
01309 }
01310 if ( myTransPe >= 0 ) {
01311 forward_plan_x=new fftwf_plan[numGrids];
01312 backward_plan_x=new fftwf_plan[numGrids];
01313 }
01314
01315 if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps. 1..." << endi;
01316 if ( myGridPe >= 0 ) {
01317 for( int g=0; g<numGrids; g++)
01318 {
01319 forward_plan_yz[g] = fftwf_plan_many_dft_r2c(2, n+1,
01320 localInfo[myGridPe].nx,
01321 qgrid + qgrid_size * g,
01322 NULL,
01323 1,
01324 myGrid.dim2 * myGrid.dim3,
01325 (fftwf_complex *)
01326 qgrid + qgrid_size * g,
01327 NULL,
01328 1,
01329 myGrid.dim2 * (myGrid.dim3/2),
01330 fftwFlags);
01331 }
01332 }
01333 int zdim = myGrid.dim3;
01334 int xStride=localInfo[myTransPe].ny_after_transpose *( myGrid.dim3 / 2);
01335 if ( ! CkMyPe() ) iout << " 2..." << endi;
01336 if ( myTransPe >= 0 ) {
01337 for( int g=0; g<numGrids; g++)
01338 {
01339
01340 forward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
01341 (fftwf_complex *)
01342 (kgrid+qgrid_size*g),
01343 NULL,
01344 xStride,
01345 1,
01346 (fftwf_complex *)
01347 (kgrid+qgrid_size*g),
01348 NULL,
01349 xStride,
01350 1,
01351 FFTW_FORWARD,fftwFlags);
01352
01353 }
01354 }
01355 if ( ! CkMyPe() ) iout << " 3..." << endi;
01356 if ( myTransPe >= 0 ) {
01357 for( int g=0; g<numGrids; g++)
01358 {
01359 backward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
01360 (fftwf_complex *)
01361 (kgrid+qgrid_size*g),
01362 NULL,
01363 xStride,
01364 1,
01365 (fftwf_complex *)
01366 (kgrid+qgrid_size*g),
01367 NULL,
01368 xStride,
01369 1,
01370 FFTW_BACKWARD, fftwFlags);
01371
01372 }
01373 }
01374 if ( ! CkMyPe() ) iout << " 4..." << endi;
01375 if ( myGridPe >= 0 ) {
01376 for( int g=0; g<numGrids; g++)
01377 {
01378 backward_plan_yz[g] = fftwf_plan_many_dft_c2r(2, n+1,
01379 localInfo[myGridPe].nx,
01380 (fftwf_complex *)
01381 qgrid + qgrid_size * g,
01382 NULL,
01383 1,
01384 myGrid.dim2*(myGrid.dim3/2),
01385 qgrid + qgrid_size * g,
01386 NULL,
01387 1,
01388 myGrid.dim2 * myGrid.dim3,
01389 fftwFlags);
01390 }
01391 }
01392 if ( ! CkMyPe() ) iout << " Done.\n" << endi;
01393
01394 #else
01395 work = new fftw_complex[n[0]];
01396
01397 if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps. 1..." << endi;
01398 if ( myGridPe >= 0 ) {
01399 forward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_REAL_TO_COMPLEX,
01400 ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01401 | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
01402 }
01403 if ( ! CkMyPe() ) iout << " 2..." << endi;
01404 if ( myTransPe >= 0 ) {
01405 forward_plan_x = fftw_create_plan_specific(n[0], FFTW_FORWARD,
01406 ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01407 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
01408 localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
01409 }
01410 if ( ! CkMyPe() ) iout << " 3..." << endi;
01411 if ( myTransPe >= 0 ) {
01412 backward_plan_x = fftw_create_plan_specific(n[0], FFTW_BACKWARD,
01413 ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01414 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
01415 localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
01416 }
01417 if ( ! CkMyPe() ) iout << " 4..." << endi;
01418 if ( myGridPe >= 0 ) {
01419 backward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_COMPLEX_TO_REAL,
01420 ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01421 | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
01422 }
01423 if ( ! CkMyPe() ) iout << " Done.\n" << endi;
01424 #endif
01425 CmiUnlock(fftw_plan_lock);
01426 #else
01427 NAMD_die("Sorry, FFTW must be compiled in to use PME.");
01428 #endif
01429
01430 if ( myGridPe >= 0 && numSources == 0 )
01431 NAMD_bug("PME grid elements exist without sources.");
01432 grid_count = numSources;
01433 memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
01434 trans_count = numGridPes;
01435 }
01436
01437
01438
01439 void ComputePmeMgr::initialize_pencils(CkQdMsg *msg) {
01440 delete msg;
01441 if ( ! usePencils ) return;
01442
01443 SimParameters *simParams = Node::Object()->simParameters;
01444
01445 PatchMap *patchMap = PatchMap::Object();
01446 Lattice lattice = simParams->lattice;
01447 BigReal sysdima = lattice.a_r().unit() * lattice.a();
01448 BigReal sysdimb = lattice.b_r().unit() * lattice.b();
01449 BigReal cutoff = simParams->cutoff;
01450 BigReal patchdim = simParams->patchDimension;
01451 int numPatches = patchMap->numPatches();
01452
01453 pencilActive = new char[xBlocks*yBlocks];
01454 for ( int i=0; i<xBlocks; ++i ) {
01455 for ( int j=0; j<yBlocks; ++j ) {
01456 pencilActive[i*yBlocks+j] = 0;
01457 }
01458 }
01459
01460 for ( int pid=0; pid < numPatches; ++pid ) {
01461 int pnode = patchMap->node(pid);
01462 if ( pnode != CkMyPe() ) continue;
01463
01464 int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
01465 int shift2 = (myGrid.K2 + myGrid.order - 1)/2;
01466
01467 BigReal minx = patchMap->min_a(pid);
01468 BigReal maxx = patchMap->max_a(pid);
01469 BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
01470
01471 int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
01472 int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
01473
01474 BigReal miny = patchMap->min_b(pid);
01475 BigReal maxy = patchMap->max_b(pid);
01476 BigReal marginb = 0.5 * ( patchdim - cutoff ) / sysdimb;
01477
01478 int min2 = ((int) floor(myGrid.K2 * (miny - marginb))) + shift2 - myGrid.order + 1;
01479 int max2 = ((int) floor(myGrid.K2 * (maxy + marginb))) + shift2;
01480
01481 for ( int i=min1; i<=max1; ++i ) {
01482 int ix = i;
01483 while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
01484 while ( ix < 0 ) ix += myGrid.K1;
01485 for ( int j=min2; j<=max2; ++j ) {
01486 int jy = j;
01487 while ( jy >= myGrid.K2 ) jy -= myGrid.K2;
01488 while ( jy < 0 ) jy += myGrid.K2;
01489 pencilActive[(ix / myGrid.block1)*yBlocks + (jy / myGrid.block2)] = 1;
01490 }
01491 }
01492 }
01493
01494 numPencilsActive = 0;
01495 for ( int i=0; i<xBlocks; ++i ) {
01496 for ( int j=0; j<yBlocks; ++j ) {
01497 if ( pencilActive[i*yBlocks+j] ) {
01498 ++numPencilsActive;
01499 zPencil(i,j,0).dummyRecvGrid(CkMyPe(),0);
01500 }
01501 }
01502 }
01503 activePencils = new ijpair[numPencilsActive];
01504 numPencilsActive = 0;
01505 for ( int i=0; i<xBlocks; ++i ) {
01506 for ( int j=0; j<yBlocks; ++j ) {
01507 if ( pencilActive[i*yBlocks+j] ) {
01508 activePencils[numPencilsActive++] = ijpair(i,j);
01509 }
01510 }
01511 }
01512 if ( simParams->PMESendOrder ) {
01513 std::sort(activePencils,activePencils+numPencilsActive,ijpair_sortop_bit_reversed());
01514 } else {
01515 Random rand(CkMyPe());
01516 rand.reorder(activePencils,numPencilsActive);
01517 }
01518
01519
01520
01521
01522 ungrid_count = numPencilsActive;
01523 }
01524
01525
01526 void ComputePmeMgr::activate_pencils(CkQdMsg *msg) {
01527 if ( ! usePencils ) return;
01528 if ( CkMyPe() == 0 ) zPencil.dummyRecvGrid(CkMyPe(),1);
01529 }
01530
01531
01532 ComputePmeMgr::~ComputePmeMgr() {
01533
01534 if ( CmiMyRank() == 0 ) {
01535 CmiDestroyLock(fftw_plan_lock);
01536 }
01537
01538 delete myKSpace;
01539 delete [] localInfo;
01540 delete [] gridNodeInfo;
01541 delete [] transNodeInfo;
01542 delete [] gridPeMap;
01543 delete [] transPeMap;
01544 delete [] recipPeDest;
01545 delete [] gridPeOrder;
01546 delete [] gridNodeOrder;
01547 delete [] transNodeOrder;
01548 delete [] isPmeFlag;
01549 delete [] qgrid;
01550 if ( kgrid != qgrid ) delete [] kgrid;
01551 delete [] work;
01552 delete [] gridmsg_reuse;
01553
01554 for (int i=0; i<fsize*numGrids; ++i) {
01555 if ( q_arr[i] ) {
01556 delete [] q_arr[i];
01557 }
01558 }
01559 delete [] q_arr;
01560 delete [] q_list;
01561 delete [] f_arr;
01562 delete [] fz_arr;
01563 }
01564
01565 void ComputePmeMgr::recvGrid(PmeGridMsg *msg) {
01566
01567 if ( grid_count == 0 ) {
01568 NAMD_bug("Message order failure in ComputePmeMgr::recvGrid\n");
01569 }
01570 if ( grid_count == numSources ) {
01571 lattice = msg->lattice;
01572 grid_sequence = msg->sequence;
01573 }
01574
01575 int zdim = myGrid.dim3;
01576 int zlistlen = msg->zlistlen;
01577 int *zlist = msg->zlist;
01578 float *qmsg = msg->qgrid;
01579 for ( int g=0; g<numGrids; ++g ) {
01580 char *f = msg->fgrid + fgrid_len * g;
01581 float *q = qgrid + qgrid_size * g;
01582 for ( int i=0; i<fgrid_len; ++i ) {
01583 if ( f[i] ) {
01584 for ( int k=0; k<zlistlen; ++k ) {
01585 q[zlist[k]] += *(qmsg++);
01586 }
01587 }
01588 q += zdim;
01589 }
01590 }
01591
01592 gridmsg_reuse[numSources-grid_count] = msg;
01593 --grid_count;
01594
01595 if ( grid_count == 0 ) {
01596 pmeProxyDir[CkMyPe()].gridCalc1();
01597 if ( useBarrier ) pmeProxyDir[0].sendTransBarrier();
01598 }
01599 }
01600 #ifdef MANUAL_DEBUG_FFTW3
01601
01602
01603 void dumpMatrixFloat(const char *infilename, float *matrix, int xdim, int ydim, int zdim,int pe)
01604 {
01605
01606 char fmt[1000];
01607 char filename[1000];
01608 strncpy(fmt,infilename,999);
01609 strncat(fmt,"_%d.out",999);
01610 sprintf(filename,fmt, pe);
01611 FILE *loutfile = fopen(filename, "w");
01612 #ifdef PAIRCALC_TEST_DUMP
01613 fprintf(loutfile,"%d\n",ydim);
01614 #endif
01615 fprintf(loutfile,"%d %d %d\n",xdim,ydim, zdim);
01616 for(int i=0;i<xdim;i++)
01617 for(int j=0;j<ydim;j++)
01618 for(int k=0;k<zdim;k++)
01619 fprintf(loutfile,"%d %d %d %.8f\n",i,j,k,matrix[i*zdim*ydim+j*zdim +k]);
01620 fclose(loutfile);
01621
01622 }
01623
01624 void dumpMatrixFloat3(const char *infilename, float *matrix, int xdim, int ydim, int zdim,int x, int y, int z)
01625 {
01626 char fmt[1000];
01627 char filename[1000];
01628 strncpy(fmt,infilename,999);
01629 strncat(fmt,"_%d_%d_%d.out",999);
01630 sprintf(filename,fmt, x,y,z);
01631 FILE *loutfile = fopen(filename, "w");
01632 CkAssert(loutfile!=NULL);
01633 CkPrintf("opened %s for dump\n",filename);
01634 fprintf(loutfile,"%d %d %d\n",xdim,ydim, zdim);
01635 for(int i=0;i<xdim;i++)
01636 for(int j=0;j<ydim;j++)
01637 for(int k=0;k<zdim;k++)
01638 fprintf(loutfile,"%d %d %d %.8f\n",i,j,k,matrix[i*zdim*ydim+j*zdim +k]);
01639 fclose(loutfile);
01640 }
01641
01642 #endif
01643
01644 void ComputePmeMgr::gridCalc1(void) {
01645
01646
01647 #ifdef NAMD_FFTW
01648 for ( int g=0; g<numGrids; ++g ) {
01649 #ifdef NAMD_FFTW_3
01650 fftwf_execute(forward_plan_yz[g]);
01651 #else
01652 rfftwnd_real_to_complex(forward_plan_yz, localInfo[myGridPe].nx,
01653 qgrid + qgrid_size * g, 1, myGrid.dim2 * myGrid.dim3, 0, 0, 0);
01654 #endif
01655
01656 }
01657 #endif
01658
01659 if ( ! useBarrier ) pmeProxyDir[CkMyPe()].sendTrans();
01660 }
01661
01662 void ComputePmeMgr::sendTransBarrier(void) {
01663 sendTransBarrier_received += 1;
01664
01665 if ( sendTransBarrier_received < numGridPes ) return;
01666 sendTransBarrier_received = 0;
01667 for ( int i=0; i<numGridPes; ++i ) {
01668 pmeProxyDir[gridPeMap[i]].sendTrans();
01669 }
01670 }
01671
01672 void ComputePmeMgr::sendTrans(void) {
01673
01674
01675
01676 int zdim = myGrid.dim3;
01677 int nx = localInfo[myGridPe].nx;
01678 int x_start = localInfo[myGridPe].x_start;
01679 int slicelen = myGrid.K2 * zdim;
01680
01681 ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
01682
01683 #if CMK_BLUEGENEL
01684 CmiNetworkProgressAfter (0);
01685 #endif
01686
01687 for (int j=0; j<numTransNodes; j++) {
01688 int node = transNodeOrder[j];
01689 int pe = transNodeInfo[node].pe_start;
01690 int npe = transNodeInfo[node].npe;
01691 int totlen = 0;
01692 if ( node != myTransNode ) for (int i=0; i<npe; ++i, ++pe) {
01693 LocalPmeInfo &li = localInfo[pe];
01694 int cpylen = li.ny_after_transpose * zdim;
01695 totlen += cpylen;
01696 }
01697 PmeTransMsg *newmsg = new (nx * totlen * numGrids,
01698 PRIORITY_SIZE) PmeTransMsg;
01699 newmsg->sourceNode = myGridPe;
01700 newmsg->lattice = lattice;
01701 newmsg->x_start = x_start;
01702 newmsg->nx = nx;
01703 for ( int g=0; g<numGrids; ++g ) {
01704 float *qmsg = newmsg->qgrid + nx * totlen * g;
01705 pe = transNodeInfo[node].pe_start;
01706 for (int i=0; i<npe; ++i, ++pe) {
01707 LocalPmeInfo &li = localInfo[pe];
01708 int cpylen = li.ny_after_transpose * zdim;
01709 if ( node == myTransNode ) {
01710 ComputePmeMgr *m = mgrObjects[CkRankOf(transPeMap[pe])];
01711 qmsg = m->kgrid + m->qgrid_size * g + x_start*cpylen;
01712 }
01713 float *q = qgrid + qgrid_size * g + li.y_start_after_transpose * zdim;
01714 for ( int x = 0; x < nx; ++x ) {
01715 CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
01716 q += slicelen;
01717 qmsg += cpylen;
01718 }
01719 }
01720 }
01721 newmsg->sequence = grid_sequence;
01722 SET_PRIORITY(newmsg,grid_sequence,PME_TRANS_PRIORITY)
01723 if ( node == myTransNode ) newmsg->nx = 0;
01724 if ( npe > 1 ) {
01725 if ( node == myTransNode ) fwdSharedTrans(newmsg);
01726 else pmeNodeProxy[transNodeInfo[node].real_node].recvTrans(newmsg);
01727 } else pmeProxy[transPeMap[transNodeInfo[node].pe_start]].recvTrans(newmsg);
01728 }
01729
01730 untrans_count = numTransPes;
01731
01732 }
01733
01734 void ComputePmeMgr::fwdSharedTrans(PmeTransMsg *msg) {
01735
01736 int pe = transNodeInfo[myTransNode].pe_start;
01737 int npe = transNodeInfo[myTransNode].npe;
01738 CmiNodeLock lock = CmiCreateLock();
01739 int *count = new int; *count = npe;
01740 for (int i=0; i<npe; ++i, ++pe) {
01741 PmeSharedTransMsg *shmsg = new (PRIORITY_SIZE) PmeSharedTransMsg;
01742 SET_PRIORITY(shmsg,msg->sequence,PME_TRANS_PRIORITY)
01743 shmsg->msg = msg;
01744 shmsg->count = count;
01745 shmsg->lock = lock;
01746 pmeProxy[transPeMap[pe]].recvSharedTrans(shmsg);
01747 }
01748 }
01749
01750 void ComputePmeMgr::recvSharedTrans(PmeSharedTransMsg *msg) {
01751 procTrans(msg->msg);
01752 CmiLock(msg->lock);
01753 int count = --(*msg->count);
01754 CmiUnlock(msg->lock);
01755 if ( count == 0 ) {
01756 CmiDestroyLock(msg->lock);
01757 delete msg->count;
01758 delete msg->msg;
01759 }
01760 delete msg;
01761 }
01762
01763 void ComputePmeMgr::recvTrans(PmeTransMsg *msg) {
01764 procTrans(msg);
01765 delete msg;
01766 }
01767
01768 void ComputePmeMgr::procTrans(PmeTransMsg *msg) {
01769
01770 if ( trans_count == numGridPes ) {
01771 lattice = msg->lattice;
01772 grid_sequence = msg->sequence;
01773 }
01774
01775 if ( msg->nx ) {
01776 int zdim = myGrid.dim3;
01777 NodePmeInfo &nodeInfo(transNodeInfo[myTransNode]);
01778 int first_pe = nodeInfo.pe_start;
01779 int last_pe = first_pe+nodeInfo.npe-1;
01780 int y_skip = localInfo[myTransPe].y_start_after_transpose
01781 - localInfo[first_pe].y_start_after_transpose;
01782 int ny_msg = localInfo[last_pe].y_start_after_transpose
01783 + localInfo[last_pe].ny_after_transpose
01784 - localInfo[first_pe].y_start_after_transpose;
01785 int ny = localInfo[myTransPe].ny_after_transpose;
01786 int x_start = msg->x_start;
01787 int nx = msg->nx;
01788 for ( int g=0; g<numGrids; ++g ) {
01789 CmiMemcpy((void*)(kgrid + qgrid_size * g + x_start*ny*zdim),
01790 (void*)(msg->qgrid + nx*(ny_msg*g+y_skip)*zdim),
01791 nx*ny*zdim*sizeof(float));
01792 }
01793 }
01794
01795 --trans_count;
01796
01797 if ( trans_count == 0 ) {
01798 pmeProxyDir[CkMyPe()].gridCalc2();
01799 }
01800 }
01801
01802 void ComputePmeMgr::gridCalc2(void) {
01803
01804
01805 #if CMK_BLUEGENEL
01806 CmiNetworkProgressAfter (0);
01807 #endif
01808
01809 int zdim = myGrid.dim3;
01810
01811 int ny = localInfo[myTransPe].ny_after_transpose;
01812
01813 for ( int g=0; g<numGrids; ++g ) {
01814
01815 #ifdef NAMD_FFTW
01816 #ifdef NAMD_FFTW_3
01817 fftwf_execute(forward_plan_x[g]);
01818 #else
01819 fftw(forward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
01820 ny * zdim / 2, 1, work, 1, 0);
01821 #endif
01822 #endif
01823 }
01824
01825 #ifdef OPENATOM_VERSION
01826 if ( ! simParams -> openatomOn ) {
01827 #endif // OPENATOM_VERSION
01828 gridCalc2R();
01829 #ifdef OPENATOM_VERSION
01830 } else {
01831 gridCalc2Moa();
01832 }
01833 #endif // OPENATOM_VERSION
01834 }
01835
01836 #ifdef OPENATOM_VERSION
01837 void ComputePmeMgr::gridCalc2Moa(void) {
01838
01839 int zdim = myGrid.dim3;
01840
01841 int ny = localInfo[myTransPe].ny_after_transpose;
01842
01843 SimParameters *simParams = Node::Object()->simParameters;
01844
01845 CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
01846
01847 for ( int g=0; g<numGrids; ++g ) {
01848 #ifdef OPENATOM_VERSION_DEBUG
01849 CkPrintf("Sending recQ on processor %d \n", CkMyPe());
01850 for ( int i=0; i<=(ny * zdim / 2); ++i)
01851 {
01852 CkPrintf("PE, g,fftw_q,k*q*g, kgrid, qgrid_size value %d pre-send = %d, %d, %f %f, %d, \n", i, CkMyPe(), g, (kgrid+qgrid_size*g)[i], kgrid[i], qgrid_size);
01853 }
01854 #endif // OPENATOM_VERSION_DEBUG
01855
01856 CkCallback resumePme(CkIndex_ComputePmeMgr::gridCalc2R(), thishandle);
01857 moaProxy[CkMyPe()].recvQ(g,numGrids,(ny * zdim / 2),(kgrid+qgrid_size*g), resumePme);
01858 }
01859 }
01860 #endif // OPENATOM_VERSION
01861
01862 void ComputePmeMgr::gridCalc2R(void) {
01863
01864 int zdim = myGrid.dim3;
01865
01866 int ny = localInfo[myTransPe].ny_after_transpose;
01867
01868 for ( int g=0; g<numGrids; ++g ) {
01869
01870 BigReal ewaldcof = ComputeNonbondedUtil::ewaldcof;
01871 recip_evir2[g][0] = myKSpace->compute_energy(kgrid+qgrid_size*g,
01872 lattice, ewaldcof, &(recip_evir2[g][1]));
01873
01874
01875
01876
01877 #ifdef NAMD_FFTW
01878 #ifdef NAMD_FFTW_3
01879 fftwf_execute(backward_plan_x[g]);
01880 #else
01881 fftw(backward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
01882 ny * zdim / 2, 1, work, 1, 0);
01883 #endif
01884 #endif
01885 }
01886
01887 pmeProxyDir[CkMyPe()].sendUntrans();
01888 }
01889
01890 void ComputePmeMgr::sendUntrans(void) {
01891
01892 int zdim = myGrid.dim3;
01893 int y_start = localInfo[myTransPe].y_start_after_transpose;
01894 int ny = localInfo[myTransPe].ny_after_transpose;
01895 int slicelen = myGrid.K2 * zdim;
01896
01897 ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
01898
01899 #if CMK_BLUEGENEL
01900 CmiNetworkProgressAfter (0);
01901 #endif
01902
01903
01904 for (int j=0; j<numGridNodes; j++) {
01905 int node = gridNodeOrder[j];
01906 int pe = gridNodeInfo[node].pe_start;
01907 int npe = gridNodeInfo[node].npe;
01908 int totlen = 0;
01909 if ( node != myGridNode ) for (int i=0; i<npe; ++i, ++pe) {
01910 LocalPmeInfo &li = localInfo[pe];
01911 int cpylen = li.nx * zdim;
01912 totlen += cpylen;
01913 }
01914 PmeUntransMsg *newmsg = new (numGrids, ny * totlen * numGrids, PRIORITY_SIZE) PmeUntransMsg;
01915 newmsg->sourceNode = myTransPe;
01916 newmsg->y_start = y_start;
01917 newmsg->ny = ny;
01918 for ( int g=0; g<numGrids; ++g ) {
01919 if ( j == 0 ) {
01920 newmsg->evir[g] = recip_evir2[g];
01921 } else {
01922 newmsg->evir[g] = 0.;
01923 }
01924 float *qmsg = newmsg->qgrid + ny * totlen * g;
01925 pe = gridNodeInfo[node].pe_start;
01926 for (int i=0; i<npe; ++i, ++pe) {
01927 LocalPmeInfo &li = localInfo[pe];
01928 if ( node == myGridNode ) {
01929 ComputePmeMgr *m = mgrObjects[CkRankOf(gridPeMap[pe])];
01930 qmsg = m->qgrid + m->qgrid_size * g + y_start * zdim;
01931 float *q = kgrid + qgrid_size*g + li.x_start*ny*zdim;
01932 int cpylen = ny * zdim;
01933 for ( int x = 0; x < li.nx; ++x ) {
01934 CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
01935 q += cpylen;
01936 qmsg += slicelen;
01937 }
01938 } else {
01939 CmiMemcpy((void*)qmsg,
01940 (void*)(kgrid + qgrid_size*g + li.x_start*ny*zdim),
01941 li.nx*ny*zdim*sizeof(float));
01942 qmsg += li.nx*ny*zdim;
01943 }
01944 }
01945 }
01946 SET_PRIORITY(newmsg,grid_sequence,PME_UNTRANS_PRIORITY)
01947 if ( node == myGridNode ) newmsg->ny = 0;
01948 if ( npe > 1 ) {
01949 if ( node == myGridNode ) fwdSharedUntrans(newmsg);
01950 else pmeNodeProxy[gridNodeInfo[node].real_node].recvUntrans(newmsg);
01951 } else pmeProxy[gridPeMap[gridNodeInfo[node].pe_start]].recvUntrans(newmsg);
01952 }
01953
01954 trans_count = numGridPes;
01955 }
01956
01957 void ComputePmeMgr::fwdSharedUntrans(PmeUntransMsg *msg) {
01958 int pe = gridNodeInfo[myGridNode].pe_start;
01959 int npe = gridNodeInfo[myGridNode].npe;
01960 CmiNodeLock lock = CmiCreateLock();
01961 int *count = new int; *count = npe;
01962 for (int i=0; i<npe; ++i, ++pe) {
01963 PmeSharedUntransMsg *shmsg = new PmeSharedUntransMsg;
01964 shmsg->msg = msg;
01965 shmsg->count = count;
01966 shmsg->lock = lock;
01967 pmeProxy[gridPeMap[pe]].recvSharedUntrans(shmsg);
01968 }
01969 }
01970
01971 void ComputePmeMgr::recvSharedUntrans(PmeSharedUntransMsg *msg) {
01972 procUntrans(msg->msg);
01973 CmiLock(msg->lock);
01974 int count = --(*msg->count);
01975 CmiUnlock(msg->lock);
01976 if ( count == 0 ) {
01977 CmiDestroyLock(msg->lock);
01978 delete msg->count;
01979 delete msg->msg;
01980 }
01981 delete msg;
01982 }
01983
01984 void ComputePmeMgr::recvUntrans(PmeUntransMsg *msg) {
01985 procUntrans(msg);
01986 delete msg;
01987 }
01988
01989 void ComputePmeMgr::procUntrans(PmeUntransMsg *msg) {
01990
01991 if ( untrans_count == numTransPes ) {
01992 for ( int g=0; g<numGrids; ++g ) {
01993 recip_evir[g] = 0.;
01994 }
01995 }
01996
01997 #if CMK_BLUEGENEL
01998 CmiNetworkProgressAfter (0);
01999 #endif
02000
02001 NodePmeInfo &nodeInfo(gridNodeInfo[myGridNode]);
02002 int first_pe = nodeInfo.pe_start;
02003 int g;
02004 if ( myGridPe == first_pe ) for ( g=0; g<numGrids; ++g ) {
02005 recip_evir[g] += msg->evir[g];
02006 }
02007
02008 if ( msg->ny ) {
02009 int zdim = myGrid.dim3;
02010 int last_pe = first_pe+nodeInfo.npe-1;
02011 int x_skip = localInfo[myGridPe].x_start
02012 - localInfo[first_pe].x_start;
02013 int nx_msg = localInfo[last_pe].x_start
02014 + localInfo[last_pe].nx
02015 - localInfo[first_pe].x_start;
02016 int nx = localInfo[myGridPe].nx;
02017 int y_start = msg->y_start;
02018 int ny = msg->ny;
02019 int slicelen = myGrid.K2 * zdim;
02020 int cpylen = ny * zdim;
02021 for ( g=0; g<numGrids; ++g ) {
02022 float *q = qgrid + qgrid_size * g + y_start * zdim;
02023 float *qmsg = msg->qgrid + (nx_msg*g+x_skip) * cpylen;
02024 for ( int x = 0; x < nx; ++x ) {
02025 CmiMemcpy((void*)q, (void*)qmsg, cpylen*sizeof(float));
02026 q += slicelen;
02027 qmsg += cpylen;
02028 }
02029 }
02030 }
02031
02032 --untrans_count;
02033
02034 if ( untrans_count == 0 ) {
02035 pmeProxyDir[CkMyPe()].gridCalc3();
02036 }
02037 }
02038
02039 void ComputePmeMgr::gridCalc3(void) {
02040
02041
02042
02043 #ifdef NAMD_FFTW
02044
02045 for ( int g=0; g<numGrids; ++g ) {
02046 #ifdef NAMD_FFTW_3
02047 fftwf_execute(backward_plan_yz[g]);
02048 #else
02049 rfftwnd_complex_to_real(backward_plan_yz, localInfo[myGridPe].nx,
02050 (fftw_complex *) (qgrid + qgrid_size * g),
02051 1, myGrid.dim2 * myGrid.dim3 / 2, 0, 0, 0);
02052 #endif
02053 }
02054
02055 #endif
02056
02057 pmeProxyDir[CkMyPe()].sendUngrid();
02058 }
02059
02060 void ComputePmeMgr::sendUngrid(void) {
02061
02062 for ( int j=0; j<numSources; ++j ) {
02063
02064 PmeGridMsg *newmsg = gridmsg_reuse[j];
02065 int pe = newmsg->sourceNode;
02066 if ( j == 0 ) {
02067 for ( int g=0; g<numGrids; ++g ) {
02068 newmsg->evir[g] = recip_evir[g];
02069 }
02070 } else {
02071 for ( int g=0; g<numGrids; ++g ) {
02072 newmsg->evir[g] = 0.;
02073 }
02074 }
02075 int zdim = myGrid.dim3;
02076 int flen = newmsg->len;
02077 int fstart = newmsg->start;
02078 int zlistlen = newmsg->zlistlen;
02079 int *zlist = newmsg->zlist;
02080 float *qmsg = newmsg->qgrid;
02081 for ( int g=0; g<numGrids; ++g ) {
02082 char *f = newmsg->fgrid + fgrid_len * g;
02083 float *q = qgrid + qgrid_size * g + (fstart-fgrid_start) * zdim;
02084 for ( int i=0; i<flen; ++i ) {
02085 if ( f[i] ) {
02086 for ( int k=0; k<zlistlen; ++k ) {
02087 *(qmsg++) = q[zlist[k]];
02088 }
02089 }
02090 q += zdim;
02091 }
02092 }
02093 newmsg->sourceNode = myGridPe;
02094
02095 SET_PRIORITY(newmsg,grid_sequence,PME_UNGRID_PRIORITY)
02096 CmiEnableUrgentSend(1);
02097 pmeProxyDir[pe].recvUngrid(newmsg);
02098 CmiEnableUrgentSend(0);
02099 }
02100 grid_count = numSources;
02101 memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
02102 }
02103
02104 void ComputePmeMgr::recvUngrid(PmeGridMsg *msg) {
02105
02106 if ( ungrid_count == 0 ) {
02107 NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
02108 }
02109
02110 if ( usePencils ) copyPencils(msg);
02111 else copyResults(msg);
02112 delete msg;
02113 recvAck(0);
02114 }
02115
02116 void ComputePmeMgr::recvAck(PmeAckMsg *msg) {
02117 if ( msg ) delete msg;
02118 --ungrid_count;
02119
02120 if ( ungrid_count == 0 ) {
02121 pmeProxyDir[CkMyPe()].ungridCalc();
02122 }
02123 }
02124
02125 void ComputePmeMgr::ungridCalc(void) {
02126
02127
02128
02129
02130
02131 for (int i=0; i<q_count; ++i) {
02132 for (int j=0; j<myGrid.order-1; ++j) {
02133 q_list[i][myGrid.K3+j] = q_list[i][j];
02134 }
02135 }
02136
02137 ungridForcesCount = pmeComputes.size();
02138
02139 for ( int i=0; i<pmeComputes.size(); ++i ) {
02140 WorkDistrib::messageEnqueueWork(pmeComputes[i]);
02141
02142 }
02143
02144
02145 ungrid_count = (usePencils ? numPencilsActive : numDestRecipPes );
02146 }
02147
02148
02149 ComputePme::ComputePme(ComputeID c, PatchID pid) : Compute(c), patchID(pid)
02150 {
02151 DebugM(4,"ComputePme created.\n");
02152 basePriority = PME_PRIORITY;
02153 setNumPatches(1);
02154
02155 CProxy_ComputePmeMgr::ckLocalBranch(
02156 CkpvAccess(BOCclass_group).computePmeMgr)->addCompute(this);
02157
02158 SimParameters *simParams = Node::Object()->simParameters;
02159
02160 alchFepOn = simParams->alchFepOn;
02161 alchThermIntOn = simParams->alchThermIntOn;
02162 alchDecouple = (alchFepOn || alchThermIntOn) && (simParams->alchDecouple);
02163 alchElecLambdaStart = (alchFepOn || alchThermIntOn) ?
02164 simParams->alchElecLambdaStart : 0;
02165
02166 if (alchFepOn || alchThermIntOn) {
02167 numGrids = 2;
02168 if (alchDecouple) numGrids += 2;
02169 if (alchElecLambdaStart || alchThermIntOn) numGrids ++;
02170 }
02171 else numGrids = 1;
02172 lesOn = simParams->lesOn;
02173 if ( lesOn ) {
02174 lesFactor = simParams->lesFactor;
02175 numGrids = lesFactor;
02176 }
02177 selfOn = 0;
02178 pairOn = simParams->pairInteractionOn;
02179 if ( pairOn ) {
02180 selfOn = simParams->pairInteractionSelf;
02181 if ( selfOn ) pairOn = 0;
02182 numGrids = selfOn ? 1 : 3;
02183 }
02184
02185 myGrid.K1 = simParams->PMEGridSizeX;
02186 myGrid.K2 = simParams->PMEGridSizeY;
02187 myGrid.K3 = simParams->PMEGridSizeZ;
02188 myGrid.order = simParams->PMEInterpOrder;
02189 myGrid.dim2 = myGrid.K2;
02190 myGrid.dim3 = 2 * (myGrid.K3/2 + 1);
02191
02192 for ( int g=0; g<numGrids; ++g ) myRealSpace[g] = new PmeRealSpace(myGrid);
02193 }
02194
02195 void ComputePme::initialize() {
02196 if (!(patch = PatchMap::Object()->patch(patchID))) {
02197 NAMD_bug("ComputePme used with unknown patch.");
02198 }
02199 positionBox = patch->registerPositionPickup(this);
02200 avgPositionBox = patch->registerAvgPositionPickup(this);
02201 forceBox = patch->registerForceDeposit(this);
02202 }
02203
02204 void ComputePmeMgr::initialize_computes() {
02205
02206 noWorkCount = 0;
02207 doWorkCount = 0;
02208 ungridForcesCount = 0;
02209
02210 reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
02211
02212 SimParameters *simParams = Node::Object()->simParameters;
02213
02214 strayChargeErrors = 0;
02215
02216 if (simParams->accelMDOn) {
02217 amd_reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_AMD);
02218 } else {
02219 amd_reduction = NULL;
02220 }
02221
02222 qsize = myGrid.K1 * myGrid.dim2 * myGrid.dim3;
02223 fsize = myGrid.K1 * myGrid.dim2;
02224 q_arr = new float*[fsize*numGrids];
02225 memset( (void*) q_arr, 0, fsize*numGrids * sizeof(float*) );
02226 q_list = new float*[fsize*numGrids];
02227 memset( (void*) q_list, 0, fsize*numGrids * sizeof(float*) );
02228 q_count = 0;
02229 f_arr = new char[fsize*numGrids];
02230
02231
02232 for ( int n=fsize*numGrids, i=0; i<n; ++i ) f_arr[i] = 2;
02233 fz_arr = new char[myGrid.K3+myGrid.order-1];
02234
02235 for ( int g=0; g<numGrids; ++g ) {
02236 char *f = f_arr + g*fsize;
02237 if ( usePencils ) {
02238 int K1 = myGrid.K1;
02239 int K2 = myGrid.K2;
02240 int block1 = ( K1 + xBlocks - 1 ) / xBlocks;
02241 int block2 = ( K2 + yBlocks - 1 ) / yBlocks;
02242 int dim2 = myGrid.dim2;
02243 for (int ap=0; ap<numPencilsActive; ++ap) {
02244 int ib = activePencils[ap].i;
02245 int jb = activePencils[ap].j;
02246 int ibegin = ib*block1;
02247 int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
02248 int jbegin = jb*block2;
02249 int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
02250 int flen = numGrids * (iend - ibegin) * (jend - jbegin);
02251 for ( int i=ibegin; i<iend; ++i ) {
02252 for ( int j=jbegin; j<jend; ++j ) {
02253 f[i*dim2+j] = 0;
02254 }
02255 }
02256 }
02257 } else {
02258 int block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
02259 bsize = block1 * myGrid.dim2 * myGrid.dim3;
02260 for (int pe=0; pe<numGridPes; pe++) {
02261 if ( ! recipPeDest[pe] ) continue;
02262 int start = pe * bsize;
02263 int len = bsize;
02264 if ( start >= qsize ) { start = 0; len = 0; }
02265 if ( start + len > qsize ) { len = qsize - start; }
02266 int zdim = myGrid.dim3;
02267 int fstart = start / zdim;
02268 int flen = len / zdim;
02269 memset(f + fstart, 0, flen*sizeof(char));
02270 }
02271 }
02272 }
02273
02274 #if USE_PERSISTENT
02275 recvGrid_handle = NULL;
02276 #endif
02277 }
02278
02279 ComputePme::~ComputePme()
02280 {
02281 for ( int g=0; g<numGrids; ++g ) delete myRealSpace[g];
02282 }
02283
02284 #if USE_PERSISTENT
02285 void ComputePmeMgr::setup_recvgrid_persistent()
02286 {
02287 int K1 = myGrid.K1;
02288 int K2 = myGrid.K2;
02289 int dim2 = myGrid.dim2;
02290 int dim3 = myGrid.dim3;
02291 int block1 = myGrid.block1;
02292 int block2 = myGrid.block2;
02293
02294 CkArray *zPencil_local = zPencil.ckLocalBranch();
02295 recvGrid_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * numPencilsActive);
02296 for (int ap=0; ap<numPencilsActive; ++ap) {
02297 int ib = activePencils[ap].i;
02298 int jb = activePencils[ap].j;
02299 int ibegin = ib*block1;
02300 int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
02301 int jbegin = jb*block2;
02302 int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
02303 int flen = numGrids * (iend - ibegin) * (jend - jbegin);
02304
02305 int fcount = 0;
02306 for ( int g=0; g<numGrids; ++g ) {
02307 char *f = f_arr + g*fsize;
02308 for ( int i=ibegin; i<iend; ++i ) {
02309 for ( int j=jbegin; j<jend; ++j ) {
02310 fcount += f[i*dim2+j];
02311 }
02312 }
02313 }
02314 int zlistlen = 0;
02315 for ( int i=0; i<myGrid.K3; ++i ) {
02316 if ( fz_arr[i] ) ++zlistlen;
02317 }
02318 int hd = ( fcount? 1 : 0 );
02319 int peer = zPencil_local->homePe(CkArrayIndex3D(ib, jb, 0));
02320 int compress_start = sizeof(PmeGridMsg ) + sizeof(envelope) + sizeof(int)*hd*zlistlen + sizeof(char)*hd*flen +sizeof(PmeReduction)*hd*numGrids ;
02321 int compress_size = sizeof(float)*hd*fcount*zlistlen;
02322 int size = compress_start + compress_size + PRIORITY_SIZE/8+6;
02323 recvGrid_handle[ap] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
02324 }
02325 }
02326 #endif
02327
02328 int ComputePme::noWork() {
02329
02330 if ( patch->flags.doFullElectrostatics ) {
02331 if ( ! myMgr->ungridForcesCount ) return 0;
02332 myMgr->heldComputes.add(this);
02333 return 1;
02334 }
02335
02336 positionBox->skip();
02337 forceBox->skip();
02338
02339 if ( ++(myMgr->noWorkCount) == myMgr->pmeComputes.size() ) {
02340 myMgr->noWorkCount = 0;
02341 myMgr->reduction->submit();
02342 if (myMgr->amd_reduction) myMgr->amd_reduction->submit();
02343 }
02344
02345 return 1;
02346 }
02347
02348 void ComputePme::doWork()
02349 {
02350 DebugM(4,"Entering ComputePme::doWork().\n");
02351
02352 if ( basePriority >= COMPUTE_HOME_PRIORITY ) {
02353 basePriority = PME_PRIORITY;
02354 ungridForces();
02355 if ( ! --(myMgr->ungridForcesCount) ) myMgr->submitReductions();
02356 return;
02357 }
02358 basePriority = COMPUTE_HOME_PRIORITY + PATCH_PRIORITY(patchID);
02359
02360 #ifdef TRACE_COMPUTE_OBJECTS
02361 double traceObjStartTime = CmiWallTimer();
02362 #endif
02363
02364
02365 numLocalAtoms = patch->getNumAtoms();
02366
02367 Lattice &lattice = patch->flags.lattice;
02368
02369 localData_alloc.resize(numLocalAtoms*(numGrids+ ((numGrids>1 || selfOn)?1:0)));
02370 localData = localData_alloc.begin();
02371 localPartition_alloc.resize(numLocalAtoms);
02372 localPartition = localPartition_alloc.begin();
02373
02374 int g;
02375 for ( g=0; g<numGrids; ++g ) {
02376 localGridData[g] = localData + numLocalAtoms*(g+1);
02377 }
02378
02379
02380 PmeParticle * data_ptr = localData;
02381 unsigned char * part_ptr = localPartition;
02382 const BigReal coulomb_sqrt = sqrt( COULOMB * ComputeNonbondedUtil::scaling
02383 * ComputeNonbondedUtil::dielectric_1 );
02384
02385 {
02386 CompAtom *x = positionBox->open();
02387
02388 if ( patch->flags.doMolly ) {
02389 positionBox->close(&x);
02390 x = avgPositionBox->open();
02391 }
02392 int numAtoms = patch->getNumAtoms();
02393
02394 for(int i=0; i<numAtoms; ++i)
02395 {
02396 data_ptr->x = x[i].position.x;
02397 data_ptr->y = x[i].position.y;
02398 data_ptr->z = x[i].position.z;
02399 data_ptr->cg = coulomb_sqrt * x[i].charge;
02400 ++data_ptr;
02401 *part_ptr = x[i].partition;
02402 ++part_ptr;
02403 }
02404
02405 if ( patch->flags.doMolly ) { avgPositionBox->close(&x); }
02406 else { positionBox->close(&x); }
02407 }
02408
02409
02410 if ( ((alchFepOn || alchThermIntOn) && (!alchDecouple)) || lesOn ) {
02411 for ( g=0; g<numGrids; ++g ) {
02412 PmeParticle *lgd = localGridData[g];
02413 int nga = 0;
02414 for(int i=0; i<numLocalAtoms; ++i) {
02415 if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
02416
02417
02418
02419 lgd[nga++] = localData[i];
02420 }
02421 }
02422 numGridAtoms[g] = nga;
02423 }
02424 } else if ( (alchFepOn || alchThermIntOn) && alchDecouple) {
02425
02426
02427
02428
02429
02430
02431 for ( g=0; g<2; ++g ) {
02432 PmeParticle *lgd = localGridData[g];
02433 int nga = 0;
02434 for(int i=0; i<numLocalAtoms; ++i) {
02435 if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
02436 lgd[nga++] = localData[i];
02437 }
02438 }
02439 numGridAtoms[g] = nga;
02440 }
02441 for (g=2 ; g<4 ; ++g ) {
02442 PmeParticle *lgd = localGridData[g];
02443 int nga = 0;
02444 for(int i=0; i<numLocalAtoms; ++i) {
02445 if ( localPartition[i] == (g-1) ) {
02446 lgd[nga++] = localData[i];
02447 }
02448 }
02449 numGridAtoms[g] = nga;
02450 }
02451 for (g=4 ; g<numGrids ; ++g ) {
02452
02453 PmeParticle *lgd = localGridData[g];
02454 int nga = 0;
02455 for(int i=0; i<numLocalAtoms; ++i) {
02456 if ( localPartition[i] == 0 ) {
02457 lgd[nga++] = localData[i];
02458 }
02459 }
02460 numGridAtoms[g] = nga;
02461 }
02462 } else if ( selfOn ) {
02463 if ( numGrids != 1 ) NAMD_bug("ComputePme::doWork assertion 1 failed");
02464 g = 0;
02465 PmeParticle *lgd = localGridData[g];
02466 int nga = 0;
02467 for(int i=0; i<numLocalAtoms; ++i) {
02468 if ( localPartition[i] == 1 ) {
02469 lgd[nga++] = localData[i];
02470 }
02471 }
02472 numGridAtoms[g] = nga;
02473 } else if ( pairOn ) {
02474 if ( numGrids != 3 ) NAMD_bug("ComputePme::doWork assertion 2 failed");
02475 g = 0;
02476 PmeParticle *lgd = localGridData[g];
02477 int nga = 0;
02478 for(int i=0; i<numLocalAtoms; ++i) {
02479 if ( localPartition[i] == 1 || localPartition[i] == 2 ) {
02480 lgd[nga++] = localData[i];
02481 }
02482 }
02483 numGridAtoms[g] = nga;
02484 for ( g=1; g<3; ++g ) {
02485 PmeParticle *lgd = localGridData[g];
02486 int nga = 0;
02487 for(int i=0; i<numLocalAtoms; ++i) {
02488 if ( localPartition[i] == g ) {
02489 lgd[nga++] = localData[i];
02490 }
02491 }
02492 numGridAtoms[g] = nga;
02493 }
02494 } else {
02495 if ( numGrids != 1 ) NAMD_bug("ComputePme::doWork assertion 3 failed");
02496 localGridData[0] = localData;
02497 numGridAtoms[0] = numLocalAtoms;
02498 }
02499
02500 if ( ! myMgr->doWorkCount ) {
02501 myMgr->doWorkCount = myMgr->pmeComputes.size();
02502
02503 memset( (void*) myMgr->fz_arr, 0, (myGrid.K3+myGrid.order-1) * sizeof(char) );
02504
02505 for (int i=0; i<myMgr->q_count; ++i) {
02506 memset( (void*) (myMgr->q_list[i]), 0, (myGrid.K3+myGrid.order-1) * sizeof(float) );
02507 }
02508
02509 for ( g=0; g<numGrids; ++g ) {
02510 myMgr->evir[g] = 0;
02511 }
02512
02513 myMgr->strayChargeErrors = 0;
02514
02515 myMgr->compute_sequence = sequence();
02516 }
02517
02518 if ( sequence() != myMgr->compute_sequence ) NAMD_bug("ComputePme sequence mismatch in doWork()");
02519
02520 int strayChargeErrors = 0;
02521
02522
02523 BigReal ewaldcof = ComputeNonbondedUtil::ewaldcof;
02524 for ( g=0; g<numGrids; ++g ) {
02525 BigReal selfEnergy = 0;
02526 data_ptr = localGridData[g];
02527 int i;
02528 for(i=0; i<numGridAtoms[g]; ++i)
02529 {
02530 selfEnergy += data_ptr->cg * data_ptr->cg;
02531 ++data_ptr;
02532 }
02533 selfEnergy *= -1. * ewaldcof / SQRT_PI;
02534 myMgr->evir[g][0] += selfEnergy;
02535
02536 float **q = myMgr->q_arr + g*myMgr->fsize;
02537 char *f = myMgr->f_arr + g*myMgr->fsize;
02538
02539 myRealSpace[g]->set_num_atoms(numGridAtoms[g]);
02540 scale_coordinates(localGridData[g], numGridAtoms[g], lattice, myGrid);
02541 myRealSpace[g]->fill_charges(q, myMgr->q_list, myMgr->q_count, strayChargeErrors, f, myMgr->fz_arr, localGridData[g]);
02542 }
02543 myMgr->strayChargeErrors += strayChargeErrors;
02544
02545 #ifdef TRACE_COMPUTE_OBJECTS
02546 traceUserBracketEvent(TRACE_COMPOBJ_IDOFFSET+this->cid, traceObjStartTime, CmiWallTimer());
02547 #endif
02548
02549 if ( --(myMgr->doWorkCount) == 0 ) {
02550 for (int j=0; j<myGrid.order-1; ++j) {
02551 myMgr->fz_arr[j] |= myMgr->fz_arr[myGrid.K3+j];
02552 }
02553 for (int i=0; i<myMgr->q_count; ++i) {
02554 for (int j=0; j<myGrid.order-1; ++j) {
02555 myMgr->q_list[i][j] += myMgr->q_list[i][myGrid.K3+j];
02556 }
02557 }
02558
02559 if ( myMgr->usePencils ) {
02560 myMgr->sendPencils(lattice,sequence());
02561 } else {
02562 myMgr->sendData(lattice,sequence());
02563 }
02564 }
02565
02566 }
02567
02568
02569 void ComputePmeMgr::sendPencils(Lattice &lattice, int sequence) {
02570
02571
02572
02573 #if USE_PERSISTENT
02574 if (recvGrid_handle== NULL) setup_recvgrid_persistent();
02575 #endif
02576 int K1 = myGrid.K1;
02577 int K2 = myGrid.K2;
02578 int dim2 = myGrid.dim2;
02579 int dim3 = myGrid.dim3;
02580 int block1 = myGrid.block1;
02581 int block2 = myGrid.block2;
02582
02583
02584 NodePmeMgr *npMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
02585
02586 for (int ap=0; ap<numPencilsActive; ++ap) {
02587 int ib = activePencils[ap].i;
02588 int jb = activePencils[ap].j;
02589 int ibegin = ib*block1;
02590 int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
02591 int jbegin = jb*block2;
02592 int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
02593 int flen = numGrids * (iend - ibegin) * (jend - jbegin);
02594
02595 int fcount = 0;
02596 for ( int g=0; g<numGrids; ++g ) {
02597 char *f = f_arr + g*fsize;
02598 for ( int i=ibegin; i<iend; ++i ) {
02599 for ( int j=jbegin; j<jend; ++j ) {
02600 fcount += f[i*dim2+j];
02601 }
02602 }
02603 }
02604
02605 #ifdef NETWORK_PROGRESS
02606 CmiNetworkProgress();
02607 #endif
02608
02609 if ( ! pencilActive[ib*yBlocks+jb] )
02610 NAMD_bug("PME activePencils list inconsistent");
02611
02612 int zlistlen = 0;
02613 for ( int i=0; i<myGrid.K3; ++i ) {
02614 if ( fz_arr[i] ) ++zlistlen;
02615 }
02616
02617 int hd = ( fcount? 1 : 0 );
02618
02619
02620
02621 PmeGridMsg *msg = new ( hd*numGrids, hd*zlistlen, hd*flen,
02622 hd*fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
02623 msg->sourceNode = CkMyPe();
02624 msg->hasData = hd;
02625 msg->lattice = lattice;
02626 if ( hd ) {
02627 #if 0
02628 msg->start = fstart;
02629 msg->len = flen;
02630 #else
02631 msg->start = -1;
02632 msg->len = -1;
02633 #endif
02634 msg->zlistlen = zlistlen;
02635 int *zlist = msg->zlist;
02636 zlistlen = 0;
02637 for ( int i=0; i<myGrid.K3; ++i ) {
02638 if ( fz_arr[i] ) zlist[zlistlen++] = i;
02639 }
02640 char *fmsg = msg->fgrid;
02641 float *qmsg = msg->qgrid;
02642 for ( int g=0; g<numGrids; ++g ) {
02643 char *f = f_arr + g*fsize;
02644 float **q = q_arr + g*fsize;
02645 for ( int i=ibegin; i<iend; ++i ) {
02646 for ( int j=jbegin; j<jend; ++j ) {
02647 *(fmsg++) = f[i*dim2+j];
02648 if( f[i*dim2+j] ) {
02649 for ( int k=0; k<zlistlen; ++k ) {
02650 *(qmsg++) = q[i*dim2+j][zlist[k]];
02651 }
02652 }
02653 }
02654 }
02655 }
02656 }
02657
02658 msg->sequence = compute_sequence;
02659 SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
02660 CmiEnableUrgentSend(1);
02661 #if USE_NODE_PAR_RECEIVE
02662 msg->destElem=CkArrayIndex3D(ib,jb,0);
02663 CProxy_PmePencilMap lzm = npMgr->zm;
02664 int destproc = lzm.ckLocalBranch()->procNum(0, msg->destElem);
02665 int destnode = CmiNodeOf(destproc);
02666
02667 #if 0
02668 CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
02669 #endif
02670 pmeNodeProxy[destnode].recvZGrid(msg);
02671 #if 0
02672 CmiUsePersistentHandle(NULL, 0);
02673 #endif
02674 #else
02675 #if 0
02676 CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
02677 #endif
02678 zPencil(ib,jb,0).recvGrid(msg);
02679 #if 0
02680 CmiUsePersistentHandle(NULL, 0);
02681 #endif
02682 #endif
02683 CmiEnableUrgentSend(0);
02684 }
02685
02686
02687 if ( strayChargeErrors ) {
02688 strayChargeErrors = 0;
02689 iout << iERROR << "Stray PME grid charges detected: "
02690 << CkMyPe() << " sending to (x,y)";
02691 for (int ib=0; ib<xBlocks; ++ib) {
02692 for (int jb=0; jb<yBlocks; ++jb) {
02693 int ibegin = ib*block1;
02694 int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
02695 int jbegin = jb*block2;
02696 int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
02697 int flen = numGrids * (iend - ibegin) * (jend - jbegin);
02698
02699 for ( int g=0; g<numGrids; ++g ) {
02700 char *f = f_arr + g*fsize;
02701 if ( ! pencilActive[ib*yBlocks+jb] ) {
02702 for ( int i=ibegin; i<iend; ++i ) {
02703 for ( int j=jbegin; j<jend; ++j ) {
02704 if ( f[i*dim2+j] == 3 ) {
02705 f[i*dim2+j] = 2;
02706 iout << " (" << i << "," << j << ")";
02707 }
02708 }
02709 }
02710 }
02711 }
02712 }
02713 }
02714 iout << "\n" << endi;
02715 }
02716
02717
02718
02719
02720
02721 }
02722
02723
02724 void ComputePmeMgr::copyPencils(PmeGridMsg *msg) {
02725
02726 int K1 = myGrid.K1;
02727 int K2 = myGrid.K2;
02728 int dim2 = myGrid.dim2;
02729 int dim3 = myGrid.dim3;
02730 int block1 = myGrid.block1;
02731 int block2 = myGrid.block2;
02732
02733
02734 int ib = msg->sourceNode / yBlocks;
02735 int jb = msg->sourceNode % yBlocks;
02736
02737 int ibegin = ib*block1;
02738 int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
02739 int jbegin = jb*block2;
02740 int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
02741
02742 int zlistlen = msg->zlistlen;
02743 int *zlist = msg->zlist;
02744 float *qmsg = msg->qgrid;
02745 int g;
02746 for ( g=0; g<numGrids; ++g ) {
02747 evir[g] += msg->evir[g];
02748 char *f = f_arr + g*fsize;
02749 float **q = q_arr + g*fsize;
02750 for ( int i=ibegin; i<iend; ++i ) {
02751 for ( int j=jbegin; j<jend; ++j ) {
02752 if( f[i*dim2+j] ) {
02753 f[i*dim2+j] = 0;
02754 for ( int k=0; k<zlistlen; ++k ) {
02755 q[i*dim2+j][zlist[k]] = *(qmsg++);
02756 }
02757 }
02758 }
02759 }
02760 }
02761 }
02762
02763
02764 void ComputePmeMgr::sendData(Lattice &lattice, int sequence) {
02765
02766
02767
02768 bsize = myGrid.block1 * myGrid.dim2 * myGrid.dim3;
02769
02770 CProxy_ComputePmeMgr pmeProxy(CkpvAccess(BOCclass_group).computePmeMgr);
02771 for (int j=0; j<numGridPes; j++) {
02772 int pe = gridPeOrder[j];
02773 if ( ! recipPeDest[pe] && ! strayChargeErrors ) continue;
02774 int start = pe * bsize;
02775 int len = bsize;
02776 if ( start >= qsize ) { start = 0; len = 0; }
02777 if ( start + len > qsize ) { len = qsize - start; }
02778 int zdim = myGrid.dim3;
02779 int fstart = start / zdim;
02780 int flen = len / zdim;
02781 int fcount = 0;
02782 int i;
02783
02784 int g;
02785 for ( g=0; g<numGrids; ++g ) {
02786 char *f = f_arr + fstart + g*fsize;
02787 for ( i=0; i<flen; ++i ) {
02788 fcount += f[i];
02789 }
02790 if ( ! recipPeDest[pe] ) {
02791 int errfound = 0;
02792 for ( i=0; i<flen; ++i ) {
02793 if ( f[i] == 3 ) {
02794 errfound = 1;
02795 break;
02796 }
02797 }
02798 if ( errfound ) {
02799 iout << iERROR << "Stray PME grid charges detected: "
02800 << CkMyPe() << " sending to " << gridPeMap[pe] << " for planes";
02801 int iz = -1;
02802 for ( i=0; i<flen; ++i ) {
02803 if ( f[i] == 3 ) {
02804 f[i] = 2;
02805 int jz = (i+fstart)/myGrid.K2;
02806 if ( iz != jz ) { iout << " " << jz; iz = jz; }
02807 }
02808 }
02809 iout << "\n" << endi;
02810 }
02811 }
02812 }
02813
02814 #ifdef NETWORK_PROGRESS
02815 CmiNetworkProgress();
02816 #endif
02817
02818 if ( ! recipPeDest[pe] ) continue;
02819
02820 int zlistlen = 0;
02821 for ( i=0; i<myGrid.K3; ++i ) {
02822 if ( fz_arr[i] ) ++zlistlen;
02823 }
02824
02825 PmeGridMsg *msg = new (numGrids, zlistlen, flen*numGrids,
02826 fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
02827
02828 msg->sourceNode = CkMyPe();
02829 msg->lattice = lattice;
02830 msg->start = fstart;
02831 msg->len = flen;
02832 msg->zlistlen = zlistlen;
02833 int *zlist = msg->zlist;
02834 zlistlen = 0;
02835 for ( i=0; i<myGrid.K3; ++i ) {
02836 if ( fz_arr[i] ) zlist[zlistlen++] = i;
02837 }
02838 float *qmsg = msg->qgrid;
02839 for ( g=0; g<numGrids; ++g ) {
02840 char *f = f_arr + fstart + g*fsize;
02841 CmiMemcpy((void*)(msg->fgrid+g*flen),(void*)f,flen*sizeof(char));
02842 float **q = q_arr + fstart + g*fsize;
02843 for ( i=0; i<flen; ++i ) {
02844 if ( f[i] ) {
02845 for ( int k=0; k<zlistlen; ++k ) {
02846 *(qmsg++) = q[i][zlist[k]];
02847 }
02848 }
02849 }
02850 }
02851
02852 msg->sequence = compute_sequence;
02853 SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
02854 pmeProxy[gridPeMap[pe]].recvGrid(msg);
02855 }
02856
02857 strayChargeErrors = 0;
02858
02859 }
02860
02861 void ComputePmeMgr::copyResults(PmeGridMsg *msg) {
02862
02863 int zdim = myGrid.dim3;
02864 int flen = msg->len;
02865 int fstart = msg->start;
02866 int zlistlen = msg->zlistlen;
02867 int *zlist = msg->zlist;
02868 float *qmsg = msg->qgrid;
02869 int g;
02870 for ( g=0; g<numGrids; ++g ) {
02871 evir[g] += msg->evir[g];
02872 char *f = msg->fgrid + g*flen;
02873 float **q = q_arr + fstart + g*fsize;
02874 for ( int i=0; i<flen; ++i ) {
02875 if ( f[i] ) {
02876 f[i] = 0;
02877 for ( int k=0; k<zlistlen; ++k ) {
02878 q[i][zlist[k]] = *(qmsg++);
02879 }
02880 }
02881 }
02882 }
02883 }
02884
02885 void ComputePme::ungridForces() {
02886
02887 if ( sequence() != myMgr->compute_sequence ) NAMD_bug("ComputePme sequence mismatch in ungridForces()");
02888
02889 SimParameters *simParams = Node::Object()->simParameters;
02890
02891 localResults_alloc.resize(numLocalAtoms* ((numGrids>1 || selfOn)?2:1));
02892 Vector *localResults = localResults_alloc.begin();
02893 Vector *gridResults;
02894 if ( alchFepOn || alchThermIntOn || lesOn || selfOn || pairOn ) {
02895 for(int i=0; i<numLocalAtoms; ++i) { localResults[i] = 0.; }
02896 gridResults = localResults + numLocalAtoms;
02897 } else {
02898 gridResults = localResults;
02899 }
02900
02901 Vector pairForce = 0.;
02902 Lattice &lattice = patch->flags.lattice;
02903 int g = 0;
02904 if(!simParams->commOnly) {
02905 for ( g=0; g<numGrids; ++g ) {
02906 #ifdef NETWORK_PROGRESS
02907 CmiNetworkProgress();
02908 #endif
02909
02910 myRealSpace[g]->compute_forces(myMgr->q_arr+g*myMgr->fsize, localGridData[g], gridResults);
02911 scale_forces(gridResults, numGridAtoms[g], lattice);
02912
02913 if ( alchFepOn || alchThermIntOn ) {
02914 float scale = 1.;
02915 if(simParams->alchFepWhamOn) {
02916 if(simParams->alchFepElecOn) {
02917 elecLambdaUp = simParams->alchElecLambda;
02918 elecLambdaDown = 1.0 - simParams->alchElecLambda;
02919 }
02920 else {
02921 elecLambdaUp = 0.0;
02922 elecLambdaDown = 1.0;
02923 }
02924 }
02925 else {
02926 elecLambdaUp = (simParams->alchLambda <= alchElecLambdaStart)? 0. : \
02927 (simParams->alchLambda - alchElecLambdaStart) / (1. - alchElecLambdaStart);
02928 elecLambdaDown = ((1-simParams->alchLambda) <= alchElecLambdaStart)? 0. : ((1-simParams->alchLambda) - alchElecLambdaStart) / (1. - alchElecLambdaStart);
02929 }
02930
02931 if ( g == 0 ) scale = elecLambdaUp;
02932 else if ( g == 1 ) scale = elecLambdaDown;
02933 else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
02934 if (alchDecouple) {
02935 if ( g == 2 ) scale = 1 - elecLambdaUp;
02936 else if ( g == 3 ) scale = 1 - elecLambdaDown;
02937 else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
02938 }
02939 int nga = 0;
02940 if (!alchDecouple) {
02941 for(int i=0; i<numLocalAtoms; ++i) {
02942 if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
02943
02944 localResults[i] += gridResults[nga++] * scale;
02945 }
02946 }
02947 }
02948 else {
02949 if ( g < 2 ) {
02950 for(int i=0; i<numLocalAtoms; ++i) {
02951 if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
02952
02953
02954 localResults[i] += gridResults[nga++] * scale;
02955 }
02956 }
02957 }
02958 else {
02959 for(int i=0; i<numLocalAtoms; ++i) {
02960 if ( localPartition[i] == (g-1) || localPartition[i] == (g-4)) {
02961
02962
02963
02964 localResults[i] += gridResults[nga++] * scale;
02965 }
02966 }
02967 }
02968 }
02969 } else if ( lesOn ) {
02970 float scale = 1.;
02971 if ( alchFepOn ) {
02972 if(simParams->alchFepWhamOn) {
02973 if(simParams->alchFepElecOn) {
02974 if ( g == 0 ) scale = simParams->alchElecLambda;
02975 else if ( g == 1 ) scale = 1. - simParams->alchElecLambda;
02976 }
02977 else {
02978 if ( g == 0 ) scale = 0.0;
02979 else if ( g == 1 ) scale = 1.0;
02980 }
02981 }
02982 else {
02983 if ( g == 0 ) scale = simParams->alchLambda;
02984 else if ( g == 1 ) scale = 1. - simParams->alchLambda;
02985 }
02986 } else if ( lesOn ) {
02987 scale = 1.0 / (float)lesFactor;
02988 }
02989 int nga = 0;
02990 for(int i=0; i<numLocalAtoms; ++i) {
02991 if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
02992 localResults[i] += gridResults[nga++] * scale;
02993 }
02994 }
02995 } else if ( selfOn ) {
02996 PmeParticle *lgd = localGridData[g];
02997 int nga = 0;
02998 for(int i=0; i<numLocalAtoms; ++i) {
02999 if ( localPartition[i] == 1 ) {
03000 pairForce += gridResults[nga];
03001 localResults[i] += gridResults[nga++];
03002 }
03003 }
03004 } else if ( pairOn ) {
03005 if ( g == 0 ) {
03006 int nga = 0;
03007 for(int i=0; i<numLocalAtoms; ++i) {
03008 if ( localPartition[i] == 1 ) {
03009 pairForce += gridResults[nga];
03010 }
03011 if ( localPartition[i] == 1 || localPartition[i] == 2 ) {
03012 localResults[i] += gridResults[nga++];
03013 }
03014 }
03015 } else if ( g == 1 ) {
03016 int nga = 0;
03017 for(int i=0; i<numLocalAtoms; ++i) {
03018 if ( localPartition[i] == g ) {
03019 pairForce -= gridResults[nga];
03020 localResults[i] -= gridResults[nga++];
03021 }
03022 }
03023 } else {
03024 int nga = 0;
03025 for(int i=0; i<numLocalAtoms; ++i) {
03026 if ( localPartition[i] == g ) {
03027 localResults[i] -= gridResults[nga++];
03028 }
03029 }
03030 }
03031 }
03032 }
03033 }
03034
03035 Vector *results_ptr = localResults;
03036
03037
03038 {
03039 Results *r = forceBox->open();
03040 Force *f = r->f[Results::slow];
03041 int numAtoms = patch->getNumAtoms();
03042
03043 if ( ! myMgr->strayChargeErrors && ! simParams->commOnly ) {
03044 for(int i=0; i<numAtoms; ++i) {
03045 f[i].x += results_ptr->x;
03046 f[i].y += results_ptr->y;
03047 f[i].z += results_ptr->z;
03048 ++results_ptr;
03049 }
03050 }
03051 forceBox->close(&r);
03052 }
03053
03054 if ( pairOn || selfOn ) {
03055 ADD_VECTOR_OBJECT(myMgr->reduction,REDUCTION_PAIR_ELECT_FORCE,pairForce);
03056 }
03057
03058 }
03059
03060 void ComputePmeMgr::submitReductions() {
03061
03062 for ( int g=0; g<numGrids; ++g ) {
03063 float scale = 1.;
03064 if ( alchFepOn || alchThermIntOn ) {
03065 if ( g == 0 ) scale = elecLambdaUp;
03066 else if ( g == 1 ) scale = elecLambdaDown;
03067 else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
03068 if (alchDecouple) {
03069 if ( g == 2 ) scale = 1-elecLambdaUp;
03070 else if ( g == 3 ) scale = 1-elecLambdaDown;
03071 else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
03072 }
03073 } else if ( lesOn ) {
03074 scale = 1.0 / lesFactor;
03075 } else if ( pairOn ) {
03076 scale = ( g == 0 ? 1. : -1. );
03077 }
03078 reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += evir[g][0] * scale;
03079 reduction->item(REDUCTION_VIRIAL_SLOW_XX) += evir[g][1] * scale;
03080 reduction->item(REDUCTION_VIRIAL_SLOW_XY) += evir[g][2] * scale;
03081 reduction->item(REDUCTION_VIRIAL_SLOW_XZ) += evir[g][3] * scale;
03082 reduction->item(REDUCTION_VIRIAL_SLOW_YX) += evir[g][2] * scale;
03083 reduction->item(REDUCTION_VIRIAL_SLOW_YY) += evir[g][4] * scale;
03084 reduction->item(REDUCTION_VIRIAL_SLOW_YZ) += evir[g][5] * scale;
03085 reduction->item(REDUCTION_VIRIAL_SLOW_ZX) += evir[g][3] * scale;
03086 reduction->item(REDUCTION_VIRIAL_SLOW_ZY) += evir[g][5] * scale;
03087 reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += evir[g][6] * scale;
03088
03089 if (amd_reduction) {
03090 amd_reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += evir[g][0] * scale;
03091 amd_reduction->item(REDUCTION_VIRIAL_SLOW_XX) += evir[g][1] * scale;
03092 amd_reduction->item(REDUCTION_VIRIAL_SLOW_XY) += evir[g][2] * scale;
03093 amd_reduction->item(REDUCTION_VIRIAL_SLOW_XZ) += evir[g][3] * scale;
03094 amd_reduction->item(REDUCTION_VIRIAL_SLOW_YX) += evir[g][2] * scale;
03095 amd_reduction->item(REDUCTION_VIRIAL_SLOW_YY) += evir[g][4] * scale;
03096 amd_reduction->item(REDUCTION_VIRIAL_SLOW_YZ) += evir[g][5] * scale;
03097 amd_reduction->item(REDUCTION_VIRIAL_SLOW_ZX) += evir[g][3] * scale;
03098 amd_reduction->item(REDUCTION_VIRIAL_SLOW_ZY) += evir[g][5] * scale;
03099 amd_reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += evir[g][6] * scale;
03100 }
03101
03102 float scale2 = 0.;
03103
03104
03105 SimParameters *simParams = Node::Object()->simParameters;
03106
03107 if (alchFepOn) {
03108 BigReal elecLambda2Up=0.0, elecLambda2Down=0.0;
03109 if(simParams->alchFepWhamOn) {
03110 if(simParams->alchFepElecOn) {
03111 elecLambda2Up = simParams->alchElecLambda;
03112 elecLambda2Down = 1.0 - simParams->alchElecLambda;
03113 }
03114 else {
03115 elecLambda2Up = 0.0;
03116 elecLambda2Down = 1.0;
03117 }
03118 }
03119 else {
03120 elecLambda2Up = (simParams->alchLambda2 <= alchElecLambdaStart)? 0. : \
03121 (simParams->alchLambda2 - alchElecLambdaStart) / (1. - alchElecLambdaStart);
03122 elecLambda2Down = ((1-simParams->alchLambda2) <= alchElecLambdaStart)? 0. : \
03123 ((1-simParams->alchLambda2) - alchElecLambdaStart) / (1. - alchElecLambdaStart);
03124 }
03125
03126 if ( g == 0 ) scale2 = elecLambda2Up;
03127 else if ( g == 1 ) scale2 = elecLambda2Down;
03128 else if ( g == 2 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
03129 if (alchDecouple && g == 2 ) scale2 = 1 - elecLambda2Up;
03130 else if (alchDecouple && g == 3 ) scale2 = 1 - elecLambda2Down;
03131 else if (alchDecouple && g == 4 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
03132 }
03133 if(simParams->alchFepWhamOn && simParams->alchFepElecOn) {
03134 if( g==0 ) scale2 = scale + 1.0;
03135 else if( g==1 ) scale2 = scale - 1.0;
03136 else if( g==2 ) scale2 = scale - 1.0;
03137 else if( g==3 ) scale2 = scale + 1.0;
03138 }
03139 reduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += evir[g][0] * scale2;
03140 if (amd_reduction) amd_reduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += evir[g][0] * scale2;
03141
03142 if (alchThermIntOn) {
03143
03144
03145
03146
03147
03148
03149
03150
03151
03152
03153
03154
03155
03156
03157
03158
03159
03160
03161
03162
03163
03164
03165
03166
03167
03168
03169
03170
03171 if ( g == 0 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) += evir[g][0];
03172 if ( g == 1 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) += evir[g][0];
03173 if (!alchDecouple) {
03174 if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
03175 if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
03176 }
03177 else {
03178 if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
03179 if ( g == 3 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
03180 if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
03181 if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
03182 }
03183 }
03184 }
03185 reduction->item(REDUCTION_STRAY_CHARGE_ERRORS) += strayChargeErrors;
03186 reduction->submit();
03187 if (amd_reduction) amd_reduction->submit();
03188
03189
03190 for ( int i=0; i<heldComputes.size(); ++i ) {
03191 WorkDistrib::messageEnqueueWork(heldComputes[i]);
03192 }
03193 heldComputes.resize(0);
03194 }
03195
03196 #if USE_TOPOMAP
03197
03198 #define NPRIMES 8
03199 const static unsigned int NAMDPrimes[] = {
03200 3,
03201 5,
03202 7,
03203 11,
03204 13,
03205 17,
03206 19,
03207 23,
03208 29,
03209 31,
03210 37,
03211 59,
03212 73,
03213 93,
03214 113,
03215 157,
03216 307,
03217 617,
03218 1217
03219 };
03220
03221 #include "RecBisection.h"
03222
03223
03224
03225
03226
03227
03228
03229 bool generateBGLORBPmePeList(int *pemap, int numPes,
03230 int *block_pes, int nbpes) {
03231
03232 PatchMap *pmap = PatchMap::Object();
03233 int *pmemap = new int [CkNumPes()];
03234
03235 if (pemap == NULL)
03236 return false;
03237
03238 TopoManager tmgr;
03239
03240 memset(pmemap, 0, sizeof(int) * CkNumPes());
03241
03242 for(int count = 0; count < CkNumPes(); count++) {
03243 if(count < nbpes)
03244 pmemap[block_pes[count]] = 1;
03245
03246 if(pmap->numPatchesOnNode(count)) {
03247 pmemap[count] = 1;
03248
03249
03250 if(tmgr.hasMultipleProcsPerNode()) {
03251 pmemap[(count + CkNumPes()/2)% CkNumPes()] = 1;
03252 }
03253 }
03254 }
03255
03256 if(numPes + nbpes + pmap->numNodesWithPatches() > CkNumPes())
03257
03258 return false;
03259
03260 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
03261 Node *node = nd.ckLocalBranch();
03262 SimParameters *simParams = node->simParameters;
03263
03264
03265
03266 int xsize = 0, ysize = 0, zsize = 0;
03267
03268 xsize = tmgr.getDimNX();
03269 ysize = tmgr.getDimNY();
03270 zsize = tmgr.getDimNZ();
03271
03272 int nx = xsize, ny = ysize, nz = zsize;
03273 DimensionMap dm;
03274
03275 dm.x = 0;
03276 dm.y = 1;
03277 dm.z = 2;
03278
03279 findOptimalDimensions(xsize, ysize, zsize, nx, ny, nz, dm);
03280
03281
03282 int group_size = numPes/nx;
03283 if(numPes % nx)
03284 group_size ++;
03285
03286 int my_prime = NAMDPrimes[0];
03287 int density = (ny * nz)/group_size + 1;
03288 int count = 0;
03289
03290
03291 for(count = 0; count < NPRIMES; count ++) {
03292
03293 if(density < NAMDPrimes[count]) {
03294 my_prime = NAMDPrimes[count];
03295 break;
03296 }
03297 }
03298
03299 if(count == NPRIMES)
03300 my_prime = NAMDPrimes[NPRIMES-1];
03301
03302
03303 int gcount = 0;
03304 int npme_pes = 0;
03305
03306 int coord[3];
03307
03308 for(int x = 0; x < nx; x++) {
03309 coord[0] = (x + nx/2)%nx;
03310
03311 for(count=0; count < group_size && npme_pes < numPes; count++) {
03312 int dest = (count + 1) * my_prime;
03313 dest = dest % (ny * nz);
03314
03315 coord[2] = dest / ny;
03316 coord[1] = dest - coord[2] * ny;
03317
03318
03319 int destPe = coord[dm.x] + coord[dm.y] * xsize +
03320 coord[dm.z] * xsize* ysize;
03321
03322 if(pmemap[destPe] == 0) {
03323 pemap[gcount++] = destPe;
03324 pmemap[destPe] = 1;
03325
03326 if(tmgr.hasMultipleProcsPerNode())
03327 pmemap[(destPe + CkNumPes()/2) % CkNumPes()] = 1;
03328
03329 npme_pes ++;
03330 }
03331 else {
03332 for(int pos = 1; pos < ny * nz; pos++) {
03333
03334 coord[2] += pos / ny;
03335 coord[1] += pos % ny;
03336
03337 coord[2] = coord[2] % nz;
03338 coord[1] = coord[1] % ny;
03339
03340 int newdest = coord[dm.x] + coord[dm.y] * xsize +
03341 coord[dm.z] * xsize * ysize;
03342
03343 if(pmemap[newdest] == 0) {
03344 pemap[gcount++] = newdest;
03345 pmemap[newdest] = 1;
03346
03347 if(tmgr.hasMultipleProcsPerNode())
03348 pmemap[(newdest + CkNumPes()/2) % CkNumPes()] = 1;
03349
03350 npme_pes ++;
03351 break;
03352 }
03353 }
03354 }
03355 }
03356
03357 if(gcount == numPes)
03358 gcount = 0;
03359
03360 if(npme_pes >= numPes)
03361 break;
03362 }
03363
03364 delete [] pmemap;
03365
03366 if(npme_pes != numPes)
03367
03368 return false;
03369
03370 return true;
03371 }
03372
03373 #endif
03374
03375 template <class T> class PmePencil : public T {
03376 public:
03377 PmePencil() {
03378 data = 0;
03379 work = 0;
03380 send_order = 0;
03381 needs_reply = 0;
03382 #if USE_PERSISTENT
03383 trans_handle = untrans_handle = ungrid_handle = NULL;
03384 #endif
03385 }
03386 ~PmePencil() {
03387 #ifdef NAMD_FFTW
03388 fftwf_free(data);
03389 #endif
03390 delete [] work;
03391 delete [] send_order;
03392 delete [] needs_reply;
03393 }
03394 void base_init(PmePencilInitMsg *msg) {
03395 imsg=0;
03396 imsgb=0;
03397 hasData=0;
03398 initdata = msg->data;
03399 }
03400 void order_init(int nBlocks) {
03401 send_order = new int[nBlocks];
03402 for ( int i=0; i<nBlocks; ++i ) send_order[i] = i;
03403 if ( Node::Object()->simParameters->PMESendOrder ) {
03404 std::sort(send_order,send_order+nBlocks,sortop_bit_reversed());
03405 } else {
03406 Random rand(CkMyPe());
03407 rand.reorder(send_order,nBlocks);
03408 }
03409 needs_reply = new int[nBlocks];
03410 }
03411 PmePencilInitMsgData initdata;
03412 Lattice lattice;
03413 PmeReduction evir;
03414 int sequence;
03415 int imsg;
03416 int imsgb;
03417 int hasData;
03418 float *data;
03419 float *work;
03420 int *send_order;
03421 int *needs_reply;
03422 #if USE_PERSISTENT
03423 PersistentHandle *trans_handle;
03424 PersistentHandle *untrans_handle;
03425 PersistentHandle *ungrid_handle;
03426 #endif
03427 };
03428
03429 class PmeZPencil : public PmePencil<CBase_PmeZPencil> {
03430 public:
03431 PmeZPencil_SDAG_CODE
03432 PmeZPencil() { __sdag_init(); setMigratable(false); }
03433 PmeZPencil(CkMigrateMessage *) { __sdag_init(); setMigratable (false); imsg=imsgb=0;}
03434 ~PmeZPencil() {
03435 #ifdef NAMD_FFTW
03436 #ifdef NAMD_FFTW_3
03437 delete [] forward_plans;
03438 delete [] backward_plans;
03439 #endif
03440 #endif
03441 }
03442 void fft_init();
03443 void recv_grid(const PmeGridMsg *);
03444 void forward_fft();
03445 void send_trans();
03446 void send_subset_trans(int fromIdx, int toIdx);
03447 void recv_untrans(const PmeUntransMsg *);
03448 void node_process_untrans(PmeUntransMsg *);
03449 void node_process_grid(PmeGridMsg *);
03450 void backward_fft();
03451 void send_ungrid(PmeGridMsg *);
03452 void send_all_ungrid();
03453 void send_subset_ungrid(int fromIdx, int toIdx, int specialIdx);
03454 private:
03455 ResizeArray<PmeGridMsg *> grid_msgs;
03456 #ifdef NAMD_FFTW
03457 #ifdef NAMD_FFTW_3
03458 fftwf_plan forward_plan, backward_plan;
03459
03460
03461 int numPlans;
03462 fftwf_plan *forward_plans, *backward_plans;
03463 #else
03464 rfftwnd_plan forward_plan, backward_plan;
03465 #endif
03466 #endif
03467
03468 int nx, ny;
03469 #if USE_PERSISTENT
03470 void setup_persistent() {
03471 int hd = 1;
03472 int zBlocks = initdata.zBlocks;
03473 int block3 = initdata.grid.block3;
03474 int dim3 = initdata.grid.dim3;
03475 CkArray *yPencil_local = initdata.yPencil.ckLocalBranch();
03476 CmiAssert(yPencil_local);
03477 trans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * zBlocks);
03478 for ( int isend=0; isend<zBlocks; ++isend ) {
03479 int kb = send_order[isend];
03480 int nz1 = block3;
03481 if ( (kb+1)*block3 > dim3/2 ) nz1 = dim3/2 - kb*block3;
03482 int peer = yPencil_local->homePe(CkArrayIndex3D(thisIndex.x, 0, kb));
03483 int size = sizeof(PmeTransMsg) + sizeof(float)*hd*nx*ny*nz1*2 +sizeof( envelope)+PRIORITY_SIZE/8;
03484 int compress_start = sizeof(PmeTransMsg)+sizeof(envelope);
03485 int compress_size = sizeof(float)*hd*nx*ny*nz1*2;
03486 trans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
03487 }
03488 }
03489
03490 void setup_ungrid_persistent()
03491 {
03492 ungrid_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * grid_msgs.size());
03493 for ( imsg=0; imsg < grid_msgs.size(); ++imsg ) {
03494 int peer = grid_msgs[imsg]->sourceNode;
03495
03496 }
03497 }
03498 #endif
03499 };
03500
03501 class PmeYPencil : public PmePencil<CBase_PmeYPencil> {
03502 public:
03503 PmeYPencil_SDAG_CODE
03504 PmeYPencil() { __sdag_init(); setMigratable(false); imsg=imsgb=0;}
03505 PmeYPencil(CkMigrateMessage *) { __sdag_init(); }
03506 void fft_init();
03507 void recv_trans(const PmeTransMsg *);
03508 void forward_fft();
03509 void forward_subset_fft(int fromIdx, int toIdx);
03510 void send_trans();
03511 void send_subset_trans(int fromIdx, int toIdx);
03512 void recv_untrans(const PmeUntransMsg *);
03513 void node_process_trans(PmeTransMsg *);
03514 void node_process_untrans(PmeUntransMsg *);
03515 void backward_fft();
03516 void backward_subset_fft(int fromIdx, int toIdx);
03517 void send_untrans();
03518 void send_subset_untrans(int fromIdx, int toIdx, int evirIdx);
03519 private:
03520 #ifdef NAMD_FFTW
03521 #ifdef NAMD_FFTW_3
03522 fftwf_plan forward_plan, backward_plan;
03523 #else
03524 fftw_plan forward_plan, backward_plan;
03525 #endif
03526 #endif
03527
03528 int nx, nz;
03529 #if USE_PERSISTENT
03530 void setup_persistent() {
03531 int yBlocks = initdata.yBlocks;
03532 int block2 = initdata.grid.block2;
03533 int K2 = initdata.grid.K2;
03534 int hd = 1;
03535 CkArray *xPencil_local = initdata.xPencil.ckLocalBranch();
03536 trans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * yBlocks);
03537 for ( int isend=0; isend<yBlocks; ++isend ) {
03538 int jb = send_order[isend];
03539 int ny1 = block2;
03540 if ( (jb+1)*block2 > K2 ) ny1 = K2 - jb*block2;
03541 int peer = xPencil_local->homePe(CkArrayIndex3D(0, jb, thisIndex.z));
03542 int size = sizeof(PmeTransMsg) + sizeof(float)*hd*nx*ny1*nz*2 +sizeof( envelope) + PRIORITY_SIZE/8;
03543 int compress_start = sizeof(PmeTransMsg)+sizeof( envelope);
03544 int compress_size = sizeof(float)*hd*nx*ny1*nz*2;
03545 trans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
03546 }
03547
03548 CkArray *zPencil_local = initdata.zPencil.ckLocalBranch();
03549 untrans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * yBlocks);
03550 int send_evir = 1;
03551 for ( int isend=0; isend<yBlocks; ++isend ) {
03552 int jb = send_order[isend];
03553 int ny1 = block2;
03554 if ( (jb+1)*block2 > K2 ) ny1 = K2 - jb*block2;
03555 int peer = zPencil_local->homePe(CkArrayIndex3D(thisIndex.x, jb, 0));
03556 int size= sizeof(PmeUntransMsg) + sizeof(float)*nx*ny1*nz*2 + sizeof(PmeReduction)*send_evir +sizeof( envelope) + PRIORITY_SIZE/8;
03557 int compress_start = sizeof(PmeUntransMsg) + sizeof(PmeReduction)*send_evir + sizeof( envelope);
03558 int compress_size = sizeof(float)*nx*ny1*nz*2;
03559 untrans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
03560 if ( send_evir ) {
03561 send_evir = 0;
03562 }
03563 }
03564 }
03565 #endif
03566 };
03567
03568 class PmeXPencil : public PmePencil<CBase_PmeXPencil> {
03569 public:
03570 PmeXPencil_SDAG_CODE
03571 PmeXPencil() { __sdag_init(); myKSpace = 0; setMigratable(false); imsg=imsgb=0; }
03572 PmeXPencil(CkMigrateMessage *) { __sdag_init(); }
03573 ~PmeXPencil() {
03574 #ifdef NAMD_FFTW
03575 #ifdef NAMD_FFTW_3
03576 delete [] forward_plans;
03577 delete [] backward_plans;
03578 #endif
03579 #endif
03580 }
03581 void fft_init();
03582 void recv_trans(const PmeTransMsg *);
03583 void forward_fft();
03584 void pme_kspace();
03585 void backward_fft();
03586 void send_untrans();
03587 void send_subset_untrans(int fromIdx, int toIdx, int evirIdx);
03588 void node_process_trans(PmeTransMsg *);
03589 #ifdef NAMD_FFTW
03590 #ifdef NAMD_FFTW_3
03591 fftwf_plan forward_plan, backward_plan;
03592
03593 int numPlans;
03594 fftwf_plan *forward_plans, *backward_plans;
03595 #else
03596 fftw_plan forward_plan, backward_plan;
03597 #endif
03598 #endif
03599 int ny, nz;
03600 PmeKSpace *myKSpace;
03601 #if USE_PERSISTENT
03602 void setup_persistent() {
03603 int xBlocks = initdata.xBlocks;
03604 int block1 = initdata.grid.block1;
03605 int K1 = initdata.grid.K1;
03606 CkArray *yPencil_local = initdata.yPencil.ckLocalBranch();
03607 untrans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * xBlocks);
03608 int send_evir = 1;
03609 for ( int isend=0; isend<xBlocks; ++isend ) {
03610 int ib = send_order[isend];
03611 int nx1 = block1;
03612 if ( (ib+1)*block1 > K1 ) nx1 = K1 - ib*block1;
03613 int peer = yPencil_local->procNum(CkArrayIndex3D(ib, 0, thisIndex.z));
03614 int size = sizeof(PmeUntransMsg) + sizeof(PmeReduction)*send_evir +
03615 sizeof(float)*nx1*ny*nz*2 +sizeof( envelope) + PRIORITY_SIZE/8;
03616 int compress_start = sizeof(PmeUntransMsg) + sizeof(PmeReduction)*send_evir + sizeof( envelope);
03617 int compress_size = sizeof(float)*nx1*ny*nz*2;
03618 untrans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
03619 if ( send_evir ) {
03620 send_evir = 0;
03621 }
03622
03623 }
03624 }
03625 #endif
03626
03627 };
03628
03629 void PmeZPencil::fft_init() {
03630 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
03631 Node *node = nd.ckLocalBranch();
03632 SimParameters *simParams = node->simParameters;
03633
03634 #if USE_NODE_PAR_RECEIVE
03635 ((NodePmeMgr *)CkLocalNodeBranch(initdata.pmeNodeProxy))->registerZPencil(thisIndex,this);
03636 #endif
03637
03638 int K1 = initdata.grid.K1;
03639 int K2 = initdata.grid.K2;
03640 int K3 = initdata.grid.K3;
03641 int dim3 = initdata.grid.dim3;
03642 int block1 = initdata.grid.block1;
03643 int block2 = initdata.grid.block2;
03644
03645 nx = block1;
03646 if ( (thisIndex.x + 1) * block1 > K1 ) nx = K1 - thisIndex.x * block1;
03647 ny = block2;
03648 if ( (thisIndex.y + 1) * block2 > K2 ) ny = K2 - thisIndex.y * block2;
03649
03650 #ifdef NAMD_FFTW
03651 CmiLock(ComputePmeMgr::fftw_plan_lock);
03652
03653 data = (float *) fftwf_malloc( sizeof(float) *nx*ny*dim3);
03654 work = new float[dim3];
03655
03656 order_init(initdata.zBlocks);
03657
03658 #ifdef NAMD_FFTW_3
03659
03660
03661 int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT : simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE ;
03662 int sizeLines=nx*ny;
03663 int planLineSizes[1];
03664 planLineSizes[0]=K3;
03665 int ndim=initdata.grid.dim3;
03666 int ndimHalf=ndim/2;
03667 forward_plan = fftwf_plan_many_dft_r2c(1, planLineSizes, sizeLines,
03668 (float *) data, NULL, 1,
03669 ndim,
03670 (fftwf_complex *) data, NULL, 1,
03671 ndimHalf,
03672 fftwFlags);
03673
03674 backward_plan = fftwf_plan_many_dft_c2r(1, planLineSizes, sizeLines,
03675 (fftwf_complex *) data, NULL, 1,
03676 ndimHalf,
03677 (float *) data, NULL, 1,
03678 ndim,
03679 fftwFlags);
03680 #if USE_CKLOOP
03681 if(simParams->useCkLoop) {
03682
03683
03684 numPlans = (nx<=ny?nx:ny);
03685 int howmany = sizeLines/numPlans;
03686 forward_plans = new fftwf_plan[numPlans];
03687 backward_plans = new fftwf_plan[numPlans];
03688 for(int i=0; i<numPlans; i++) {
03689 int dimStride = i*ndim*howmany;
03690 int dimHalfStride = i*ndimHalf*howmany;
03691 forward_plans[i] = fftwf_plan_many_dft_r2c(1, planLineSizes, howmany,
03692 ((float *)data)+dimStride, NULL, 1,
03693 ndim,
03694 ((fftwf_complex *)data)+dimHalfStride, NULL, 1,
03695 ndimHalf,
03696 fftwFlags);
03697
03698 backward_plans[i] = fftwf_plan_many_dft_c2r(1, planLineSizes, howmany,
03699 ((fftwf_complex *)data)+dimHalfStride, NULL, 1,
03700 ndimHalf,
03701 ((float *)data)+dimStride, NULL, 1,
03702 ndim,
03703 fftwFlags);
03704 }
03705 }else
03706 #endif
03707 {
03708 forward_plans = NULL;
03709 backward_plans = NULL;
03710 }
03711 #else
03712 forward_plan = rfftwnd_create_plan_specific(1, &K3, FFTW_REAL_TO_COMPLEX,
03713 ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
03714 | FFTW_IN_PLACE | FFTW_USE_WISDOM, data, 1, work, 1);
03715 backward_plan = rfftwnd_create_plan_specific(1, &K3, FFTW_COMPLEX_TO_REAL,
03716 ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
03717 | FFTW_IN_PLACE | FFTW_USE_WISDOM, data, 1, work, 1);
03718 #endif
03719 CmiUnlock(ComputePmeMgr::fftw_plan_lock);
03720 #else
03721 NAMD_die("Sorry, FFTW must be compiled in to use PME.");
03722 #endif
03723
03724 #if USE_NODE_PAR_RECEIVE
03725 evir = 0.;
03726 memset(data, 0, sizeof(float) * nx*ny*dim3);
03727 #endif
03728 }
03729
03730 void PmeYPencil::fft_init() {
03731 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
03732 Node *node = nd.ckLocalBranch();
03733 SimParameters *simParams = node->simParameters;
03734
03735 #if USE_NODE_PAR_RECEIVE
03736 ((NodePmeMgr *)CkLocalNodeBranch(initdata.pmeNodeProxy))->registerYPencil(thisIndex,this);
03737 #endif
03738
03739 int K1 = initdata.grid.K1;
03740 int K2 = initdata.grid.K2;
03741 int dim2 = initdata.grid.dim2;
03742 int dim3 = initdata.grid.dim3;
03743 int block1 = initdata.grid.block1;
03744 int block3 = initdata.grid.block3;
03745
03746 nx = block1;
03747 if ( (thisIndex.x + 1) * block1 > K1 ) nx = K1 - thisIndex.x * block1;
03748 nz = block3;
03749 if ( (thisIndex.z+1)*block3 > dim3/2 ) nz = dim3/2 - thisIndex.z*block3;
03750
03751 #ifdef NAMD_FFTW
03752 CmiLock(ComputePmeMgr::fftw_plan_lock);
03753
03754 data = (float *) fftwf_malloc( sizeof(float) * nx*dim2*nz*2);
03755 work = new float[2*K2];
03756
03757 order_init(initdata.yBlocks);
03758
03759 #ifdef NAMD_FFTW_3
03760
03761
03762
03763
03764 int sizeLines=nz;
03765 int planLineSizes[1];
03766 planLineSizes[0]=K2;
03767 int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT : simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE ;
03768 forward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
03769 (fftwf_complex *) data, NULL, sizeLines, 1,
03770 (fftwf_complex *) data, NULL, sizeLines, 1,
03771 FFTW_FORWARD,
03772 fftwFlags);
03773 backward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
03774 (fftwf_complex *) data, NULL, sizeLines, 1,
03775 (fftwf_complex *) data, NULL, sizeLines, 1,
03776 FFTW_BACKWARD,
03777 fftwFlags);
03778 CkAssert(forward_plan != NULL);
03779 CkAssert(backward_plan != NULL);
03780 #else
03781 forward_plan = fftw_create_plan_specific(K2, FFTW_FORWARD,
03782 ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
03783 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
03784 nz, (fftw_complex *) work, 1);
03785 backward_plan = fftw_create_plan_specific(K2, FFTW_BACKWARD,
03786 ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
03787 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
03788 nz, (fftw_complex *) work, 1);
03789 #endif
03790 CmiUnlock(ComputePmeMgr::fftw_plan_lock);
03791 #else
03792 NAMD_die("Sorry, FFTW must be compiled in to use PME.");
03793 #endif
03794
03795 #if USE_NODE_PAR_RECEIVE
03796 evir = 0;
03797 CmiMemoryWriteFence();
03798 #endif
03799 }
03800
03801 void PmeYPencil::node_process_trans(PmeTransMsg *msg)
03802 {
03803 if ( msg->hasData ) hasData = 1;
03804 needs_reply[msg->sourceNode] = msg->hasData;
03805 recv_trans(msg);
03806 int limsg;
03807 CmiMemoryAtomicFetchAndInc(imsg,limsg);
03808 if(limsg+1 == initdata.yBlocks)
03809 {
03810 if ( hasData ) {
03811 forward_fft();
03812 }
03813 send_trans();
03814 if( ! hasData)
03815 {
03816 send_untrans();
03817 }
03818 imsg=0;
03819 CmiMemoryWriteFence();
03820 }
03821 }
03822
03823 void PmeYPencil::node_process_untrans(PmeUntransMsg *msg)
03824 {
03825 recv_untrans(msg);
03826 int limsg;
03827 CmiMemoryAtomicFetchAndInc(imsgb,limsg);
03828 if(limsg+1 == initdata.yBlocks)
03829 {
03830 backward_fft();
03831 send_untrans();
03832 imsgb=0;
03833 CmiMemoryWriteFence();
03834 }
03835 }
03836
03837 #define DEBUG_NODE_PAR_RECV 0
03838
03839 void NodePmeMgr::recvXTrans(PmeTransMsg *msg) {
03840
03841 PmeXPencil *target=xPencilObj.get(msg->destElem);
03842 #if DEBUG_NODE_PAR_RECV
03843 if(target == NULL)
03844 CkAbort("xpencil in recvXTrans not found, debug registeration");
03845 #endif
03846 target->node_process_trans(msg);
03847 delete msg;
03848 }
03849
03850
03851 void NodePmeMgr::recvYTrans(PmeTransMsg *msg) {
03852
03853 PmeYPencil *target=yPencilObj.get(msg->destElem);
03854 #if DEBUG_NODE_PAR_RECV
03855 if(target == NULL)
03856 CkAbort("ypencil in recvYTrans not found, debug registeration");
03857 #endif
03858 target->node_process_trans(msg);
03859 delete msg;
03860 }
03861 void NodePmeMgr::recvYUntrans(PmeUntransMsg *msg) {
03862
03863 PmeYPencil *target=yPencilObj.get(msg->destElem);
03864 #if DEBUG_NODE_PAR_RECV
03865 if(target == NULL)
03866 CkAbort("ypencil in recvYUntrans not found, debug registeration");
03867 #endif
03868 target->node_process_untrans(msg);
03869 delete msg;
03870 }
03871 void NodePmeMgr::recvZUntrans(PmeUntransMsg *msg) {
03872
03873 PmeZPencil *target=zPencilObj.get(msg->destElem);
03874 #if DEBUG_NODE_PAR_RECV
03875 if(target == NULL)
03876 CkAbort("zpencil in recvZUntrans not found, debug registeration");
03877 #endif
03878 target->node_process_untrans(msg);
03879 delete msg;
03880 }
03881
03882 void NodePmeMgr::recvZGrid(PmeGridMsg *msg) {
03883
03884 PmeZPencil *target=zPencilObj.get(msg->destElem);
03885 #if DEBUG_NODE_PAR_RECV
03886 if(target == NULL){
03887 CkAbort("zpencil in recvZGrid not found, debug registeration");
03888 }
03889 #endif
03890 target->node_process_grid(msg);
03891 }
03892
03893 void PmeXPencil::fft_init() {
03894 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
03895 Node *node = nd.ckLocalBranch();
03896 SimParameters *simParams = node->simParameters;
03897 #if USE_NODE_PAR_RECEIVE
03898 ((NodePmeMgr *)CkLocalNodeBranch(initdata.pmeNodeProxy))->registerXPencil(thisIndex,this);
03899 #endif
03900
03901 int K1 = initdata.grid.K1;
03902 int K2 = initdata.grid.K2;
03903 int dim3 = initdata.grid.dim3;
03904 int block2 = initdata.grid.block2;
03905 int block3 = initdata.grid.block3;
03906
03907 ny = block2;
03908 if ( (thisIndex.y + 1) * block2 > K2 ) ny = K2 - thisIndex.y * block2;
03909 nz = block3;
03910 if ( (thisIndex.z+1)*block3 > dim3/2 ) nz = dim3/2 - thisIndex.z*block3;
03911
03912 #ifdef NAMD_FFTW
03913 CmiLock(ComputePmeMgr::fftw_plan_lock);
03914
03915 data = (float *) fftwf_malloc( sizeof(float) * K1*ny*nz*2);
03916 work = new float[2*K1];
03917
03918 order_init(initdata.xBlocks);
03919
03920 #ifdef NAMD_FFTW_3
03921
03922 int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT : simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE ;
03923 int sizeLines=ny*nz;
03924 int planLineSizes[1];
03925 planLineSizes[0]=K1;
03926 forward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
03927 (fftwf_complex *) data, NULL, sizeLines, 1,
03928 (fftwf_complex *) data, NULL, sizeLines, 1,
03929 FFTW_FORWARD,
03930 fftwFlags);
03931 backward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
03932 (fftwf_complex *) data, NULL, sizeLines, 1,
03933 (fftwf_complex *) data, NULL, sizeLines, 1,
03934 FFTW_BACKWARD,
03935 fftwFlags);
03936
03937 #if USE_CKLOOP
03938 if(simParams->useCkLoop) {
03939
03940
03941 numPlans = (ny<=nz?ny:nz);
03942 int howmany = sizeLines/numPlans;
03943 forward_plans = new fftwf_plan[numPlans];
03944 backward_plans = new fftwf_plan[numPlans];
03945 for(int i=0; i<numPlans; i++) {
03946 int curStride = i*howmany;
03947 forward_plans[i] = fftwf_plan_many_dft(1, planLineSizes, howmany,
03948 ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
03949 ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
03950 FFTW_FORWARD,
03951 fftwFlags);
03952
03953 backward_plans[i] = fftwf_plan_many_dft(1, planLineSizes, howmany,
03954 ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
03955 ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
03956 FFTW_BACKWARD,
03957 fftwFlags);
03958 }
03959 }else
03960 #endif
03961 {
03962 forward_plans = NULL;
03963 backward_plans = NULL;
03964 }
03965 #else
03966 forward_plan = fftw_create_plan_specific(K1, FFTW_FORWARD,
03967 ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
03968 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
03969 ny*nz, (fftw_complex *) work, 1);
03970 backward_plan = fftw_create_plan_specific(K1, FFTW_BACKWARD,
03971 ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
03972 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
03973 ny*nz, (fftw_complex *) work, 1);
03974 #endif
03975 CmiUnlock(ComputePmeMgr::fftw_plan_lock);
03976 #else
03977 NAMD_die("Sorry, FFTW must be compiled in to use PME.");
03978 #endif
03979
03980 myKSpace = new PmeKSpace(initdata.grid,
03981 thisIndex.y*block2, thisIndex.y*block2 + ny,
03982 thisIndex.z*block3, thisIndex.z*block3 + nz);
03983
03984 }
03985
03986
03987
03988
03989 void PmeZPencil::recv_grid(const PmeGridMsg *msg) {
03990
03991 int dim3 = initdata.grid.dim3;
03992 if ( imsg == 0 ) {
03993 lattice = msg->lattice;
03994 sequence = msg->sequence;
03995 #if ! USE_NODE_PAR_RECEIVE
03996 memset(data, 0, sizeof(float)*nx*ny*dim3);
03997 #endif
03998 }
03999
04000 if ( ! msg->hasData ) return;
04001
04002 int zlistlen = msg->zlistlen;
04003 int *zlist = msg->zlist;
04004 char *fmsg = msg->fgrid;
04005 float *qmsg = msg->qgrid;
04006 float *d = data;
04007 int numGrids = 1;
04008 for ( int g=0; g<numGrids; ++g ) {
04009 for ( int i=0; i<nx; ++i ) {
04010 for ( int j=0; j<ny; ++j, d += dim3 ) {
04011 if( *(fmsg++) ) {
04012 for ( int k=0; k<zlistlen; ++k ) {
04013 d[zlist[k]] += *(qmsg++);
04014 }
04015 }
04016 }
04017 }
04018 }
04019 }
04020
04021 static inline void PmeXZPencilFFT(int first, int last, void *result, int paraNum, void *param){
04022 #ifdef NAMD_FFTW
04023 #ifdef NAMD_FFTW_3
04024 fftwf_plan *plans = (fftwf_plan *)param;
04025 for(int i=first; i<=last; i++) fftwf_execute(plans[i]);
04026 #endif
04027 #endif
04028 }
04029
04030 void PmeZPencil::forward_fft() {
04031 evir = 0.;
04032 #ifdef FFTCHECK
04033 int dim3 = initdata.grid.dim3;
04034 int K3 = initdata.grid.K3;
04035 float std_base = 100. * (thisIndex.x+1.) + 10. * (thisIndex.y+1.);
04036 float *d = data;
04037 for ( int i=0; i<nx; ++i ) {
04038 for ( int j=0; j<ny; ++j, d += dim3 ) {
04039 for ( int k=0; k<dim3; ++k ) {
04040 d[k] = 10. * (10. * (10. * std_base + i) + j) + k;
04041 }
04042 }
04043 }
04044 #endif
04045 #ifdef NAMD_FFTW
04046 #ifdef MANUAL_DEBUG_FFTW3
04047 dumpMatrixFloat3("fw_z_b", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
04048 #endif
04049 #ifdef NAMD_FFTW_3
04050 #if USE_CKLOOP
04051 int useCkLoop = Node::Object()->simParameters->useCkLoop;
04052 if(useCkLoop>=CKLOOP_CTRL_PME_FORWARDFFT) {
04053
04054
04055 CkLoop_Parallelize(PmeXZPencilFFT, 1, (void *)forward_plans, CkMyNodeSize(), 0, numPlans-1);
04056 return;
04057 }
04058 #endif
04059 fftwf_execute(forward_plan);
04060 #else
04061 rfftwnd_real_to_complex(forward_plan, nx*ny,
04062 data, 1, initdata.grid.dim3, (fftw_complex *) work, 1, 0);
04063 #endif
04064 #ifdef MANUAL_DEBUG_FFTW3
04065 dumpMatrixFloat3("fw_z_a", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
04066 #endif
04067
04068 #endif
04069 #ifdef ZEROCHECK
04070 int dim3 = initdata.grid.dim3;
04071 int K3 = initdata.grid.K3;
04072 float *d = data;
04073 for ( int i=0; i<nx; ++i ) {
04074 for ( int j=0; j<ny; ++j, d += dim3 ) {
04075 for ( int k=0; k<dim3; ++k ) {
04076 if ( d[k] == 0. ) CkPrintf("0 in Z at %d %d %d %d %d %d %d %d %d\n",
04077 thisIndex.x, thisIndex.y, i, j, k, nx, ny, dim3);
04078 }
04079 }
04080 }
04081 #endif
04082 }
04083
04084
04085 static inline void PmeZPencilSendTrans(int first, int last, void *result, int paraNum, void *param){
04086 PmeZPencil *zpencil = (PmeZPencil *)param;
04087 zpencil->send_subset_trans(first, last);
04088 }
04089
04090 void PmeZPencil::send_subset_trans(int fromIdx, int toIdx){
04091 int zBlocks = initdata.zBlocks;
04092 int block3 = initdata.grid.block3;
04093 int dim3 = initdata.grid.dim3;
04094 for ( int isend=fromIdx; isend<=toIdx; ++isend ) {
04095 int kb = send_order[isend];
04096 int nz = block3;
04097 if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
04098 int hd = ( hasData ? 1 : 0 );
04099 PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
04100 msg->lattice = lattice;
04101 msg->sourceNode = thisIndex.y;
04102 msg->hasData = hasData;
04103 msg->nx = ny;
04104 if ( hasData ) {
04105 float *md = msg->qgrid;
04106 const float *d = data;
04107 for ( int i=0; i<nx; ++i ) {
04108 for ( int j=0; j<ny; ++j, d += dim3 ) {
04109 for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
04110 *(md++) = d[2*k];
04111 *(md++) = d[2*k+1];
04112 }
04113 }
04114 }
04115 }
04116 msg->sequence = sequence;
04117 SET_PRIORITY(msg,sequence,PME_TRANS_PRIORITY)
04118
04119 CmiEnableUrgentSend(1);
04120 #if USE_NODE_PAR_RECEIVE
04121 msg->destElem=CkArrayIndex3D(thisIndex.x,0,kb);
04122 #if Y_PERSIST
04123 CmiUsePersistentHandle(&trans_handle[isend], 1);
04124 #endif
04125 initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYTrans(msg);
04126 #if Y_PERSIST
04127 CmiUsePersistentHandle(NULL, 0);
04128 #endif
04129 #else
04130 #if Y_PERSIST
04131 CmiUsePersistentHandle(&trans_handle[isend], 1);
04132 #endif
04133 initdata.yPencil(thisIndex.x,0,kb).recvTrans(msg);
04134 #if Y_PERSIST
04135 CmiUsePersistentHandle(NULL, 0);
04136 #endif
04137 #endif
04138 CmiEnableUrgentSend(0);
04139 }
04140 }
04141
04142 void PmeZPencil::send_trans() {
04143 #if USE_PERSISTENT
04144 if (trans_handle == NULL) setup_persistent();
04145 #endif
04146 #if USE_CKLOOP
04147 Bool useCkLoop = Node::Object()->simParameters->useCkLoop;
04148 if(useCkLoop>=CKLOOP_CTRL_PME_SENDTRANS) {
04155
04156 CkLoop_Parallelize(PmeZPencilSendTrans, 1, (void *)this, CkMyNodeSize(), 0, initdata.zBlocks-1, 1);
04157 return;
04158 }
04159 #endif
04160 int zBlocks = initdata.zBlocks;
04161 int block3 = initdata.grid.block3;
04162 int dim3 = initdata.grid.dim3;
04163 for ( int isend=0; isend<zBlocks; ++isend ) {
04164 int kb = send_order[isend];
04165 int nz = block3;
04166 if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
04167 int hd = ( hasData ? 1 : 0 );
04168 PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
04169 msg->lattice = lattice;
04170 msg->sourceNode = thisIndex.y;
04171 msg->hasData = hasData;
04172 msg->nx = ny;
04173 if ( hasData ) {
04174 float *md = msg->qgrid;
04175 const float *d = data;
04176 for ( int i=0; i<nx; ++i ) {
04177 for ( int j=0; j<ny; ++j, d += dim3 ) {
04178 for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
04179 *(md++) = d[2*k];
04180 *(md++) = d[2*k+1];
04181 }
04182 }
04183 }
04184 }
04185 msg->sequence = sequence;
04186 SET_PRIORITY(msg,sequence,PME_TRANS_PRIORITY)
04187
04188 CmiEnableUrgentSend(1);
04189 #if USE_NODE_PAR_RECEIVE
04190 msg->destElem=CkArrayIndex3D(thisIndex.x,0,kb);
04191 #if Y_PERSIST
04192 CmiUsePersistentHandle(&trans_handle[isend], 1);
04193 #endif
04194 initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYTrans(msg);
04195 #if Y_PERSIST
04196 CmiUsePersistentHandle(NULL, 0);
04197 #endif
04198 #else
04199 #if Y_PERSIST
04200 CmiUsePersistentHandle(&trans_handle[isend], 1);
04201 #endif
04202 initdata.yPencil(thisIndex.x,0,kb).recvTrans(msg);
04203 #if Y_PERSIST
04204 CmiUsePersistentHandle(NULL, 0);
04205 #endif
04206 #endif
04207 CmiEnableUrgentSend(0);
04208 }
04209 }
04210
04211 void PmeYPencil::recv_trans(const PmeTransMsg *msg) {
04212 if ( imsg == 0 ) {
04213 lattice = msg->lattice;
04214 sequence = msg->sequence;
04215 }
04216 int block2 = initdata.grid.block2;
04217 int K2 = initdata.grid.K2;
04218 int jb = msg->sourceNode;
04219 int ny = msg->nx;
04220 if ( msg->hasData ) {
04221 const float *md = msg->qgrid;
04222 float *d = data;
04223 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
04224 for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
04225 for ( int k=0; k<nz; ++k ) {
04226 #ifdef ZEROCHECK
04227 if ( (*md) == 0. ) CkPrintf("0 in ZY at %d %d %d %d %d %d %d %d %d\n",
04228 thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
04229 #endif
04230 d[2*(j*nz+k)] = *(md++);
04231 d[2*(j*nz+k)+1] = *(md++);
04232 }
04233 }
04234 }
04235 } else {
04236 float *d = data;
04237 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
04238 for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
04239 for ( int k=0; k<nz; ++k ) {
04240 d[2*(j*nz+k)] = 0;
04241 d[2*(j*nz+k)+1] = 0;
04242 }
04243 }
04244 }
04245 }
04246 }
04247
04248 static inline void PmeYPencilForwardFFT(int first, int last, void *result, int paraNum, void *param){
04249 PmeYPencil *ypencil = (PmeYPencil *)param;
04250 ypencil->forward_subset_fft(first, last);
04251 }
04252 void PmeYPencil::forward_subset_fft(int fromIdx, int toIdx) {
04253 #ifdef NAMD_FFTW
04254 #ifdef NAMD_FFTW_3
04255 for(int i=fromIdx; i<=toIdx; i++){
04256 fftwf_execute_dft(forward_plan, ((fftwf_complex *) data) + i
04257 * nz * initdata.grid.K2,
04258 ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
04259 }
04260 #endif
04261 #endif
04262 }
04263
04264 void PmeYPencil::forward_fft() {
04265 evir = 0.;
04266 #ifdef NAMD_FFTW
04267 #ifdef MANUAL_DEBUG_FFTW3
04268 dumpMatrixFloat3("fw_y_b", data, nx, initdata.grid.K2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
04269 #endif
04270
04271 #ifdef NAMD_FFTW_3
04272 #if USE_CKLOOP
04273 int useCkLoop = Node::Object()->simParameters->useCkLoop;
04274 if(useCkLoop>=CKLOOP_CTRL_PME_FORWARDFFT) {
04275 CkLoop_Parallelize(PmeYPencilForwardFFT, 1, (void *)this, CkMyNodeSize(), 0, nx-1);
04276 return;
04277 }
04278 #endif
04279
04280 for ( int i=0; i<nx; ++i ) {
04281 fftwf_execute_dft(forward_plan, ((fftwf_complex *) data) + i
04282 * nz * initdata.grid.K2,
04283 ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
04284 }
04285 #else
04286 for ( int i=0; i<nx; ++i ) {
04287 fftw(forward_plan, nz,
04288 ((fftw_complex *) data) + i * nz * initdata.grid.K2,
04289 nz, 1, (fftw_complex *) work, 1, 0);
04290 }
04291 #endif
04292 #ifdef MANUAL_DEBUG_FFTW3
04293 dumpMatrixFloat3("fw_y_a", data, nx, initdata.grid.dim2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
04294 #endif
04295
04296 #endif
04297 }
04298
04299 static inline void PmeYPencilSendTrans(int first, int last, void *result, int paraNum, void *param){
04300 PmeYPencil *ypencil = (PmeYPencil *)param;
04301 ypencil->send_subset_trans(first, last);
04302 }
04303
04304 void PmeYPencil::send_subset_trans(int fromIdx, int toIdx){
04305 int yBlocks = initdata.yBlocks;
04306 int block2 = initdata.grid.block2;
04307 int K2 = initdata.grid.K2;
04308 for ( int isend=fromIdx; isend<=toIdx; ++isend ) {
04309 int jb = send_order[isend];
04310 int ny = block2;
04311 if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
04312 int hd = ( hasData ? 1 : 0 );
04313 PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
04314 msg->lattice = lattice;
04315 msg->sourceNode = thisIndex.x;
04316 msg->hasData = hasData;
04317 msg->nx = nx;
04318 if ( hasData ) {
04319 float *md = msg->qgrid;
04320 const float *d = data;
04321 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
04322 for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
04323 for ( int k=0; k<nz; ++k ) {
04324 *(md++) = d[2*(j*nz+k)];
04325 *(md++) = d[2*(j*nz+k)+1];
04326 #ifdef ZEROCHECK
04327 if ( *(md-2) == 0. ) CkPrintf("send 0 in YX at %d %d %d %d %d %d %d %d %d\n",
04328 thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
04329 #endif
04330 }
04331 }
04332 }
04333 if ( md != msg->qgrid + nx*ny*nz*2 ) CkPrintf("error in YX at %d %d %d\n",
04334 thisIndex.x, jb, thisIndex.z);
04335 }
04336 msg->sequence = sequence;
04337 SET_PRIORITY(msg,sequence,PME_TRANS2_PRIORITY)
04338 CmiEnableUrgentSend(1);
04339 #if USE_NODE_PAR_RECEIVE
04340 msg->destElem=CkArrayIndex3D(0,jb,thisIndex.z);
04341 #if X_PERSIST
04342 CmiUsePersistentHandle(&trans_handle[isend], 1);
04343 #endif
04344 initdata.pmeNodeProxy[CmiNodeOf(initdata.xm.ckLocalBranch()->procNum(0,msg->destElem))].recvXTrans(msg);
04345 #if X_PERSIST
04346 CmiUsePersistentHandle(NULL, 0);
04347 #endif
04348 #else
04349 #if X_PERSIST
04350 CmiUsePersistentHandle(&trans_handle[isend], 1);
04351 #endif
04352 initdata.xPencil(0,jb,thisIndex.z).recvTrans(msg);
04353 #if X_PERSIST
04354 CmiUsePersistentHandle(NULL, 0);
04355 #endif
04356 #endif
04357 CmiEnableUrgentSend(0);
04358 }
04359 }
04360
04361 void PmeYPencil::send_trans() {
04362 #if USE_PERSISTENT
04363 if (trans_handle == NULL) setup_persistent();
04364 #endif
04365 #if USE_CKLOOP
04366 Bool useCkLoop = Node::Object()->simParameters->useCkLoop;
04367 if(useCkLoop>=CKLOOP_CTRL_PME_SENDTRANS) {
04374
04375 CkLoop_Parallelize(PmeYPencilSendTrans, 1, (void *)this, CkMyNodeSize(), 0, initdata.yBlocks-1, 1);
04376 return;
04377 }
04378 #endif
04379 int yBlocks = initdata.yBlocks;
04380 int block2 = initdata.grid.block2;
04381 int K2 = initdata.grid.K2;
04382 for ( int isend=0; isend<yBlocks; ++isend ) {
04383 int jb = send_order[isend];
04384 int ny = block2;
04385 if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
04386 int hd = ( hasData ? 1 : 0 );
04387 PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
04388 msg->lattice = lattice;
04389 msg->sourceNode = thisIndex.x;
04390 msg->hasData = hasData;
04391 msg->nx = nx;
04392 if ( hasData ) {
04393 float *md = msg->qgrid;
04394 const float *d = data;
04395 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
04396 for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
04397 for ( int k=0; k<nz; ++k ) {
04398 *(md++) = d[2*(j*nz+k)];
04399 *(md++) = d[2*(j*nz+k)+1];
04400 #ifdef ZEROCHECK
04401 if ( *(md-2) == 0. ) CkPrintf("send 0 in YX at %d %d %d %d %d %d %d %d %d\n",
04402 thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
04403 #endif
04404 }
04405 }
04406 }
04407 if ( md != msg->qgrid + nx*ny*nz*2 ) CkPrintf("error in YX at %d %d %d\n",
04408 thisIndex.x, jb, thisIndex.z);
04409 }
04410 msg->sequence = sequence;
04411 SET_PRIORITY(msg,sequence,PME_TRANS2_PRIORITY)
04412 CmiEnableUrgentSend(1);
04413 #if USE_NODE_PAR_RECEIVE
04414 msg->destElem=CkArrayIndex3D(0,jb,thisIndex.z);
04415 #if X_PERSIST
04416 CmiUsePersistentHandle(&trans_handle[isend], 1);
04417 #endif
04418 initdata.pmeNodeProxy[CmiNodeOf(initdata.xm.ckLocalBranch()->procNum(0,msg->destElem))].recvXTrans(msg);
04419 #if X_PERSIST
04420 CmiUsePersistentHandle(NULL, 0);
04421 #endif
04422 #else
04423 #if X_PERSIST
04424 CmiUsePersistentHandle(&trans_handle[isend], 1);
04425 #endif
04426 initdata.xPencil(0,jb,thisIndex.z).recvTrans(msg);
04427 #if X_PERSIST
04428 CmiUsePersistentHandle(NULL, 0);
04429 #endif
04430
04431 #endif
04432 CmiEnableUrgentSend(0);
04433 }
04434 }
04435
04436 void PmeXPencil::node_process_trans(PmeTransMsg *msg)
04437 {
04438 if(msg->hasData) hasData=1;
04439 needs_reply[msg->sourceNode] = msg->hasData;
04440 recv_trans(msg);
04441 int limsg;
04442 CmiMemoryAtomicFetchAndInc(imsg,limsg);
04443 if(limsg+1 == initdata.xBlocks)
04444 {
04445 if(hasData){
04446 forward_fft();
04447 pme_kspace();
04448 backward_fft();
04449 }
04450 send_untrans();
04451 imsg=0;
04452 CmiMemoryWriteFence();
04453 }
04454 }
04455
04456 void PmeXPencil::recv_trans(const PmeTransMsg *msg) {
04457 if ( imsg == 0 ) {
04458 lattice = msg->lattice;
04459 sequence = msg->sequence;
04460 }
04461 int block1 = initdata.grid.block1;
04462 int K1 = initdata.grid.K1;
04463 int ib = msg->sourceNode;
04464 int nx = msg->nx;
04465 if ( msg->hasData ) {
04466 const float *md = msg->qgrid;
04467 for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
04468 float *d = data + i*ny*nz*2;
04469 for ( int j=0; j<ny; ++j, d += nz*2 ) {
04470 for ( int k=0; k<nz; ++k ) {
04471 #ifdef ZEROCHECK
04472 if ( (*md) == 0. ) CkPrintf("0 in YX at %d %d %d %d %d %d %d %d %d\n",
04473 ib, thisIndex.y, thisIndex.z, i, j, k, nx, ny, nz);
04474 #endif
04475 d[2*k] = *(md++);
04476 d[2*k+1] = *(md++);
04477 }
04478 }
04479 }
04480 } else {
04481 for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
04482 float *d = data + i*ny*nz*2;
04483 for ( int j=0; j<ny; ++j, d += nz*2 ) {
04484 for ( int k=0; k<nz; ++k ) {
04485 d[2*k] = 0;
04486 d[2*k+1] = 0;
04487 }
04488 }
04489 }
04490 }
04491 }
04492
04493 void PmeXPencil::forward_fft() {
04494 #ifdef NAMD_FFTW
04495
04496 #ifdef MANUAL_DEBUG_FFTW3
04497 dumpMatrixFloat3("fw_x_b", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
04498 #endif
04499
04500 #ifdef NAMD_FFTW_3
04501 #if USE_CKLOOP
04502 int useCkLoop = Node::Object()->simParameters->useCkLoop;
04503 if(useCkLoop>=CKLOOP_CTRL_PME_FORWARDFFT) {
04504
04505
04506 CkLoop_Parallelize(PmeXZPencilFFT, 1, (void *)forward_plans, CkMyNodeSize(), 0, numPlans-1);
04507 return;
04508 }
04509 #endif
04510 fftwf_execute(forward_plan);
04511 #else
04512 fftw(forward_plan, ny*nz,
04513 ((fftw_complex *) data), ny*nz, 1, (fftw_complex *) work, 1, 0);
04514 #endif
04515 #ifdef MANUAL_DEBUG_FFTW3
04516 dumpMatrixFloat3("fw_x_a", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
04517 #endif
04518
04519 #endif
04520 }
04521
04522 void PmeXPencil::pme_kspace() {
04523
04524 evir = 0.;
04525
04526 #ifdef FFTCHECK
04527 return;
04528 #endif
04529
04530 BigReal ewaldcof = ComputeNonbondedUtil::ewaldcof;
04531
04532 int numGrids = 1;
04533 for ( int g=0; g<numGrids; ++g ) {
04534 evir[0] = myKSpace->compute_energy(data+0*g,
04535 lattice, ewaldcof, &(evir[1]));
04536 }
04537
04538 #if USE_NODE_PAR_RECEIVE
04539 CmiMemoryWriteFence();
04540 #endif
04541 }
04542
04543 void PmeXPencil::backward_fft() {
04544 #ifdef NAMD_FFTW
04545 #ifdef MANUAL_DEBUG_FFTW3
04546 dumpMatrixFloat3("bw_x_b", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
04547 #endif
04548
04549 #ifdef NAMD_FFTW_3
04550 #if USE_CKLOOP
04551 int useCkLoop = Node::Object()->simParameters->useCkLoop;
04552 if(useCkLoop>=CKLOOP_CTRL_PME_BACKWARDFFT) {
04553
04554
04555 CkLoop_Parallelize(PmeXZPencilFFT, 1, (void *)backward_plans, CkMyNodeSize(), 0, numPlans-1);
04556 return;
04557 }
04558 #endif
04559 fftwf_execute(backward_plan);
04560 #else
04561 fftw(backward_plan, ny*nz,
04562 ((fftw_complex *) data), ny*nz, 1, (fftw_complex *) work, 1, 0);
04563 #endif
04564 #ifdef MANUAL_DEBUG_FFTW3
04565 dumpMatrixFloat3("bw_x_a", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
04566 #endif
04567 #endif
04568 }
04569
04570 static inline void PmeXPencilSendUntrans(int first, int last, void *result, int paraNum, void *param){
04571 int evirIdx = paraNum;
04572 PmeXPencil *xpencil = (PmeXPencil *)param;
04573 xpencil->send_subset_untrans(first, last, evirIdx);
04574 }
04575
04576 void PmeXPencil::send_subset_untrans(int fromIdx, int toIdx, int evirIdx){
04577 int xBlocks = initdata.xBlocks;
04578 int block1 = initdata.grid.block1;
04579 int K1 = initdata.grid.K1;
04580
04581 int ackL=0, ackH=-1;
04582 int unL=0, unH=-1;
04583 int send_evir=0;
04584 if(fromIdx >= evirIdx+1) {
04585
04586 unL = fromIdx;
04587 unH = toIdx;
04588 } else if(toIdx <= evirIdx-1) {
04589
04590 ackL=fromIdx;
04591 ackH=toIdx;
04592 } else {
04593
04594 ackL=fromIdx;
04595 ackH=evirIdx-1;
04596 send_evir=1;
04597 unL=evirIdx+1;
04598 unH=toIdx;
04599 }
04600
04601 for(int isend=ackL; isend<=ackH; isend++) {
04602
04603 CmiEnableUrgentSend(1);
04604 int ib = send_order[isend];
04605 PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
04606 SET_PRIORITY(msg,sequence,PME_UNTRANS_PRIORITY)
04607 initdata.yPencil(ib,0,thisIndex.z).recvAck(msg);
04608 CmiEnableUrgentSend(0);
04609 }
04610
04611 CmiEnableUrgentSend(1);
04612
04613 if(send_evir) {
04614 int ib = send_order[evirIdx];
04615 int nx = block1;
04616 if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
04617 PmeUntransMsg *msg = new (1,nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;
04618 msg->evir[0] = evir;
04619 msg->has_evir = 1;
04620 msg->sourceNode = thisIndex.y;
04621 msg->ny = ny;
04622 float *md = msg->qgrid;
04623 for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
04624 float *d = data + i*ny*nz*2;
04625 for ( int j=0; j<ny; ++j, d += nz*2 ) {
04626 for ( int k=0; k<nz; ++k ) {
04627 *(md++) = d[2*k];
04628 *(md++) = d[2*k+1];
04629 }
04630 }
04631 }
04632 SET_PRIORITY(msg,sequence,PME_UNTRANS_PRIORITY)
04633 #if USE_NODE_PAR_RECEIVE
04634 msg->destElem=CkArrayIndex3D(ib,0, thisIndex.z);
04635 initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYUntrans(msg);
04636 #else
04637 initdata.yPencil(ib,0,thisIndex.z).recvUntrans(msg);
04638 #endif
04639 }
04640 CmiEnableUrgentSend(0);
04641
04642
04643 for(int isend=unL; isend<=unH; isend++) {
04644 int ib = send_order[isend];
04645 int nx = block1;
04646 if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
04647 PmeUntransMsg *msg = new (0,nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;
04648 msg->has_evir = 0;
04649 msg->sourceNode = thisIndex.y;
04650 msg->ny = ny;
04651 float *md = msg->qgrid;
04652 for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
04653 float *d = data + i*ny*nz*2;
04654 for ( int j=0; j<ny; ++j, d += nz*2 ) {
04655 for ( int k=0; k<nz; ++k ) {
04656 *(md++) = d[2*k];
04657 *(md++) = d[2*k+1];
04658 }
04659 }
04660 }
04661 SET_PRIORITY(msg,sequence,PME_UNTRANS_PRIORITY)
04662 CmiEnableUrgentSend(1);
04663 #if USE_NODE_PAR_RECEIVE
04664 msg->destElem=CkArrayIndex3D(ib,0, thisIndex.z);
04665 #if Y_PERSIST
04666 CmiUsePersistentHandle(&untrans_handle[isend], 1);
04667 #endif
04668 initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYUntrans(msg);
04669 #if Y_PERSIST
04670 CmiUsePersistentHandle(NULL, 0);
04671 #endif
04672 #else
04673 #if Y_PERSIST
04674
04675 #endif
04676 initdata.yPencil(ib,0,thisIndex.z).recvUntrans(msg);
04677 #if Y_PERSIST
04678
04679 #endif
04680 #endif
04681 CmiEnableUrgentSend(0);
04682 }
04683 }
04684
04685 void PmeXPencil::send_untrans() {
04686 #if USE_PERSISTENT
04687 if (untrans_handle == NULL) setup_persistent();
04688 #endif
04689 #if USE_CKLOOP
04690 Bool useCkLoop = Node::Object()->simParameters->useCkLoop;
04691 if(useCkLoop>=CKLOOP_CTRL_PME_SENDUNTRANS) {
04692 int xBlocks = initdata.xBlocks;
04693 int evirIdx = 0;
04694 for ( int isend=0; isend<xBlocks; ++isend ) {
04695 int ib = send_order[isend];
04696 if (needs_reply[ib]) {
04697 evirIdx = isend;
04698 break;
04699 }
04700 }
04701
04702
04703
04704
04705
04706
04707 #if USE_NODE_PAR_RECEIVE
04708
04709 CkLoop_Parallelize(PmeXPencilSendUntrans, evirIdx, (void *)this, xBlocks, 0, xBlocks-1, 1);
04710 #else
04711
04712 CkLoop_Parallelize(PmeXPencilSendUntrans, evirIdx, (void *)this, xBlocks, 0, xBlocks-1, 0);
04713 #endif
04714 return;
04715 }
04716 #endif
04717 int xBlocks = initdata.xBlocks;
04718 int block1 = initdata.grid.block1;
04719 int K1 = initdata.grid.K1;
04720 int send_evir = 1;
04721 for ( int isend=0; isend<xBlocks; ++isend ) {
04722 int ib = send_order[isend];
04723 if ( ! needs_reply[ib] ) {
04724 PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
04725 CmiEnableUrgentSend(1);
04726 SET_PRIORITY(msg,sequence,PME_UNTRANS_PRIORITY)
04727 initdata.yPencil(ib,0,thisIndex.z).recvAck(msg);
04728 CmiEnableUrgentSend(0);
04729 continue;
04730 }
04731 int nx = block1;
04732 if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
04733 PmeUntransMsg *msg = new (send_evir, nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;
04734 if ( send_evir ) {
04735 msg->evir[0] = evir;
04736 msg->has_evir = 1;
04737 send_evir = 0;
04738 } else {
04739 msg->has_evir = 0;
04740 }
04741 msg->sourceNode = thisIndex.y;
04742 msg->ny = ny;
04743 float *md = msg->qgrid;
04744 for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
04745 float *d = data + i*ny*nz*2;
04746 for ( int j=0; j<ny; ++j, d += nz*2 ) {
04747 for ( int k=0; k<nz; ++k ) {
04748 *(md++) = d[2*k];
04749 *(md++) = d[2*k+1];
04750 }
04751 }
04752 }
04753 SET_PRIORITY(msg,sequence,PME_UNTRANS_PRIORITY)
04754
04755 CmiEnableUrgentSend(1);
04756 #if USE_NODE_PAR_RECEIVE
04757 msg->destElem=CkArrayIndex3D(ib,0, thisIndex.z);
04758 #if Y_PERSIST
04759 CmiUsePersistentHandle(&untrans_handle[isend], 1);
04760 #endif
04761 initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYUntrans(msg);
04762 #if Y_PERSIST
04763 CmiUsePersistentHandle(NULL, 0);
04764 #endif
04765 #else
04766 #if Y_PERSIST
04767 CmiUsePersistentHandle(&untrans_handle[isend], 1);
04768 #endif
04769 initdata.yPencil(ib,0,thisIndex.z).recvUntrans(msg);
04770 #if Y_PERSIST
04771 CmiUsePersistentHandle(NULL, 0);
04772 #endif
04773 #endif
04774 CmiEnableUrgentSend(0);
04775 }
04776 }
04777
04778 void PmeYPencil::recv_untrans(const PmeUntransMsg *msg) {
04779 if ( msg->has_evir ) evir += msg->evir[0];
04780 int block2 = initdata.grid.block2;
04781 int K2 = initdata.grid.K2;
04782 int jb = msg->sourceNode;
04783 int ny = msg->ny;
04784 const float *md = msg->qgrid;
04785 float *d = data;
04786 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
04787 #if CMK_BLUEGENEL
04788 CmiNetworkProgress();
04789 #endif
04790 for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
04791 for ( int k=0; k<nz; ++k ) {
04792 #ifdef ZEROCHECK
04793 if ( (*md) == 0. ) CkPrintf("0 in XY at %d %d %d %d %d %d %d %d %d\n",
04794 thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
04795 #endif
04796 d[2*(j*nz+k)] = *(md++);
04797 d[2*(j*nz+k)+1] = *(md++);
04798 }
04799 }
04800 }
04801 }
04802
04803 static inline void PmeYPencilBackwardFFT(int first, int last, void *result, int paraNum, void *param){
04804 PmeYPencil *ypencil = (PmeYPencil *)param;
04805 ypencil->backward_subset_fft(first, last);
04806 }
04807
04808 void PmeYPencil::backward_subset_fft(int fromIdx, int toIdx) {
04809 #ifdef NAMD_FFTW
04810 #ifdef NAMD_FFTW_3
04811 for(int i=fromIdx; i<=toIdx; i++){
04812 fftwf_execute_dft(backward_plan,
04813 ((fftwf_complex *) data) + i * nz * initdata.grid.K2,
04814 ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
04815 }
04816 #endif
04817 #endif
04818 }
04819
04820 void PmeYPencil::backward_fft() {
04821 #ifdef NAMD_FFTW
04822 #ifdef MANUAL_DEBUG_FFTW3
04823 dumpMatrixFloat3("bw_y_b", data, nx, initdata.grid.K2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
04824 #endif
04825
04826 #ifdef NAMD_FFTW_3
04827 #if USE_CKLOOP
04828 int useCkLoop = Node::Object()->simParameters->useCkLoop;
04829 if(useCkLoop>=CKLOOP_CTRL_PME_BACKWARDFFT) {
04830 CkLoop_Parallelize(PmeYPencilBackwardFFT, 1, (void *)this, CkMyNodeSize(), 0, nx-1);
04831 return;
04832 }
04833 #endif
04834
04835 for ( int i=0; i<nx; ++i ) {
04836 #if CMK_BLUEGENEL
04837 CmiNetworkProgress();
04838 #endif
04839 fftwf_execute_dft(backward_plan,
04840 ((fftwf_complex *) data) + i * nz * initdata.grid.K2,
04841 ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
04842 }
04843 #else
04844 for ( int i=0; i<nx; ++i ) {
04845 #if CMK_BLUEGENEL
04846 CmiNetworkProgress();
04847 #endif
04848 fftw(backward_plan, nz,
04849 ((fftw_complex *) data) + i * nz * initdata.grid.K2,
04850 nz, 1, (fftw_complex *) work, 1, 0);
04851 }
04852 #endif
04853
04854 #ifdef MANUAL_DEBUG_FFTW3
04855 dumpMatrixFloat3("bw_y_a", data, nx, initdata.grid.K2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
04856 #endif
04857
04858 #endif
04859 }
04860
04861 static inline void PmeYPencilSendUntrans(int first, int last, void *result, int paraNum, void *param){
04862 int evirIdx = paraNum;
04863 PmeYPencil *ypencil = (PmeYPencil *)param;
04864 ypencil->send_subset_untrans(first, last, evirIdx);
04865 }
04866
04867 void PmeYPencil::send_subset_untrans(int fromIdx, int toIdx, int evirIdx){
04868 int yBlocks = initdata.yBlocks;
04869 int block2 = initdata.grid.block2;
04870 int K2 = initdata.grid.K2;
04871
04872 int ackL=0, ackH=-1;
04873 int unL=0, unH=-1;
04874 int send_evir=0;
04875 if(fromIdx >= evirIdx+1) {
04876
04877 unL = fromIdx;
04878 unH = toIdx;
04879 } else if(toIdx <= evirIdx-1) {
04880
04881 ackL=fromIdx;
04882 ackH=toIdx;
04883 } else {
04884
04885 ackL=fromIdx;
04886 ackH=evirIdx-1;
04887 send_evir=1;
04888 unL=evirIdx+1;
04889 unH=toIdx;
04890 }
04891
04892 for(int isend=ackL; isend<=ackH; isend++) {
04893
04894 CmiEnableUrgentSend(1);
04895 int jb = send_order[isend];
04896 PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
04897 SET_PRIORITY(msg,sequence,PME_UNTRANS2_PRIORITY)
04898 initdata.zPencil(thisIndex.x,jb,0).recvAck(msg);
04899 CmiEnableUrgentSend(0);
04900 }
04901
04902 CmiEnableUrgentSend(1);
04903
04904 if(send_evir) {
04905 int jb = send_order[evirIdx];
04906 int ny = block2;
04907 if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
04908 PmeUntransMsg *msg = new (1,nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;
04909 msg->evir[0] = evir;
04910 msg->has_evir = 1;
04911 msg->sourceNode = thisIndex.z;
04912 msg->ny = nz;
04913 float *md = msg->qgrid;
04914 const float *d = data;
04915 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
04916 for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
04917 for ( int k=0; k<nz; ++k ) {
04918 *(md++) = d[2*(j*nz+k)];
04919 *(md++) = d[2*(j*nz+k)+1];
04920 }
04921 }
04922 }
04923 SET_PRIORITY(msg,sequence,PME_UNTRANS2_PRIORITY)
04924 #if USE_NODE_PAR_RECEIVE
04925 msg->destElem=CkArrayIndex3D( thisIndex.x, jb, 0);
04926
04927 initdata.pmeNodeProxy[CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem))].recvZUntrans(msg);
04928 #else
04929 initdata.zPencil(thisIndex.x,jb,0).recvUntrans(msg);
04930 #endif
04931 }
04932
04933 CmiEnableUrgentSend(0);
04934
04935 for(int isend=unL; isend<=unH; isend++) {
04936 int jb = send_order[isend];
04937 int ny = block2;
04938 if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
04939 PmeUntransMsg *msg = new (0,nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;
04940 msg->has_evir = 0;
04941 msg->sourceNode = thisIndex.z;
04942 msg->ny = nz;
04943 float *md = msg->qgrid;
04944 const float *d = data;
04945 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
04946 for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
04947 for ( int k=0; k<nz; ++k ) {
04948 *(md++) = d[2*(j*nz+k)];
04949 *(md++) = d[2*(j*nz+k)+1];
04950 }
04951 }
04952 }
04953 SET_PRIORITY(msg,sequence,PME_UNTRANS2_PRIORITY)
04954 CmiEnableUrgentSend(1);
04955 #if USE_NODE_PAR_RECEIVE
04956 msg->destElem=CkArrayIndex3D( thisIndex.x, jb, 0);
04957
04958 #if Z_PERSIST
04959 CmiUsePersistentHandle(&untrans_handle[isend], 1);
04960 #endif
04961 initdata.pmeNodeProxy[CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem))].recvZUntrans(msg);
04962 #if Z_PERSIST
04963 CmiUsePersistentHandle(NULL, 0);
04964 #endif
04965 #else
04966 #if Z_PERSIST
04967 CmiUsePersistentHandle(&untrans_handle[isend], 1);
04968 #endif
04969 initdata.zPencil(thisIndex.x,jb,0).recvUntrans(msg);
04970 #if Z_PERSIST
04971 CmiUsePersistentHandle(NULL, 0);
04972 #endif
04973 #endif
04974 CmiEnableUrgentSend(0);
04975 }
04976 }
04977
04978 void PmeYPencil::send_untrans() {
04979 #if USE_PERSISTENT
04980 if (untrans_handle == NULL) setup_persistent();
04981 #endif
04982 #if USE_CKLOOP
04983 Bool useCkLoop = Node::Object()->simParameters->useCkLoop;
04984 if(useCkLoop>=CKLOOP_CTRL_PME_SENDUNTRANS) {
04985 int yBlocks = initdata.yBlocks;
04986 int evirIdx = 0;
04987 for ( int isend=0; isend<yBlocks; ++isend ) {
04988 int jb = send_order[isend];
04989 if (needs_reply[jb]) {
04990 evirIdx = isend;
04991 break;
04992 }
04993 }
04994
04995
04996
04997
04998
04999
05000 #if USE_NODE_PAR_RECEIVE
05001
05002 CkLoop_Parallelize(PmeYPencilSendUntrans, evirIdx, (void *)this, yBlocks, 0, yBlocks-1, 1);
05003 evir = 0.;
05004 CmiMemoryWriteFence();
05005 #else
05006
05007 CkLoop_Parallelize(PmeYPencilSendUntrans, evirIdx, (void *)this, yBlocks, 0, yBlocks-1, 0);
05008 #endif
05009 return;
05010 }
05011 #endif
05012 int yBlocks = initdata.yBlocks;
05013 int block2 = initdata.grid.block2;
05014 int K2 = initdata.grid.K2;
05015 int send_evir = 1;
05016 for ( int isend=0; isend<yBlocks; ++isend ) {
05017 int jb = send_order[isend];
05018 if ( ! needs_reply[jb] ) {
05019 PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
05020 CmiEnableUrgentSend(1);
05021 SET_PRIORITY(msg,sequence,PME_UNTRANS2_PRIORITY)
05022 initdata.zPencil(thisIndex.x,jb,0).recvAck(msg);
05023 CmiEnableUrgentSend(0);
05024 continue;
05025 }
05026 int ny = block2;
05027 if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
05028 PmeUntransMsg *msg = new (send_evir,nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;
05029 if ( send_evir ) {
05030 msg->evir[0] = evir;
05031 msg->has_evir = 1;
05032 send_evir = 0;
05033 } else {
05034 msg->has_evir = 0;
05035 }
05036 msg->sourceNode = thisIndex.z;
05037 msg->ny = nz;
05038 float *md = msg->qgrid;
05039 const float *d = data;
05040 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
05041 for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
05042 for ( int k=0; k<nz; ++k ) {
05043 *(md++) = d[2*(j*nz+k)];
05044 *(md++) = d[2*(j*nz+k)+1];
05045 }
05046 }
05047 }
05048 SET_PRIORITY(msg,sequence,PME_UNTRANS2_PRIORITY)
05049
05050 CmiEnableUrgentSend(1);
05051 #if USE_NODE_PAR_RECEIVE
05052 msg->destElem=CkArrayIndex3D( thisIndex.x, jb, 0);
05053
05054 #if Z_PERSIST
05055 CmiUsePersistentHandle(&untrans_handle[isend], 1);
05056 #endif
05057 initdata.pmeNodeProxy[CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem))].recvZUntrans(msg);
05058 #if Z_PERSIST
05059 CmiUsePersistentHandle(NULL, 0);
05060 #endif
05061 #else
05062 #if Z_PERSIST
05063 CmiUsePersistentHandle(&untrans_handle[isend], 1);
05064 #endif
05065 initdata.zPencil(thisIndex.x,jb,0).recvUntrans(msg);
05066 #if Z_PERSIST
05067 CmiUsePersistentHandle(NULL, 0);
05068 #endif
05069 #endif
05070 CmiEnableUrgentSend(0);
05071 }
05072
05073 #if USE_NODE_PAR_RECEIVE
05074 evir = 0.;
05075 CmiMemoryWriteFence();
05076 #endif
05077 }
05078
05079 void PmeZPencil::recv_untrans(const PmeUntransMsg *msg) {
05080 #if ! USE_NODE_PAR_RECEIVE
05081 if(imsg==0) evir=0.;
05082 #endif
05083
05084 if ( msg->has_evir ) evir += msg->evir[0];
05085 int block3 = initdata.grid.block3;
05086 int dim3 = initdata.grid.dim3;
05087 int kb = msg->sourceNode;
05088 int nz = msg->ny;
05089 const float *md = msg->qgrid;
05090 float *d = data;
05091 for ( int i=0; i<nx; ++i ) {
05092 #if CMK_BLUEGENEL
05093 CmiNetworkProgress();
05094 #endif
05095 for ( int j=0; j<ny; ++j, d += dim3 ) {
05096 for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
05097 #ifdef ZEROCHECK
05098 if ( (*md) == 0. ) CkPrintf("0 in YZ at %d %d %d %d %d %d %d %d %d\n",
05099 thisIndex.x, thisIndex.y, kb, i, j, k, nx, ny, nz);
05100 #endif
05101 d[2*k] = *(md++);
05102 d[2*k+1] = *(md++);
05103 }
05104 }
05105 }
05106 }
05107
05108 void PmeZPencil::backward_fft() {
05109 #ifdef NAMD_FFTW
05110 #ifdef MANUAL_DEBUG_FFTW3
05111 dumpMatrixFloat3("bw_z_b", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
05112 #endif
05113 #ifdef NAMD_FFTW_3
05114 #if USE_CKLOOP
05115 int useCkLoop = Node::Object()->simParameters->useCkLoop;
05116 if(useCkLoop>=CKLOOP_CTRL_PME_BACKWARDFFT) {
05117
05118
05119 CkLoop_Parallelize(PmeXZPencilFFT, 1, (void *)backward_plans, CkMyNodeSize(), 0, numPlans-1);
05120 return;
05121 }
05122 #endif
05123 fftwf_execute(backward_plan);
05124 #else
05125 rfftwnd_complex_to_real(backward_plan, nx*ny,
05126 (fftw_complex *) data, 1, initdata.grid.dim3/2, work, 1, 0);
05127 #endif
05128 #ifdef MANUAL_DEBUG_FFTW3
05129 dumpMatrixFloat3("bw_z_a", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
05130 #endif
05131
05132 #endif
05133
05134 #if CMK_BLUEGENEL
05135 CmiNetworkProgress();
05136 #endif
05137
05138 #ifdef FFTCHECK
05139 int dim3 = initdata.grid.dim3;
05140 int K1 = initdata.grid.K1;
05141 int K2 = initdata.grid.K2;
05142 int K3 = initdata.grid.K3;
05143 float scale = 1. / (1. * K1 * K2 * K3);
05144 float maxerr = 0.;
05145 float maxstd = 0.;
05146 int mi, mj, mk; mi = mj = mk = -1;
05147 float std_base = 100. * (thisIndex.x+1.) + 10. * (thisIndex.y+1.);
05148 const float *d = data;
05149 for ( int i=0; i<nx; ++i ) {
05150 for ( int j=0; j<ny; ++j, d += dim3 ) {
05151 for ( int k=0; k<K3; ++k ) {
05152 float std = 10. * (10. * (10. * std_base + i) + j) + k;
05153 float err = scale * d[k] - std;
05154 if ( fabsf(err) > fabsf(maxerr) ) {
05155 maxerr = err;
05156 maxstd = std;
05157 mi = i; mj = j; mk = k;
05158 }
05159 }
05160 }
05161 }
05162 CkPrintf("pencil %d %d max error %f at %d %d %d (should be %f)\n",
05163 thisIndex.x, thisIndex.y, maxerr, mi, mj, mk, maxstd);
05164 #endif
05165
05166 }
05167
05168 static inline void PmeZPencilSendUngrid(int first, int last, void *result, int paraNum, void *param){
05169
05170
05171 int specialIdx = paraNum;
05172 PmeZPencil *zpencil = (PmeZPencil *)param;
05173 zpencil->send_subset_ungrid(first, last, specialIdx);
05174 }
05175
05176 void PmeZPencil::send_all_ungrid() {
05177
05178
05179
05180
05181
05182
05183
05184
05185
05186
05187
05188
05189
05190
05191
05192
05193
05194 int evirIdx = 0;
05195 for(int imsg=0; imsg<grid_msgs.size(); imsg++) {
05196 if(grid_msgs[imsg]->hasData) {
05197 evirIdx = imsg;
05198 break;
05199 }
05200 }
05201
05202 #if USE_CKLOOP
05203 Bool useCkLoop = Node::Object()->simParameters->useCkLoop;
05204 if(useCkLoop>=CKLOOP_CTRL_PME_SENDUNTRANS) {
05205
05206 #if USE_NODE_PAR_RECEIVE
05207
05208 CkLoop_Parallelize(PmeZPencilSendUngrid, evirIdx, (void *)this, grid_msgs.size(), 0, grid_msgs.size()-1, 1);
05209 #else
05210
05211 CkLoop_Parallelize(PmeZPencilSendUngrid, evirIdx, (void *)this, grid_msgs.size(), 0, grid_msgs.size()-1, 0);
05212 #endif
05213 return;
05214 }
05215 #endif
05216 send_subset_ungrid(0, grid_msgs.size()-1, evirIdx);
05217 }
05218
05219 void PmeZPencil::send_subset_ungrid(int fromIdx, int toIdx, int specialIdx){
05220 for (int imsg=fromIdx; imsg <=toIdx; ++imsg ) {
05221 PmeGridMsg *msg = grid_msgs[imsg];
05222 if ( msg->hasData) {
05223 if (imsg == specialIdx) {
05224 msg->evir[0] = evir;
05225 } else {
05226 msg->evir[0] = 0.;
05227 }
05228 }
05229 send_ungrid(msg);
05230 }
05231 }
05232
05233 void PmeZPencil::send_ungrid(PmeGridMsg *msg) {
05234 int pe = msg->sourceNode;
05235 if ( ! msg->hasData ) {
05236 delete msg;
05237 PmeAckMsg *ackmsg = new (PRIORITY_SIZE) PmeAckMsg;
05238 SET_PRIORITY(ackmsg,sequence,PME_UNGRID_PRIORITY)
05239 CmiEnableUrgentSend(1);
05240 initdata.pmeProxy[pe].recvAck(ackmsg);
05241 CmiEnableUrgentSend(0);
05242 return;
05243 }
05244 msg->sourceNode = thisIndex.x * initdata.yBlocks + thisIndex.y;
05245 int dim3 = initdata.grid.dim3;
05246 int zlistlen = msg->zlistlen;
05247 int *zlist = msg->zlist;
05248 char *fmsg = msg->fgrid;
05249 float *qmsg = msg->qgrid;
05250 float *d = data;
05251 int numGrids = 1;
05252 for ( int g=0; g<numGrids; ++g ) {
05253 #if CMK_BLUEGENEL
05254 CmiNetworkProgress();
05255 #endif
05256 for ( int i=0; i<nx; ++i ) {
05257 for ( int j=0; j<ny; ++j, d += dim3 ) {
05258 if( *(fmsg++) ) {
05259 for ( int k=0; k<zlistlen; ++k ) {
05260 *(qmsg++) = d[zlist[k]];
05261 }
05262 }
05263 }
05264 }
05265 }
05266 SET_PRIORITY(msg,sequence,PME_UNGRID_PRIORITY)
05267 CmiEnableUrgentSend(1);
05268 initdata.pmeProxy[pe].recvUngrid(msg);
05269 CmiEnableUrgentSend(0);
05270 }
05271
05272 void PmeZPencil::node_process_grid(PmeGridMsg *msg)
05273 {
05274 #if USE_NODE_PAR_RECEIVE
05275 CmiLock(ComputePmeMgr::fftw_plan_lock);
05276 CmiMemoryReadFence();
05277 #endif
05278 recv_grid(msg);
05279 if(msg->hasData) hasData=msg->hasData;
05280 int limsg;
05281 CmiMemoryAtomicFetchAndInc(imsg,limsg);
05282 grid_msgs[limsg] = msg;
05283
05284 if(limsg+1 == grid_msgs.size())
05285 {
05286
05287 if (hasData)
05288 {
05289 forward_fft();
05290 }
05291 send_trans();
05292 imsg=0;
05293 CmiMemoryWriteFence();
05294
05295 }
05296 #if USE_NODE_PAR_RECEIVE
05297 CmiUnlock(ComputePmeMgr::fftw_plan_lock);
05298 CmiMemoryWriteFence();
05299 #endif
05300 }
05301
05302 void PmeZPencil::node_process_untrans(PmeUntransMsg *msg)
05303 {
05304 #if USE_NODE_PAR_RECEIVE
05305 CmiLock(ComputePmeMgr::fftw_plan_lock);
05306 CmiMemoryReadFence();
05307 #endif
05308 recv_untrans(msg);
05309 int limsg;
05310 CmiMemoryAtomicFetchAndInc(imsgb,limsg);
05311 if(limsg+1 == initdata.zBlocks)
05312 {
05313 if(hasData)
05314 {
05315 backward_fft();
05316 }
05317
05318 send_all_ungrid();
05319
05320
05321
05322
05323
05324
05325
05326
05327
05328
05329
05330
05331
05332
05333 imsgb=0;
05334 evir = 0;
05335 memset(data, 0, sizeof(float) * nx*ny* initdata.grid.dim3);
05336 CmiMemoryWriteFence();
05337
05338 }
05339 #if USE_NODE_PAR_RECEIVE
05340 CmiUnlock(ComputePmeMgr::fftw_plan_lock);
05341 CmiMemoryWriteFence();
05342 #endif
05343 }
05344
05345
05346 #include "ComputePmeMgr.def.h"
05347