00001
00007 #ifdef NAMD_FFTW
00008
00009 #ifdef NAMD_FFTW_3
00010 #include <fftw3.h>
00011 #else
00012
00013 #define fftwf_malloc fftw_malloc
00014 #define fftwf_free fftw_free
00015 #ifdef NAMD_FFTW_NO_TYPE_PREFIX
00016 #include <fftw.h>
00017 #include <rfftw.h>
00018 #else
00019 #include <sfftw.h>
00020 #include <srfftw.h>
00021 #endif
00022 #endif
00023 #endif
00024
00025 #include <vector>
00026 using namespace std;
00027
00028 #include "InfoStream.h"
00029 #include "Node.h"
00030 #include "PatchMap.h"
00031 #include "PatchMap.inl"
00032 #include "AtomMap.h"
00033 #include "ComputePme.h"
00034 #include "ComputePmeMgr.decl.h"
00035 #include "PmeRealSpace.h"
00036 #include "PmeKSpace.h"
00037 #include "ComputeNonbondedUtil.h"
00038 #include "PatchMgr.h"
00039 #include "Molecule.h"
00040 #include "ReductionMgr.h"
00041 #include "ComputeMgr.h"
00042 #include "ComputeMgr.decl.h"
00043
00044 #define MIN_DEBUG_LEVEL 3
00045 #include "Debug.h"
00046 #include "SimParameters.h"
00047 #include "WorkDistrib.h"
00048 #include "varsizemsg.h"
00049 #include "Random.h"
00050 #include "ckhashtable.h"
00051 #include "Priorities.h"
00052
00053 #include "ComputeMoa.h"
00054 #include "ComputeMoaMgr.decl.h"
00055
00056
00057
00058
00059
00060 #include "TopoManager.h"
00061
00062 #ifndef SQRT_PI
00063 #define SQRT_PI 1.7724538509055160273
00064 #endif
00065
00066 #if CMK_PERSISTENT_COMM
00067 #define USE_PERSISTENT 1
00068 #endif
00069
00070
00071
00072 char *pencilPMEProcessors;
00073
00074 class PmeAckMsg : public CMessage_PmeAckMsg {
00075 };
00076
00077 class PmeGridMsg : public CMessage_PmeGridMsg {
00078 public:
00079
00080 int sourceNode;
00081 int sequence;
00082 int hasData;
00083 Lattice lattice;
00084 PmeReduction *evir;
00085 int start;
00086 int len;
00087 int zlistlen;
00088 int *zlist;
00089 char *fgrid;
00090 float *qgrid;
00091 CkArrayIndex3D destElem;
00092 };
00093
00094 class PmeTransMsg : public CMessage_PmeTransMsg {
00095 public:
00096
00097 int sourceNode;
00098 int sequence;
00099 int hasData;
00100 Lattice lattice;
00101 int x_start;
00102 int nx;
00103 float *qgrid;
00104 CkArrayIndex3D destElem;
00105 };
00106
00107 class PmeSharedTransMsg : public CMessage_PmeSharedTransMsg {
00108 public:
00109 PmeTransMsg *msg;
00110 int *count;
00111 CmiNodeLock lock;
00112 };
00113
00114 class PmeUntransMsg : public CMessage_PmeUntransMsg {
00115 public:
00116
00117 int sourceNode;
00118 int has_evir;
00119 PmeReduction *evir;
00120 int y_start;
00121 int ny;
00122 float *qgrid;
00123 CkArrayIndex3D destElem;
00124 };
00125
00126 class PmeSharedUntransMsg : public CMessage_PmeSharedUntransMsg {
00127 public:
00128 PmeUntransMsg *msg;
00129 int *count;
00130 CmiNodeLock lock;
00131 };
00132
00133 class PmePencilMap : public CBase_PmePencilMap {
00134 public:
00135 PmePencilMap(int i_a, int i_b, int n_b, int n, int *d)
00136 : ia(i_a), ib(i_b), nb(n_b),
00137 size(n), data(newcopyint(n,d)) {
00138 }
00139 virtual int registerArray(CkArrayIndexMax&, CkArrayID) {
00140
00141 return 0;
00142 }
00143 virtual int procNum(int, const CkArrayIndex &i) {
00144
00145 return data[ i.data()[ia] * nb + i.data()[ib] ];
00146 }
00147 virtual void populateInitial(int, CkArrayIndexMax &, void *msg, CkArrMgr *mgr) {
00148 int mype = CkMyPe();
00149 for ( int i=0; i < size; ++i ) {
00150 if ( data[i] == mype ) {
00151 CkArrayIndex3D ai(0,0,0);
00152 ai.data()[ia] = i / nb;
00153 ai.data()[ib] = i % nb;
00154 if ( procNum(0,ai) != mype ) NAMD_bug("PmePencilMap is inconsistent");
00155 if ( ! msg ) NAMD_bug("PmePencilMap multiple pencils on a pe?");
00156 mgr->insertInitial(ai,msg);
00157 msg = 0;
00158 }
00159 }
00160 mgr->doneInserting();
00161 if ( msg ) CkFreeMsg(msg);
00162 }
00163 private:
00164 const int ia, ib, nb, size;
00165 const int* const data;
00166 static int* newcopyint(int n, int *d) {
00167 int *newd = new int[n];
00168 memcpy(newd, d, n*sizeof(int));
00169 return newd;
00170 }
00171 };
00172
00173
00174 struct PmePencilInitMsgData {
00175 PmeGrid grid;
00176 int xBlocks, yBlocks, zBlocks;
00177 CProxy_PmeXPencil xPencil;
00178 CProxy_PmeYPencil yPencil;
00179 CProxy_PmeZPencil zPencil;
00180 CProxy_ComputePmeMgr pmeProxy;
00181 CProxy_NodePmeMgr pmeNodeProxy;
00182 CProxy_PmePencilMap xm;
00183 CProxy_PmePencilMap ym;
00184 CProxy_PmePencilMap zm;
00185 };
00186
00187 class PmePencilInitMsg : public CMessage_PmePencilInitMsg {
00188 public:
00189 PmePencilInitMsg(PmePencilInitMsgData &d) { data = d; }
00190 PmePencilInitMsgData data;
00191 };
00192
00193
00194 struct LocalPmeInfo {
00195 int nx, x_start;
00196 int ny_after_transpose, y_start_after_transpose;
00197 };
00198
00199 struct NodePmeInfo {
00200 int npe, pe_start, real_node;
00201 };
00202
00203
00204
00205 void generatePmePeList(int *peMap, int numPes){
00206
00207 int i;
00208 int ncpus = CkNumPes();
00209
00210
00211 int npow2 = 1; int nbits = 0;
00212 while ( npow2 < ncpus ) { npow2 *= 2; nbits += 1; }
00213
00214
00215 SortableResizeArray<int> seq(ncpus);
00216 i = 0;
00217 for ( int icpu=0; icpu<ncpus; ++icpu ) {
00218 int ri;
00219 for ( ri = ncpus; ri >= ncpus; ++i ) {
00220 ri = 0;
00221 int pow2 = 1;
00222 int rpow2 = npow2 / 2;
00223 for ( int j=0; j<nbits; ++j ) {
00224 ri += rpow2 * ( ( i / pow2 ) % 2 );
00225 pow2 *= 2; rpow2 /= 2;
00226 }
00227 }
00228 seq[icpu] = ri;
00229 }
00230
00231
00232 for ( i=0; i<numPes; ++i ) {
00233 seq[i] = seq[ncpus - numPes + i];
00234 }
00235 seq.resize(numPes);
00236 seq.sort();
00237
00238 for ( i=0; i<numPes; ++i )
00239 peMap[i] = seq[i];
00240
00241
00242 }
00243
00244
00245 void generatePmePeList2(int *gridPeMap, int numGridPes, int *transPeMap, int numTransPes){
00246
00247 int i;
00248 int ncpus = CkNumPes();
00249
00250
00251 int npow2 = 1; int nbits = 0;
00252 while ( npow2 < ncpus ) { npow2 *= 2; nbits += 1; }
00253
00254
00255 SortableResizeArray<int> seq(ncpus);
00256 SortableResizeArray<int> seq2(ncpus);
00257 i = 0;
00258 for ( int icpu=0; icpu<ncpus; ++icpu ) {
00259 int ri;
00260 for ( ri = ncpus; ri >= ncpus; ++i ) {
00261 ri = 0;
00262 int pow2 = 1;
00263 int rpow2 = npow2 / 2;
00264 for ( int j=0; j<nbits; ++j ) {
00265 ri += rpow2 * ( ( i / pow2 ) % 2 );
00266 pow2 *= 2; rpow2 /= 2;
00267 }
00268 }
00269 seq[icpu] = ri;
00270 seq2[icpu] = ri;
00271 }
00272
00273
00274 for ( i=0; i<numGridPes; ++i ) {
00275 seq[i] = seq[ncpus - numGridPes + i];
00276 }
00277 seq.resize(numGridPes);
00278 seq.sort();
00279 int firstTransPe = ncpus - numGridPes - numTransPes;
00280 if ( firstTransPe < 0 ) {
00281 firstTransPe = 0;
00282
00283 if ( ncpus > numTransPes ) firstTransPe = 1;
00284 }
00285 for ( i=0; i<numTransPes; ++i ) {
00286 seq2[i] = seq2[firstTransPe + i];
00287 }
00288 seq2.resize(numTransPes);
00289 seq2.sort();
00290
00291 for ( i=0; i<numGridPes; ++i )
00292 gridPeMap[i] = seq[i];
00293
00294 for ( i=0; i<numTransPes; ++i )
00295 transPeMap[i] = seq2[i];
00296 }
00297
00298 #if USE_TOPOMAP
00299
00300 bool generateBGLORBPmePeList(int *pemap, int numPes, int *block_pes=0,
00301 int nbpes=0);
00302 #endif
00303
00304 struct ijpair {
00305 int i,j;
00306 ijpair() {;}
00307 ijpair(int I, int J) : i(I), j(J) {;}
00308 };
00309
00310 class ComputePmeMgr : public BOCclass {
00311 public:
00312 friend class ComputePme;
00313 friend class NodePmeMgr;
00314 ComputePmeMgr();
00315 ~ComputePmeMgr();
00316
00317 void initialize(CkQdMsg*);
00318 void initialize_pencils(CkQdMsg*);
00319 void activate_pencils(CkQdMsg*);
00320 void recvArrays(CProxy_PmeXPencil, CProxy_PmeYPencil, CProxy_PmeZPencil);
00321
00322 void sendGrid(void);
00323 void recvGrid(PmeGridMsg *);
00324 void gridCalc1(void);
00325 void sendTransBarrier(void);
00326 void sendTrans(void);
00327 void fwdSharedTrans(PmeTransMsg *);
00328 void recvSharedTrans(PmeSharedTransMsg *);
00329 void recvTrans(PmeTransMsg *);
00330 void procTrans(PmeTransMsg *);
00331 void gridCalc2(void);
00332 #ifdef OPENATOM_VERSION
00333 void gridCalc2Moa(void);
00334 #endif // OPENATOM_VERSION
00335 void gridCalc2R(void);
00336 void fwdSharedUntrans(PmeUntransMsg *);
00337 void recvSharedUntrans(PmeSharedUntransMsg *);
00338 void sendUntrans(void);
00339 void recvUntrans(PmeUntransMsg *);
00340 void procUntrans(PmeUntransMsg *);
00341 void gridCalc3(void);
00342 void sendUngrid(void);
00343 void recvUngrid(PmeGridMsg *);
00344 void recvAck(PmeAckMsg *);
00345 void ungridCalc(void);
00346
00347 void setCompute(ComputePme *c) { pmeCompute = c; c->setMgr(this); }
00348
00349
00350 int isPmeProcessor(int p);
00351
00352 #ifdef NAMD_FFTW
00353 static CmiNodeLock fftw_plan_lock;
00354 #endif
00355
00356 private:
00357 CProxy_ComputePmeMgr pmeProxy;
00358 CProxy_ComputePmeMgr pmeProxyDir;
00359 CProxy_NodePmeMgr pmeNodeProxy;
00360
00361 ComputePme *pmeCompute;
00362 PmeGrid myGrid;
00363 Lattice lattice;
00364 PmeKSpace *myKSpace;
00365 float *qgrid;
00366 float *kgrid;
00367
00368 #ifdef NAMD_FFTW
00369 #ifdef NAMD_FFTW_3
00370 fftwf_plan *forward_plan_x, *backward_plan_x;
00371 fftwf_plan *forward_plan_yz, *backward_plan_yz;
00372 fftwf_complex *work;
00373 #else
00374 fftw_plan forward_plan_x, backward_plan_x;
00375 rfftwnd_plan forward_plan_yz, backward_plan_yz;
00376 fftw_complex *work;
00377 #endif
00378 #else
00379 float *work;
00380 #endif
00381
00382 int alchFepOn, alchThermIntOn, lesOn, lesFactor, pairOn, selfOn, numGrids;
00383 int alchDecouple;
00384 BigReal alchElecLambdaStart;
00385 BigReal elecLambdaUp;
00386 BigReal elecLambdaDown;
00387
00388
00389 LocalPmeInfo *localInfo;
00390 NodePmeInfo *gridNodeInfo;
00391 NodePmeInfo *transNodeInfo;
00392 int qgrid_size;
00393 int qgrid_start;
00394 int qgrid_len;
00395 int fgrid_start;
00396 int fgrid_len;
00397
00398 int numSources;
00399 int numGridPes;
00400 int numTransPes;
00401 int numGridNodes;
00402 int numTransNodes;
00403 int numDestRecipPes;
00404 int myGridPe, myGridNode;
00405 int myTransPe, myTransNode;
00406 int *gridPeMap;
00407 int *transPeMap;
00408 int *recipPeDest;
00409 int *gridPeOrder;
00410 int *gridNodeOrder;
00411 int *transNodeOrder;
00412 char *isPmeFlag;
00413 int grid_count;
00414 int trans_count;
00415 int untrans_count;
00416 int ungrid_count;
00417 PmeGridMsg **gridmsg_reuse;
00418 PmeReduction recip_evir[PME_MAX_EVALS];
00419 PmeReduction recip_evir2[PME_MAX_EVALS];
00420
00421 int sequence;
00422 int useBarrier;
00423 int sendTransBarrier_received;
00424
00425 int usePencils;
00426 int xBlocks, yBlocks, zBlocks;
00427 CProxy_PmeXPencil xPencil;
00428 CProxy_PmeYPencil yPencil;
00429 CProxy_PmeZPencil zPencil;
00430 char *pencilActive;
00431 ijpair *activePencils;
00432 int numPencilsActive;
00433 };
00434
00435 #ifdef NAMD_FFTW
00436 CmiNodeLock ComputePmeMgr::fftw_plan_lock;
00437 #endif
00438
00439 int isPmeProcessor(int p){
00440 return CProxy_ComputePmeMgr::ckLocalBranch(CkpvAccess(BOCclass_group).computePmeMgr)->isPmeProcessor(p);
00441 }
00442
00443 int ComputePmeMgr::isPmeProcessor(int p){
00444 return ( usePencils ? pencilPMEProcessors[p] : isPmeFlag[p] );
00445 }
00446
00447 class NodePmeMgr : public CBase_NodePmeMgr {
00448 public:
00449 friend class ComputePmeMgr;
00450 NodePmeMgr();
00451 ~NodePmeMgr();
00452 void initialize();
00453 void recvTrans(PmeTransMsg *);
00454 void recvUntrans(PmeUntransMsg *);
00455 void registerXPencil(CkArrayIndex3D, PmeXPencil *);
00456 void registerYPencil(CkArrayIndex3D, PmeYPencil *);
00457 void registerZPencil(CkArrayIndex3D, PmeZPencil *);
00458 void recvXTrans(PmeTransMsg *);
00459 void recvYTrans(PmeTransMsg *);
00460 void recvYUntrans(PmeUntransMsg *);
00461 void recvZGrid(PmeGridMsg *);
00462 void recvZUntrans(PmeUntransMsg *);
00463
00464 void recvPencilMapProxies(CProxy_PmePencilMap _xm, CProxy_PmePencilMap _ym, CProxy_PmePencilMap _zm){
00465 xm=_xm; ym=_ym; zm=_zm;
00466 }
00467 CProxy_PmePencilMap xm;
00468 CProxy_PmePencilMap ym;
00469 CProxy_PmePencilMap zm;
00470
00471 private:
00472 CProxy_ComputePmeMgr mgrProxy;
00473 ComputePmeMgr *mgrObject;
00474 ComputePmeMgr **mgrObjects;
00475 CProxy_PmeXPencil xPencil;
00476 CProxy_PmeYPencil yPencil;
00477 CProxy_PmeZPencil zPencil;
00478 CkHashtableT<CkArrayIndex3D,PmeXPencil*> xPencilObj;
00479 CkHashtableT<CkArrayIndex3D,PmeYPencil*> yPencilObj;
00480 CkHashtableT<CkArrayIndex3D,PmeZPencil*> zPencilObj;
00481 };
00482
00483 NodePmeMgr::NodePmeMgr() {
00484 mgrObjects = new ComputePmeMgr*[CkMyNodeSize()];
00485 }
00486
00487 NodePmeMgr::~NodePmeMgr() {
00488 delete [] mgrObjects;
00489 }
00490
00491 void NodePmeMgr::initialize() {
00492 CProxy_ComputePmeMgr proxy = CkpvAccess(BOCclass_group).computePmeMgr;
00493 mgrObjects[CkMyRank()] = proxy.ckLocalBranch();
00494 if ( CkMyRank() == 0 ) {
00495 mgrProxy = proxy;
00496 mgrObject = proxy.ckLocalBranch();
00497 }
00498 }
00499
00500 void NodePmeMgr::recvTrans(PmeTransMsg *msg) {
00501 mgrObject->fwdSharedTrans(msg);
00502 }
00503
00504 void NodePmeMgr::recvUntrans(PmeUntransMsg *msg) {
00505 mgrObject->fwdSharedUntrans(msg);
00506 }
00507
00508 void NodePmeMgr::registerXPencil(CkArrayIndex3D idx, PmeXPencil *obj)
00509 {
00510 CmiLock(ComputePmeMgr::fftw_plan_lock);
00511 xPencilObj.put(idx)=obj;
00512 CmiUnlock(ComputePmeMgr::fftw_plan_lock);
00513 }
00514 void NodePmeMgr::registerYPencil(CkArrayIndex3D idx, PmeYPencil *obj)
00515 {
00516 CmiLock(ComputePmeMgr::fftw_plan_lock);
00517 yPencilObj.put(idx)=obj;
00518 CmiUnlock(ComputePmeMgr::fftw_plan_lock);
00519 }
00520 void NodePmeMgr::registerZPencil(CkArrayIndex3D idx, PmeZPencil *obj)
00521 {
00522 CmiLock(ComputePmeMgr::fftw_plan_lock);
00523 zPencilObj.put(idx)=obj;
00524 CmiUnlock(ComputePmeMgr::fftw_plan_lock);
00525 }
00526
00527 ComputePmeMgr::ComputePmeMgr() : pmeProxy(thisgroup),
00528 pmeProxyDir(thisgroup), pmeCompute(0) {
00529
00530 CkpvAccess(BOCclass_group).computePmeMgr = thisgroup;
00531 pmeNodeProxy = CkpvAccess(BOCclass_group).nodePmeMgr;
00532
00533 pmeNodeProxy.ckLocalBranch()->initialize();
00534
00535 #ifdef NAMD_FFTW
00536 if ( CmiMyRank() == 0 ) {
00537 fftw_plan_lock = CmiCreateLock();
00538 }
00539 #endif
00540
00541 myKSpace = 0;
00542 kgrid = 0;
00543 work = 0;
00544 grid_count = 0;
00545 trans_count = 0;
00546 untrans_count = 0;
00547 ungrid_count = 0;
00548 gridmsg_reuse= new PmeGridMsg*[CkNumPes()];
00549 useBarrier = 0;
00550 sendTransBarrier_received = 0;
00551 usePencils = 0;
00552 }
00553
00554
00555 void ComputePmeMgr::recvArrays(
00556 CProxy_PmeXPencil x, CProxy_PmeYPencil y, CProxy_PmeZPencil z) {
00557 xPencil = x; yPencil = y; zPencil = z;
00558
00559 if(CmiMyRank()==0)
00560 {
00561 pmeNodeProxy.ckLocalBranch()->xPencil=x;
00562 pmeNodeProxy.ckLocalBranch()->yPencil=y;
00563 pmeNodeProxy.ckLocalBranch()->zPencil=z;
00564 }
00565 }
00566
00567 #if USE_TOPO_SFC
00568 struct Coord
00569 {
00570 int x, y, z;
00571 Coord(): x(0), y(0), z(0) {}
00572 Coord(int a, int b, int c): x(a), y(b), z(c) {}
00573 };
00574 extern void SFC_grid(int xdim, int ydim, int zdim, int xdim1, int ydim1, int zdim1, vector<Coord> &result);
00575
00576 void sort_sfc(SortableResizeArray<int> &procs, TopoManager &tmgr, vector<Coord> &result)
00577 {
00578 SortableResizeArray<int> newprocs(procs.size());
00579 int num = 0;
00580 for (int i=0; i<result.size(); i++) {
00581 Coord &c = result[i];
00582 for (int j=0; j<procs.size(); j++) {
00583 int pe = procs[j];
00584 int x,y,z,t;
00585 tmgr.rankToCoordinates(pe, x, y, z, t);
00586 if (x==c.x && y==c.y && z==c.z)
00587 newprocs[num++] = pe;
00588 }
00589 }
00590 CmiAssert(newprocs.size() == procs.size());
00591 procs = newprocs;
00592 }
00593
00594 int find_level_grid(int x)
00595 {
00596 int a = sqrt(x);
00597 int b;
00598 for (; a>0; a--) {
00599 if (x%a == 0) break;
00600 }
00601 if (a==1) a = x;
00602 b = x/a;
00603
00604 return b;
00605 }
00606 CmiNodeLock tmgr_lock;
00607 #endif
00608
00609 void Pme_init()
00610 {
00611 #if USE_TOPO_SFC
00612 if (CkMyRank() == 0)
00613 tmgr_lock = CmiCreateLock();
00614 #endif
00615 }
00616
00617 void ComputePmeMgr::initialize(CkQdMsg *msg) {
00618 delete msg;
00619
00620 localInfo = new LocalPmeInfo[CkNumPes()];
00621 gridNodeInfo = new NodePmeInfo[CkNumNodes()];
00622 transNodeInfo = new NodePmeInfo[CkNumNodes()];
00623 gridPeMap = new int[CkNumPes()];
00624 transPeMap = new int[CkNumPes()];
00625 recipPeDest = new int[CkNumPes()];
00626 gridPeOrder = new int[CkNumPes()];
00627 gridNodeOrder = new int[CkNumNodes()];
00628 transNodeOrder = new int[CkNumNodes()];
00629 isPmeFlag = new char[CkNumPes()];
00630
00631 SimParameters *simParams = Node::Object()->simParameters;
00632 PatchMap *patchMap = PatchMap::Object();
00633
00634 alchFepOn = simParams->alchFepOn;
00635 alchThermIntOn = simParams->alchThermIntOn;
00636 alchDecouple = (alchFepOn || alchThermIntOn) && (simParams->alchDecouple);
00637 alchElecLambdaStart = (alchFepOn || alchThermIntOn) ?
00638 simParams->alchElecLambdaStart : 0;
00639 if (alchFepOn || alchThermIntOn) {
00640 numGrids = 2;
00641 if (alchDecouple) numGrids += 2;
00642 if (alchElecLambdaStart || alchThermIntOn) numGrids ++;
00643 }
00644 else numGrids = 1;
00645 lesOn = simParams->lesOn;
00646 useBarrier = simParams->PMEBarrier;
00647 if ( lesOn ) {
00648 lesFactor = simParams->lesFactor;
00649 numGrids = lesFactor;
00650 }
00651 selfOn = 0;
00652 pairOn = simParams->pairInteractionOn;
00653 if ( pairOn ) {
00654 selfOn = simParams->pairInteractionSelf;
00655 if ( selfOn ) pairOn = 0;
00656 numGrids = selfOn ? 1 : 3;
00657 }
00658
00659 if ( numGrids != 1 || simParams->PMEPencils == 0 ) usePencils = 0;
00660 else if ( simParams->PMEPencils > 0 ) usePencils = 1;
00661 else {
00662 int nrps = simParams->PMEProcessors;
00663 if ( nrps <= 0 ) nrps = CkNumPes();
00664 if ( nrps > CkNumPes() ) nrps = CkNumPes();
00665 int dimx = simParams->PMEGridSizeX;
00666 int dimy = simParams->PMEGridSizeY;
00667 int maxslabs = 1 + (dimx - 1) / simParams->PMEMinSlices;
00668 if ( maxslabs > nrps ) maxslabs = nrps;
00669 int maxpencils = ( simParams->PMEGridSizeX * simParams->PMEGridSizeY
00670 * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
00671 if ( maxpencils > nrps ) maxpencils = nrps;
00672 if ( maxpencils > 3 * maxslabs ) usePencils = 1;
00673 else usePencils = 0;
00674 }
00675
00676 if ( usePencils ) {
00677 int nrps = simParams->PMEProcessors;
00678 if ( nrps <= 0 ) nrps = CkNumPes();
00679 if ( nrps > CkNumPes() ) nrps = CkNumPes();
00680 if ( simParams->PMEPencils > 1 &&
00681 simParams->PMEPencils * simParams->PMEPencils <= nrps ) {
00682 xBlocks = yBlocks = zBlocks = simParams->PMEPencils;
00683 } else {
00684 int nb2 = ( simParams->PMEGridSizeX * simParams->PMEGridSizeY
00685 * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
00686 if ( nb2 > nrps ) nb2 = nrps;
00687 int nb = (int) sqrt((float)nb2);
00688 xBlocks = zBlocks = nb;
00689 yBlocks = nb2 / nb;
00690 }
00691
00692 int dimx = simParams->PMEGridSizeX;
00693 int bx = 1 + ( dimx - 1 ) / xBlocks;
00694 xBlocks = 1 + ( dimx - 1 ) / bx;
00695
00696 int dimy = simParams->PMEGridSizeY;
00697 int by = 1 + ( dimy - 1 ) / yBlocks;
00698 yBlocks = 1 + ( dimy - 1 ) / by;
00699
00700 int dimz = simParams->PMEGridSizeZ / 2 + 1;
00701 int bz = 1 + ( dimz - 1 ) / zBlocks;
00702 zBlocks = 1 + ( dimz - 1 ) / bz;
00703
00704 if ( ! CkMyPe() ) {
00705 iout << iINFO << "PME using " << xBlocks << " x " <<
00706 yBlocks << " x " << zBlocks <<
00707 " pencil grid for FFT and reciprocal sum.\n" << endi;
00708 }
00709 } else {
00710
00711 {
00712
00713
00714 int minslices = simParams->PMEMinSlices;
00715 int dimx = simParams->PMEGridSizeX;
00716 int nrpx = ( dimx + minslices - 1 ) / minslices;
00717 int dimy = simParams->PMEGridSizeY;
00718 int nrpy = ( dimy + minslices - 1 ) / minslices;
00719
00720
00721 int nrpp = CkNumPes();
00722
00723 if ( nrpp < nrpx ) nrpx = nrpp;
00724 if ( nrpp < nrpy ) nrpy = nrpp;
00725
00726
00727 int nrps = simParams->PMEProcessors;
00728 if ( nrps > CkNumPes() ) nrps = CkNumPes();
00729 if ( nrps > 0 ) nrpx = nrps;
00730 if ( nrps > 0 ) nrpy = nrps;
00731
00732
00733 int bx = ( dimx + nrpx - 1 ) / nrpx;
00734 nrpx = ( dimx + bx - 1 ) / bx;
00735 int by = ( dimy + nrpy - 1 ) / nrpy;
00736 nrpy = ( dimy + by - 1 ) / by;
00737 if ( bx != ( dimx + nrpx - 1 ) / nrpx )
00738 NAMD_bug("Error in selecting number of PME processors.");
00739 if ( by != ( dimy + nrpy - 1 ) / nrpy )
00740 NAMD_bug("Error in selecting number of PME processors.");
00741
00742 numGridPes = nrpx;
00743 numTransPes = nrpy;
00744 }
00745 if ( ! CkMyPe() ) {
00746 iout << iINFO << "PME using " << numGridPes << " and " << numTransPes <<
00747 " processors for FFT and reciprocal sum.\n" << endi;
00748 }
00749
00750 int sum_npes = numTransPes + numGridPes;
00751 int max_npes = (numTransPes > numGridPes)?numTransPes:numGridPes;
00752
00753 #if 0 // USE_TOPOMAP
00754
00755 PatchMap * pmap = PatchMap::Object();
00756
00757 int patch_pes = pmap->numNodesWithPatches();
00758 TopoManager tmgr;
00759 if(tmgr.hasMultipleProcsPerNode())
00760 patch_pes *= 2;
00761
00762 bool done = false;
00763 if(CkNumPes() > 2*sum_npes + patch_pes) {
00764 done = generateBGLORBPmePeList(transPeMap, numTransPes);
00765 done &= generateBGLORBPmePeList(gridPeMap, numGridPes, transPeMap, numTransPes);
00766 }
00767 else
00768 if(CkNumPes() > 2 *max_npes + patch_pes) {
00769 done = generateBGLORBPmePeList(transPeMap, max_npes);
00770 gridPeMap = transPeMap;
00771 }
00772
00773 if (!done)
00774 #endif
00775 {
00776
00777
00778 generatePmePeList2(gridPeMap, numGridPes, transPeMap, numTransPes);
00779 }
00780
00781 myGridPe = -1;
00782 myGridNode = -1;
00783 int i = 0;
00784 for ( i=0; i<CkNumPes(); ++i )
00785 isPmeFlag[i] = 0;
00786 int node = -1;
00787 int real_node = -1;
00788 for ( i=0; i<numGridPes; ++i ) {
00789 if ( gridPeMap[i] == CkMyPe() ) myGridPe = i;
00790 isPmeFlag[gridPeMap[i]] |= 1;
00791 int real_node_i = CkNodeOf(gridPeMap[i]);
00792 if ( real_node_i == real_node ) {
00793 gridNodeInfo[node].npe += 1;
00794 } else {
00795 real_node = real_node_i;
00796 ++node;
00797 gridNodeInfo[node].real_node = real_node;
00798 gridNodeInfo[node].pe_start = i;
00799 gridNodeInfo[node].npe = 1;
00800 }
00801 if ( CkMyNode() == real_node_i ) myGridNode = node;
00802 }
00803 numGridNodes = node + 1;
00804 myTransPe = -1;
00805 myTransNode = -1;
00806 node = -1;
00807 real_node = -1;
00808 for ( i=0; i<numTransPes; ++i ) {
00809 if ( transPeMap[i] == CkMyPe() ) myTransPe = i;
00810 isPmeFlag[transPeMap[i]] |= 2;
00811 int real_node_i = CkNodeOf(transPeMap[i]);
00812 if ( real_node_i == real_node ) {
00813 transNodeInfo[node].npe += 1;
00814 } else {
00815 real_node = real_node_i;
00816 ++node;
00817 transNodeInfo[node].real_node = real_node;
00818 transNodeInfo[node].pe_start = i;
00819 transNodeInfo[node].npe = 1;
00820 }
00821 if ( CkMyNode() == real_node_i ) myTransNode = node;
00822 }
00823 numTransNodes = node + 1;
00824
00825 if ( ! CkMyPe() ) {
00826 iout << iINFO << "PME USING " << numGridNodes << " GRID NODES AND "
00827 << numTransNodes << " TRANS NODES\n" << endi;
00828 iout << iINFO << "PME GRID LOCATIONS:";
00829 int i;
00830 for ( i=0; i<numGridPes && i<10; ++i ) {
00831 iout << " " << gridPeMap[i];
00832 }
00833 if ( i < numGridPes ) iout << " ...";
00834 iout << "\n" << endi;
00835 iout << iINFO << "PME TRANS LOCATIONS:";
00836 for ( i=0; i<numTransPes && i<10; ++i ) {
00837 iout << " " << transPeMap[i];
00838 }
00839 if ( i < numTransPes ) iout << " ...";
00840 iout << "\n" << endi;
00841 }
00842
00843 {
00844 int i;
00845 for ( i = 0; i < numGridPes; ++i ) {
00846 gridPeOrder[i] = i;
00847 }
00848 Random rand(CkMyPe());
00849 if ( myGridPe < 0 ) {
00850 rand.reorder(gridPeOrder,numGridPes);
00851 } else {
00852 gridPeOrder[myGridPe] = numGridPes-1;
00853 gridPeOrder[numGridPes-1] = myGridPe;
00854 rand.reorder(gridPeOrder,numGridPes-1);
00855 }
00856 for ( i = 0; i < numGridNodes; ++i ) {
00857 gridNodeOrder[i] = i;
00858 }
00859 if ( myGridNode < 0 ) {
00860 rand.reorder(gridNodeOrder,numGridNodes);
00861 } else {
00862 gridNodeOrder[myGridNode] = numGridNodes-1;
00863 gridNodeOrder[numGridNodes-1] = myGridNode;
00864 rand.reorder(gridNodeOrder,numGridNodes-1);
00865 }
00866 for ( i = 0; i < numTransNodes; ++i ) {
00867 transNodeOrder[i] = i;
00868 }
00869 if ( myTransNode < 0 ) {
00870 rand.reorder(transNodeOrder,numTransNodes);
00871 } else {
00872 transNodeOrder[myTransNode] = numTransNodes-1;
00873 transNodeOrder[numTransNodes-1] = myTransNode;
00874 rand.reorder(transNodeOrder,numTransNodes-1);
00875 }
00876 }
00877
00878 }
00879
00880 myGrid.K1 = simParams->PMEGridSizeX;
00881 myGrid.K2 = simParams->PMEGridSizeY;
00882 myGrid.K3 = simParams->PMEGridSizeZ;
00883 myGrid.order = simParams->PMEInterpOrder;
00884 myGrid.dim2 = myGrid.K2;
00885 myGrid.dim3 = 2 * (myGrid.K3/2 + 1);
00886
00887 if ( ! usePencils ) {
00888 myGrid.block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
00889 myGrid.block2 = ( myGrid.K2 + numTransPes - 1 ) / numTransPes;
00890 myGrid.block3 = myGrid.dim3 / 2;
00891 }
00892
00893 if ( usePencils ) {
00894 myGrid.block1 = ( myGrid.K1 + xBlocks - 1 ) / xBlocks;
00895 myGrid.block2 = ( myGrid.K2 + yBlocks - 1 ) / yBlocks;
00896 myGrid.block3 = ( myGrid.K3/2 + 1 + zBlocks - 1 ) / zBlocks;
00897
00898
00899 int pe = 0;
00900 int x,y,z;
00901
00902 SortableResizeArray<int> zprocs(xBlocks*yBlocks);
00903 SortableResizeArray<int> yprocs(xBlocks*zBlocks);
00904 SortableResizeArray<int> xprocs(yBlocks*zBlocks);
00905 {
00906 int basepe = 0; int npe = CkNumPes();
00907 if ( npe > xBlocks*yBlocks &&
00908 npe > xBlocks*zBlocks &&
00909 npe > yBlocks*zBlocks ) {
00910
00911 ++basepe;
00912 --npe;
00913 }
00914
00915
00916
00917 int i;
00918 int ncpus = CkNumPes();
00919
00920
00921 int npow2 = 1; int nbits = 0;
00922 while ( npow2 < ncpus ) { npow2 *= 2; nbits += 1; }
00923
00924
00925 SortableResizeArray<int> patches, nopatches, pmeprocs;
00926 PatchMap *pmap = PatchMap::Object();
00927 i = 0;
00928 for ( int icpu=0; icpu<ncpus; ++icpu ) {
00929 int ri;
00930 for ( ri = ncpus; ri >= ncpus; ++i ) {
00931 ri = 0;
00932 int pow2 = 1;
00933 int rpow2 = npow2 / 2;
00934 for ( int j=0; j<nbits; ++j ) {
00935 ri += rpow2 * ( ( i / pow2 ) % 2 );
00936 pow2 *= 2; rpow2 /= 2;
00937 }
00938 }
00939
00940 if ( ri ) {
00941 if ( pmap->numPatchesOnNode(ri) ) patches.add(ri);
00942 else nopatches.add(ri);
00943 }
00944 }
00945
00946 #if USE_RANDOM_TOPO
00947 Random rand(CkMyPe());
00948 int *tmp = new int[patches.size()];
00949 int nn = patches.size();
00950 for (i=0;i<nn;i++) tmp[i] = patches[i];
00951 rand.reorder(tmp, nn);
00952 patches.resize(0);
00953 for (i=0;i<nn;i++) patches.add(tmp[i]);
00954 delete [] tmp;
00955 tmp = new int[nopatches.size()];
00956 nn = nopatches.size();
00957 for (i=0;i<nn;i++) tmp[i] = nopatches[i];
00958 rand.reorder(tmp, nn);
00959 nopatches.resize(0);
00960 for (i=0;i<nn;i++) nopatches.add(tmp[i]);
00961 delete [] tmp;
00962 #endif
00963
00964
00965 int useZero = 0;
00966 int npens = xBlocks*yBlocks;
00967 if ( npens % ncpus == 0 ) useZero = 1;
00968 if ( npens == nopatches.size() + 1 ) useZero = 1;
00969 npens += xBlocks*zBlocks;
00970 if ( npens % ncpus == 0 ) useZero = 1;
00971 if ( npens == nopatches.size() + 1 ) useZero = 1;
00972 npens += yBlocks*zBlocks;
00973 if ( npens % ncpus == 0 ) useZero = 1;
00974 if ( npens == nopatches.size() + 1 ) useZero = 1;
00975
00976
00977 for ( i=nopatches.size()-1; i>=0; --i ) pmeprocs.add(nopatches[i]);
00978 if ( useZero && ! pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
00979 for ( i=patches.size()-1; i>=0; --i ) pmeprocs.add(patches[i]);
00980 if ( pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
00981
00982 int npes = pmeprocs.size();
00983 for ( i=0; i<xBlocks*yBlocks; ++i, ++pe ) zprocs[i] = pmeprocs[pe%npes];
00984 if ( i>1 && zprocs[0] == zprocs[i-1] ) zprocs[0] = 0;
00985 #if !USE_RANDOM_TOPO
00986 zprocs.sort();
00987 #endif
00988 for ( i=0; i<xBlocks*zBlocks; ++i, ++pe ) yprocs[i] = pmeprocs[pe%npes];
00989 if ( i>1 && yprocs[0] == yprocs[i-1] ) yprocs[0] = 0;
00990 #if !USE_RANDOM_TOPO
00991 yprocs.sort();
00992 #endif
00993 for ( i=0; i<yBlocks*zBlocks; ++i, ++pe ) xprocs[i] = pmeprocs[pe%npes];
00994 if ( i>1 && xprocs[0] == xprocs[i-1] ) xprocs[0] = 0;
00995 #if !USE_RANDOM_TOPO
00996 xprocs.sort();
00997 #endif
00998
00999 #if USE_TOPO_SFC
01000 CmiLock(tmgr_lock);
01001
01002 TopoManager tmgr;
01003 int xdim = tmgr.getDimNX();
01004 int ydim = tmgr.getDimNY();
01005 int zdim = tmgr.getDimNZ();
01006 int xdim1 = find_level_grid(xdim);
01007 int ydim1 = find_level_grid(ydim);
01008 int zdim1 = find_level_grid(zdim);
01009 if(CkMyPe() == 0)
01010 printf("xdim: %d %d %d, %d %d %d\n", xdim, ydim, zdim, xdim1, ydim1, zdim1);
01011
01012 vector<Coord> result;
01013 SFC_grid(xdim, ydim, zdim, xdim1, ydim1, zdim1, result);
01014 sort_sfc(xprocs, tmgr, result);
01015 sort_sfc(yprocs, tmgr, result);
01016 sort_sfc(zprocs, tmgr, result);
01017
01018 CmiUnlock(tmgr_lock);
01019 #endif
01020
01021 pencilPMEProcessors = new char [CkNumPes()];
01022 memset (pencilPMEProcessors, 0, sizeof(char) * CkNumPes());
01023
01024
01025 if(CkMyPe() == 0){
01026 iout << iINFO << "PME Z PENCIL LOCATIONS:";
01027 for ( i=0; i<zprocs.size() && i<10; ++i ) {
01028 #if USE_TOPO_SFC
01029 int x,y,z,t;
01030 tmgr.rankToCoordinates(zprocs[i], x,y, z, t);
01031 iout << " " << zprocs[i] << "(" << x << " " << y << " " << z << ")";
01032 #else
01033 iout << " " << zprocs[i];
01034 #endif
01035 }
01036 if ( i < zprocs.size() ) iout << " ...";
01037 iout << "\n" << endi;
01038 }
01039
01040 for (pe=0, x = 0; x < xBlocks; ++x)
01041 for (y = 0; y < yBlocks; ++y, ++pe ) {
01042 pencilPMEProcessors[zprocs[pe]] = 1;
01043 }
01044
01045 if(CkMyPe() == 0){
01046 iout << iINFO << "PME Y PENCIL LOCATIONS:";
01047 for ( i=0; i<yprocs.size() && i<10; ++i ) {
01048 #if USE_TOPO_SFC
01049 int x,y,z,t;
01050 tmgr.rankToCoordinates(yprocs[i], x,y, z, t);
01051 iout << " " << yprocs[i] << "(" << x << " " << y << " " << z << ")";
01052 #else
01053 iout << " " << yprocs[i];
01054 #endif
01055 }
01056 if ( i < yprocs.size() ) iout << " ...";
01057 iout << "\n" << endi;
01058 }
01059
01060 for (pe=0, z = 0; z < zBlocks; ++z )
01061 for (x = 0; x < xBlocks; ++x, ++pe ) {
01062 pencilPMEProcessors[yprocs[pe]] = 1;
01063 }
01064
01065 if(CkMyPe() == 0){
01066 iout << iINFO << "PME X PENCIL LOCATIONS:";
01067 for ( i=0; i<xprocs.size() && i<10; ++i ) {
01068 #if USE_TOPO_SFC
01069 int x,y,z,t;
01070 tmgr.rankToCoordinates(xprocs[i], x,y, z, t);
01071 iout << " " << xprocs[i] << "(" << x << " " << y << " " << z << ")";
01072 #else
01073 iout << " " << xprocs[i];
01074 #endif
01075 }
01076 if ( i < xprocs.size() ) iout << " ...";
01077 iout << "\n" << endi;
01078 }
01079
01080 for (pe=0, y = 0; y < yBlocks; ++y )
01081 for (z = 0; z < zBlocks; ++z, ++pe ) {
01082 pencilPMEProcessors[xprocs[pe]] = 1;
01083 }
01084
01085 }
01086
01087
01088
01089 if ( CkMyPe() == 0 ){
01090 #if 1
01091 CProxy_PmePencilMap zm = CProxy_PmePencilMap::ckNew(0,1,yBlocks,xBlocks*yBlocks,zprocs.begin());
01092 CProxy_PmePencilMap ym = CProxy_PmePencilMap::ckNew(2,0,xBlocks,zBlocks*xBlocks,yprocs.begin());
01093 CProxy_PmePencilMap xm = CProxy_PmePencilMap::ckNew(1,2,zBlocks,yBlocks*zBlocks,xprocs.begin());
01094 pmeNodeProxy.recvPencilMapProxies(xm,ym,zm);
01095 CkArrayOptions zo(xBlocks,yBlocks,1); zo.setMap(zm);
01096 CkArrayOptions yo(xBlocks,1,zBlocks); yo.setMap(ym);
01097 CkArrayOptions xo(1,yBlocks,zBlocks); xo.setMap(xm);
01098 #if CHARM_VERSION > 60301
01099 zo.setAnytimeMigration(false); zo.setStaticInsertion(true);
01100 yo.setAnytimeMigration(false); yo.setStaticInsertion(true);
01101 xo.setAnytimeMigration(false); xo.setStaticInsertion(true);
01102 #endif
01103 zPencil = CProxy_PmeZPencil::ckNew(zo);
01104 yPencil = CProxy_PmeYPencil::ckNew(yo);
01105 xPencil = CProxy_PmeXPencil::ckNew(xo);
01106 #else
01107 zPencil = CProxy_PmeZPencil::ckNew();
01108 yPencil = CProxy_PmeYPencil::ckNew();
01109 xPencil = CProxy_PmeXPencil::ckNew();
01110
01111 for (pe=0, x = 0; x < xBlocks; ++x)
01112 for (y = 0; y < yBlocks; ++y, ++pe ) {
01113 zPencil(x,y,0).insert(zprocs[pe]);
01114 }
01115 zPencil.doneInserting();
01116
01117 for (pe=0, z = 0; z < zBlocks; ++z )
01118 for (x = 0; x < xBlocks; ++x, ++pe ) {
01119 yPencil(x,0,z).insert(yprocs[pe]);
01120 }
01121 yPencil.doneInserting();
01122
01123
01124 for (pe=0, y = 0; y < yBlocks; ++y )
01125 for (z = 0; z < zBlocks; ++z, ++pe ) {
01126 xPencil(0,y,z).insert(xprocs[pe]);
01127 }
01128 xPencil.doneInserting();
01129 #endif
01130
01131 pmeProxy.recvArrays(xPencil,yPencil,zPencil);
01132 PmePencilInitMsgData msgdata;
01133 msgdata.grid = myGrid;
01134 msgdata.xBlocks = xBlocks;
01135 msgdata.yBlocks = yBlocks;
01136 msgdata.zBlocks = zBlocks;
01137 msgdata.xPencil = xPencil;
01138 msgdata.yPencil = yPencil;
01139 msgdata.zPencil = zPencil;
01140 msgdata.pmeProxy = pmeProxyDir;
01141 msgdata.pmeNodeProxy = pmeNodeProxy;
01142 msgdata.xm = xm;
01143 msgdata.ym = ym;
01144 msgdata.zm = zm;
01145 xPencil.init(new PmePencilInitMsg(msgdata));
01146 yPencil.init(new PmePencilInitMsg(msgdata));
01147 zPencil.init(new PmePencilInitMsg(msgdata));
01148 }
01149
01150 return;
01151 }
01152
01153
01154 int pe;
01155 int nx = 0;
01156 for ( pe = 0; pe < numGridPes; ++pe ) {
01157 localInfo[pe].x_start = nx;
01158 nx += myGrid.block1;
01159 if ( nx > myGrid.K1 ) nx = myGrid.K1;
01160 localInfo[pe].nx = nx - localInfo[pe].x_start;
01161 }
01162 int ny = 0;
01163 for ( pe = 0; pe < numTransPes; ++pe ) {
01164 localInfo[pe].y_start_after_transpose = ny;
01165 ny += myGrid.block2;
01166 if ( ny > myGrid.K2 ) ny = myGrid.K2;
01167 localInfo[pe].ny_after_transpose =
01168 ny - localInfo[pe].y_start_after_transpose;
01169 }
01170
01171 {
01172
01173 PatchMap *patchMap = PatchMap::Object();
01174 Lattice lattice = simParams->lattice;
01175 BigReal sysdima = lattice.a_r().unit() * lattice.a();
01176 BigReal cutoff = simParams->cutoff;
01177 BigReal patchdim = simParams->patchDimension;
01178 int numPatches = patchMap->numPatches();
01179 int numNodes = CkNumPes();
01180 int *source_flags = new int[numNodes];
01181 int node;
01182 for ( node=0; node<numNodes; ++node ) {
01183 source_flags[node] = 0;
01184 recipPeDest[node] = 0;
01185 }
01186
01187
01188
01189
01190
01191
01192
01193 for ( int pid=0; pid < numPatches; ++pid ) {
01194 int pnode = patchMap->node(pid);
01195 BigReal minx = patchMap->min_a(pid);
01196 BigReal maxx = patchMap->max_a(pid);
01197 BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
01198
01199 int min1 = ((int) floor(myGrid.K1 * (minx - margina))) - myGrid.order + 1;
01200 int max1 = ((int) floor(myGrid.K1 * (maxx + margina)));
01201 for ( int i=min1; i<=max1; ++i ) {
01202 int ix = i;
01203 while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
01204 while ( ix < 0 ) ix += myGrid.K1;
01205
01206 if ( myGridPe >= 0 && ix >= localInfo[myGridPe].x_start &&
01207 ix < localInfo[myGridPe].x_start + localInfo[myGridPe].nx ) {
01208 source_flags[pnode] = 1;
01209 }
01210
01211 if ( pnode == CkMyPe() ) {
01212 recipPeDest[ix / myGrid.block1] = 1;
01213 }
01214 }
01215 }
01216
01217 numSources = 0;
01218 numDestRecipPes = 0;
01219 for ( node=0; node<numNodes; ++node ) {
01220 if ( source_flags[node] ) ++numSources;
01221 if ( recipPeDest[node] ) ++numDestRecipPes;
01222 }
01223
01224 #if 0
01225 if ( numSources ) {
01226 iout << iINFO << "PME " << CkMyPe() << " sources:";
01227 for ( node=0; node<numNodes; ++node ) {
01228 if ( source_flags[node] ) iout << " " << node;
01229 }
01230 iout << "\n" << endi;
01231 }
01232 #endif
01233
01234 delete [] source_flags;
01235
01236
01237
01238
01239 }
01240
01241 ungrid_count = numDestRecipPes;
01242
01243 sendTransBarrier_received = 0;
01244
01245 if ( myGridPe < 0 && myTransPe < 0 ) return;
01246
01247
01248 if ( myTransPe >= 0 ) {
01249 int k2_start = localInfo[myTransPe].y_start_after_transpose;
01250 int k2_end = k2_start + localInfo[myTransPe].ny_after_transpose;
01251 #ifdef OPENATOM_VERSION
01252 if ( simParams->openatomOn ) {
01253 CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
01254 myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2, moaProxy);
01255 } else {
01256 myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
01257 }
01258 #else // OPENATOM_VERSION
01259 myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
01260 #endif // OPENATOM_VERSION
01261 }
01262
01263 int local_size = myGrid.block1 * myGrid.K2 * myGrid.dim3;
01264 int local_size_2 = myGrid.block2 * myGrid.K1 * myGrid.dim3;
01265 if ( local_size < local_size_2 ) local_size = local_size_2;
01266 qgrid = new float[local_size*numGrids];
01267 if ( numGridPes > 1 || numTransPes > 1 ) {
01268 kgrid = new float[local_size*numGrids];
01269 } else {
01270 kgrid = qgrid;
01271 }
01272 qgrid_size = local_size;
01273
01274 if ( myGridPe >= 0 ) {
01275 qgrid_start = localInfo[myGridPe].x_start * myGrid.K2 * myGrid.dim3;
01276 qgrid_len = localInfo[myGridPe].nx * myGrid.K2 * myGrid.dim3;
01277 fgrid_start = localInfo[myGridPe].x_start * myGrid.K2;
01278 fgrid_len = localInfo[myGridPe].nx * myGrid.K2;
01279 }
01280
01281 int n[3]; n[0] = myGrid.K1; n[1] = myGrid.K2; n[2] = myGrid.K3;
01282 #ifdef NAMD_FFTW
01283 CmiLock(fftw_plan_lock);
01284 #ifdef NAMD_FFTW_3
01285 work = new fftwf_complex[n[0]];
01286 int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT : simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE ;
01287 if ( myGridPe >= 0 ) {
01288 forward_plan_yz=new fftwf_plan[numGrids];
01289 backward_plan_yz=new fftwf_plan[numGrids];
01290 }
01291 if ( myTransPe >= 0 ) {
01292 forward_plan_x=new fftwf_plan[numGrids];
01293 backward_plan_x=new fftwf_plan[numGrids];
01294 }
01295
01296 if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps. 1..." << endi;
01297 if ( myGridPe >= 0 ) {
01298 for( int g=0; g<numGrids; g++)
01299 {
01300 forward_plan_yz[g] = fftwf_plan_many_dft_r2c(2, n+1,
01301 localInfo[myGridPe].nx,
01302 qgrid + qgrid_size * g,
01303 NULL,
01304 1,
01305 myGrid.dim2 * myGrid.dim3,
01306 (fftwf_complex *)
01307 qgrid + qgrid_size * g,
01308 NULL,
01309 1,
01310 myGrid.dim2 * (myGrid.dim3/2),
01311 fftwFlags);
01312 }
01313 }
01314 int zdim = myGrid.dim3;
01315 int xStride=localInfo[myTransPe].ny_after_transpose *( myGrid.dim3 / 2);
01316 if ( ! CkMyPe() ) iout << " 2..." << endi;
01317 if ( myTransPe >= 0 ) {
01318 for( int g=0; g<numGrids; g++)
01319 {
01320
01321 forward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
01322 (fftwf_complex *)
01323 (kgrid+qgrid_size*g),
01324 NULL,
01325 xStride,
01326 1,
01327 (fftwf_complex *)
01328 (kgrid+qgrid_size*g),
01329 NULL,
01330 xStride,
01331 1,
01332 FFTW_FORWARD,fftwFlags);
01333
01334 }
01335 }
01336 if ( ! CkMyPe() ) iout << " 3..." << endi;
01337 if ( myTransPe >= 0 ) {
01338 for( int g=0; g<numGrids; g++)
01339 {
01340 backward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
01341 (fftwf_complex *)
01342 (kgrid+qgrid_size*g),
01343 NULL,
01344 xStride,
01345 1,
01346 (fftwf_complex *)
01347 (kgrid+qgrid_size*g),
01348 NULL,
01349 xStride,
01350 1,
01351 FFTW_BACKWARD, fftwFlags);
01352
01353 }
01354 }
01355 if ( ! CkMyPe() ) iout << " 4..." << endi;
01356 if ( myGridPe >= 0 ) {
01357 for( int g=0; g<numGrids; g++)
01358 {
01359 backward_plan_yz[g] = fftwf_plan_many_dft_c2r(2, n+1,
01360 localInfo[myGridPe].nx,
01361 (fftwf_complex *)
01362 qgrid + qgrid_size * g,
01363 NULL,
01364 1,
01365 myGrid.dim2*(myGrid.dim3/2),
01366 qgrid + qgrid_size * g,
01367 NULL,
01368 1,
01369 myGrid.dim2 * myGrid.dim3,
01370 fftwFlags);
01371 }
01372 }
01373 if ( ! CkMyPe() ) iout << " Done.\n" << endi;
01374
01375 #else
01376 work = new fftw_complex[n[0]];
01377
01378 if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps. 1..." << endi;
01379 if ( myGridPe >= 0 ) {
01380 forward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_REAL_TO_COMPLEX,
01381 ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01382 | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
01383 }
01384 if ( ! CkMyPe() ) iout << " 2..." << endi;
01385 if ( myTransPe >= 0 ) {
01386 forward_plan_x = fftw_create_plan_specific(n[0], FFTW_FORWARD,
01387 ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01388 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
01389 localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
01390 }
01391 if ( ! CkMyPe() ) iout << " 3..." << endi;
01392 if ( myTransPe >= 0 ) {
01393 backward_plan_x = fftw_create_plan_specific(n[0], FFTW_BACKWARD,
01394 ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01395 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
01396 localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
01397 }
01398 if ( ! CkMyPe() ) iout << " 4..." << endi;
01399 if ( myGridPe >= 0 ) {
01400 backward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_COMPLEX_TO_REAL,
01401 ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01402 | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
01403 }
01404 if ( ! CkMyPe() ) iout << " Done.\n" << endi;
01405 #endif
01406 CmiUnlock(fftw_plan_lock);
01407 #else
01408 NAMD_die("Sorry, FFTW must be compiled in to use PME.");
01409 #endif
01410
01411 if ( myGridPe >= 0 && numSources == 0 )
01412 NAMD_bug("PME grid elements exist without sources.");
01413 grid_count = numSources;
01414 memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
01415 trans_count = numGridPes;
01416 }
01417
01418
01419
01420 void ComputePmeMgr::initialize_pencils(CkQdMsg *msg) {
01421 delete msg;
01422 if ( ! usePencils ) return;
01423
01424 SimParameters *simParams = Node::Object()->simParameters;
01425
01426 PatchMap *patchMap = PatchMap::Object();
01427 Lattice lattice = simParams->lattice;
01428 BigReal sysdima = lattice.a_r().unit() * lattice.a();
01429 BigReal sysdimb = lattice.b_r().unit() * lattice.b();
01430 BigReal cutoff = simParams->cutoff;
01431 BigReal patchdim = simParams->patchDimension;
01432 int numPatches = patchMap->numPatches();
01433
01434 pencilActive = new char[xBlocks*yBlocks];
01435 for ( int i=0; i<xBlocks; ++i ) {
01436 for ( int j=0; j<yBlocks; ++j ) {
01437 pencilActive[i*yBlocks+j] = 0;
01438 }
01439 }
01440
01441 for ( int pid=0; pid < numPatches; ++pid ) {
01442 int pnode = patchMap->node(pid);
01443 if ( pnode != CkMyPe() ) continue;
01444
01445 BigReal minx = patchMap->min_a(pid);
01446 BigReal maxx = patchMap->max_a(pid);
01447 BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
01448
01449 int min1 = ((int) floor(myGrid.K1 * (minx - margina))) - myGrid.order + 1;
01450 int max1 = ((int) floor(myGrid.K1 * (maxx + margina)));
01451
01452 BigReal miny = patchMap->min_b(pid);
01453 BigReal maxy = patchMap->max_b(pid);
01454 BigReal marginb = 0.5 * ( patchdim - cutoff ) / sysdimb;
01455
01456 int min2 = ((int) floor(myGrid.K2 * (miny - marginb))) - myGrid.order + 1;
01457 int max2 = ((int) floor(myGrid.K2 * (maxy + marginb)));
01458
01459 for ( int i=min1; i<=max1; ++i ) {
01460 int ix = i;
01461 while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
01462 while ( ix < 0 ) ix += myGrid.K1;
01463 for ( int j=min2; j<=max2; ++j ) {
01464 int jy = j;
01465 while ( jy >= myGrid.K2 ) jy -= myGrid.K2;
01466 while ( jy < 0 ) jy += myGrid.K2;
01467 pencilActive[(ix / myGrid.block1)*yBlocks + (jy / myGrid.block2)] = 1;
01468 }
01469 }
01470 }
01471
01472 numPencilsActive = 0;
01473 for ( int i=0; i<xBlocks; ++i ) {
01474 for ( int j=0; j<yBlocks; ++j ) {
01475 if ( pencilActive[i*yBlocks+j] ) {
01476 ++numPencilsActive;
01477 zPencil(i,j,0).dummyRecvGrid(CkMyPe(),0);
01478 }
01479 }
01480 }
01481 activePencils = new ijpair[numPencilsActive];
01482 numPencilsActive = 0;
01483 for ( int i=0; i<xBlocks; ++i ) {
01484 for ( int j=0; j<yBlocks; ++j ) {
01485 if ( pencilActive[i*yBlocks+j] ) {
01486 activePencils[numPencilsActive++] = ijpair(i,j);
01487 }
01488 }
01489 }
01490 Random rand(CkMyPe());
01491 rand.reorder(activePencils,numPencilsActive);
01492
01493
01494
01495
01496 ungrid_count = numPencilsActive;
01497 }
01498
01499
01500 void ComputePmeMgr::activate_pencils(CkQdMsg *msg) {
01501 if ( ! usePencils ) return;
01502 if ( CkMyPe() == 0 ) zPencil.dummyRecvGrid(CkMyPe(),1);
01503 }
01504
01505
01506 ComputePmeMgr::~ComputePmeMgr() {
01507
01508 #ifdef NAMD_FFTW
01509 if ( CmiMyRank() == 0 ) {
01510 CmiDestroyLock(fftw_plan_lock);
01511 }
01512 #endif
01513
01514 delete myKSpace;
01515 delete [] localInfo;
01516 delete [] gridNodeInfo;
01517 delete [] transNodeInfo;
01518 delete [] gridPeMap;
01519 delete [] transPeMap;
01520 delete [] recipPeDest;
01521 delete [] gridPeOrder;
01522 delete [] gridNodeOrder;
01523 delete [] transNodeOrder;
01524 delete [] isPmeFlag;
01525 delete [] qgrid;
01526 if ( kgrid != qgrid ) delete [] kgrid;
01527 delete [] work;
01528 delete [] gridmsg_reuse;
01529 }
01530
01531 void ComputePmeMgr::sendGrid(void) {
01532 pmeCompute->sendData(numGridPes,gridPeOrder,recipPeDest,gridPeMap);
01533 }
01534
01535 void ComputePmeMgr::recvGrid(PmeGridMsg *msg) {
01536
01537 if ( grid_count == 0 ) {
01538 NAMD_bug("Message order failure in ComputePmeMgr::recvGrid\n");
01539 }
01540 if ( grid_count == numSources ) {
01541 lattice = msg->lattice;
01542 sequence = msg->sequence;
01543 }
01544
01545 int zdim = myGrid.dim3;
01546 int zlistlen = msg->zlistlen;
01547 int *zlist = msg->zlist;
01548 float *qmsg = msg->qgrid;
01549 for ( int g=0; g<numGrids; ++g ) {
01550 char *f = msg->fgrid + fgrid_len * g;
01551 float *q = qgrid + qgrid_size * g;
01552 for ( int i=0; i<fgrid_len; ++i ) {
01553 if ( f[i] ) {
01554 for ( int k=0; k<zlistlen; ++k ) {
01555 q[zlist[k]] += *(qmsg++);
01556 }
01557 }
01558 q += zdim;
01559 }
01560 }
01561
01562 gridmsg_reuse[numSources-grid_count] = msg;
01563 --grid_count;
01564
01565 if ( grid_count == 0 ) {
01566 pmeProxyDir[CkMyPe()].gridCalc1();
01567 if ( useBarrier ) pmeProxyDir[0].sendTransBarrier();
01568 }
01569 }
01570 #ifdef MANUAL_DEBUG_FFTW3
01571
01572
01573 void dumpMatrixFloat(const char *infilename, float *matrix, int xdim, int ydim, int zdim,int pe)
01574 {
01575
01576 char fmt[1000];
01577 char filename[1000];
01578 strncpy(fmt,infilename,999);
01579 strncat(fmt,"_%d.out",999);
01580 sprintf(filename,fmt, pe);
01581 FILE *loutfile = fopen(filename, "w");
01582 #ifdef PAIRCALC_TEST_DUMP
01583 fprintf(loutfile,"%d\n",ydim);
01584 #endif
01585 fprintf(loutfile,"%d %d %d\n",xdim,ydim, zdim);
01586 for(int i=0;i<xdim;i++)
01587 for(int j=0;j<ydim;j++)
01588 for(int k=0;k<zdim;k++)
01589 fprintf(loutfile,"%d %d %d %.8f\n",i,j,k,matrix[i*zdim*ydim+j*zdim +k]);
01590 fclose(loutfile);
01591
01592 }
01593
01594 void dumpMatrixFloat3(const char *infilename, float *matrix, int xdim, int ydim, int zdim,int x, int y, int z)
01595 {
01596 char fmt[1000];
01597 char filename[1000];
01598 strncpy(fmt,infilename,999);
01599 strncat(fmt,"_%d_%d_%d.out",999);
01600 sprintf(filename,fmt, x,y,z);
01601 FILE *loutfile = fopen(filename, "w");
01602 CkAssert(loutfile!=NULL);
01603 CkPrintf("opened %s for dump\n",filename);
01604 fprintf(loutfile,"%d %d %d\n",xdim,ydim, zdim);
01605 for(int i=0;i<xdim;i++)
01606 for(int j=0;j<ydim;j++)
01607 for(int k=0;k<zdim;k++)
01608 fprintf(loutfile,"%d %d %d %.8f\n",i,j,k,matrix[i*zdim*ydim+j*zdim +k]);
01609 fclose(loutfile);
01610 }
01611
01612 #endif
01613
01614 void ComputePmeMgr::gridCalc1(void) {
01615
01616
01617 #ifdef NAMD_FFTW
01618 for ( int g=0; g<numGrids; ++g ) {
01619 #ifdef NAMD_FFTW_3
01620 fftwf_execute(forward_plan_yz[g]);
01621 #else
01622 rfftwnd_real_to_complex(forward_plan_yz, localInfo[myGridPe].nx,
01623 qgrid + qgrid_size * g, 1, myGrid.dim2 * myGrid.dim3, 0, 0, 0);
01624 #endif
01625
01626 }
01627 #endif
01628
01629 if ( ! useBarrier ) pmeProxyDir[CkMyPe()].sendTrans();
01630 }
01631
01632 void ComputePmeMgr::sendTransBarrier(void) {
01633 sendTransBarrier_received += 1;
01634
01635 if ( sendTransBarrier_received < numGridPes ) return;
01636 sendTransBarrier_received = 0;
01637 for ( int i=0; i<numGridPes; ++i ) {
01638 pmeProxyDir[gridPeMap[i]].sendTrans();
01639 }
01640 }
01641
01642 void ComputePmeMgr::sendTrans(void) {
01643
01644
01645
01646 int zdim = myGrid.dim3;
01647 int nx = localInfo[myGridPe].nx;
01648 int x_start = localInfo[myGridPe].x_start;
01649 int slicelen = myGrid.K2 * zdim;
01650
01651 ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
01652
01653 #if CMK_BLUEGENEL
01654 CmiNetworkProgressAfter (0);
01655 #endif
01656
01657 for (int j=0; j<numTransNodes; j++) {
01658 int node = transNodeOrder[j];
01659 int pe = transNodeInfo[node].pe_start;
01660 int npe = transNodeInfo[node].npe;
01661 int totlen = 0;
01662 if ( node != myTransNode ) for (int i=0; i<npe; ++i, ++pe) {
01663 LocalPmeInfo &li = localInfo[pe];
01664 int cpylen = li.ny_after_transpose * zdim;
01665 totlen += cpylen;
01666 }
01667 PmeTransMsg *newmsg = new (nx * totlen * numGrids,
01668 PRIORITY_SIZE) PmeTransMsg;
01669 newmsg->sourceNode = myGridPe;
01670 newmsg->lattice = lattice;
01671 newmsg->x_start = x_start;
01672 newmsg->nx = nx;
01673 for ( int g=0; g<numGrids; ++g ) {
01674 float *qmsg = newmsg->qgrid + nx * totlen * g;
01675 pe = transNodeInfo[node].pe_start;
01676 for (int i=0; i<npe; ++i, ++pe) {
01677 LocalPmeInfo &li = localInfo[pe];
01678 int cpylen = li.ny_after_transpose * zdim;
01679 if ( node == myTransNode ) {
01680 ComputePmeMgr *m = mgrObjects[CkRankOf(transPeMap[pe])];
01681 qmsg = m->kgrid + m->qgrid_size * g + x_start*cpylen;
01682 }
01683 float *q = qgrid + qgrid_size * g + li.y_start_after_transpose * zdim;
01684 for ( int x = 0; x < nx; ++x ) {
01685 CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
01686 q += slicelen;
01687 qmsg += cpylen;
01688 }
01689 }
01690 }
01691 newmsg->sequence = sequence;
01692 SET_PRIORITY(newmsg,sequence,PME_TRANS_PRIORITY)
01693 if ( node == myTransNode ) newmsg->nx = 0;
01694 if ( npe > 1 ) {
01695 if ( node == myTransNode ) fwdSharedTrans(newmsg);
01696 else pmeNodeProxy[transNodeInfo[node].real_node].recvTrans(newmsg);
01697 } else pmeProxy[transPeMap[transNodeInfo[node].pe_start]].recvTrans(newmsg);
01698 }
01699
01700 untrans_count = numTransPes;
01701
01702 }
01703
01704 void ComputePmeMgr::fwdSharedTrans(PmeTransMsg *msg) {
01705
01706 int pe = transNodeInfo[myTransNode].pe_start;
01707 int npe = transNodeInfo[myTransNode].npe;
01708 CmiNodeLock lock = CmiCreateLock();
01709 int *count = new int; *count = npe;
01710 for (int i=0; i<npe; ++i, ++pe) {
01711 PmeSharedTransMsg *shmsg = new (PRIORITY_SIZE) PmeSharedTransMsg;
01712 SET_PRIORITY(shmsg,msg->sequence,PME_TRANS_PRIORITY)
01713 shmsg->msg = msg;
01714 shmsg->count = count;
01715 shmsg->lock = lock;
01716 pmeProxy[transPeMap[pe]].recvSharedTrans(shmsg);
01717 }
01718 }
01719
01720 void ComputePmeMgr::recvSharedTrans(PmeSharedTransMsg *msg) {
01721 procTrans(msg->msg);
01722 CmiLock(msg->lock);
01723 int count = --(*msg->count);
01724 CmiUnlock(msg->lock);
01725 if ( count == 0 ) {
01726 CmiDestroyLock(msg->lock);
01727 delete msg->count;
01728 delete msg->msg;
01729 }
01730 delete msg;
01731 }
01732
01733 void ComputePmeMgr::recvTrans(PmeTransMsg *msg) {
01734 procTrans(msg);
01735 delete msg;
01736 }
01737
01738 void ComputePmeMgr::procTrans(PmeTransMsg *msg) {
01739
01740 if ( trans_count == numGridPes ) {
01741 lattice = msg->lattice;
01742 sequence = msg->sequence;
01743 }
01744
01745 if ( msg->nx ) {
01746 int zdim = myGrid.dim3;
01747 NodePmeInfo &nodeInfo(transNodeInfo[myTransNode]);
01748 int first_pe = nodeInfo.pe_start;
01749 int last_pe = first_pe+nodeInfo.npe-1;
01750 int y_skip = localInfo[myTransPe].y_start_after_transpose
01751 - localInfo[first_pe].y_start_after_transpose;
01752 int ny_msg = localInfo[last_pe].y_start_after_transpose
01753 + localInfo[last_pe].ny_after_transpose
01754 - localInfo[first_pe].y_start_after_transpose;
01755 int ny = localInfo[myTransPe].ny_after_transpose;
01756 int x_start = msg->x_start;
01757 int nx = msg->nx;
01758 for ( int g=0; g<numGrids; ++g ) {
01759 CmiMemcpy((void*)(kgrid + qgrid_size * g + x_start*ny*zdim),
01760 (void*)(msg->qgrid + nx*(ny_msg*g+y_skip)*zdim),
01761 nx*ny*zdim*sizeof(float));
01762 }
01763 }
01764
01765 --trans_count;
01766
01767 if ( trans_count == 0 ) {
01768 pmeProxyDir[CkMyPe()].gridCalc2();
01769 }
01770 }
01771
01772 void ComputePmeMgr::gridCalc2(void) {
01773
01774
01775 #if CMK_BLUEGENEL
01776 CmiNetworkProgressAfter (0);
01777 #endif
01778
01779 int zdim = myGrid.dim3;
01780
01781 int ny = localInfo[myTransPe].ny_after_transpose;
01782
01783 for ( int g=0; g<numGrids; ++g ) {
01784
01785 #ifdef NAMD_FFTW
01786 #ifdef NAMD_FFTW_3
01787 fftwf_execute(forward_plan_x[g]);
01788 #else
01789 fftw(forward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
01790 ny * zdim / 2, 1, work, 1, 0);
01791 #endif
01792 #endif
01793 }
01794
01795 #ifdef OPENATOM_VERSION
01796 if ( ! simParams -> openatomOn ) {
01797 #endif // OPENATOM_VERSION
01798 gridCalc2R();
01799 #ifdef OPENATOM_VERSION
01800 } else {
01801 gridCalc2Moa();
01802 }
01803 #endif // OPENATOM_VERSION
01804 }
01805
01806 #ifdef OPENATOM_VERSION
01807 void ComputePmeMgr::gridCalc2Moa(void) {
01808
01809 int zdim = myGrid.dim3;
01810
01811 int ny = localInfo[myTransPe].ny_after_transpose;
01812
01813 SimParameters *simParams = Node::Object()->simParameters;
01814
01815 CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
01816
01817 for ( int g=0; g<numGrids; ++g ) {
01818 #ifdef OPENATOM_VERSION_DEBUG
01819 CkPrintf("Sending recQ on processor %d \n", CkMyPe());
01820 for ( int i=0; i<=(ny * zdim / 2); ++i)
01821 {
01822 CkPrintf("PE, g,fftw_q,k*q*g, kgrid, qgrid_size value %d pre-send = %d, %d, %f %f, %d, \n", i, CkMyPe(), g, (kgrid+qgrid_size*g)[i], kgrid[i], qgrid_size);
01823 }
01824 #endif // OPENATOM_VERSION_DEBUG
01825
01826 CkCallback resumePme(CkIndex_ComputePmeMgr::gridCalc2R(), thishandle);
01827 moaProxy[CkMyPe()].recvQ(g,numGrids,(ny * zdim / 2),(kgrid+qgrid_size*g), resumePme);
01828 }
01829 }
01830 #endif // OPENATOM_VERSION
01831
01832 void ComputePmeMgr::gridCalc2R(void) {
01833
01834 int zdim = myGrid.dim3;
01835
01836 int ny = localInfo[myTransPe].ny_after_transpose;
01837
01838 for ( int g=0; g<numGrids; ++g ) {
01839
01840 BigReal ewaldcof = ComputeNonbondedUtil::ewaldcof;
01841 recip_evir2[g][0] = myKSpace->compute_energy(kgrid+qgrid_size*g,
01842 lattice, ewaldcof, &(recip_evir2[g][1]));
01843
01844
01845
01846
01847 #ifdef NAMD_FFTW
01848 #ifdef NAMD_FFTW_3
01849 fftwf_execute(backward_plan_x[g]);
01850 #else
01851 fftw(backward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
01852 ny * zdim / 2, 1, work, 1, 0);
01853 #endif
01854 #endif
01855 }
01856
01857 pmeProxyDir[CkMyPe()].sendUntrans();
01858 }
01859
01860 void ComputePmeMgr::sendUntrans(void) {
01861
01862 int zdim = myGrid.dim3;
01863 int y_start = localInfo[myTransPe].y_start_after_transpose;
01864 int ny = localInfo[myTransPe].ny_after_transpose;
01865 int slicelen = myGrid.K2 * zdim;
01866
01867 ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
01868
01869 #if CMK_BLUEGENEL
01870 CmiNetworkProgressAfter (0);
01871 #endif
01872
01873
01874 for (int j=0; j<numGridNodes; j++) {
01875 int node = gridNodeOrder[j];
01876 int pe = gridNodeInfo[node].pe_start;
01877 int npe = gridNodeInfo[node].npe;
01878 int totlen = 0;
01879 if ( node != myGridNode ) for (int i=0; i<npe; ++i, ++pe) {
01880 LocalPmeInfo &li = localInfo[pe];
01881 int cpylen = li.nx * zdim;
01882 totlen += cpylen;
01883 }
01884 PmeUntransMsg *newmsg = new (ny * totlen * numGrids,numGrids,
01885 PRIORITY_SIZE) PmeUntransMsg;
01886 newmsg->sourceNode = myTransPe;
01887 newmsg->y_start = y_start;
01888 newmsg->ny = ny;
01889 for ( int g=0; g<numGrids; ++g ) {
01890 if ( j == 0 ) {
01891 newmsg->evir[g] = recip_evir2[g];
01892 } else {
01893 newmsg->evir[g] = 0.;
01894 }
01895 float *qmsg = newmsg->qgrid + ny * totlen * g;
01896 pe = gridNodeInfo[node].pe_start;
01897 for (int i=0; i<npe; ++i, ++pe) {
01898 LocalPmeInfo &li = localInfo[pe];
01899 if ( node == myGridNode ) {
01900 ComputePmeMgr *m = mgrObjects[CkRankOf(gridPeMap[pe])];
01901 qmsg = m->qgrid + m->qgrid_size * g + y_start * zdim;
01902 float *q = kgrid + qgrid_size*g + li.x_start*ny*zdim;
01903 int cpylen = ny * zdim;
01904 for ( int x = 0; x < li.nx; ++x ) {
01905 CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
01906 q += cpylen;
01907 qmsg += slicelen;
01908 }
01909 } else {
01910 CmiMemcpy((void*)qmsg,
01911 (void*)(kgrid + qgrid_size*g + li.x_start*ny*zdim),
01912 li.nx*ny*zdim*sizeof(float));
01913 qmsg += li.nx*ny*zdim;
01914 }
01915 }
01916 }
01917 SET_PRIORITY(newmsg,sequence,PME_UNTRANS_PRIORITY)
01918 if ( node == myGridNode ) newmsg->ny = 0;
01919 if ( npe > 1 ) {
01920 if ( node == myGridNode ) fwdSharedUntrans(newmsg);
01921 else pmeNodeProxy[gridNodeInfo[node].real_node].recvUntrans(newmsg);
01922 } else pmeProxy[gridPeMap[gridNodeInfo[node].pe_start]].recvUntrans(newmsg);
01923 }
01924
01925 trans_count = numGridPes;
01926 }
01927
01928 void ComputePmeMgr::fwdSharedUntrans(PmeUntransMsg *msg) {
01929 int pe = gridNodeInfo[myGridNode].pe_start;
01930 int npe = gridNodeInfo[myGridNode].npe;
01931 CmiNodeLock lock = CmiCreateLock();
01932 int *count = new int; *count = npe;
01933 for (int i=0; i<npe; ++i, ++pe) {
01934 PmeSharedUntransMsg *shmsg = new PmeSharedUntransMsg;
01935 shmsg->msg = msg;
01936 shmsg->count = count;
01937 shmsg->lock = lock;
01938 pmeProxy[gridPeMap[pe]].recvSharedUntrans(shmsg);
01939 }
01940 }
01941
01942 void ComputePmeMgr::recvSharedUntrans(PmeSharedUntransMsg *msg) {
01943 procUntrans(msg->msg);
01944 CmiLock(msg->lock);
01945 int count = --(*msg->count);
01946 CmiUnlock(msg->lock);
01947 if ( count == 0 ) {
01948 CmiDestroyLock(msg->lock);
01949 delete msg->count;
01950 delete msg->msg;
01951 }
01952 delete msg;
01953 }
01954
01955 void ComputePmeMgr::recvUntrans(PmeUntransMsg *msg) {
01956 procUntrans(msg);
01957 delete msg;
01958 }
01959
01960 void ComputePmeMgr::procUntrans(PmeUntransMsg *msg) {
01961
01962 if ( untrans_count == numTransPes ) {
01963 for ( int g=0; g<numGrids; ++g ) {
01964 recip_evir[g] = 0.;
01965 }
01966 }
01967
01968 #if CMK_BLUEGENEL
01969 CmiNetworkProgressAfter (0);
01970 #endif
01971
01972 NodePmeInfo &nodeInfo(gridNodeInfo[myGridNode]);
01973 int first_pe = nodeInfo.pe_start;
01974 int g;
01975 if ( myGridPe == first_pe ) for ( g=0; g<numGrids; ++g ) {
01976 recip_evir[g] += msg->evir[g];
01977 }
01978
01979 if ( msg->ny ) {
01980 int zdim = myGrid.dim3;
01981 int last_pe = first_pe+nodeInfo.npe-1;
01982 int x_skip = localInfo[myGridPe].x_start
01983 - localInfo[first_pe].x_start;
01984 int nx_msg = localInfo[last_pe].x_start
01985 + localInfo[last_pe].nx
01986 - localInfo[first_pe].x_start;
01987 int nx = localInfo[myGridPe].nx;
01988 int y_start = msg->y_start;
01989 int ny = msg->ny;
01990 int slicelen = myGrid.K2 * zdim;
01991 int cpylen = ny * zdim;
01992 for ( g=0; g<numGrids; ++g ) {
01993 float *q = qgrid + qgrid_size * g + y_start * zdim;
01994 float *qmsg = msg->qgrid + (nx_msg*g+x_skip) * cpylen;
01995 for ( int x = 0; x < nx; ++x ) {
01996 CmiMemcpy((void*)q, (void*)qmsg, cpylen*sizeof(float));
01997 q += slicelen;
01998 qmsg += cpylen;
01999 }
02000 }
02001 }
02002
02003 --untrans_count;
02004
02005 if ( untrans_count == 0 ) {
02006 pmeProxyDir[CkMyPe()].gridCalc3();
02007 }
02008 }
02009
02010 void ComputePmeMgr::gridCalc3(void) {
02011
02012
02013
02014 #ifdef NAMD_FFTW
02015
02016 for ( int g=0; g<numGrids; ++g ) {
02017 #ifdef NAMD_FFTW_3
02018 fftwf_execute(backward_plan_yz[g]);
02019 #else
02020 rfftwnd_complex_to_real(backward_plan_yz, localInfo[myGridPe].nx,
02021 (fftw_complex *) (qgrid + qgrid_size * g),
02022 1, myGrid.dim2 * myGrid.dim3 / 2, 0, 0, 0);
02023 #endif
02024 }
02025
02026 #endif
02027
02028 pmeProxyDir[CkMyPe()].sendUngrid();
02029 }
02030
02031 void ComputePmeMgr::sendUngrid(void) {
02032
02033 for ( int j=0; j<numSources; ++j ) {
02034
02035 PmeGridMsg *newmsg = gridmsg_reuse[j];
02036 int pe = newmsg->sourceNode;
02037 if ( j == 0 ) {
02038 for ( int g=0; g<numGrids; ++g ) {
02039 newmsg->evir[g] = recip_evir[g];
02040 }
02041 } else {
02042 for ( int g=0; g<numGrids; ++g ) {
02043 newmsg->evir[g] = 0.;
02044 }
02045 }
02046 int zdim = myGrid.dim3;
02047 int flen = newmsg->len;
02048 int fstart = newmsg->start;
02049 int zlistlen = newmsg->zlistlen;
02050 int *zlist = newmsg->zlist;
02051 float *qmsg = newmsg->qgrid;
02052 for ( int g=0; g<numGrids; ++g ) {
02053 char *f = newmsg->fgrid + fgrid_len * g;
02054 float *q = qgrid + qgrid_size * g + (fstart-fgrid_start) * zdim;
02055 for ( int i=0; i<flen; ++i ) {
02056 if ( f[i] ) {
02057 for ( int k=0; k<zlistlen; ++k ) {
02058 *(qmsg++) = q[zlist[k]];
02059 }
02060 }
02061 q += zdim;
02062 }
02063 }
02064 newmsg->sourceNode = myGridPe;
02065
02066 SET_PRIORITY(newmsg,sequence,PME_UNGRID_PRIORITY)
02067 CmiEnableUrgentSend(1);
02068 pmeProxyDir[pe].recvUngrid(newmsg);
02069 CmiEnableUrgentSend(0);
02070 }
02071 grid_count = numSources;
02072 memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
02073 }
02074
02075 void ComputePmeMgr::recvUngrid(PmeGridMsg *msg) {
02076
02077 if ( ungrid_count == 0 ) {
02078 NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
02079 }
02080
02081 if ( usePencils ) pmeCompute->copyPencils(msg);
02082 else pmeCompute->copyResults(msg);
02083 delete msg;
02084 recvAck(0);
02085 }
02086
02087 void ComputePmeMgr::recvAck(PmeAckMsg *msg) {
02088 if ( msg ) delete msg;
02089 --ungrid_count;
02090
02091 if ( ungrid_count == 0 ) {
02092 pmeProxyDir[CkMyPe()].ungridCalc();
02093 }
02094 }
02095
02096 void ComputePmeMgr::ungridCalc(void) {
02097
02098
02099 pmeCompute->ungridForces();
02100
02101 ungrid_count = (usePencils ? numPencilsActive : numDestRecipPes );
02102 }
02103
02104
02105 ComputePme::ComputePme(ComputeID c) :
02106 ComputeHomePatches(c)
02107 {
02108 DebugM(4,"ComputePme created.\n");
02109 basePriority = PME_PRIORITY;
02110
02111 CProxy_ComputePmeMgr::ckLocalBranch(
02112 CkpvAccess(BOCclass_group).computePmeMgr)->setCompute(this);
02113
02114 useAvgPositions = 1;
02115
02116 reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
02117
02118 SimParameters *simParams = Node::Object()->simParameters;
02119
02120 if (simParams->accelMDOn) {
02121 amd_reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_AMD);
02122 } else {
02123 amd_reduction = NULL;
02124 }
02125
02126 alchFepOn = simParams->alchFepOn;
02127 alchThermIntOn = simParams->alchThermIntOn;
02128 alchDecouple = (alchFepOn || alchThermIntOn) && (simParams->alchDecouple);
02129 alchElecLambdaStart = (alchFepOn || alchThermIntOn) ?
02130 simParams->alchElecLambdaStart : 0;
02131
02132 if (alchFepOn || alchThermIntOn) {
02133 numGrids = 2;
02134 if (alchDecouple) numGrids += 2;
02135 if (alchElecLambdaStart || alchThermIntOn) numGrids ++;
02136 }
02137 else numGrids = 1;
02138 lesOn = simParams->lesOn;
02139 if ( lesOn ) {
02140 lesFactor = simParams->lesFactor;
02141 numGrids = lesFactor;
02142 }
02143 selfOn = 0;
02144 pairOn = simParams->pairInteractionOn;
02145 if ( pairOn ) {
02146 selfOn = simParams->pairInteractionSelf;
02147 if ( selfOn ) pairOn = 0;
02148 numGrids = selfOn ? 1 : 3;
02149 }
02150
02151 myGrid.K1 = simParams->PMEGridSizeX;
02152 myGrid.K2 = simParams->PMEGridSizeY;
02153 myGrid.K3 = simParams->PMEGridSizeZ;
02154 myGrid.order = simParams->PMEInterpOrder;
02155 myGrid.dim2 = myGrid.K2;
02156 myGrid.dim3 = 2 * (myGrid.K3/2 + 1);
02157 qsize = myGrid.K1 * myGrid.dim2 * myGrid.dim3;
02158 fsize = myGrid.K1 * myGrid.dim2;
02159 q_arr = new double*[fsize*numGrids];
02160 memset( (void*) q_arr, 0, fsize*numGrids * sizeof(double*) );
02161 q_list = new double*[fsize*numGrids];
02162 memset( (void*) q_list, 0, fsize*numGrids * sizeof(double*) );
02163 q_count = 0;
02164 f_arr = new char[fsize*numGrids];
02165 memset( (void*) f_arr, 2, fsize*numGrids * sizeof(char) );
02166 fz_arr = new char[myGrid.K3];
02167
02168 for ( int g=0; g<numGrids; ++g ) {
02169 char *f = f_arr + g*fsize;
02170 if ( myMgr->usePencils ) {
02171 int xBlocks = myMgr->xBlocks;
02172 int yBlocks = myMgr->yBlocks;
02173 int K1 = myGrid.K1;
02174 int K2 = myGrid.K2;
02175 int block1 = ( K1 + xBlocks - 1 ) / xBlocks;
02176 int block2 = ( K2 + yBlocks - 1 ) / yBlocks;
02177 int dim2 = myGrid.dim2;
02178 const ijpair *activePencils = myMgr->activePencils;
02179 const int numPencilsActive = myMgr->numPencilsActive;
02180 for (int ap=0; ap<numPencilsActive; ++ap) {
02181 int ib = activePencils[ap].i;
02182 int jb = activePencils[ap].j;
02183 int ibegin = ib*block1;
02184 int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
02185 int jbegin = jb*block2;
02186 int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
02187 int flen = numGrids * (iend - ibegin) * (jend - jbegin);
02188 for ( int i=ibegin; i<iend; ++i ) {
02189 for ( int j=jbegin; j<jend; ++j ) {
02190 f[i*dim2+j] = 0;
02191 }
02192 }
02193 }
02194 } else {
02195 int numRecipPes = myMgr->numGridPes;
02196 int block1 = ( myGrid.K1 + numRecipPes - 1 ) / numRecipPes;
02197 bsize = block1 * myGrid.dim2 * myGrid.dim3;
02198 for (int pe=0; pe<numRecipPes; pe++) {
02199 if ( ! myMgr->recipPeDest[pe] ) continue;
02200 int start = pe * bsize;
02201 int len = bsize;
02202 if ( start >= qsize ) { start = 0; len = 0; }
02203 if ( start + len > qsize ) { len = qsize - start; }
02204 int zdim = myGrid.dim3;
02205 int fstart = start / zdim;
02206 int flen = len / zdim;
02207 memset(f + fstart, 0, flen*sizeof(char));
02208 }
02209 }
02210 }
02211
02212 #if CMK_PERSISTENT_COMM
02213 recvGrid_handle = NULL;
02214 #endif
02215 }
02216
02217 ComputePme::~ComputePme()
02218 {
02219 for (int i=0; i<fsize*numGrids; ++i) {
02220 if ( q_arr[i] ) {
02221 delete [] q_arr[i];
02222 }
02223 }
02224 delete [] q_arr;
02225 delete [] q_list;
02226 delete [] f_arr;
02227 delete [] fz_arr;
02228 }
02229
02230 #if CMK_PERSISTENT_COMM
02231 void ComputePme::setup_recvgrid_persistent()
02232 {
02233 int xBlocks = myMgr->xBlocks;
02234 int yBlocks = myMgr->yBlocks;
02235 int zBlocks = myMgr->zBlocks;
02236 myGrid.block1 = ( myGrid.K1 + xBlocks - 1 ) / xBlocks;
02237 myGrid.block2 = ( myGrid.K2 + yBlocks - 1 ) / yBlocks;
02238 int K1 = myGrid.K1;
02239 int K2 = myGrid.K2;
02240 int dim2 = myGrid.dim2;
02241 int dim3 = myGrid.dim3;
02242 int block1 = myGrid.block1;
02243 int block2 = myGrid.block2;
02244
02245 const char *pencilActive = myMgr->pencilActive;
02246 const ijpair *activePencils = myMgr->activePencils;
02247 const int numPencilsActive = myMgr->numPencilsActive;
02248 int hd = 1 ;
02249
02250 CkArray *zPencil_local = myMgr->zPencil.ckLocalBranch();
02251 recvGrid_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * numPencilsActive);
02252 for (int ap=0; ap<numPencilsActive; ++ap) {
02253 int ib = activePencils[ap].i;
02254 int jb = activePencils[ap].j;
02255 int ibegin = ib*block1;
02256 int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
02257 int jbegin = jb*block2;
02258 int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
02259 int flen = numGrids * (iend - ibegin) * (jend - jbegin);
02260
02261 int fcount = 0;
02262 for ( int g=0; g<numGrids; ++g ) {
02263 char *f = f_arr + g*fsize;
02264 for ( int i=ibegin; i<iend; ++i ) {
02265 for ( int j=jbegin; j<jend; ++j ) {
02266
02267 fcount += (f[i*dim2+j]>3?f[i*dim2+j]:3);
02268 }
02269 }
02270 }
02271 int zlistlen = 0;
02272 for ( int i=0; i<myGrid.K3; ++i ) {
02273 if ( 1 ) ++zlistlen;
02274
02275 }
02276 int peer = zPencil_local->homePe(CkArrayIndex3D(ib, jb, 0));
02277 recvGrid_handle[ap] = CmiCreatePersistent(peer, sizeof(PmeGridMsg)
02278 + sizeof(float)*hd*fcount*zlistlen + sizeof(int)*hd*zlistlen + sizeof(char)*hd*flen +sizeof(PmeReduction)*hd*numGrids
02279 + sizeof(envelope)+PRIORITY_SIZE/8 );
02280 }
02281 }
02282 #endif
02283
02284
02285 void ComputePme::doWork()
02286 {
02287 DebugM(4,"Entering ComputePme::doWork().\n");
02288
02289 #ifdef TRACE_COMPUTE_OBJECTS
02290 double traceObjStartTime = CmiWallTimer();
02291 #endif
02292
02293 ResizeArrayIter<PatchElem> ap(patchList);
02294
02295
02296 if ( ! patchList[0].p->flags.doFullElectrostatics )
02297 {
02298 for (ap = ap.begin(); ap != ap.end(); ap++) {
02299 CompAtom *x = (*ap).positionBox->open();
02300 Results *r = (*ap).forceBox->open();
02301 (*ap).positionBox->close(&x);
02302 (*ap).forceBox->close(&r);
02303 }
02304 reduction->submit();
02305 if (amd_reduction) amd_reduction->submit();
02306 return;
02307 }
02308
02309
02310 numLocalAtoms = 0;
02311 for (ap = ap.begin(); ap != ap.end(); ap++) {
02312 numLocalAtoms += (*ap).p->getNumAtoms();
02313 }
02314
02315 Lattice lattice = patchList[0].p->flags.lattice;
02316
02317 localData = new PmeParticle[numLocalAtoms*(numGrids+
02318 ((numGrids>1 || selfOn)?1:0))];
02319 localPartition = new unsigned char[numLocalAtoms];
02320
02321 int g;
02322 for ( g=0; g<numGrids; ++g ) {
02323 localGridData[g] = localData + numLocalAtoms*(g+1);
02324 }
02325
02326
02327 PmeParticle * data_ptr = localData;
02328 unsigned char * part_ptr = localPartition;
02329 const BigReal coulomb_sqrt = sqrt( COULOMB * ComputeNonbondedUtil::scaling
02330 * ComputeNonbondedUtil::dielectric_1 );
02331
02332 for (ap = ap.begin(); ap != ap.end(); ap++) {
02333 #ifdef NETWORK_PROGRESS
02334 CmiNetworkProgress();
02335 #endif
02336
02337 CompAtom *x = (*ap).positionBox->open();
02338
02339 if ( patchList[0].p->flags.doMolly ) {
02340 (*ap).positionBox->close(&x);
02341 x = (*ap).avgPositionBox->open();
02342 }
02343 int numAtoms = (*ap).p->getNumAtoms();
02344
02345 for(int i=0; i<numAtoms; ++i)
02346 {
02347 data_ptr->x = x[i].position.x;
02348 data_ptr->y = x[i].position.y;
02349 data_ptr->z = x[i].position.z;
02350 data_ptr->cg = coulomb_sqrt * x[i].charge;
02351 ++data_ptr;
02352 *part_ptr = x[i].partition;
02353 ++part_ptr;
02354 }
02355
02356 if ( patchList[0].p->flags.doMolly ) { (*ap).avgPositionBox->close(&x); }
02357 else { (*ap).positionBox->close(&x); }
02358 }
02359
02360
02361 if ( ((alchFepOn || alchThermIntOn) && (!alchDecouple)) || lesOn ) {
02362 for ( g=0; g<numGrids; ++g ) {
02363 #ifdef NETWORK_PROGRESS
02364 CmiNetworkProgress();
02365 #endif
02366
02367 PmeParticle *lgd = localGridData[g];
02368 int nga = 0;
02369 for(int i=0; i<numLocalAtoms; ++i) {
02370 if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
02371
02372
02373
02374 lgd[nga++] = localData[i];
02375 }
02376 }
02377 numGridAtoms[g] = nga;
02378 }
02379 } else if ( (alchFepOn || alchThermIntOn) && alchDecouple) {
02380
02381
02382
02383
02384
02385
02386 for ( g=0; g<2; ++g ) {
02387 #ifdef NETWORK_PROGRESS
02388 CmiNetworkProgress();
02389 #endif
02390
02391 PmeParticle *lgd = localGridData[g];
02392 int nga = 0;
02393 for(int i=0; i<numLocalAtoms; ++i) {
02394 if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
02395 lgd[nga++] = localData[i];
02396 }
02397 }
02398 numGridAtoms[g] = nga;
02399 }
02400 for (g=2 ; g<4 ; ++g ) {
02401 #ifdef NETWORK_PROGRESS
02402 CmiNetworkProgress();
02403 #endif
02404
02405 PmeParticle *lgd = localGridData[g];
02406 int nga = 0;
02407 for(int i=0; i<numLocalAtoms; ++i) {
02408 if ( localPartition[i] == (g-1) ) {
02409 lgd[nga++] = localData[i];
02410 }
02411 }
02412 numGridAtoms[g] = nga;
02413 }
02414 for (g=4 ; g<numGrids ; ++g ) {
02415
02416 #ifdef NETWORK_PROGRESS
02417 CmiNetworkProgress();
02418 #endif
02419
02420 PmeParticle *lgd = localGridData[g];
02421 int nga = 0;
02422 for(int i=0; i<numLocalAtoms; ++i) {
02423 if ( localPartition[i] == 0 ) {
02424 lgd[nga++] = localData[i];
02425 }
02426 }
02427 numGridAtoms[g] = nga;
02428 }
02429 } else if ( selfOn ) {
02430 if ( numGrids != 1 ) NAMD_bug("ComputePme::doWork assertion 1 failed");
02431 g = 0;
02432 PmeParticle *lgd = localGridData[g];
02433 int nga = 0;
02434 for(int i=0; i<numLocalAtoms; ++i) {
02435 if ( localPartition[i] == 1 ) {
02436 lgd[nga++] = localData[i];
02437 }
02438 }
02439 numGridAtoms[g] = nga;
02440 } else if ( pairOn ) {
02441 if ( numGrids != 3 ) NAMD_bug("ComputePme::doWork assertion 2 failed");
02442 g = 0;
02443 PmeParticle *lgd = localGridData[g];
02444 int nga = 0;
02445 for(int i=0; i<numLocalAtoms; ++i) {
02446 if ( localPartition[i] == 1 || localPartition[i] == 2 ) {
02447 lgd[nga++] = localData[i];
02448 }
02449 }
02450 numGridAtoms[g] = nga;
02451 for ( g=1; g<3; ++g ) {
02452 PmeParticle *lgd = localGridData[g];
02453 int nga = 0;
02454 for(int i=0; i<numLocalAtoms; ++i) {
02455 if ( localPartition[i] == g ) {
02456 lgd[nga++] = localData[i];
02457 }
02458 }
02459 numGridAtoms[g] = nga;
02460 }
02461 } else {
02462 if ( numGrids != 1 ) NAMD_bug("ComputePme::doWork assertion 3 failed");
02463 localGridData[0] = localData;
02464 numGridAtoms[0] = numLocalAtoms;
02465 }
02466
02467 memset( (void*) fz_arr, 0, myGrid.K3 * sizeof(char) );
02468
02469 for (int i=0; i<q_count; ++i) {
02470 memset( (void*) (q_list[i]), 0, myGrid.dim3 * sizeof(double) );
02471 }
02472
02473 strayChargeErrors = 0;
02474
02475
02476 BigReal ewaldcof = ComputeNonbondedUtil::ewaldcof;
02477 for ( g=0; g<numGrids; ++g ) {
02478 #ifdef NETWORK_PROGRESS
02479 CmiNetworkProgress();
02480 #endif
02481
02482 evir[g] = 0;
02483 BigReal selfEnergy = 0;
02484 data_ptr = localGridData[g];
02485 int i;
02486 for(i=0; i<numGridAtoms[g]; ++i)
02487 {
02488 selfEnergy += data_ptr->cg * data_ptr->cg;
02489 ++data_ptr;
02490 }
02491 selfEnergy *= -1. * ewaldcof / SQRT_PI;
02492 evir[g][0] += selfEnergy;
02493
02494 double **q = q_arr + g*fsize;
02495 char *f = f_arr + g*fsize;
02496
02497 myRealSpace[g] = new PmeRealSpace(myGrid,numGridAtoms[g]);
02498 scale_coordinates(localGridData[g], numGridAtoms[g], lattice, myGrid);
02499 myRealSpace[g]->fill_charges(q, q_list, q_count, strayChargeErrors, f, fz_arr, localGridData[g]);
02500 }
02501
02502 #ifdef TRACE_COMPUTE_OBJECTS
02503 traceUserBracketEvent(TRACE_COMPOBJ_IDOFFSET+this->cid, traceObjStartTime, CmiWallTimer());
02504 #endif
02505
02506 if ( myMgr->usePencils ) {
02507 sendPencils();
02508 } else {
02509 #if 0
02510 CProxy_ComputePmeMgr pmeProxy(CkpvAccess(BOCclass_group).computePmeMgr);
02511 pmeProxy[CkMyPe()].sendGrid();
02512 #else
02513 sendData(myMgr->numGridPes,myMgr->gridPeOrder,
02514 myMgr->recipPeDest,myMgr->gridPeMap);
02515 #endif
02516 }
02517
02518 }
02519
02520
02521 void ComputePme::sendPencils() {
02522
02523
02524
02525 int xBlocks = myMgr->xBlocks;
02526 int yBlocks = myMgr->yBlocks;
02527 int zBlocks = myMgr->zBlocks;
02528 myGrid.block1 = ( myGrid.K1 + xBlocks - 1 ) / xBlocks;
02529 myGrid.block2 = ( myGrid.K2 + yBlocks - 1 ) / yBlocks;
02530 int K1 = myGrid.K1;
02531 int K2 = myGrid.K2;
02532 int dim2 = myGrid.dim2;
02533 int dim3 = myGrid.dim3;
02534 int block1 = myGrid.block1;
02535 int block2 = myGrid.block2;
02536
02537 Lattice lattice = patchList[0].p->flags.lattice;
02538
02539 resultsRemaining = myMgr->numPencilsActive;
02540 const char *pencilActive = myMgr->pencilActive;
02541 const ijpair *activePencils = myMgr->activePencils;
02542 const int numPencilsActive = myMgr->numPencilsActive;
02543
02544
02545 NodePmeMgr *npMgr = myMgr->pmeNodeProxy[CkMyNode()].ckLocalBranch();
02546
02547 for (int ap=0; ap<numPencilsActive; ++ap) {
02548 int ib = activePencils[ap].i;
02549 int jb = activePencils[ap].j;
02550 int ibegin = ib*block1;
02551 int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
02552 int jbegin = jb*block2;
02553 int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
02554 int flen = numGrids * (iend - ibegin) * (jend - jbegin);
02555
02556 int fcount = 0;
02557 for ( int g=0; g<numGrids; ++g ) {
02558 char *f = f_arr + g*fsize;
02559 for ( int i=ibegin; i<iend; ++i ) {
02560 for ( int j=jbegin; j<jend; ++j ) {
02561 fcount += f[i*dim2+j];
02562 }
02563 }
02564 }
02565
02566 #ifdef NETWORK_PROGRESS
02567 CmiNetworkProgress();
02568 #endif
02569
02570 if ( ! pencilActive[ib*yBlocks+jb] )
02571 NAMD_bug("PME activePencils list inconsistent");
02572
02573 int zlistlen = 0;
02574 for ( int i=0; i<myGrid.K3; ++i ) {
02575 if ( fz_arr[i] ) ++zlistlen;
02576 }
02577
02578 int hd = ( fcount? 1 : 0 );
02579
02580
02581 PmeGridMsg *msg = new (hd*fcount*zlistlen, hd*zlistlen, hd*flen,
02582 hd*numGrids, PRIORITY_SIZE) PmeGridMsg;
02583 msg->sourceNode = CkMyPe();
02584 msg->hasData = hd;
02585 msg->lattice = lattice;
02586 if ( hd ) {
02587 #if 0
02588 msg->start = fstart;
02589 msg->len = flen;
02590 #else
02591 msg->start = -1;
02592 msg->len = -1;
02593 #endif
02594 msg->zlistlen = zlistlen;
02595 int *zlist = msg->zlist;
02596 zlistlen = 0;
02597 for ( int i=0; i<myGrid.K3; ++i ) {
02598 if ( fz_arr[i] ) zlist[zlistlen++] = i;
02599 }
02600 char *fmsg = msg->fgrid;
02601 float *qmsg = msg->qgrid;
02602 for ( int g=0; g<numGrids; ++g ) {
02603 char *f = f_arr + g*fsize;
02604 double **q = q_arr + g*fsize;
02605 for ( int i=ibegin; i<iend; ++i ) {
02606 for ( int j=jbegin; j<jend; ++j ) {
02607 *(fmsg++) = f[i*dim2+j];
02608 if( f[i*dim2+j] ) {
02609 for ( int k=0; k<zlistlen; ++k ) {
02610 *(qmsg++) = q[i*dim2+j][zlist[k]];
02611 }
02612 }
02613 }
02614 }
02615 }
02616 }
02617
02618 msg->sequence = sequence();
02619 SET_PRIORITY(msg,sequence(),PME_GRID_PRIORITY)
02620 CmiEnableUrgentSend(1);
02621 #if USE_NODE_PAR_RECEIVE
02622 msg->destElem=CkArrayIndex3D(ib,jb,0);
02623 CProxy_PmePencilMap lzm = npMgr->zm;
02624 int destproc = lzm.ckLocalBranch()->procNum(0, msg->destElem);
02625 int destnode = CmiNodeOf(destproc);
02626 myMgr->pmeNodeProxy[destnode].recvZGrid(msg);
02627 #else
02628 myMgr->zPencil(ib,jb,0).recvGrid(msg);
02629 #endif
02630 CmiEnableUrgentSend(0);
02631 }
02632
02633
02634 if ( strayChargeErrors ) {
02635 iout << iERROR << "Stray PME grid charges detected: "
02636 << CkMyPe() << " sending to (x,y)";
02637 for (int ib=0; ib<xBlocks; ++ib) {
02638 for (int jb=0; jb<yBlocks; ++jb) {
02639 int ibegin = ib*block1;
02640 int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
02641 int jbegin = jb*block2;
02642 int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
02643 int flen = numGrids * (iend - ibegin) * (jend - jbegin);
02644
02645 for ( int g=0; g<numGrids; ++g ) {
02646 char *f = f_arr + g*fsize;
02647 if ( ! pencilActive[ib*yBlocks+jb] ) {
02648 for ( int i=ibegin; i<iend; ++i ) {
02649 for ( int j=jbegin; j<jend; ++j ) {
02650 if ( f[i*dim2+j] == 3 ) {
02651 f[i*dim2+j] = 2;
02652 iout << " (" << i << "," << j << ")";
02653 }
02654 }
02655 }
02656 }
02657 }
02658 }
02659 }
02660 iout << "\n" << endi;
02661 }
02662
02663
02664
02665
02666
02667 }
02668
02669
02670 void ComputePme::copyPencils(PmeGridMsg *msg) {
02671
02672 int xBlocks = myMgr->xBlocks;
02673 int yBlocks = myMgr->yBlocks;
02674 int zBlocks = myMgr->zBlocks;
02675 myGrid.block1 = ( myGrid.K1 + xBlocks - 1 ) / xBlocks;
02676 myGrid.block2 = ( myGrid.K2 + yBlocks - 1 ) / yBlocks;
02677 int K1 = myGrid.K1;
02678 int K2 = myGrid.K2;
02679 int dim2 = myGrid.dim2;
02680 int dim3 = myGrid.dim3;
02681 int block1 = myGrid.block1;
02682 int block2 = myGrid.block2;
02683
02684
02685 int ib = msg->sourceNode / yBlocks;
02686 int jb = msg->sourceNode % yBlocks;
02687
02688 int ibegin = ib*block1;
02689 int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
02690 int jbegin = jb*block2;
02691 int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
02692
02693 int zlistlen = msg->zlistlen;
02694 int *zlist = msg->zlist;
02695 float *qmsg = msg->qgrid;
02696 int g;
02697 for ( g=0; g<numGrids; ++g ) {
02698 evir[g] += msg->evir[g];
02699 char *f = f_arr + g*fsize;
02700 double **q = q_arr + g*fsize;
02701 for ( int i=ibegin; i<iend; ++i ) {
02702 for ( int j=jbegin; j<jend; ++j ) {
02703 if( f[i*dim2+j] ) {
02704 f[i*dim2+j] = 0;
02705 for ( int k=0; k<zlistlen; ++k ) {
02706 q[i*dim2+j][zlist[k]] = *(qmsg++);
02707 }
02708 }
02709 }
02710 }
02711 }
02712 }
02713
02714
02715 void ComputePme::sendData(int numRecipPes, int *recipPeOrder,
02716 int *recipPeDest, int *gridPeMap) {
02717
02718
02719
02720 myGrid.block1 = ( myGrid.K1 + numRecipPes - 1 ) / numRecipPes;
02721 myGrid.block2 = ( myGrid.K2 + numRecipPes - 1 ) / numRecipPes;
02722 bsize = myGrid.block1 * myGrid.dim2 * myGrid.dim3;
02723
02724 Lattice lattice = patchList[0].p->flags.lattice;
02725
02726 resultsRemaining = numRecipPes;
02727
02728 CProxy_ComputePmeMgr pmeProxy(CkpvAccess(BOCclass_group).computePmeMgr);
02729 for (int j=0; j<numRecipPes; j++) {
02730 int pe = recipPeOrder[j];
02731 if ( ! recipPeDest[pe] && ! strayChargeErrors ) continue;
02732 int start = pe * bsize;
02733 int len = bsize;
02734 if ( start >= qsize ) { start = 0; len = 0; }
02735 if ( start + len > qsize ) { len = qsize - start; }
02736 int zdim = myGrid.dim3;
02737 int fstart = start / zdim;
02738 int flen = len / zdim;
02739 int fcount = 0;
02740 int i;
02741
02742 int g;
02743 for ( g=0; g<numGrids; ++g ) {
02744 char *f = f_arr + fstart + g*fsize;
02745 for ( i=0; i<flen; ++i ) {
02746 fcount += f[i];
02747 }
02748 if ( ! recipPeDest[pe] ) {
02749 int errfound = 0;
02750 for ( i=0; i<flen; ++i ) {
02751 if ( f[i] == 3 ) {
02752 errfound = 1;
02753 break;
02754 }
02755 }
02756 if ( errfound ) {
02757 iout << iERROR << "Stray PME grid charges detected: "
02758 << CkMyPe() << " sending to " << gridPeMap[pe] << " for planes";
02759 int iz = -1;
02760 for ( i=0; i<flen; ++i ) {
02761 if ( f[i] == 3 ) {
02762 f[i] = 2;
02763 int jz = (i+fstart)/myGrid.K2;
02764 if ( iz != jz ) { iout << " " << jz; iz = jz; }
02765 }
02766 }
02767 iout << "\n" << endi;
02768 }
02769 }
02770 }
02771
02772 #ifdef NETWORK_PROGRESS
02773 CmiNetworkProgress();
02774 #endif
02775
02776 if ( ! recipPeDest[pe] ) continue;
02777
02778 int zlistlen = 0;
02779 for ( i=0; i<myGrid.K3; ++i ) {
02780 if ( fz_arr[i] ) ++zlistlen;
02781 }
02782
02783 PmeGridMsg *msg = new (fcount*zlistlen, zlistlen, flen*numGrids,
02784 numGrids, PRIORITY_SIZE) PmeGridMsg;
02785 msg->sourceNode = CkMyPe();
02786 msg->lattice = lattice;
02787 msg->start = fstart;
02788 msg->len = flen;
02789 msg->zlistlen = zlistlen;
02790 int *zlist = msg->zlist;
02791 zlistlen = 0;
02792 for ( i=0; i<myGrid.K3; ++i ) {
02793 if ( fz_arr[i] ) zlist[zlistlen++] = i;
02794 }
02795 float *qmsg = msg->qgrid;
02796 for ( g=0; g<numGrids; ++g ) {
02797 char *f = f_arr + fstart + g*fsize;
02798 CmiMemcpy((void*)(msg->fgrid+g*flen),(void*)f,flen*sizeof(char));
02799 double **q = q_arr + fstart + g*fsize;
02800 for ( i=0; i<flen; ++i ) {
02801 if ( f[i] ) {
02802 for ( int k=0; k<zlistlen; ++k ) {
02803 *(qmsg++) = q[i][zlist[k]];
02804 }
02805 }
02806 }
02807 }
02808
02809 msg->sequence = sequence();
02810 SET_PRIORITY(msg,sequence(),PME_GRID_PRIORITY)
02811 pmeProxy[gridPeMap[pe]].recvGrid(msg);
02812 }
02813
02814 }
02815
02816 void ComputePme::copyResults(PmeGridMsg *msg) {
02817
02818 int zdim = myGrid.dim3;
02819 int flen = msg->len;
02820 int fstart = msg->start;
02821 int zlistlen = msg->zlistlen;
02822 int *zlist = msg->zlist;
02823 float *qmsg = msg->qgrid;
02824 int g;
02825 for ( g=0; g<numGrids; ++g ) {
02826 evir[g] += msg->evir[g];
02827 char *f = msg->fgrid + g*flen;
02828 double **q = q_arr + fstart + g*fsize;
02829 for ( int i=0; i<flen; ++i ) {
02830 if ( f[i] ) {
02831 f[i] = 0;
02832 for ( int k=0; k<zlistlen; ++k ) {
02833 q[i][zlist[k]] = *(qmsg++);
02834 }
02835 }
02836 }
02837 }
02838 }
02839
02840 void ComputePme::ungridForces() {
02841
02842 SimParameters *simParams = Node::Object()->simParameters;
02843
02844 Vector *localResults = new Vector[numLocalAtoms*
02845 ((numGrids>1 || selfOn)?2:1)];
02846 Vector *gridResults;
02847 if ( alchFepOn || alchThermIntOn || lesOn || selfOn || pairOn ) {
02848 for(int i=0; i<numLocalAtoms; ++i) { localResults[i] = 0.; }
02849 gridResults = localResults + numLocalAtoms;
02850 } else {
02851 gridResults = localResults;
02852 }
02853
02854 Vector pairForce = 0.;
02855 Lattice lattice = patchList[0].p->flags.lattice;
02856 int g = 0;
02857 if(!simParams->commOnly) {
02858 for ( g=0; g<numGrids; ++g ) {
02859 #ifdef NETWORK_PROGRESS
02860 CmiNetworkProgress();
02861 #endif
02862
02863 myRealSpace[g]->compute_forces(q_arr+g*fsize, localGridData[g], gridResults);
02864 delete myRealSpace[g];
02865 scale_forces(gridResults, numGridAtoms[g], lattice);
02866
02867 if ( alchFepOn || alchThermIntOn ) {
02868 double scale = 1.;
02869 elecLambdaUp = (simParams->alchLambda <= alchElecLambdaStart)? 0. : \
02870 (simParams->alchLambda - alchElecLambdaStart) / (1. - alchElecLambdaStart);
02871 elecLambdaDown = ((1-simParams->alchLambda) <= alchElecLambdaStart)? 0. : ((1-simParams->alchLambda) - alchElecLambdaStart) / (1. - alchElecLambdaStart);
02872 if ( g == 0 ) scale = elecLambdaUp;
02873 else if ( g == 1 ) scale = elecLambdaDown;
02874 else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
02875 if (alchDecouple) {
02876 if ( g == 2 ) scale = 1 - elecLambdaUp;
02877 else if ( g == 3 ) scale = 1 - elecLambdaDown;
02878 else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
02879 }
02880 int nga = 0;
02881 if (!alchDecouple) {
02882 for(int i=0; i<numLocalAtoms; ++i) {
02883 if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
02884
02885 localResults[i] += gridResults[nga++] * scale;
02886 }
02887 }
02888 }
02889 else {
02890 if ( g < 2 ) {
02891 for(int i=0; i<numLocalAtoms; ++i) {
02892 if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
02893
02894
02895 localResults[i] += gridResults[nga++] * scale;
02896 }
02897 }
02898 }
02899 else {
02900 for(int i=0; i<numLocalAtoms; ++i) {
02901 if ( localPartition[i] == (g-1) || localPartition[i] == (g-4)) {
02902
02903
02904
02905 localResults[i] += gridResults[nga++] * scale;
02906 }
02907 }
02908 }
02909 }
02910 } else if ( lesOn ) {
02911 double scale = 1.;
02912 if ( alchFepOn ) {
02913 if ( g == 0 ) scale = simParams->alchLambda;
02914 else if ( g == 1 ) scale = 1. - simParams->alchLambda;
02915 } else if ( lesOn ) {
02916 scale = 1.0 / (double)lesFactor;
02917 }
02918 int nga = 0;
02919 for(int i=0; i<numLocalAtoms; ++i) {
02920 if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
02921 localResults[i] += gridResults[nga++] * scale;
02922 }
02923 }
02924 } else if ( selfOn ) {
02925 PmeParticle *lgd = localGridData[g];
02926 int nga = 0;
02927 for(int i=0; i<numLocalAtoms; ++i) {
02928 if ( localPartition[i] == 1 ) {
02929 pairForce += gridResults[nga];
02930 localResults[i] += gridResults[nga++];
02931 }
02932 }
02933 } else if ( pairOn ) {
02934 if ( g == 0 ) {
02935 int nga = 0;
02936 for(int i=0; i<numLocalAtoms; ++i) {
02937 if ( localPartition[i] == 1 ) {
02938 pairForce += gridResults[nga];
02939 }
02940 if ( localPartition[i] == 1 || localPartition[i] == 2 ) {
02941 localResults[i] += gridResults[nga++];
02942 }
02943 }
02944 } else if ( g == 1 ) {
02945 int nga = 0;
02946 for(int i=0; i<numLocalAtoms; ++i) {
02947 if ( localPartition[i] == g ) {
02948 pairForce -= gridResults[nga];
02949 localResults[i] -= gridResults[nga++];
02950 }
02951 }
02952 } else {
02953 int nga = 0;
02954 for(int i=0; i<numLocalAtoms; ++i) {
02955 if ( localPartition[i] == g ) {
02956 localResults[i] -= gridResults[nga++];
02957 }
02958 }
02959 }
02960 }
02961 }
02962 }
02963 else
02964 {
02965 for ( g=0; g<numGrids; ++g ) {delete myRealSpace[g];}
02966 }
02967 delete [] localData;
02968 delete [] localPartition;
02969
02970 Vector *results_ptr = localResults;
02971 ResizeArrayIter<PatchElem> ap(patchList);
02972
02973
02974 for (ap = ap.begin(); ap != ap.end(); ap++) {
02975 Results *r = (*ap).forceBox->open();
02976 Force *f = r->f[Results::slow];
02977 int numAtoms = (*ap).p->getNumAtoms();
02978
02979 if ( ! strayChargeErrors && ! simParams->commOnly ) {
02980 for(int i=0; i<numAtoms; ++i) {
02981 f[i].x += results_ptr->x;
02982 f[i].y += results_ptr->y;
02983 f[i].z += results_ptr->z;
02984 ++results_ptr;
02985 }
02986 }
02987 (*ap).forceBox->close(&r);
02988 }
02989
02990 delete [] localResults;
02991
02992 if ( pairOn || selfOn ) {
02993 ADD_VECTOR_OBJECT(reduction,REDUCTION_PAIR_ELECT_FORCE,pairForce);
02994 }
02995
02996 for ( g=0; g<numGrids; ++g ) {
02997 double scale = 1.;
02998 if ( alchFepOn || alchThermIntOn ) {
02999 if ( g == 0 ) scale = elecLambdaUp;
03000 else if ( g == 1 ) scale = elecLambdaDown;
03001 else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
03002 if (alchDecouple) {
03003 if ( g == 2 ) scale = 1-elecLambdaUp;
03004 else if ( g == 3 ) scale = 1-elecLambdaDown;
03005 else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
03006 }
03007 } else if ( lesOn ) {
03008 scale = 1.0 / (double)lesFactor;
03009 } else if ( pairOn ) {
03010 scale = ( g == 0 ? 1. : -1. );
03011 }
03012 reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += evir[g][0] * scale;
03013 reduction->item(REDUCTION_VIRIAL_SLOW_XX) += evir[g][1] * scale;
03014 reduction->item(REDUCTION_VIRIAL_SLOW_XY) += evir[g][2] * scale;
03015 reduction->item(REDUCTION_VIRIAL_SLOW_XZ) += evir[g][3] * scale;
03016 reduction->item(REDUCTION_VIRIAL_SLOW_YX) += evir[g][2] * scale;
03017 reduction->item(REDUCTION_VIRIAL_SLOW_YY) += evir[g][4] * scale;
03018 reduction->item(REDUCTION_VIRIAL_SLOW_YZ) += evir[g][5] * scale;
03019 reduction->item(REDUCTION_VIRIAL_SLOW_ZX) += evir[g][3] * scale;
03020 reduction->item(REDUCTION_VIRIAL_SLOW_ZY) += evir[g][5] * scale;
03021 reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += evir[g][6] * scale;
03022
03023 if (amd_reduction) {
03024 amd_reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += evir[g][0] * scale;
03025 amd_reduction->item(REDUCTION_VIRIAL_SLOW_XX) += evir[g][1] * scale;
03026 amd_reduction->item(REDUCTION_VIRIAL_SLOW_XY) += evir[g][2] * scale;
03027 amd_reduction->item(REDUCTION_VIRIAL_SLOW_XZ) += evir[g][3] * scale;
03028 amd_reduction->item(REDUCTION_VIRIAL_SLOW_YX) += evir[g][2] * scale;
03029 amd_reduction->item(REDUCTION_VIRIAL_SLOW_YY) += evir[g][4] * scale;
03030 amd_reduction->item(REDUCTION_VIRIAL_SLOW_YZ) += evir[g][5] * scale;
03031 amd_reduction->item(REDUCTION_VIRIAL_SLOW_ZX) += evir[g][3] * scale;
03032 amd_reduction->item(REDUCTION_VIRIAL_SLOW_ZY) += evir[g][5] * scale;
03033 amd_reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += evir[g][6] * scale;
03034 }
03035
03036 double scale2 = 0.;
03037
03038
03039
03040 if (alchFepOn) {
03041 BigReal elecLambda2Up = (simParams->alchLambda2 <= alchElecLambdaStart)? 0. : \
03042 (simParams->alchLambda2 - alchElecLambdaStart) / (1. - alchElecLambdaStart);
03043 BigReal elecLambda2Down = ((1-simParams->alchLambda2) <= alchElecLambdaStart)? 0. : \
03044 ((1-simParams->alchLambda2) - alchElecLambdaStart) / (1. - alchElecLambdaStart);
03045
03046
03047 if ( g == 0 ) scale2 = elecLambda2Up;
03048 else if ( g == 1 ) scale2 = elecLambda2Down;
03049 else if ( g == 2 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
03050 if (alchDecouple && g == 2 ) scale2 = 1 - elecLambda2Up;
03051 else if (alchDecouple && g == 3 ) scale2 = 1 - elecLambda2Down;
03052 else if (alchDecouple && g == 4 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
03053 }
03054 reduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += evir[g][0] * scale2;
03055 if (amd_reduction) amd_reduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += evir[g][0] * scale2;
03056
03057 if (alchThermIntOn) {
03058
03059
03060
03061
03062
03063
03064
03065
03066
03067
03068
03069
03070
03071
03072
03073
03074
03075
03076
03077
03078
03079
03080
03081
03082
03083
03084
03085
03086 if ( g == 0 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) += evir[g][0];
03087 if ( g == 1 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) += evir[g][0];
03088 if (!alchDecouple) {
03089 if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
03090 if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
03091 }
03092 else {
03093 if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
03094 if ( g == 3 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
03095 if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
03096 if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
03097 }
03098 }
03099 }
03100 reduction->item(REDUCTION_STRAY_CHARGE_ERRORS) += strayChargeErrors;
03101 reduction->submit();
03102 if (amd_reduction) amd_reduction->submit();
03103 }
03104
03105 #if USE_TOPOMAP
03106
03107 #define NPRIMES 8
03108 const static unsigned int NAMDPrimes[] = {
03109 3,
03110 5,
03111 7,
03112 11,
03113 13,
03114 17,
03115 19,
03116 23,
03117 29,
03118 31,
03119 37,
03120 59,
03121 73,
03122 93,
03123 113,
03124 157,
03125 307,
03126 617,
03127 1217
03128 };
03129
03130 #include "RecBisection.h"
03131
03132
03133
03134
03135
03136
03137
03138 bool generateBGLORBPmePeList(int *pemap, int numPes,
03139 int *block_pes, int nbpes) {
03140
03141 PatchMap *pmap = PatchMap::Object();
03142 int *pmemap = new int [CkNumPes()];
03143
03144 if (pemap == NULL)
03145 return false;
03146
03147 TopoManager tmgr;
03148
03149 memset(pmemap, 0, sizeof(int) * CkNumPes());
03150
03151 for(int count = 0; count < CkNumPes(); count++) {
03152 if(count < nbpes)
03153 pmemap[block_pes[count]] = 1;
03154
03155 if(pmap->numPatchesOnNode(count)) {
03156 pmemap[count] = 1;
03157
03158
03159 if(tmgr.hasMultipleProcsPerNode()) {
03160 pmemap[(count + CkNumPes()/2)% CkNumPes()] = 1;
03161 }
03162 }
03163 }
03164
03165 if(numPes + nbpes + pmap->numNodesWithPatches() > CkNumPes())
03166
03167 return false;
03168
03169 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
03170 Node *node = nd.ckLocalBranch();
03171 SimParameters *simParams = node->simParameters;
03172
03173
03174
03175 int xsize = 0, ysize = 0, zsize = 0;
03176
03177 xsize = tmgr.getDimNX();
03178 ysize = tmgr.getDimNY();
03179 zsize = tmgr.getDimNZ();
03180
03181 int nx = xsize, ny = ysize, nz = zsize;
03182 DimensionMap dm;
03183
03184 dm.x = 0;
03185 dm.y = 1;
03186 dm.z = 2;
03187
03188 findOptimalDimensions(xsize, ysize, zsize, nx, ny, nz, dm);
03189
03190
03191 int group_size = numPes/nx;
03192 if(numPes % nx)
03193 group_size ++;
03194
03195 int my_prime = NAMDPrimes[0];
03196 int density = (ny * nz)/group_size + 1;
03197 int count = 0;
03198
03199
03200 for(count = 0; count < NPRIMES; count ++) {
03201
03202 if(density < NAMDPrimes[count]) {
03203 my_prime = NAMDPrimes[count];
03204 break;
03205 }
03206 }
03207
03208 if(count == NPRIMES)
03209 my_prime = NAMDPrimes[NPRIMES-1];
03210
03211
03212 int gcount = 0;
03213 int npme_pes = 0;
03214
03215 int coord[3];
03216
03217 for(int x = 0; x < nx; x++) {
03218 coord[0] = (x + nx/2)%nx;
03219
03220 for(count=0; count < group_size && npme_pes < numPes; count++) {
03221 int dest = (count + 1) * my_prime;
03222 dest = dest % (ny * nz);
03223
03224 coord[2] = dest / ny;
03225 coord[1] = dest - coord[2] * ny;
03226
03227
03228 int destPe = coord[dm.x] + coord[dm.y] * xsize +
03229 coord[dm.z] * xsize* ysize;
03230
03231 if(pmemap[destPe] == 0) {
03232 pemap[gcount++] = destPe;
03233 pmemap[destPe] = 1;
03234
03235 if(tmgr.hasMultipleProcsPerNode())
03236 pmemap[(destPe + CkNumPes()/2) % CkNumPes()] = 1;
03237
03238 npme_pes ++;
03239 }
03240 else {
03241 for(int pos = 1; pos < ny * nz; pos++) {
03242
03243 coord[2] += pos / ny;
03244 coord[1] += pos % ny;
03245
03246 coord[2] = coord[2] % nz;
03247 coord[1] = coord[1] % ny;
03248
03249 int newdest = coord[dm.x] + coord[dm.y] * xsize +
03250 coord[dm.z] * xsize * ysize;
03251
03252 if(pmemap[newdest] == 0) {
03253 pemap[gcount++] = newdest;
03254 pmemap[newdest] = 1;
03255
03256 if(tmgr.hasMultipleProcsPerNode())
03257 pmemap[(newdest + CkNumPes()/2) % CkNumPes()] = 1;
03258
03259 npme_pes ++;
03260 break;
03261 }
03262 }
03263 }
03264 }
03265
03266 if(gcount == numPes)
03267 gcount = 0;
03268
03269 if(npme_pes >= numPes)
03270 break;
03271 }
03272
03273 delete [] pmemap;
03274
03275 if(npme_pes != numPes)
03276
03277 return false;
03278
03279 return true;
03280 }
03281
03282 #endif
03283
03284 template <class T> class PmePencil : public T {
03285 public:
03286 PmePencil() {
03287 data = 0;
03288 work = 0;
03289 send_order = 0;
03290 needs_reply = 0;
03291 #if USE_PERSISTENT
03292 trans_handle = untrans_handle = ungrid_handle = NULL;
03293 #endif
03294 }
03295 ~PmePencil() {
03296 fftwf_free(data);
03297 delete [] work;
03298 delete [] send_order;
03299 delete [] needs_reply;
03300 }
03301 void base_init(PmePencilInitMsg *msg) {
03302 imsg=0;
03303 imsgb=0;
03304 hasData=0;
03305 initdata = msg->data;
03306 }
03307 void order_init(int nBlocks) {
03308 send_order = new int[nBlocks];
03309 for ( int i=0; i<nBlocks; ++i ) send_order[i] = i;
03310 Random rand(CkMyPe());
03311 rand.reorder(send_order,nBlocks);
03312 needs_reply = new int[nBlocks];
03313 }
03314 PmePencilInitMsgData initdata;
03315 Lattice lattice;
03316 PmeReduction evir;
03317 int sequence;
03318 int imsg;
03319 int imsgb;
03320 int hasData;
03321 float *data;
03322 float *work;
03323 int *send_order;
03324 int *needs_reply;
03325 #if USE_PERSISTENT
03326 PersistentHandle *trans_handle;
03327 PersistentHandle *untrans_handle;
03328 PersistentHandle *ungrid_handle;
03329 #endif
03330 };
03331
03332 class PmeZPencil : public PmePencil<CBase_PmeZPencil> {
03333 public:
03334 PmeZPencil_SDAG_CODE
03335 PmeZPencil() { __sdag_init(); setMigratable(false); }
03336 PmeZPencil(CkMigrateMessage *) { __sdag_init(); setMigratable (false); imsg=imsgb=0;}
03337 ~PmeZPencil() {
03338 #ifdef NAMD_FFTW
03339 #ifdef NAMD_FFTW_3
03340 delete [] forward_plans;
03341 delete [] backward_plans;
03342 #endif
03343 #endif
03344 }
03345 void fft_init();
03346 void recv_grid(const PmeGridMsg *);
03347 void forward_fft();
03348 void send_trans();
03349 void send_subset_trans(int fromIdx, int toIdx);
03350 void recv_untrans(const PmeUntransMsg *);
03351 void node_process_untrans(PmeUntransMsg *);
03352 void node_process_grid(PmeGridMsg *);
03353 void backward_fft();
03354 void send_ungrid(PmeGridMsg *);
03355 void send_all_ungrid();
03356 void send_subset_ungrid(int fromIdx, int toIdx, int specialIdx);
03357 private:
03358 ResizeArray<PmeGridMsg *> grid_msgs;
03359 #ifdef NAMD_FFTW
03360 #ifdef NAMD_FFTW_3
03361 fftwf_plan forward_plan, backward_plan;
03362
03363
03364 int numPlans;
03365 fftwf_plan *forward_plans, *backward_plans;
03366 #else
03367 rfftwnd_plan forward_plan, backward_plan;
03368 #endif
03369 #endif
03370
03371 int nx, ny;
03372 #if USE_PERSISTENT
03373 void setup_persistent() {
03374 int hd = 1;
03375 int zBlocks = initdata.zBlocks;
03376 int block3 = initdata.grid.block3;
03377 int dim3 = initdata.grid.dim3;
03378 CkArray *yPencil_local = initdata.yPencil.ckLocalBranch();
03379 CmiAssert(yPencil_local);
03380 trans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * zBlocks);
03381 for ( int isend=0; isend<zBlocks; ++isend ) {
03382 int kb = send_order[isend];
03383 int nz = (initdata.grid.block3 > dim3/2 - kb*block3) ? (initdata.grid.block3) : (dim3/2 - kb*block3);
03384 int peer = yPencil_local->homePe(CkArrayIndex3D(thisIndex.x, 0, kb));
03385 int size = sizeof(PmeTransMsg) + sizeof(float)*hd*nx*ny*nz*2 +sizeof( envelope)+PRIORITY_SIZE/8;
03386 trans_handle[isend] = CmiCreatePersistent(peer, size);
03387 }
03388 }
03389
03390 void setup_ungrid_persistent()
03391 {
03392 ungrid_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * grid_msgs.size());
03393 for ( imsg=0; imsg < grid_msgs.size(); ++imsg ) {
03394 int peer = grid_msgs[imsg]->sourceNode;
03395 ungrid_handle[imsg] = CmiCreatePersistent(peer, 0);
03396 }
03397 }
03398 #endif
03399 };
03400
03401 class PmeYPencil : public PmePencil<CBase_PmeYPencil> {
03402 public:
03403 PmeYPencil_SDAG_CODE
03404 PmeYPencil() { __sdag_init(); setMigratable(false); imsg=imsgb=0;}
03405 PmeYPencil(CkMigrateMessage *) { __sdag_init(); }
03406 void fft_init();
03407 void recv_trans(const PmeTransMsg *);
03408 void forward_fft();
03409 void forward_subset_fft(int fromIdx, int toIdx);
03410 void send_trans();
03411 void send_subset_trans(int fromIdx, int toIdx);
03412 void recv_untrans(const PmeUntransMsg *);
03413 void node_process_trans(PmeTransMsg *);
03414 void node_process_untrans(PmeUntransMsg *);
03415 void backward_fft();
03416 void backward_subset_fft(int fromIdx, int toIdx);
03417 void send_untrans();
03418 void send_subset_untrans(int fromIdx, int toIdx, int evirIdx);
03419 private:
03420 #ifdef NAMD_FFTW
03421 #ifdef NAMD_FFTW_3
03422 fftwf_plan forward_plan, backward_plan;
03423 #else
03424 fftw_plan forward_plan, backward_plan;
03425 #endif
03426 #endif
03427
03428 int nx, nz;
03429 #if USE_PERSISTENT
03430 void setup_persistent() {
03431 int yBlocks = initdata.yBlocks;
03432 int block2 = initdata.grid.block2;
03433 int K2 = initdata.grid.K2;
03434 int hd = 1;
03435 CkArray *xPencil_local = initdata.xPencil.ckLocalBranch();
03436 trans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * yBlocks);
03437 for ( int isend=0; isend<yBlocks; ++isend ) {
03438 int jb = send_order[isend];
03439 int ny = block2 > (K2 - jb*block2) ? block2 : (K2 - jb*block2);
03440 int peer = xPencil_local->homePe(CkArrayIndex3D(0, jb, thisIndex.z));
03441 int size = sizeof(PmeTransMsg) + sizeof(float)*hd*nx*ny*nz*2 +sizeof( envelope) + PRIORITY_SIZE/8;
03442 trans_handle[isend] = CmiCreatePersistent(peer, size);
03443 }
03444
03445 CkArray *zPencil_local = initdata.zPencil.ckLocalBranch();
03446 untrans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * yBlocks);
03447 int send_evir = 1;
03448 for ( int isend=0; isend<yBlocks; ++isend ) {
03449 int jb = send_order[isend];
03450 int ny = block2 > (K2 - jb*block2) ? block2 : (K2 - jb*block2);
03451
03452 int peer = zPencil_local->homePe(CkArrayIndex3D(thisIndex.x, jb, 0));
03453
03454 untrans_handle[isend] = CmiCreatePersistent(peer, sizeof(PmeUntransMsg) + sizeof(float)*nx*ny*nz*2 +sizeof(PmeReduction)*send_evir +sizeof( envelope) + PRIORITY_SIZE/8);
03455 }
03456 }
03457 #endif
03458 };
03459
03460 class PmeXPencil : public PmePencil<CBase_PmeXPencil> {
03461 public:
03462 PmeXPencil_SDAG_CODE
03463 PmeXPencil() { __sdag_init(); myKSpace = 0; setMigratable(false); imsg=imsgb=0; }
03464 PmeXPencil(CkMigrateMessage *) { __sdag_init(); }
03465 ~PmeXPencil() {
03466 #ifdef NAMD_FFTW
03467 #ifdef NAMD_FFTW_3
03468 delete [] forward_plans;
03469 delete [] backward_plans;
03470 #endif
03471 #endif
03472 }
03473 void fft_init();
03474 void recv_trans(const PmeTransMsg *);
03475 void forward_fft();
03476 void pme_kspace();
03477 void backward_fft();
03478 void send_untrans();
03479 void send_subset_untrans(int fromIdx, int toIdx, int evirIdx);
03480 void node_process_trans(PmeTransMsg *);
03481 #ifdef NAMD_FFTW
03482 #ifdef NAMD_FFTW_3
03483 fftwf_plan forward_plan, backward_plan;
03484
03485 int numPlans;
03486 fftwf_plan *forward_plans, *backward_plans;
03487 #else
03488 fftw_plan forward_plan, backward_plan;
03489 #endif
03490 #endif
03491 int ny, nz;
03492 PmeKSpace *myKSpace;
03493 #if USE_PERSISTENT
03494 void setup_persistent() {
03495 int xBlocks = initdata.xBlocks;
03496 int block1 = initdata.grid.block1;
03497 int K1 = initdata.grid.K1;
03498 CkArray *yPencil_local = initdata.yPencil.ckLocalBranch();
03499 untrans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * xBlocks);
03500 int send_evir = 1;
03501 for ( int isend=0; isend<xBlocks; ++isend ) {
03502 int ib = send_order[isend];
03503 int nx = block1 > (K1 - ib*block1) ? block1 : (K1 - ib*block1);
03504 int peer = yPencil_local->procNum(CkArrayIndex3D(ib, 0, thisIndex.z));
03505 untrans_handle[isend] = CmiCreatePersistent(peer, sizeof(PmeUntransMsg) + sizeof(PmeReduction)*send_evir + sizeof(float)*nx*ny*nz*2 +sizeof( envelope) + PRIORITY_SIZE/8);
03506 }
03507 }
03508 #endif
03509
03510 };
03511
03512 void PmeZPencil::fft_init() {
03513 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
03514 Node *node = nd.ckLocalBranch();
03515 SimParameters *simParams = node->simParameters;
03516
03517 #if USE_NODE_PAR_RECEIVE
03518 ((NodePmeMgr *)CkLocalNodeBranch(initdata.pmeNodeProxy))->registerZPencil(thisIndex,this);
03519 #endif
03520
03521 int K1 = initdata.grid.K1;
03522 int K2 = initdata.grid.K2;
03523 int K3 = initdata.grid.K3;
03524 int dim3 = initdata.grid.dim3;
03525 int block1 = initdata.grid.block1;
03526 int block2 = initdata.grid.block2;
03527
03528 nx = block1;
03529 if ( (thisIndex.x + 1) * block1 > K1 ) nx = K1 - thisIndex.x * block1;
03530 ny = block2;
03531 if ( (thisIndex.y + 1) * block2 > K2 ) ny = K2 - thisIndex.y * block2;
03532
03533 data = (float *) fftwf_malloc( sizeof(float) *nx*ny*dim3);
03534 work = new float[dim3];
03535
03536 order_init(initdata.zBlocks);
03537
03538 #ifdef NAMD_FFTW
03539 CmiLock(ComputePmeMgr::fftw_plan_lock);
03540 #ifdef NAMD_FFTW_3
03541
03542
03543 int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT : simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE ;
03544 int sizeLines=nx*ny;
03545 int planLineSizes[1];
03546 planLineSizes[0]=K3;
03547 int ndim=initdata.grid.dim3;
03548 int ndimHalf=ndim/2;
03549 forward_plan = fftwf_plan_many_dft_r2c(1, planLineSizes, sizeLines,
03550 (float *) data, NULL, 1,
03551 ndim,
03552 (fftwf_complex *) data, NULL, 1,
03553 ndimHalf,
03554 fftwFlags);
03555
03556 backward_plan = fftwf_plan_many_dft_c2r(1, planLineSizes, sizeLines,
03557 (fftwf_complex *) data, NULL, 1,
03558 ndimHalf,
03559 (float *) data, NULL, 1,
03560 ndim,
03561 fftwFlags);
03562 #if USE_NODEHELPER
03563 if(simParams->useNodeHelper) {
03564
03565
03566 numPlans = (nx<=ny?nx:ny);
03567 int howmany = sizeLines/numPlans;
03568 forward_plans = new fftwf_plan[numPlans];
03569 backward_plans = new fftwf_plan[numPlans];
03570 for(int i=0; i<numPlans; i++) {
03571 int dimStride = i*ndim*howmany;
03572 int dimHalfStride = i*ndimHalf*howmany;
03573 forward_plans[i] = fftwf_plan_many_dft_r2c(1, planLineSizes, howmany,
03574 ((float *)data)+dimStride, NULL, 1,
03575 ndim,
03576 ((fftwf_complex *)data)+dimHalfStride, NULL, 1,
03577 ndimHalf,
03578 fftwFlags);
03579
03580 backward_plans[i] = fftwf_plan_many_dft_c2r(1, planLineSizes, howmany,
03581 ((fftwf_complex *)data)+dimHalfStride, NULL, 1,
03582 ndimHalf,
03583 ((float *)data)+dimStride, NULL, 1,
03584 ndim,
03585 fftwFlags);
03586 }
03587 }else
03588 #endif
03589 {
03590 forward_plans = NULL;
03591 backward_plans = NULL;
03592 }
03593 #else
03594 forward_plan = rfftwnd_create_plan_specific(1, &K3, FFTW_REAL_TO_COMPLEX,
03595 ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
03596 | FFTW_IN_PLACE | FFTW_USE_WISDOM, data, 1, work, 1);
03597 backward_plan = rfftwnd_create_plan_specific(1, &K3, FFTW_COMPLEX_TO_REAL,
03598 ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
03599 | FFTW_IN_PLACE | FFTW_USE_WISDOM, data, 1, work, 1);
03600 #endif
03601 CmiUnlock(ComputePmeMgr::fftw_plan_lock);
03602 #else
03603 NAMD_die("Sorry, FFTW must be compiled in to use PME.");
03604 #endif
03605
03606 #if USE_NODE_PAR_RECEIVE
03607 evir = 0.;
03608 memset(data, 0, sizeof(float) * nx*ny*dim3);
03609 #endif
03610 }
03611
03612 void PmeYPencil::fft_init() {
03613 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
03614 Node *node = nd.ckLocalBranch();
03615 SimParameters *simParams = node->simParameters;
03616
03617 #if USE_NODE_PAR_RECEIVE
03618 ((NodePmeMgr *)CkLocalNodeBranch(initdata.pmeNodeProxy))->registerYPencil(thisIndex,this);
03619 #endif
03620
03621 int K1 = initdata.grid.K1;
03622 int K2 = initdata.grid.K2;
03623 int dim2 = initdata.grid.dim2;
03624 int dim3 = initdata.grid.dim3;
03625 int block1 = initdata.grid.block1;
03626 int block3 = initdata.grid.block3;
03627
03628 nx = block1;
03629 if ( (thisIndex.x + 1) * block1 > K1 ) nx = K1 - thisIndex.x * block1;
03630 nz = block3;
03631 if ( (thisIndex.z+1)*block3 > dim3/2 ) nz = dim3/2 - thisIndex.z*block3;
03632
03633 data = (float *) fftwf_malloc( sizeof(float) * nx*dim2*nz*2);
03634 work = new float[2*K2];
03635
03636 order_init(initdata.yBlocks);
03637
03638 #ifdef NAMD_FFTW
03639 CmiLock(ComputePmeMgr::fftw_plan_lock);
03640 #ifdef NAMD_FFTW_3
03641
03642
03643
03644
03645 int sizeLines=nz;
03646 int planLineSizes[1];
03647 planLineSizes[0]=K2;
03648 int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT : simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE ;
03649 forward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
03650 (fftwf_complex *) data, NULL, sizeLines, 1,
03651 (fftwf_complex *) data, NULL, sizeLines, 1,
03652 FFTW_FORWARD,
03653 fftwFlags);
03654 backward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
03655 (fftwf_complex *) data, NULL, sizeLines, 1,
03656 (fftwf_complex *) data, NULL, sizeLines, 1,
03657 FFTW_BACKWARD,
03658 fftwFlags);
03659 CkAssert(forward_plan != NULL);
03660 CkAssert(backward_plan != NULL);
03661 #else
03662 forward_plan = fftw_create_plan_specific(K2, FFTW_FORWARD,
03663 ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
03664 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
03665 nz, (fftw_complex *) work, 1);
03666 backward_plan = fftw_create_plan_specific(K2, FFTW_BACKWARD,
03667 ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
03668 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
03669 nz, (fftw_complex *) work, 1);
03670 #endif
03671 CmiUnlock(ComputePmeMgr::fftw_plan_lock);
03672 #else
03673 NAMD_die("Sorry, FFTW must be compiled in to use PME.");
03674 #endif
03675
03676 #if USE_NODE_PAR_RECEIVE
03677 evir = 0;
03678 CmiMemoryWriteFence();
03679 #endif
03680 }
03681
03682 void PmeYPencil::node_process_trans(PmeTransMsg *msg)
03683 {
03684 if ( msg->hasData ) hasData = 1;
03685 needs_reply[msg->sourceNode] = msg->hasData;
03686 recv_trans(msg);
03687 int limsg;
03688 CmiMemoryAtomicFetchAndInc(imsg,limsg);
03689 if(limsg+1 == initdata.yBlocks)
03690 {
03691 if ( hasData ) {
03692 forward_fft();
03693 }
03694 send_trans();
03695 if( ! hasData)
03696 {
03697 send_untrans();
03698 }
03699 imsg=0;
03700 CmiMemoryWriteFence();
03701 }
03702 }
03703
03704 void PmeYPencil::node_process_untrans(PmeUntransMsg *msg)
03705 {
03706 recv_untrans(msg);
03707 int limsg;
03708 CmiMemoryAtomicFetchAndInc(imsgb,limsg);
03709 if(limsg+1 == initdata.yBlocks)
03710 {
03711 backward_fft();
03712 send_untrans();
03713 imsgb=0;
03714 CmiMemoryWriteFence();
03715 }
03716 }
03717
03718 #define DEBUG_NODE_PAR_RECV 0
03719
03720 void NodePmeMgr::recvXTrans(PmeTransMsg *msg) {
03721
03722 PmeXPencil *target=xPencilObj.get(msg->destElem);
03723 #if DEBUG_NODE_PAR_RECV
03724 if(target == NULL)
03725 CkAbort("xpencil in recvXTrans not found, debug registeration");
03726 #endif
03727 target->node_process_trans(msg);
03728 delete msg;
03729 }
03730
03731
03732 void NodePmeMgr::recvYTrans(PmeTransMsg *msg) {
03733
03734 PmeYPencil *target=yPencilObj.get(msg->destElem);
03735 #if DEBUG_NODE_PAR_RECV
03736 if(target == NULL)
03737 CkAbort("ypencil in recvYTrans not found, debug registeration");
03738 #endif
03739 target->node_process_trans(msg);
03740 delete msg;
03741 }
03742 void NodePmeMgr::recvYUntrans(PmeUntransMsg *msg) {
03743
03744 PmeYPencil *target=yPencilObj.get(msg->destElem);
03745 #if DEBUG_NODE_PAR_RECV
03746 if(target == NULL)
03747 CkAbort("ypencil in recvYUntrans not found, debug registeration");
03748 #endif
03749 target->node_process_untrans(msg);
03750 delete msg;
03751 }
03752 void NodePmeMgr::recvZUntrans(PmeUntransMsg *msg) {
03753
03754 PmeZPencil *target=zPencilObj.get(msg->destElem);
03755 #if DEBUG_NODE_PAR_RECV
03756 if(target == NULL)
03757 CkAbort("zpencil in recvZUntrans not found, debug registeration");
03758 #endif
03759 target->node_process_untrans(msg);
03760 delete msg;
03761 }
03762
03763 void NodePmeMgr::recvZGrid(PmeGridMsg *msg) {
03764
03765 PmeZPencil *target=zPencilObj.get(msg->destElem);
03766 #if DEBUG_NODE_PAR_RECV
03767 if(target == NULL){
03768 CkAbort("zpencil in recvZGrid not found, debug registeration");
03769 }
03770 #endif
03771 target->node_process_grid(msg);
03772 }
03773
03774 void PmeXPencil::fft_init() {
03775 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
03776 Node *node = nd.ckLocalBranch();
03777 SimParameters *simParams = node->simParameters;
03778 #if USE_NODE_PAR_RECEIVE
03779 ((NodePmeMgr *)CkLocalNodeBranch(initdata.pmeNodeProxy))->registerXPencil(thisIndex,this);
03780 #endif
03781
03782 int K1 = initdata.grid.K1;
03783 int K2 = initdata.grid.K2;
03784 int dim3 = initdata.grid.dim3;
03785 int block2 = initdata.grid.block2;
03786 int block3 = initdata.grid.block3;
03787
03788 ny = block2;
03789 if ( (thisIndex.y + 1) * block2 > K2 ) ny = K2 - thisIndex.y * block2;
03790 nz = block3;
03791 if ( (thisIndex.z+1)*block3 > dim3/2 ) nz = dim3/2 - thisIndex.z*block3;
03792
03793 data = (float *) fftwf_malloc( sizeof(float) * K1*ny*nz*2);
03794 work = new float[2*K1];
03795
03796 order_init(initdata.xBlocks);
03797
03798 #ifdef NAMD_FFTW
03799 CmiLock(ComputePmeMgr::fftw_plan_lock);
03800 #ifdef NAMD_FFTW_3
03801
03802 int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT : simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE ;
03803 int sizeLines=ny*nz;
03804 int planLineSizes[1];
03805 planLineSizes[0]=K1;
03806 forward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
03807 (fftwf_complex *) data, NULL, sizeLines, 1,
03808 (fftwf_complex *) data, NULL, sizeLines, 1,
03809 FFTW_FORWARD,
03810 fftwFlags);
03811 backward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
03812 (fftwf_complex *) data, NULL, sizeLines, 1,
03813 (fftwf_complex *) data, NULL, sizeLines, 1,
03814 FFTW_BACKWARD,
03815 fftwFlags);
03816
03817 #if USE_NODEHELPER
03818 if(simParams->useNodeHelper) {
03819
03820
03821 numPlans = (ny<=nz?ny:nz);
03822 int howmany = sizeLines/numPlans;
03823 forward_plans = new fftwf_plan[numPlans];
03824 backward_plans = new fftwf_plan[numPlans];
03825 for(int i=0; i<numPlans; i++) {
03826 int curStride = i*howmany;
03827 forward_plans[i] = fftwf_plan_many_dft(1, planLineSizes, howmany,
03828 ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
03829 ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
03830 FFTW_FORWARD,
03831 fftwFlags);
03832
03833 backward_plans[i] = fftwf_plan_many_dft(1, planLineSizes, howmany,
03834 ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
03835 ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
03836 FFTW_BACKWARD,
03837 fftwFlags);
03838 }
03839 }else
03840 #endif
03841 {
03842 forward_plans = NULL;
03843 backward_plans = NULL;
03844 }
03845 #else
03846 forward_plan = fftw_create_plan_specific(K1, FFTW_FORWARD,
03847 ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
03848 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
03849 ny*nz, (fftw_complex *) work, 1);
03850 backward_plan = fftw_create_plan_specific(K1, FFTW_BACKWARD,
03851 ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
03852 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
03853 ny*nz, (fftw_complex *) work, 1);
03854 #endif
03855 CmiUnlock(ComputePmeMgr::fftw_plan_lock);
03856 #else
03857 NAMD_die("Sorry, FFTW must be compiled in to use PME.");
03858 #endif
03859
03860 myKSpace = new PmeKSpace(initdata.grid,
03861 thisIndex.y*block2, thisIndex.y*block2 + ny,
03862 thisIndex.z*block3, thisIndex.z*block3 + nz);
03863
03864 }
03865
03866
03867
03868
03869 void PmeZPencil::recv_grid(const PmeGridMsg *msg) {
03870
03871 int dim3 = initdata.grid.dim3;
03872 if ( imsg == 0 ) {
03873 lattice = msg->lattice;
03874 sequence = msg->sequence;
03875 #if ! USE_NODE_PAR_RECEIVE
03876 memset(data, 0, sizeof(float)*nx*ny*dim3);
03877 #endif
03878 }
03879
03880 if ( ! msg->hasData ) return;
03881
03882 int zlistlen = msg->zlistlen;
03883 int *zlist = msg->zlist;
03884 char *fmsg = msg->fgrid;
03885 float *qmsg = msg->qgrid;
03886 float *d = data;
03887 int numGrids = 1;
03888 for ( int g=0; g<numGrids; ++g ) {
03889 for ( int i=0; i<nx; ++i ) {
03890 for ( int j=0; j<ny; ++j, d += dim3 ) {
03891 if( *(fmsg++) ) {
03892 for ( int k=0; k<zlistlen; ++k ) {
03893 d[zlist[k]] += *(qmsg++);
03894 }
03895 }
03896 }
03897 }
03898 }
03899 }
03900
03901 static inline void PmeXZPencilFFT(int first, int last, void *result, int paraNum, void *param){
03902 #ifdef NAMD_FFTW
03903 #ifdef NAMD_FFTW_3
03904 fftwf_plan *plans = (fftwf_plan *)param;
03905 for(int i=first; i<=last; i++) fftwf_execute(plans[i]);
03906 #endif
03907 #endif
03908 }
03909
03910 void PmeZPencil::forward_fft() {
03911 evir = 0.;
03912 #ifdef FFTCHECK
03913 int dim3 = initdata.grid.dim3;
03914 int K3 = initdata.grid.K3;
03915 float std_base = 100. * (thisIndex.x+1.) + 10. * (thisIndex.y+1.);
03916 float *d = data;
03917 for ( int i=0; i<nx; ++i ) {
03918 for ( int j=0; j<ny; ++j, d += dim3 ) {
03919 for ( int k=0; k<dim3; ++k ) {
03920 d[k] = 10. * (10. * (10. * std_base + i) + j) + k;
03921 }
03922 }
03923 }
03924 #endif
03925 #ifdef NAMD_FFTW
03926 #ifdef MANUAL_DEBUG_FFTW3
03927 dumpMatrixFloat3("fw_z_b", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
03928 #endif
03929 #ifdef NAMD_FFTW_3
03930 #if USE_NODEHELPER
03931 int useNodeHelper = Node::Object()->simParameters->useNodeHelper;
03932 if(useNodeHelper>=NDH_CTRL_PME_FORWARDFFT) {
03933
03934
03935 CProxy_FuncNodeHelper nodeHelper = CkpvAccess(BOCclass_group).nodeHelper;
03936 NodeHelper_Parallelize(nodeHelper, PmeXZPencilFFT, 1, (void *)forward_plans, CkMyNodeSize(), 0, numPlans-1);
03937 return;
03938 }
03939 #endif
03940 fftwf_execute(forward_plan);
03941 #else
03942 rfftwnd_real_to_complex(forward_plan, nx*ny,
03943 data, 1, initdata.grid.dim3, (fftw_complex *) work, 1, 0);
03944 #endif
03945 #ifdef MANUAL_DEBUG_FFTW3
03946 dumpMatrixFloat3("fw_z_a", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
03947 #endif
03948
03949 #endif
03950 #ifdef ZEROCHECK
03951 int dim3 = initdata.grid.dim3;
03952 int K3 = initdata.grid.K3;
03953 float *d = data;
03954 for ( int i=0; i<nx; ++i ) {
03955 for ( int j=0; j<ny; ++j, d += dim3 ) {
03956 for ( int k=0; k<dim3; ++k ) {
03957 if ( d[k] == 0. ) CkPrintf("0 in Z at %d %d %d %d %d %d %d %d %d\n",
03958 thisIndex.x, thisIndex.y, i, j, k, nx, ny, dim3);
03959 }
03960 }
03961 }
03962 #endif
03963 }
03964
03965
03966 static inline void PmeZPencilSendTrans(int first, int last, void *result, int paraNum, void *param){
03967 PmeZPencil *zpencil = (PmeZPencil *)param;
03968 zpencil->send_subset_trans(first, last);
03969 }
03970
03971 void PmeZPencil::send_subset_trans(int fromIdx, int toIdx){
03972 int zBlocks = initdata.zBlocks;
03973 int block3 = initdata.grid.block3;
03974 int dim3 = initdata.grid.dim3;
03975 for ( int isend=fromIdx; isend<=toIdx; ++isend ) {
03976 int kb = send_order[isend];
03977 int nz = block3;
03978 if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
03979 int hd = ( hasData ? 1 : 0 );
03980 PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
03981 msg->lattice = lattice;
03982 msg->sourceNode = thisIndex.y;
03983 msg->hasData = hasData;
03984 msg->nx = ny;
03985 if ( hasData ) {
03986 float *md = msg->qgrid;
03987 const float *d = data;
03988 for ( int i=0; i<nx; ++i ) {
03989 for ( int j=0; j<ny; ++j, d += dim3 ) {
03990 for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
03991 *(md++) = d[2*k];
03992 *(md++) = d[2*k+1];
03993 }
03994 }
03995 }
03996 }
03997 msg->sequence = sequence;
03998 SET_PRIORITY(msg,sequence,PME_TRANS_PRIORITY)
03999
04000 CmiEnableUrgentSend(1);
04001 #if USE_NODE_PAR_RECEIVE
04002 msg->destElem=CkArrayIndex3D(thisIndex.x,0,kb);
04003 #if USE_PERSISTENT
04004 CmiUsePersistentHandle(&trans_handle[isend], 1);
04005 #endif
04006 initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYTrans(msg);
04007 #if USE_PERSISTENT
04008 CmiUsePersistentHandle(NULL, 0);
04009 #endif
04010 #else
04011 #if USE_PERSISTENT
04012 CmiUsePersistentHandle(&trans_handle[isend], 1);
04013 #endif
04014 initdata.yPencil(thisIndex.x,0,kb).recvTrans(msg);
04015 #if USE_PERSISTENT
04016 CmiUsePersistentHandle(NULL, 0);
04017 #endif
04018 #endif
04019 CmiEnableUrgentSend(0);
04020 }
04021 }
04022
04023 void PmeZPencil::send_trans() {
04024 #if USE_PERSISTENT
04025 if (trans_handle == NULL) setup_persistent();
04026 #endif
04027 #if USE_NODEHELPER
04028 Bool useNodeHelper = Node::Object()->simParameters->useNodeHelper;
04029 if(useNodeHelper>=NDH_CTRL_PME_SENDTRANS) {
04036
04037 CProxy_FuncNodeHelper nodeHelper = CkpvAccess(BOCclass_group).nodeHelper;
04038 NodeHelper_Parallelize(nodeHelper, PmeZPencilSendTrans, 1, (void *)this, CkMyNodeSize(), 0, initdata.zBlocks-1, 1);
04039 return;
04040 }
04041 #endif
04042 int zBlocks = initdata.zBlocks;
04043 int block3 = initdata.grid.block3;
04044 int dim3 = initdata.grid.dim3;
04045 for ( int isend=0; isend<zBlocks; ++isend ) {
04046 int kb = send_order[isend];
04047 int nz = block3;
04048 if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
04049 int hd = ( hasData ? 1 : 0 );
04050 PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
04051 msg->lattice = lattice;
04052 msg->sourceNode = thisIndex.y;
04053 msg->hasData = hasData;
04054 msg->nx = ny;
04055 if ( hasData ) {
04056 float *md = msg->qgrid;
04057 const float *d = data;
04058 for ( int i=0; i<nx; ++i ) {
04059 for ( int j=0; j<ny; ++j, d += dim3 ) {
04060 for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
04061 *(md++) = d[2*k];
04062 *(md++) = d[2*k+1];
04063 }
04064 }
04065 }
04066 }
04067 msg->sequence = sequence;
04068 SET_PRIORITY(msg,sequence,PME_TRANS_PRIORITY)
04069
04070 CmiEnableUrgentSend(1);
04071 #if USE_NODE_PAR_RECEIVE
04072 msg->destElem=CkArrayIndex3D(thisIndex.x,0,kb);
04073 #if USE_PERSISTENT
04074 CmiUsePersistentHandle(&trans_handle[isend], 1);
04075 #endif
04076 initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYTrans(msg);
04077 #if USE_PERSISTENT
04078 CmiUsePersistentHandle(NULL, 0);
04079 #endif
04080 #else
04081 #if USE_PERSISTENT
04082 CmiUsePersistentHandle(&trans_handle[isend], 1);
04083 #endif
04084 initdata.yPencil(thisIndex.x,0,kb).recvTrans(msg);
04085 #if USE_PERSISTENT
04086 CmiUsePersistentHandle(NULL, 0);
04087 #endif
04088 #endif
04089 CmiEnableUrgentSend(0);
04090 }
04091 }
04092
04093 void PmeYPencil::recv_trans(const PmeTransMsg *msg) {
04094 if ( imsg == 0 ) {
04095 lattice = msg->lattice;
04096 sequence = msg->sequence;
04097 }
04098 int block2 = initdata.grid.block2;
04099 int K2 = initdata.grid.K2;
04100 int jb = msg->sourceNode;
04101 int ny = msg->nx;
04102 if ( msg->hasData ) {
04103 const float *md = msg->qgrid;
04104 float *d = data;
04105 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
04106 for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
04107 for ( int k=0; k<nz; ++k ) {
04108 #ifdef ZEROCHECK
04109 if ( (*md) == 0. ) CkPrintf("0 in ZY at %d %d %d %d %d %d %d %d %d\n",
04110 thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
04111 #endif
04112 d[2*(j*nz+k)] = *(md++);
04113 d[2*(j*nz+k)+1] = *(md++);
04114 }
04115 }
04116 }
04117 } else {
04118 float *d = data;
04119 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
04120 for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
04121 for ( int k=0; k<nz; ++k ) {
04122 d[2*(j*nz+k)] = 0;
04123 d[2*(j*nz+k)+1] = 0;
04124 }
04125 }
04126 }
04127 }
04128 }
04129
04130 static inline void PmeYPencilForwardFFT(int first, int last, void *result, int paraNum, void *param){
04131 PmeYPencil *ypencil = (PmeYPencil *)param;
04132 ypencil->forward_subset_fft(first, last);
04133 }
04134 void PmeYPencil::forward_subset_fft(int fromIdx, int toIdx) {
04135 #ifdef NAMD_FFTW
04136 #ifdef NAMD_FFTW_3
04137 for(int i=fromIdx; i<=toIdx; i++){
04138 fftwf_execute_dft(forward_plan, ((fftwf_complex *) data) + i
04139 * nz * initdata.grid.K2,
04140 ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
04141 }
04142 #endif
04143 #endif
04144 }
04145
04146 void PmeYPencil::forward_fft() {
04147 evir = 0.;
04148 #ifdef NAMD_FFTW
04149 #ifdef MANUAL_DEBUG_FFTW3
04150 dumpMatrixFloat3("fw_y_b", data, nx, initdata.grid.K2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
04151 #endif
04152
04153 #ifdef NAMD_FFTW_3
04154 #if USE_NODEHELPER
04155 int useNodeHelper = Node::Object()->simParameters->useNodeHelper;
04156 if(useNodeHelper>=NDH_CTRL_PME_FORWARDFFT) {
04157 CProxy_FuncNodeHelper nodeHelper = CkpvAccess(BOCclass_group).nodeHelper;
04158 NodeHelper_Parallelize(nodeHelper, PmeYPencilForwardFFT, 1, (void *)this, CkMyNodeSize(), 0, nx-1);
04159 return;
04160 }
04161 #endif
04162
04163 for ( int i=0; i<nx; ++i ) {
04164 fftwf_execute_dft(forward_plan, ((fftwf_complex *) data) + i
04165 * nz * initdata.grid.K2,
04166 ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
04167 }
04168 #else
04169 for ( int i=0; i<nx; ++i ) {
04170 fftw(forward_plan, nz,
04171 ((fftw_complex *) data) + i * nz * initdata.grid.K2,
04172 nz, 1, (fftw_complex *) work, 1, 0);
04173 }
04174 #endif
04175 #ifdef MANUAL_DEBUG_FFTW3
04176 dumpMatrixFloat3("fw_y_a", data, nx, initdata.grid.dim2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
04177 #endif
04178
04179 #endif
04180 }
04181
04182 static inline void PmeYPencilSendTrans(int first, int last, void *result, int paraNum, void *param){
04183 PmeYPencil *ypencil = (PmeYPencil *)param;
04184 ypencil->send_subset_trans(first, last);
04185 }
04186
04187 void PmeYPencil::send_subset_trans(int fromIdx, int toIdx){
04188 int yBlocks = initdata.yBlocks;
04189 int block2 = initdata.grid.block2;
04190 int K2 = initdata.grid.K2;
04191 for ( int isend=fromIdx; isend<=toIdx; ++isend ) {
04192 int jb = send_order[isend];
04193 int ny = block2;
04194 if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
04195 int hd = ( hasData ? 1 : 0 );
04196 PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
04197 msg->lattice = lattice;
04198 msg->sourceNode = thisIndex.x;
04199 msg->hasData = hasData;
04200 msg->nx = nx;
04201 if ( hasData ) {
04202 float *md = msg->qgrid;
04203 const float *d = data;
04204 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
04205 for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
04206 for ( int k=0; k<nz; ++k ) {
04207 *(md++) = d[2*(j*nz+k)];
04208 *(md++) = d[2*(j*nz+k)+1];
04209 #ifdef ZEROCHECK
04210 if ( *(md-2) == 0. ) CkPrintf("send 0 in YX at %d %d %d %d %d %d %d %d %d\n",
04211 thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
04212 #endif
04213 }
04214 }
04215 }
04216 if ( md != msg->qgrid + nx*ny*nz*2 ) CkPrintf("error in YX at %d %d %d\n",
04217 thisIndex.x, jb, thisIndex.z);
04218 }
04219 msg->sequence = sequence;
04220 SET_PRIORITY(msg,sequence,PME_TRANS2_PRIORITY)
04221 CmiEnableUrgentSend(1);
04222 #if USE_NODE_PAR_RECEIVE
04223 msg->destElem=CkArrayIndex3D(0,jb,thisIndex.z);
04224 #if USE_PERSISTENT
04225 CmiUsePersistentHandle(&trans_handle[isend], 1);
04226 #endif
04227 initdata.pmeNodeProxy[CmiNodeOf(initdata.xm.ckLocalBranch()->procNum(0,msg->destElem))].recvXTrans(msg);
04228 #if USE_PERSISTENT
04229 CmiUsePersistentHandle(NULL, 0);
04230 #endif
04231 #else
04232 #if USE_PERSISTENT
04233 CmiUsePersistentHandle(&trans_handle[isend], 1);
04234 #endif
04235 initdata.xPencil(0,jb,thisIndex.z).recvTrans(msg);
04236 #if USE_PERSISTENT
04237 CmiUsePersistentHandle(NULL, 0);
04238 #endif
04239 #endif
04240 CmiEnableUrgentSend(0);
04241 }
04242 }
04243
04244 void PmeYPencil::send_trans() {
04245 #if USE_PERSISTENT
04246 if (trans_handle == NULL) setup_persistent();
04247 #endif
04248 #if USE_NODEHELPER
04249 Bool useNodeHelper = Node::Object()->simParameters->useNodeHelper;
04250 if(useNodeHelper>=NDH_CTRL_PME_SENDTRANS) {
04257
04258 CProxy_FuncNodeHelper nodeHelper = CkpvAccess(BOCclass_group).nodeHelper;
04259 NodeHelper_Parallelize(nodeHelper, PmeYPencilSendTrans, 1, (void *)this, CkMyNodeSize(), 0, initdata.yBlocks-1, 1);
04260 return;
04261 }
04262 #endif
04263 int yBlocks = initdata.yBlocks;
04264 int block2 = initdata.grid.block2;
04265 int K2 = initdata.grid.K2;
04266 for ( int isend=0; isend<yBlocks; ++isend ) {
04267 int jb = send_order[isend];
04268 int ny = block2;
04269 if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
04270 int hd = ( hasData ? 1 : 0 );
04271 PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
04272 msg->lattice = lattice;
04273 msg->sourceNode = thisIndex.x;
04274 msg->hasData = hasData;
04275 msg->nx = nx;
04276 if ( hasData ) {
04277 float *md = msg->qgrid;
04278 const float *d = data;
04279 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
04280 for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
04281 for ( int k=0; k<nz; ++k ) {
04282 *(md++) = d[2*(j*nz+k)];
04283 *(md++) = d[2*(j*nz+k)+1];
04284 #ifdef ZEROCHECK
04285 if ( *(md-2) == 0. ) CkPrintf("send 0 in YX at %d %d %d %d %d %d %d %d %d\n",
04286 thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
04287 #endif
04288 }
04289 }
04290 }
04291 if ( md != msg->qgrid + nx*ny*nz*2 ) CkPrintf("error in YX at %d %d %d\n",
04292 thisIndex.x, jb, thisIndex.z);
04293 }
04294 msg->sequence = sequence;
04295 SET_PRIORITY(msg,sequence,PME_TRANS2_PRIORITY)
04296 CmiEnableUrgentSend(1);
04297 #if USE_NODE_PAR_RECEIVE
04298 msg->destElem=CkArrayIndex3D(0,jb,thisIndex.z);
04299 #if USE_PERSISTENT
04300 CmiUsePersistentHandle(&trans_handle[isend], 1);
04301 #endif
04302 initdata.pmeNodeProxy[CmiNodeOf(initdata.xm.ckLocalBranch()->procNum(0,msg->destElem))].recvXTrans(msg);
04303 #if USE_PERSISTENT
04304 CmiUsePersistentHandle(NULL, 0);
04305 #endif
04306 #else
04307 #if USE_PERSISTENT
04308 CmiUsePersistentHandle(&trans_handle[isend], 1);
04309 #endif
04310 initdata.xPencil(0,jb,thisIndex.z).recvTrans(msg);
04311 #if USE_PERSISTENT
04312 CmiUsePersistentHandle(NULL, 0);
04313 #endif
04314
04315 #endif
04316 CmiEnableUrgentSend(0);
04317 }
04318 }
04319
04320 void PmeXPencil::node_process_trans(PmeTransMsg *msg)
04321 {
04322 if(msg->hasData) hasData=1;
04323 needs_reply[msg->sourceNode] = msg->hasData;
04324 recv_trans(msg);
04325 int limsg;
04326 CmiMemoryAtomicFetchAndInc(imsg,limsg);
04327 if(limsg+1 == initdata.xBlocks)
04328 {
04329 if(hasData){
04330 forward_fft();
04331 pme_kspace();
04332 backward_fft();
04333 }
04334 send_untrans();
04335 imsg=0;
04336 CmiMemoryWriteFence();
04337 }
04338 }
04339
04340 void PmeXPencil::recv_trans(const PmeTransMsg *msg) {
04341 if ( imsg == 0 ) {
04342 lattice = msg->lattice;
04343 sequence = msg->sequence;
04344 }
04345 int block1 = initdata.grid.block1;
04346 int K1 = initdata.grid.K1;
04347 int ib = msg->sourceNode;
04348 int nx = msg->nx;
04349 if ( msg->hasData ) {
04350 const float *md = msg->qgrid;
04351 for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
04352 float *d = data + i*ny*nz*2;
04353 for ( int j=0; j<ny; ++j, d += nz*2 ) {
04354 for ( int k=0; k<nz; ++k ) {
04355 #ifdef ZEROCHECK
04356 if ( (*md) == 0. ) CkPrintf("0 in YX at %d %d %d %d %d %d %d %d %d\n",
04357 ib, thisIndex.y, thisIndex.z, i, j, k, nx, ny, nz);
04358 #endif
04359 d[2*k] = *(md++);
04360 d[2*k+1] = *(md++);
04361 }
04362 }
04363 }
04364 } else {
04365 for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
04366 float *d = data + i*ny*nz*2;
04367 for ( int j=0; j<ny; ++j, d += nz*2 ) {
04368 for ( int k=0; k<nz; ++k ) {
04369 d[2*k] = 0;
04370 d[2*k+1] = 0;
04371 }
04372 }
04373 }
04374 }
04375 }
04376
04377 void PmeXPencil::forward_fft() {
04378 #ifdef NAMD_FFTW
04379
04380 #ifdef MANUAL_DEBUG_FFTW3
04381 dumpMatrixFloat3("fw_x_b", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
04382 #endif
04383
04384 #ifdef NAMD_FFTW_3
04385 #if USE_NODEHELPER
04386 int useNodeHelper = Node::Object()->simParameters->useNodeHelper;
04387 if(useNodeHelper>=NDH_CTRL_PME_FORWARDFFT) {
04388
04389
04390 CProxy_FuncNodeHelper nodeHelper = CkpvAccess(BOCclass_group).nodeHelper;
04391 NodeHelper_Parallelize(nodeHelper, PmeXZPencilFFT, 1, (void *)forward_plans, CkMyNodeSize(), 0, numPlans-1);
04392 return;
04393 }
04394 #endif
04395 fftwf_execute(forward_plan);
04396 #else
04397 fftw(forward_plan, ny*nz,
04398 ((fftw_complex *) data), ny*nz, 1, (fftw_complex *) work, 1, 0);
04399 #endif
04400 #ifdef MANUAL_DEBUG_FFTW3
04401 dumpMatrixFloat3("fw_x_a", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
04402 #endif
04403
04404 #endif
04405 }
04406
04407 void PmeXPencil::pme_kspace() {
04408
04409 evir = 0.;
04410
04411 #ifdef FFTCHECK
04412 return;
04413 #endif
04414
04415 BigReal ewaldcof = ComputeNonbondedUtil::ewaldcof;
04416
04417 int numGrids = 1;
04418 for ( int g=0; g<numGrids; ++g ) {
04419 evir[0] = myKSpace->compute_energy(data+0*g,
04420 lattice, ewaldcof, &(evir[1]));
04421 }
04422
04423 #if USE_NODE_PAR_RECEIVE
04424 CmiMemoryWriteFence();
04425 #endif
04426 }
04427
04428 void PmeXPencil::backward_fft() {
04429 #ifdef NAMD_FFTW
04430 #ifdef MANUAL_DEBUG_FFTW3
04431 dumpMatrixFloat3("bw_x_b", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
04432 #endif
04433
04434 #ifdef NAMD_FFTW_3
04435 #if USE_NODEHELPER
04436 int useNodeHelper = Node::Object()->simParameters->useNodeHelper;
04437 if(useNodeHelper>=NDH_CTRL_PME_BACKWARDFFT) {
04438
04439
04440 CProxy_FuncNodeHelper nodeHelper = CkpvAccess(BOCclass_group).nodeHelper;
04441 NodeHelper_Parallelize(nodeHelper, PmeXZPencilFFT, 1, (void *)backward_plans, CkMyNodeSize(), 0, numPlans-1);
04442 return;
04443 }
04444 #endif
04445 fftwf_execute(backward_plan);
04446 #else
04447 fftw(backward_plan, ny*nz,
04448 ((fftw_complex *) data), ny*nz, 1, (fftw_complex *) work, 1, 0);
04449 #endif
04450 #ifdef MANUAL_DEBUG_FFTW3
04451 dumpMatrixFloat3("bw_x_a", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
04452 #endif
04453 #endif
04454 }
04455
04456 static inline void PmeXPencilSendUntrans(int first, int last, void *result, int paraNum, void *param){
04457 int evirIdx = paraNum;
04458 PmeXPencil *xpencil = (PmeXPencil *)param;
04459 xpencil->send_subset_untrans(first, last, evirIdx);
04460 }
04461
04462 void PmeXPencil::send_subset_untrans(int fromIdx, int toIdx, int evirIdx){
04463 int xBlocks = initdata.xBlocks;
04464 int block1 = initdata.grid.block1;
04465 int K1 = initdata.grid.K1;
04466
04467 int ackL=0, ackH=-1;
04468 int unL=0, unH=-1;
04469 int send_evir=0;
04470 if(fromIdx >= evirIdx+1) {
04471
04472 unL = fromIdx;
04473 unH = toIdx;
04474 } else if(toIdx <= evirIdx-1) {
04475
04476 ackL=fromIdx;
04477 ackH=toIdx;
04478 } else {
04479
04480 ackL=fromIdx;
04481 ackH=evirIdx-1;
04482 send_evir=1;
04483 unL=evirIdx+1;
04484 unH=toIdx;
04485 }
04486
04487 for(int isend=ackL; isend<=ackH; isend++) {
04488
04489 CmiEnableUrgentSend(1);
04490 int ib = send_order[isend];
04491 PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
04492 SET_PRIORITY(msg,sequence,PME_UNTRANS_PRIORITY)
04493 initdata.yPencil(ib,0,thisIndex.z).recvAck(msg);
04494 CmiEnableUrgentSend(0);
04495 }
04496
04497 CmiEnableUrgentSend(1);
04498
04499 if(send_evir) {
04500 int ib = send_order[evirIdx];
04501 int nx = block1;
04502 if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
04503 PmeUntransMsg *msg = new (nx*ny*nz*2,1,PRIORITY_SIZE) PmeUntransMsg;
04504 msg->evir[0] = evir;
04505 msg->has_evir = 1;
04506 msg->sourceNode = thisIndex.y;
04507 msg->ny = ny;
04508 float *md = msg->qgrid;
04509 for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
04510 float *d = data + i*ny*nz*2;
04511 for ( int j=0; j<ny; ++j, d += nz*2 ) {
04512 for ( int k=0; k<nz; ++k ) {
04513 *(md++) = d[2*k];
04514 *(md++) = d[2*k+1];
04515 }
04516 }
04517 }
04518 SET_PRIORITY(msg,sequence,PME_UNTRANS_PRIORITY)
04519 #if USE_NODE_PAR_RECEIVE
04520 msg->destElem=CkArrayIndex3D(ib,0, thisIndex.z);
04521 initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYUntrans(msg);
04522 #else
04523 initdata.yPencil(ib,0,thisIndex.z).recvUntrans(msg);
04524 #endif
04525 }
04526 CmiEnableUrgentSend(0);
04527
04528
04529 for(int isend=unL; isend<=unH; isend++) {
04530 int ib = send_order[isend];
04531 int nx = block1;
04532 if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
04533 PmeUntransMsg *msg = new (nx*ny*nz*2,0,PRIORITY_SIZE) PmeUntransMsg;
04534 msg->has_evir = 0;
04535 msg->sourceNode = thisIndex.y;
04536 msg->ny = ny;
04537 float *md = msg->qgrid;
04538 for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
04539 float *d = data + i*ny*nz*2;
04540 for ( int j=0; j<ny; ++j, d += nz*2 ) {
04541 for ( int k=0; k<nz; ++k ) {
04542 *(md++) = d[2*k];
04543 *(md++) = d[2*k+1];
04544 }
04545 }
04546 }
04547 SET_PRIORITY(msg,sequence,PME_UNTRANS_PRIORITY)
04548 CmiEnableUrgentSend(1);
04549 #if USE_NODE_PAR_RECEIVE
04550 msg->destElem=CkArrayIndex3D(ib,0, thisIndex.z);
04551 #if USE_PERSISTENT
04552 CmiUsePersistentHandle(&untrans_handle[isend], 1);
04553 #endif
04554 initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYUntrans(msg);
04555 #if USE_PERSISTENT
04556 CmiUsePersistentHandle(NULL, 0);
04557 #endif
04558 #else
04559 #if USE_PERSISTENT
04560 CmiUsePersistentHandle(&untrans_handle[isend], 1);
04561 #endif
04562 initdata.yPencil(ib,0,thisIndex.z).recvUntrans(msg);
04563 #if USE_PERSISTENT
04564 CmiUsePersistentHandle(NULL, 0);
04565 #endif
04566 #endif
04567 CmiEnableUrgentSend(0);
04568 }
04569 }
04570
04571 void PmeXPencil::send_untrans() {
04572 #if USE_PERSISTENT
04573 if (untrans_handle == NULL) setup_persistent();
04574 #endif
04575 #if USE_NODEHELPER
04576 Bool useNodeHelper = Node::Object()->simParameters->useNodeHelper;
04577 if(useNodeHelper>=NDH_CTRL_PME_SENDUNTRANS) {
04578 int xBlocks = initdata.xBlocks;
04579 int evirIdx = 0;
04580 for ( int isend=0; isend<xBlocks; ++isend ) {
04581 int ib = send_order[isend];
04582 if (needs_reply[ib]) {
04583 evirIdx = isend;
04584 break;
04585 }
04586 }
04587
04588
04589
04590
04591
04592
04593 CProxy_FuncNodeHelper nodeHelper = CkpvAccess(BOCclass_group).nodeHelper;
04594 #if USE_NODE_PAR_RECEIVE
04595
04596 NodeHelper_Parallelize(nodeHelper, PmeXPencilSendUntrans, evirIdx, (void *)this, xBlocks, 0, xBlocks-1, 1);
04597 #else
04598
04599 NodeHelper_Parallelize(nodeHelper, PmeXPencilSendUntrans, evirIdx, (void *)this, xBlocks, 0, xBlocks-1, 0);
04600 #endif
04601 return;
04602 }
04603 #endif
04604 int xBlocks = initdata.xBlocks;
04605 int block1 = initdata.grid.block1;
04606 int K1 = initdata.grid.K1;
04607 int send_evir = 1;
04608 for ( int isend=0; isend<xBlocks; ++isend ) {
04609 int ib = send_order[isend];
04610 if ( ! needs_reply[ib] ) {
04611 PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
04612 CmiEnableUrgentSend(1);
04613 SET_PRIORITY(msg,sequence,PME_UNTRANS_PRIORITY)
04614 initdata.yPencil(ib,0,thisIndex.z).recvAck(msg);
04615 CmiEnableUrgentSend(0);
04616 continue;
04617 }
04618 int nx = block1;
04619 if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
04620 PmeUntransMsg *msg = new (nx*ny*nz*2,send_evir,PRIORITY_SIZE) PmeUntransMsg;
04621 if ( send_evir ) {
04622 msg->evir[0] = evir;
04623 msg->has_evir = 1;
04624 send_evir = 0;
04625 } else {
04626 msg->has_evir = 0;
04627 }
04628 msg->sourceNode = thisIndex.y;
04629 msg->ny = ny;
04630 float *md = msg->qgrid;
04631 for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
04632 float *d = data + i*ny*nz*2;
04633 for ( int j=0; j<ny; ++j, d += nz*2 ) {
04634 for ( int k=0; k<nz; ++k ) {
04635 *(md++) = d[2*k];
04636 *(md++) = d[2*k+1];
04637 }
04638 }
04639 }
04640 SET_PRIORITY(msg,sequence,PME_UNTRANS_PRIORITY)
04641
04642 CmiEnableUrgentSend(1);
04643 #if USE_NODE_PAR_RECEIVE
04644 msg->destElem=CkArrayIndex3D(ib,0, thisIndex.z);
04645 #if USE_PERSISTENT
04646 CmiUsePersistentHandle(&untrans_handle[isend], 1);
04647 #endif
04648 initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYUntrans(msg);
04649 #if USE_PERSISTENT
04650 CmiUsePersistentHandle(NULL, 0);
04651 #endif
04652 #else
04653 #if USE_PERSISTENT
04654 CmiUsePersistentHandle(&untrans_handle[isend], 1);
04655 #endif
04656 initdata.yPencil(ib,0,thisIndex.z).recvUntrans(msg);
04657 #if USE_PERSISTENT
04658 CmiUsePersistentHandle(NULL, 0);
04659 #endif
04660 #endif
04661 CmiEnableUrgentSend(0);
04662 }
04663 }
04664
04665 void PmeYPencil::recv_untrans(const PmeUntransMsg *msg) {
04666 if ( msg->has_evir ) evir += msg->evir[0];
04667 int block2 = initdata.grid.block2;
04668 int K2 = initdata.grid.K2;
04669 int jb = msg->sourceNode;
04670 int ny = msg->ny;
04671 const float *md = msg->qgrid;
04672 float *d = data;
04673 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
04674 #if CMK_BLUEGENEL
04675 CmiNetworkProgress();
04676 #endif
04677 for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
04678 for ( int k=0; k<nz; ++k ) {
04679 #ifdef ZEROCHECK
04680 if ( (*md) == 0. ) CkPrintf("0 in XY at %d %d %d %d %d %d %d %d %d\n",
04681 thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
04682 #endif
04683 d[2*(j*nz+k)] = *(md++);
04684 d[2*(j*nz+k)+1] = *(md++);
04685 }
04686 }
04687 }
04688 }
04689
04690 static inline void PmeYPencilBackwardFFT(int first, int last, void *result, int paraNum, void *param){
04691 PmeYPencil *ypencil = (PmeYPencil *)param;
04692 ypencil->backward_subset_fft(first, last);
04693 }
04694
04695 void PmeYPencil::backward_subset_fft(int fromIdx, int toIdx) {
04696 #ifdef NAMD_FFTW
04697 #ifdef NAMD_FFTW_3
04698 for(int i=fromIdx; i<=toIdx; i++){
04699 fftwf_execute_dft(backward_plan,
04700 ((fftwf_complex *) data) + i * nz * initdata.grid.K2,
04701 ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
04702 }
04703 #endif
04704 #endif
04705 }
04706
04707 void PmeYPencil::backward_fft() {
04708 #ifdef NAMD_FFTW
04709 #ifdef MANUAL_DEBUG_FFTW3
04710 dumpMatrixFloat3("bw_y_b", data, nx, initdata.grid.K2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
04711 #endif
04712
04713 #ifdef NAMD_FFTW_3
04714 #if USE_NODEHELPER
04715 int useNodeHelper = Node::Object()->simParameters->useNodeHelper;
04716 if(useNodeHelper>=NDH_CTRL_PME_BACKWARDFFT) {
04717 CProxy_FuncNodeHelper nodeHelper = CkpvAccess(BOCclass_group).nodeHelper;
04718 NodeHelper_Parallelize(nodeHelper, PmeYPencilBackwardFFT, 1, (void *)this, CkMyNodeSize(), 0, nx-1);
04719 return;
04720 }
04721 #endif
04722
04723 for ( int i=0; i<nx; ++i ) {
04724 #if CMK_BLUEGENEL
04725 CmiNetworkProgress();
04726 #endif
04727 fftwf_execute_dft(backward_plan,
04728 ((fftwf_complex *) data) + i * nz * initdata.grid.K2,
04729 ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
04730 }
04731 #else
04732 for ( int i=0; i<nx; ++i ) {
04733 #if CMK_BLUEGENEL
04734 CmiNetworkProgress();
04735 #endif
04736 fftw(backward_plan, nz,
04737 ((fftw_complex *) data) + i * nz * initdata.grid.K2,
04738 nz, 1, (fftw_complex *) work, 1, 0);
04739 }
04740 #endif
04741
04742 #ifdef MANUAL_DEBUG_FFTW3
04743 dumpMatrixFloat3("bw_y_a", data, nx, initdata.grid.K2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
04744 #endif
04745
04746 #endif
04747 }
04748
04749 static inline void PmeYPencilSendUntrans(int first, int last, void *result, int paraNum, void *param){
04750 int evirIdx = paraNum;
04751 PmeYPencil *ypencil = (PmeYPencil *)param;
04752 ypencil->send_subset_untrans(first, last, evirIdx);
04753 }
04754
04755 void PmeYPencil::send_subset_untrans(int fromIdx, int toIdx, int evirIdx){
04756 int yBlocks = initdata.yBlocks;
04757 int block2 = initdata.grid.block2;
04758 int K2 = initdata.grid.K2;
04759
04760 int ackL=0, ackH=-1;
04761 int unL=0, unH=-1;
04762 int send_evir=0;
04763 if(fromIdx >= evirIdx+1) {
04764
04765 unL = fromIdx;
04766 unH = toIdx;
04767 } else if(toIdx <= evirIdx-1) {
04768
04769 ackL=fromIdx;
04770 ackH=toIdx;
04771 } else {
04772
04773 ackL=fromIdx;
04774 ackH=evirIdx-1;
04775 send_evir=1;
04776 unL=evirIdx+1;
04777 unH=toIdx;
04778 }
04779
04780 for(int isend=ackL; isend<=ackH; isend++) {
04781
04782 CmiEnableUrgentSend(1);
04783 int jb = send_order[isend];
04784 PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
04785 SET_PRIORITY(msg,sequence,PME_UNTRANS2_PRIORITY)
04786 initdata.zPencil(thisIndex.x,jb,0).recvAck(msg);
04787 CmiEnableUrgentSend(0);
04788 }
04789
04790 CmiEnableUrgentSend(1);
04791
04792 if(send_evir) {
04793 int jb = send_order[evirIdx];
04794 int ny = block2;
04795 if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
04796 PmeUntransMsg *msg = new (nx*ny*nz*2,1,PRIORITY_SIZE) PmeUntransMsg;
04797 msg->evir[0] = evir;
04798 msg->has_evir = 1;
04799 msg->sourceNode = thisIndex.z;
04800 msg->ny = nz;
04801 float *md = msg->qgrid;
04802 const float *d = data;
04803 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
04804 for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
04805 for ( int k=0; k<nz; ++k ) {
04806 *(md++) = d[2*(j*nz+k)];
04807 *(md++) = d[2*(j*nz+k)+1];
04808 }
04809 }
04810 }
04811 SET_PRIORITY(msg,sequence,PME_UNTRANS2_PRIORITY)
04812 #if USE_NODE_PAR_RECEIVE
04813 msg->destElem=CkArrayIndex3D( thisIndex.x, jb, 0);
04814
04815 initdata.pmeNodeProxy[CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem))].recvZUntrans(msg);
04816 #else
04817 initdata.zPencil(thisIndex.x,jb,0).recvUntrans(msg);
04818 #endif
04819 }
04820
04821 CmiEnableUrgentSend(0);
04822
04823 for(int isend=unL; isend<=unH; isend++) {
04824 int jb = send_order[isend];
04825 int ny = block2;
04826 if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
04827 PmeUntransMsg *msg = new (nx*ny*nz*2,0,PRIORITY_SIZE) PmeUntransMsg;
04828 msg->has_evir = 0;
04829 msg->sourceNode = thisIndex.z;
04830 msg->ny = nz;
04831 float *md = msg->qgrid;
04832 const float *d = data;
04833 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
04834 for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
04835 for ( int k=0; k<nz; ++k ) {
04836 *(md++) = d[2*(j*nz+k)];
04837 *(md++) = d[2*(j*nz+k)+1];
04838 }
04839 }
04840 }
04841 SET_PRIORITY(msg,sequence,PME_UNTRANS2_PRIORITY)
04842 CmiEnableUrgentSend(1);
04843 #if USE_NODE_PAR_RECEIVE
04844 msg->destElem=CkArrayIndex3D( thisIndex.x, jb, 0);
04845
04846 #if USE_PERSISTENT
04847 CmiUsePersistentHandle(&untrans_handle[isend], 1);
04848 #endif
04849 initdata.pmeNodeProxy[CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem))].recvZUntrans(msg);
04850 #if USE_PERSISTENT
04851 CmiUsePersistentHandle(NULL, 0);
04852 #endif
04853 #else
04854 #if USE_PERSISTENT
04855 CmiUsePersistentHandle(&untrans_handle[isend], 1);
04856 #endif
04857 initdata.zPencil(thisIndex.x,jb,0).recvUntrans(msg);
04858 #if USE_PERSISTENT
04859 CmiUsePersistentHandle(NULL, 0);
04860 #endif
04861 #endif
04862 CmiEnableUrgentSend(0);
04863 }
04864 }
04865
04866 void PmeYPencil::send_untrans() {
04867 #if USE_PERSISTENT
04868 if (untrans_handle == NULL) setup_persistent();
04869 #endif
04870 #if USE_NODEHELPER
04871 Bool useNodeHelper = Node::Object()->simParameters->useNodeHelper;
04872 if(useNodeHelper>=NDH_CTRL_PME_SENDUNTRANS) {
04873 int yBlocks = initdata.yBlocks;
04874 int evirIdx = 0;
04875 for ( int isend=0; isend<yBlocks; ++isend ) {
04876 int jb = send_order[isend];
04877 if (needs_reply[jb]) {
04878 evirIdx = isend;
04879 break;
04880 }
04881 }
04882
04883
04884
04885
04886
04887
04888 CProxy_FuncNodeHelper nodeHelper = CkpvAccess(BOCclass_group).nodeHelper;
04889 #if USE_NODE_PAR_RECEIVE
04890
04891 NodeHelper_Parallelize(nodeHelper, PmeYPencilSendUntrans, evirIdx, (void *)this, yBlocks, 0, yBlocks-1, 1);
04892 evir = 0.;
04893 CmiMemoryWriteFence();
04894 #else
04895
04896 NodeHelper_Parallelize(nodeHelper, PmeYPencilSendUntrans, evirIdx, (void *)this, yBlocks, 0, yBlocks-1, 0);
04897 #endif
04898 return;
04899 }
04900 #endif
04901 int yBlocks = initdata.yBlocks;
04902 int block2 = initdata.grid.block2;
04903 int K2 = initdata.grid.K2;
04904 int send_evir = 1;
04905 for ( int isend=0; isend<yBlocks; ++isend ) {
04906 int jb = send_order[isend];
04907 if ( ! needs_reply[jb] ) {
04908 PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
04909 CmiEnableUrgentSend(1);
04910 SET_PRIORITY(msg,sequence,PME_UNTRANS2_PRIORITY)
04911 initdata.zPencil(thisIndex.x,jb,0).recvAck(msg);
04912 CmiEnableUrgentSend(0);
04913 continue;
04914 }
04915 int ny = block2;
04916 if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
04917 PmeUntransMsg *msg = new (nx*ny*nz*2,send_evir,PRIORITY_SIZE) PmeUntransMsg;
04918 if ( send_evir ) {
04919 msg->evir[0] = evir;
04920 msg->has_evir = 1;
04921 send_evir = 0;
04922 } else {
04923 msg->has_evir = 0;
04924 }
04925 msg->sourceNode = thisIndex.z;
04926 msg->ny = nz;
04927 float *md = msg->qgrid;
04928 const float *d = data;
04929 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
04930 for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
04931 for ( int k=0; k<nz; ++k ) {
04932 *(md++) = d[2*(j*nz+k)];
04933 *(md++) = d[2*(j*nz+k)+1];
04934 }
04935 }
04936 }
04937 SET_PRIORITY(msg,sequence,PME_UNTRANS2_PRIORITY)
04938
04939 CmiEnableUrgentSend(1);
04940 #if USE_NODE_PAR_RECEIVE
04941 msg->destElem=CkArrayIndex3D( thisIndex.x, jb, 0);
04942
04943 #if USE_PERSISTENT
04944 CmiUsePersistentHandle(&untrans_handle[isend], 1);
04945 #endif
04946 initdata.pmeNodeProxy[CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem))].recvZUntrans(msg);
04947 #if USE_PERSISTENT
04948 CmiUsePersistentHandle(NULL, 0);
04949 #endif
04950 #else
04951 #if USE_PERSISTENT
04952 CmiUsePersistentHandle(&untrans_handle[isend], 1);
04953 #endif
04954 initdata.zPencil(thisIndex.x,jb,0).recvUntrans(msg);
04955 #if USE_PERSISTENT
04956 CmiUsePersistentHandle(NULL, 0);
04957 #endif
04958 #endif
04959 CmiEnableUrgentSend(0);
04960 }
04961
04962 #if USE_NODE_PAR_RECEIVE
04963 evir = 0.;
04964 CmiMemoryWriteFence();
04965 #endif
04966 }
04967
04968 void PmeZPencil::recv_untrans(const PmeUntransMsg *msg) {
04969 #if ! USE_NODE_PAR_RECEIVE
04970 if(imsg==0) evir=0.;
04971 #endif
04972
04973 if ( msg->has_evir ) evir += msg->evir[0];
04974 int block3 = initdata.grid.block3;
04975 int dim3 = initdata.grid.dim3;
04976 int kb = msg->sourceNode;
04977 int nz = msg->ny;
04978 const float *md = msg->qgrid;
04979 float *d = data;
04980 for ( int i=0; i<nx; ++i ) {
04981 #if CMK_BLUEGENEL
04982 CmiNetworkProgress();
04983 #endif
04984 for ( int j=0; j<ny; ++j, d += dim3 ) {
04985 for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
04986 #ifdef ZEROCHECK
04987 if ( (*md) == 0. ) CkPrintf("0 in YZ at %d %d %d %d %d %d %d %d %d\n",
04988 thisIndex.x, thisIndex.y, kb, i, j, k, nx, ny, nz);
04989 #endif
04990 d[2*k] = *(md++);
04991 d[2*k+1] = *(md++);
04992 }
04993 }
04994 }
04995 }
04996
04997 void PmeZPencil::backward_fft() {
04998 #ifdef NAMD_FFTW
04999 #ifdef MANUAL_DEBUG_FFTW3
05000 dumpMatrixFloat3("bw_z_b", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
05001 #endif
05002 #ifdef NAMD_FFTW_3
05003 #if USE_NODEHELPER
05004 int useNodeHelper = Node::Object()->simParameters->useNodeHelper;
05005 if(useNodeHelper>=NDH_CTRL_PME_BACKWARDFFT) {
05006
05007
05008 CProxy_FuncNodeHelper nodeHelper = CkpvAccess(BOCclass_group).nodeHelper;
05009 NodeHelper_Parallelize(nodeHelper, PmeXZPencilFFT, 1, (void *)backward_plans, CkMyNodeSize(), 0, numPlans-1);
05010 return;
05011 }
05012 #endif
05013 fftwf_execute(backward_plan);
05014 #else
05015 rfftwnd_complex_to_real(backward_plan, nx*ny,
05016 (fftw_complex *) data, 1, initdata.grid.dim3/2, work, 1, 0);
05017 #endif
05018 #ifdef MANUAL_DEBUG_FFTW3
05019 dumpMatrixFloat3("bw_z_a", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
05020 #endif
05021
05022 #endif
05023
05024 #if CMK_BLUEGENEL
05025 CmiNetworkProgress();
05026 #endif
05027
05028 #ifdef FFTCHECK
05029 int dim3 = initdata.grid.dim3;
05030 int K1 = initdata.grid.K1;
05031 int K2 = initdata.grid.K2;
05032 int K3 = initdata.grid.K3;
05033 float scale = 1. / (1. * K1 * K2 * K3);
05034 float maxerr = 0.;
05035 float maxstd = 0.;
05036 int mi, mj, mk; mi = mj = mk = -1;
05037 float std_base = 100. * (thisIndex.x+1.) + 10. * (thisIndex.y+1.);
05038 const float *d = data;
05039 for ( int i=0; i<nx; ++i ) {
05040 for ( int j=0; j<ny; ++j, d += dim3 ) {
05041 for ( int k=0; k<K3; ++k ) {
05042 float std = 10. * (10. * (10. * std_base + i) + j) + k;
05043 float err = scale * d[k] - std;
05044 if ( fabsf(err) > fabsf(maxerr) ) {
05045 maxerr = err;
05046 maxstd = std;
05047 mi = i; mj = j; mk = k;
05048 }
05049 }
05050 }
05051 }
05052 CkPrintf("pencil %d %d max error %f at %d %d %d (should be %f)\n",
05053 thisIndex.x, thisIndex.y, maxerr, mi, mj, mk, maxstd);
05054 #endif
05055
05056 }
05057
05058 static inline void PmeZPencilSendUngrid(int first, int last, void *result, int paraNum, void *param){
05059
05060
05061 int specialIdx = paraNum;
05062 PmeZPencil *zpencil = (PmeZPencil *)param;
05063 zpencil->send_subset_ungrid(first, last, specialIdx);
05064 }
05065
05066 void PmeZPencil::send_all_ungrid() {
05067
05068
05069
05070
05071
05072
05073
05074
05075
05076
05077
05078
05079
05080
05081
05082
05083
05084 int evirIdx = 0;
05085 for(int imsg=0; imsg<grid_msgs.size(); imsg++) {
05086 if(grid_msgs[imsg]->hasData) {
05087 evirIdx = imsg;
05088 break;
05089 }
05090 }
05091
05092 #if USE_NODEHELPER
05093 Bool useNodeHelper = Node::Object()->simParameters->useNodeHelper;
05094 if(useNodeHelper>=NDH_CTRL_PME_SENDUNTRANS) {
05095 CProxy_FuncNodeHelper nodeHelper = CkpvAccess(BOCclass_group).nodeHelper;
05096
05097 #if USE_NODE_PAR_RECEIVE
05098
05099 NodeHelper_Parallelize(nodeHelper, PmeZPencilSendUngrid, evirIdx, (void *)this, grid_msgs.size(), 0, grid_msgs.size()-1, 1);
05100 #else
05101
05102 NodeHelper_Parallelize(nodeHelper, PmeZPencilSendUngrid, evirIdx, (void *)this, grid_msgs.size(), 0, grid_msgs.size()-1, 0);
05103 #endif
05104 return;
05105 }
05106 #endif
05107 send_subset_ungrid(0, grid_msgs.size()-1, evirIdx);
05108 }
05109
05110 void PmeZPencil::send_subset_ungrid(int fromIdx, int toIdx, int specialIdx){
05111 for (int imsg=fromIdx; imsg <=toIdx; ++imsg ) {
05112 PmeGridMsg *msg = grid_msgs[imsg];
05113 if ( msg->hasData) {
05114 if (imsg == specialIdx) {
05115 msg->evir[0] = evir;
05116 } else {
05117 msg->evir[0] = 0.;
05118 }
05119 }
05120 send_ungrid(msg);
05121 }
05122 }
05123
05124 void PmeZPencil::send_ungrid(PmeGridMsg *msg) {
05125 int pe = msg->sourceNode;
05126 if ( ! msg->hasData ) {
05127 delete msg;
05128 PmeAckMsg *ackmsg = new (PRIORITY_SIZE) PmeAckMsg;
05129 SET_PRIORITY(ackmsg,sequence,PME_UNGRID_PRIORITY)
05130 CmiEnableUrgentSend(1);
05131 initdata.pmeProxy[pe].recvAck(ackmsg);
05132 CmiEnableUrgentSend(0);
05133 return;
05134 }
05135 msg->sourceNode = thisIndex.x * initdata.yBlocks + thisIndex.y;
05136 int dim3 = initdata.grid.dim3;
05137 int zlistlen = msg->zlistlen;
05138 int *zlist = msg->zlist;
05139 char *fmsg = msg->fgrid;
05140 float *qmsg = msg->qgrid;
05141 float *d = data;
05142 int numGrids = 1;
05143 for ( int g=0; g<numGrids; ++g ) {
05144 #if CMK_BLUEGENEL
05145 CmiNetworkProgress();
05146 #endif
05147 for ( int i=0; i<nx; ++i ) {
05148 for ( int j=0; j<ny; ++j, d += dim3 ) {
05149 if( *(fmsg++) ) {
05150 for ( int k=0; k<zlistlen; ++k ) {
05151 *(qmsg++) = d[zlist[k]];
05152 }
05153 }
05154 }
05155 }
05156 }
05157 SET_PRIORITY(msg,sequence,PME_UNGRID_PRIORITY)
05158 CmiEnableUrgentSend(1);
05159 initdata.pmeProxy[pe].recvUngrid(msg);
05160 CmiEnableUrgentSend(0);
05161 }
05162
05163 void PmeZPencil::node_process_grid(PmeGridMsg *msg)
05164 {
05165 #if USE_NODE_PAR_RECEIVE
05166 CmiLock(ComputePmeMgr::fftw_plan_lock);
05167 CmiMemoryReadFence();
05168 #endif
05169 recv_grid(msg);
05170 if(msg->hasData) hasData=msg->hasData;
05171 int limsg;
05172 CmiMemoryAtomicFetchAndInc(imsg,limsg);
05173 grid_msgs[limsg] = msg;
05174
05175 if(limsg+1 == grid_msgs.size())
05176 {
05177
05178 if (hasData)
05179 {
05180 forward_fft();
05181 }
05182 send_trans();
05183 imsg=0;
05184 CmiMemoryWriteFence();
05185
05186 }
05187 #if USE_NODE_PAR_RECEIVE
05188 CmiUnlock(ComputePmeMgr::fftw_plan_lock);
05189 CmiMemoryWriteFence();
05190 #endif
05191 }
05192
05193 void PmeZPencil::node_process_untrans(PmeUntransMsg *msg)
05194 {
05195 #if USE_NODE_PAR_RECEIVE
05196 CmiLock(ComputePmeMgr::fftw_plan_lock);
05197 CmiMemoryReadFence();
05198 #endif
05199 recv_untrans(msg);
05200 int limsg;
05201 CmiMemoryAtomicFetchAndInc(imsgb,limsg);
05202 if(limsg+1 == initdata.zBlocks)
05203 {
05204 if(hasData)
05205 {
05206 backward_fft();
05207 }
05208
05209 send_all_ungrid();
05210
05211
05212
05213
05214
05215
05216
05217
05218
05219
05220
05221
05222
05223
05224 imsgb=0;
05225 evir = 0;
05226 memset(data, 0, sizeof(float) * nx*ny* initdata.grid.dim3);
05227 CmiMemoryWriteFence();
05228
05229 }
05230 #if USE_NODE_PAR_RECEIVE
05231 CmiUnlock(ComputePmeMgr::fftw_plan_lock);
05232 CmiMemoryWriteFence();
05233 #endif
05234 }
05235
05236
05237 #include "ComputePmeMgr.def.h"
05238