NAMD
ComputePme.C
Go to the documentation of this file.
1 
7 #ifdef NAMD_FFTW
8 //#define MANUAL_DEBUG_FFTW3 1
9 #ifdef NAMD_FFTW_3
10 #include <fftw3.h>
11 #else
12 // fftw2 doesn't have these defined
13 #define fftwf_malloc fftw_malloc
14 #define fftwf_free fftw_free
15 #ifdef NAMD_FFTW_NO_TYPE_PREFIX
16 #include <fftw.h>
17 #include <rfftw.h>
18 #else
19 #include <sfftw.h>
20 #include <srfftw.h>
21 #endif
22 #endif
23 #endif
24 
25 #include <vector>
26 #include <algorithm>
27 #include <deque>
28 using namespace std;
29 
30 #include "InfoStream.h"
31 #include "Node.h"
32 #include "PatchMap.h"
33 #include "PatchMap.inl"
34 #include "AtomMap.h"
35 #include "ComputePme.h"
36 #include "ComputePmeMgr.decl.h"
37 #include "PmeBase.inl"
38 #include "PmeRealSpace.h"
39 #include "PmeKSpace.h"
40 #include "ComputeNonbondedUtil.h"
41 #include "PatchMgr.h"
42 #include "Molecule.h"
43 #include "ReductionMgr.h"
44 #include "ComputeMgr.h"
45 #include "ComputeMgr.decl.h"
46 // #define DEBUGM
47 #define MIN_DEBUG_LEVEL 3
48 #include "Debug.h"
49 #include "SimParameters.h"
50 #include "WorkDistrib.h"
51 #include "varsizemsg.h"
52 #include "Random.h"
53 #include "ckhashtable.h"
54 #include "Priorities.h"
55 
56 #include "ComputeMoa.h"
57 #include "ComputeMoaMgr.decl.h"
58 
59 //#define USE_RANDOM_TOPO 1
60 
61 //#define USE_TOPO_SFC 1
62 //#define USE_CKLOOP 1
63 //#include "TopoManager.h"
64 
65 #include "DeviceCUDA.h"
66 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
67 #ifdef NAMD_CUDA
68 #include <cuda_runtime.h>
69 #include <cuda.h>
70 #endif
71 #ifdef NAMD_HIP
72 #include "HipDefines.h"
73 #include <hip/hip_runtime.h>
74 #endif
75 void cuda_errcheck(const char *msg);
76 #ifdef WIN32
77 #define __thread __declspec(thread)
78 #endif
79 extern __thread DeviceCUDA *deviceCUDA;
80 #endif
81 
82 #include "ComputePmeCUDAKernel.h"
83 
84 #ifndef SQRT_PI
85 #define SQRT_PI 1.7724538509055160273 /* mathematica 15 digits*/
86 #endif
87 
88 #if CMK_PERSISTENT_COMM
89 #define USE_PERSISTENT 1
90 #endif
91 
92 #if USE_PERSISTENT
93 #define Z_PERSIST 1
94 #define Y_PERSIST 1
95 #define X_PERSIST 1
96 #endif
97 
98 #if (defined(NAMD_HIP) || defined(NAMD_CUDA)) && defined(MEM_OPT_VERSION)
99 #define USE_NODE_PAR_RECEIVE 1
100 #endif
101 
112 
114 
115 class PmeAckMsg : public CMessage_PmeAckMsg {
116 };
117 
118 class PmeGridMsg : public CMessage_PmeGridMsg {
119 public:
120 
122  int sequence;
123  int hasData;
125  int start;
126  int len;
127  int zlistlen;
128  int *zlist;
129  char *fgrid;
130  float *qgrid;
131  CkArrayIndex3D destElem;
132 };
133 
134 class PmeTransMsg : public CMessage_PmeTransMsg {
135 public:
136 
138  int sequence;
139  int hasData;
141  int x_start;
142  int nx;
143  float *qgrid;
144  CkArrayIndex3D destElem;
145 };
146 
147 class PmeSharedTransMsg : public CMessage_PmeSharedTransMsg {
148 public:
150  int *count;
151  CmiNodeLock lock;
152 };
153 
154 class PmeUntransMsg : public CMessage_PmeUntransMsg {
155 public:
156 
158  int y_start;
159  int ny;
160  float *qgrid;
161  CkArrayIndex3D destElem;
162 };
163 
164 class PmeSharedUntransMsg : public CMessage_PmeSharedUntransMsg {
165 public:
167  int *count;
168  CmiNodeLock lock;
169 };
170 
171 class PmeEvirMsg : public CMessage_PmeEvirMsg {
172 public:
174 };
175 
176 class PmePencilMap : public CBase_PmePencilMap {
177 public:
178  PmePencilMap(int i_a, int i_b, int n_b, int n, int *d)
179  : ia(i_a), ib(i_b), nb(n_b),
180  size(n), data(newcopyint(n,d)) {
181  }
182  virtual int registerArray(CkArrayIndexMax&, CkArrayID) {
183  //Return an ``arrayHdl'', given some information about the array
184  return 0;
185  }
186  virtual int procNum(int, const CkArrayIndex &i) {
187  //Return the home processor number for this element of this array
188  return data[ i.data()[ia] * nb + i.data()[ib] ];
189  }
190  virtual void populateInitial(int, CkArrayIndexMax &, void *msg, CkArrMgr *mgr) {
191  int mype = CkMyPe();
192  for ( int i=0; i < size; ++i ) {
193  if ( data[i] == mype ) {
194  CkArrayIndex3D ai(0,0,0);
195  ai.data()[ia] = i / nb;
196  ai.data()[ib] = i % nb;
197  if ( procNum(0,ai) != mype ) NAMD_bug("PmePencilMap is inconsistent");
198  if ( ! msg ) NAMD_bug("PmePencilMap multiple pencils on a pe?");
199  mgr->insertInitial(ai,msg);
200  msg = 0;
201  }
202  }
203  mgr->doneInserting();
204  if ( msg ) CkFreeMsg(msg);
205  }
206 private:
207  const int ia, ib, nb, size;
208  const int* const data;
209  static int* newcopyint(int n, int *d) {
210  int *newd = new int[n];
211  memcpy(newd, d, n*sizeof(int));
212  return newd;
213  }
214 };
215 
216 // use this idiom since messages don't have copy constructors
219  int xBlocks, yBlocks, zBlocks;
220  CProxy_PmeXPencil xPencil;
221  CProxy_PmeYPencil yPencil;
222  CProxy_PmeZPencil zPencil;
223  CProxy_ComputePmeMgr pmeProxy;
224  CProxy_NodePmeMgr pmeNodeProxy;
225  CProxy_PmePencilMap xm;
226  CProxy_PmePencilMap ym;
227  CProxy_PmePencilMap zm;
228 };
229 
230 class PmePencilInitMsg : public CMessage_PmePencilInitMsg {
231 public:
234 };
235 
236 
237 struct LocalPmeInfo {
238  int nx, x_start;
239  int ny_after_transpose, y_start_after_transpose;
240 };
241 
242 struct NodePmeInfo {
243  int npe, pe_start, real_node;
244 };
245 
246 
247 static int findRecipEvirPe() {
248  PatchMap *patchMap = PatchMap::Object();
249  {
250  int mype = CkMyPe();
251  if ( patchMap->numPatchesOnNode(mype) ) {
252  return mype;
253  }
254  }
255  {
256  int node = CmiMyNode();
257  int firstpe = CmiNodeFirst(node);
258  int nodeSize = CmiNodeSize(node);
259  int myrank = CkMyRank();
260  for ( int i=0; i<nodeSize; ++i ) {
261  int pe = firstpe + (myrank+i)%nodeSize;
262  if ( patchMap->numPatchesOnNode(pe) ) {
263  return pe;
264  }
265  }
266  }
267  {
268  int *pelist;
269  int nodeSize;
270  CmiGetPesOnPhysicalNode(CmiPhysicalNodeID(CkMyPe()), &pelist, &nodeSize);
271  int myrank;
272  for ( int i=0; i<nodeSize; ++i ) {
273  if ( pelist[i] == CkMyPe() ) myrank = i;
274  }
275  for ( int i=0; i<nodeSize; ++i ) {
276  int pe = pelist[(myrank+i)%nodeSize];
277  if ( patchMap->numPatchesOnNode(pe) ) {
278  return pe;
279  }
280  }
281  }
282  {
283  int mype = CkMyPe();
284  int npes = CkNumPes();
285  for ( int i=0; i<npes; ++i ) {
286  int pe = (mype+i)%npes;
287  if ( patchMap->numPatchesOnNode(pe) ) {
288  return pe;
289  }
290  }
291  }
292  NAMD_bug("findRecipEvirPe() failed!");
293  return -999; // should never happen
294 }
295 
296 
297 //Assigns gridPeMap and transPeMap to different set of processors.
298 void generatePmePeList2(int *gridPeMap, int numGridPes, int *transPeMap, int numTransPes){
299  int ncpus = CkNumPes();
300 
301  for ( int i=0; i<numGridPes; ++i ) {
302  gridPeMap[i] = WorkDistrib::peDiffuseOrdering[ncpus - numGridPes + i];
303  }
304  std::sort(gridPeMap,gridPeMap+numGridPes);
305  int firstTransPe = ncpus - numGridPes - numTransPes;
306  if ( firstTransPe < 0 ) {
307  firstTransPe = 0;
308  // 0 should be first in list, skip if possible
309  if ( ncpus > numTransPes ) firstTransPe = 1;
310  }
311  for ( int i=0; i<numTransPes; ++i ) {
312  transPeMap[i] = WorkDistrib::peDiffuseOrdering[firstTransPe + i];
313  }
314  std::sort(transPeMap,transPeMap+numTransPes);
315 }
316 
317 #if USE_TOPOMAP
318 //Topology aware PME allocation
319 bool generateBGLORBPmePeList(int *pemap, int numPes, int *block_pes=0,
320  int nbpes=0);
321 #endif
322 
323 
324 int compare_bit_reversed(int a, int b) {
325  int d = a ^ b;
326  int c = 1;
327  if ( d ) while ( ! (d & c) ) {
328  c = c << 1;
329  }
330  return (a & c) - (b & c);
331 }
332 
333 inline bool less_than_bit_reversed(int a, int b) {
334  int d = a ^ b;
335  int c = 1;
336  if ( d ) while ( ! (d & c) ) {
337  c = c << 1;
338  }
339  return d && (b & c);
340 }
341 
343  inline bool operator() (int a, int b) const {
344  return less_than_bit_reversed(a,b);
345  }
346 };
347 
348 struct ijpair {
349  int i,j;
350  ijpair() {;}
351  ijpair(int I, int J) : i(I), j(J) {;}
352 };
353 
355  inline bool operator() (const ijpair &a, const ijpair &b) const {
356  return ( less_than_bit_reversed(a.i,b.i)
357  || ( (a.i == b.i) && less_than_bit_reversed(a.j,b.j) ) );
358  }
359 };
360 
361 class ComputePmeMgr : public CBase_ComputePmeMgr, public ComputePmeUtil {
362 public:
363  friend class ComputePme;
364  friend class NodePmeMgr;
365  ComputePmeMgr();
366  ~ComputePmeMgr();
367 
368  void initialize(CkQdMsg*);
369  void initialize_pencils(CkQdMsg*);
370  void activate_pencils(CkQdMsg*);
371  void recvArrays(CProxy_PmeXPencil, CProxy_PmeYPencil, CProxy_PmeZPencil);
372  void initialize_computes();
373 
374  void sendData(Lattice &, int sequence);
375  void sendDataPart(int first, int last, Lattice &, int sequence, int sourcepe, int errors);
380  void sendPencils(Lattice &, int sequence);
381  void sendPencilsPart(int first, int last, Lattice &, int sequence, int sourcepe);
382  void recvGrid(PmeGridMsg *);
383  void gridCalc1(void);
384  void sendTransBarrier(void);
385  void sendTransSubset(int first, int last);
386  void sendTrans(void);
387  void fwdSharedTrans(PmeTransMsg *);
388  void recvSharedTrans(PmeSharedTransMsg *);
389  void sendDataHelper(int);
390  void sendPencilsHelper(int);
391  void recvTrans(PmeTransMsg *);
392  void procTrans(PmeTransMsg *);
393  void gridCalc2(void);
394  #ifdef OPENATOM_VERSION
395  void gridCalc2Moa(void);
396  #endif // OPENATOM_VERSION
397  void gridCalc2R(void);
398  void fwdSharedUntrans(PmeUntransMsg *);
399  void recvSharedUntrans(PmeSharedUntransMsg *);
400  void sendUntrans(void);
401  void sendUntransSubset(int first, int last);
402  void recvUntrans(PmeUntransMsg *);
403  void procUntrans(PmeUntransMsg *);
404  void gridCalc3(void);
405  void sendUngrid(void);
406  void sendUngridSubset(int first, int last);
407  void recvUngrid(PmeGridMsg *);
408  void recvAck(PmeAckMsg *);
409  void copyResults(PmeGridMsg *);
410  void copyPencils(PmeGridMsg *);
411  void ungridCalc(void);
412  void recvRecipEvir(PmeEvirMsg *);
413  void addRecipEvirClient(void);
414  void submitReductions();
415 
416 #if 0 && USE_PERSISTENT
417  void setup_recvgrid_persistent();
418 #endif
419 
420  static CmiNodeLock fftw_plan_lock;
421  CmiNodeLock pmemgr_lock; // for accessing this object from other threads
422 
423 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
424  float *a_data_host;
425  float *a_data_dev;
426  float *f_data_host;
427  float *f_data_dev;
430  static CmiNodeLock cuda_lock;
431  void chargeGridSubmitted(Lattice &lattice, int sequence);
432  cudaEvent_t end_charges;
433  cudaEvent_t *end_forces;
436  double charges_time;
437  double forces_time;
441  int this_pe;
442 
443  void cuda_submit_charges(Lattice &lattice, int sequence);
445  ComputePmeMgr *mgr; Lattice *lattice; int sequence;
446  };
447  static std::deque<cuda_submit_charges_args> cuda_submit_charges_deque;
448  static bool cuda_busy;
449 
451  void sendChargeGridReady();
452 #endif
453  Lattice *saved_lattice; // saved by chargeGridSubmitted
454  int saved_sequence; // saved by chargeGridSubmitted
455  void pollChargeGridReady();
456  void pollForcesReady();
457  void recvChargeGridReady();
458  void chargeGridReady(Lattice &lattice, int sequence);
459 
461 
462 private:
463 
464 #if 0 && USE_PERSISTENT
465  PersistentHandle *recvGrid_handle;
466 #endif
467 
468  CProxy_ComputePmeMgr pmeProxy;
469  CProxy_ComputePmeMgr pmeProxyDir;
470  CProxy_NodePmeMgr pmeNodeProxy;
471  NodePmeMgr *nodePmeMgr;
472  ComputePmeMgr *masterPmeMgr;
473 
474  void addCompute(ComputePme *c) {
475  if ( ! pmeComputes.size() ) initialize_computes();
476  pmeComputes.add(c);
477  c->setMgr(this);
478  }
479 
480  ResizeArray<ComputePme*> heldComputes;
481  PmeGrid myGrid;
482  Lattice lattice;
483  PmeKSpace *myKSpace;
484  float *qgrid;
485  float *kgrid;
486 
487 #ifdef NAMD_FFTW
488 #ifdef NAMD_FFTW_3
489  fftwf_plan *forward_plan_x, *backward_plan_x;
490  fftwf_plan *forward_plan_yz, *backward_plan_yz;
491  fftwf_complex *work;
492 #else
493  fftw_plan forward_plan_x, backward_plan_x;
494  rfftwnd_plan forward_plan_yz, backward_plan_yz;
495  fftw_complex *work;
496 #endif
497 #else
498  float *work;
499 #endif
500 
501  int qsize, fsize, bsize;
502  int offload;
503  BigReal alchLambda; // set on each step in ComputePme::ungridForces()
504  BigReal alchLambda2; // set on each step in ComputePme::ungridForces()
505 
506  float **q_arr;
507  // q_list and q_count not used for offload
508  float **q_list;
509  int q_count;
510  char *f_arr;
511  char *fz_arr;
513  SubmitReduction *reduction;
514 
515  int noWorkCount;
516  int doWorkCount;
517  int ungridForcesCount;
518 
519 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
520 #define NUM_STREAMS 1
521  cudaStream_t streams[NUM_STREAMS];
522  int stream;
523 
524  float **q_arr_dev;
525  float **v_arr_dev;
526  float *q_data_host;
527  float *q_data_dev;
528  float *v_data_dev;
529  int *ffz_host;
530  int *ffz_dev;
531  int q_data_size;
532  int ffz_size;
533 
534  int f_data_mgr_alloc;
535  float *f_data_mgr_host;
536  float *f_data_mgr_dev;
537  float **afn_host;
538  float **afn_dev;
539 
540  float *bspline_coeffs_dev;
541  float *bspline_dcoeffs_dev;
542 #endif
543  int recipEvirCount; // used in compute only
544  int recipEvirClients; // used in compute only
545  int recipEvirPe; // used in trans only
546 
547  LocalPmeInfo *localInfo;
548  NodePmeInfo *gridNodeInfo;
549  NodePmeInfo *transNodeInfo;
550  int qgrid_size;
551  int qgrid_start;
552  int qgrid_len;
553  int fgrid_start;
554  int fgrid_len;
555 
556  int numSources;
557  int numGridPes;
558  int numTransPes;
559  int numGridNodes;
560  int numTransNodes;
561  int numDestRecipPes;
562  int myGridPe, myGridNode;
563  int myTransPe, myTransNode;
564  int *gridPeMap;
565  int *transPeMap;
566  int *recipPeDest;
567  int *gridPeOrder;
568  int *gridNodeOrder;
569  int *transNodeOrder;
570  int grid_count;
571  int trans_count;
572  int untrans_count;
573  int ungrid_count;
574  PmeGridMsg **gridmsg_reuse;
575  PmeReduction recip_evir2[PME_MAX_EVALS];
576 
577  int compute_sequence; // set from patch computes, used for priorities
578  int grid_sequence; // set from grid messages, used for priorities
579  int useBarrier;
580  int sendTransBarrier_received;
581 
582  int usePencils;
583  int xBlocks, yBlocks, zBlocks;
584  CProxy_PmeXPencil xPencil;
585  CProxy_PmeYPencil yPencil;
586  CProxy_PmeZPencil zPencil;
587  char *pencilActive;
588  ijpair *activePencils;
589  int numPencilsActive;
590  int strayChargeErrors;
591 };
592 
594  return mgr->pmeComputes ;
595 }
596 
597  CmiNodeLock ComputePmeMgr::fftw_plan_lock;
598 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
599  CmiNodeLock ComputePmeMgr::cuda_lock;
600  std::deque<ComputePmeMgr::cuda_submit_charges_args> ComputePmeMgr::cuda_submit_charges_deque;
602 #endif
603 
604 int isPmeProcessor(int p){
606  if (simParams->usePMECUDA) {
607  return 0;
608  } else {
609  return pencilPMEProcessors[p];
610  }
611 }
612 
613 class NodePmeMgr : public CBase_NodePmeMgr {
614 public:
615  friend class ComputePmeMgr;
616  friend class ComputePme;
617  NodePmeMgr();
618  ~NodePmeMgr();
619  void initialize();
620  void sendDataHelper(int);
621  void sendPencilsHelper(int);
622  void recvTrans(PmeTransMsg *);
623  void recvUntrans(PmeUntransMsg *);
624  void registerXPencil(CkArrayIndex3D, PmeXPencil *);
625  void registerYPencil(CkArrayIndex3D, PmeYPencil *);
626  void registerZPencil(CkArrayIndex3D, PmeZPencil *);
627  void recvXTrans(PmeTransMsg *);
628  void recvYTrans(PmeTransMsg *);
629  void recvYUntrans(PmeUntransMsg *);
630  void recvZGrid(PmeGridMsg *);
631  void recvZUntrans(PmeUntransMsg *);
632 
633  void recvUngrid(PmeGridMsg *);
634 
635  void recvPencilMapProxies(CProxy_PmePencilMap _xm, CProxy_PmePencilMap _ym, CProxy_PmePencilMap _zm){
636  xm=_xm; ym=_ym; zm=_zm;
637  }
638  CProxy_PmePencilMap xm;
639  CProxy_PmePencilMap ym;
640  CProxy_PmePencilMap zm;
641 
642 private:
643  CProxy_ComputePmeMgr mgrProxy;
644  ComputePmeMgr *mgrObject;
645  ComputePmeMgr **mgrObjects;
646 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
647  ComputePmeMgr *masterPmeMgr;
648  int master_pe;
649 #endif
650  CProxy_PmeXPencil xPencil;
651  CProxy_PmeYPencil yPencil;
652  CProxy_PmeZPencil zPencil;
653  CkHashtableT<CkArrayIndex3D,PmeXPencil*> xPencilObj;
654  CkHashtableT<CkArrayIndex3D,PmeYPencil*> yPencilObj;
655  CkHashtableT<CkArrayIndex3D,PmeZPencil*> zPencilObj;
656 
657 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
658  cudaEvent_t end_charge_memset;
659  cudaEvent_t end_all_pme_kernels;
660  cudaEvent_t end_potential_memcpy;
661 #endif
662 };
663 
665  mgrObjects = new ComputePmeMgr*[CkMyNodeSize()];
666 }
667 
669  delete [] mgrObjects;
670 }
671 
673  CProxy_ComputePmeMgr proxy = CkpvAccess(BOCclass_group).computePmeMgr;
674  mgrObjects[CkMyRank()] = proxy.ckLocalBranch();
675  if ( CkMyRank() == 0 ) {
676  mgrProxy = proxy;
677  mgrObject = proxy.ckLocalBranch();
678  }
679 }
680 
682  mgrObject->fwdSharedTrans(msg);
683 }
684 
686  mgrObject->fwdSharedUntrans(msg);
687 }
688 
690 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
691  masterPmeMgr->recvUngrid(msg);
692 #else
693  NAMD_bug("NodePmeMgr::recvUngrid called in non-CUDA build.");
694 #endif
695 }
696 
697 void NodePmeMgr::registerXPencil(CkArrayIndex3D idx, PmeXPencil *obj)
698 {
700  xPencilObj.put(idx)=obj;
702 }
703 void NodePmeMgr::registerYPencil(CkArrayIndex3D idx, PmeYPencil *obj)
704 {
706  yPencilObj.put(idx)=obj;
708 }
709 void NodePmeMgr::registerZPencil(CkArrayIndex3D idx, PmeZPencil *obj)
710 {
712  zPencilObj.put(idx)=obj;
714 }
715 
716 ComputePmeMgr::ComputePmeMgr() : pmeProxy(thisgroup),
717  pmeProxyDir(thisgroup) {
718 
719  CkpvAccess(BOCclass_group).computePmeMgr = thisgroup;
720  pmeNodeProxy = CkpvAccess(BOCclass_group).nodePmeMgr;
721  nodePmeMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
722 
723  pmeNodeProxy.ckLocalBranch()->initialize();
724 
725  if ( CmiMyRank() == 0 ) {
726  fftw_plan_lock = CmiCreateLock();
727  }
728  pmemgr_lock = CmiCreateLock();
729 
730  myKSpace = 0;
731  kgrid = 0;
732  work = 0;
733  grid_count = 0;
734  trans_count = 0;
735  untrans_count = 0;
736  ungrid_count = 0;
737  gridmsg_reuse= new PmeGridMsg*[CkNumPes()];
738  useBarrier = 0;
739  sendTransBarrier_received = 0;
740  usePencils = 0;
741 
742 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
743  // offload has not been set so this happens on every run
744  if ( CmiMyRank() == 0 ) {
745  cuda_lock = CmiCreateLock();
746  }
747 
748 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP)
749  int leastPriority, greatestPriority;
750  cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
751  cuda_errcheck("in cudaDeviceGetStreamPriorityRange");
752  //if ( CkMyNode() == 0 ) {
753  // CkPrintf("Pe %d PME CUDA stream priority range %d %d\n", CkMyPe(), leastPriority, greatestPriority);
754  //}
755 #define CUDA_STREAM_CREATE(X) cudaStreamCreateWithPriority(X,cudaStreamDefault,greatestPriority)
756 #else
757 #define CUDA_STREAM_CREATE(X) cudaStreamCreate(X)
758 #endif
759 
760  stream = 0;
761  for ( int i=0; i<NUM_STREAMS; ++i ) {
762 #if 1
763  CUDA_STREAM_CREATE(&streams[i]);
764  cuda_errcheck("cudaStreamCreate");
765 #else
766  streams[i] = 0; // XXXX Testing!!!
767 #endif
768  }
769 
770  this_pe = CkMyPe();
771 
772  cudaEventCreateWithFlags(&end_charges,cudaEventDisableTiming);
773  end_forces = 0;
775  check_forces_count = 0;
777 
778  cuda_atoms_count = 0;
779  cuda_atoms_alloc = 0;
780 
781  f_data_mgr_alloc = 0;
782  f_data_mgr_host = 0;
783  f_data_mgr_dev = 0;
784  afn_host = 0;
785  afn_dev = 0;
786 
787 #define CUDA_EVENT_ID_PME_CHARGES 80
788 #define CUDA_EVENT_ID_PME_FORCES 81
789 #define CUDA_EVENT_ID_PME_TICK 82
790 #define CUDA_EVENT_ID_PME_COPY 83
791 #define CUDA_EVENT_ID_PME_KERNEL 84
792  if ( 0 == CkMyPe() ) {
793  traceRegisterUserEvent("CUDA PME charges", CUDA_EVENT_ID_PME_CHARGES);
794  traceRegisterUserEvent("CUDA PME forces", CUDA_EVENT_ID_PME_FORCES);
795  traceRegisterUserEvent("CUDA PME tick", CUDA_EVENT_ID_PME_TICK);
796  traceRegisterUserEvent("CUDA PME memcpy", CUDA_EVENT_ID_PME_COPY);
797  traceRegisterUserEvent("CUDA PME kernel", CUDA_EVENT_ID_PME_KERNEL);
798  }
799 #endif
800  recipEvirCount = 0;
801  recipEvirClients = 0;
802  recipEvirPe = -999;
803 }
804 
805 
807  CProxy_PmeXPencil x, CProxy_PmeYPencil y, CProxy_PmeZPencil z) {
808  xPencil = x; yPencil = y; zPencil = z;
809 
810  if(CmiMyRank()==0)
811  {
812  pmeNodeProxy.ckLocalBranch()->xPencil=x;
813  pmeNodeProxy.ckLocalBranch()->yPencil=y;
814  pmeNodeProxy.ckLocalBranch()->zPencil=z;
815  }
816 }
817 
818 #if USE_TOPO_SFC
819  struct Coord
820  {
821  int x, y, z;
822  Coord(): x(0), y(0), z(0) {}
823  Coord(int a, int b, int c): x(a), y(b), z(c) {}
824  };
825  extern void SFC_grid(int xdim, int ydim, int zdim, int xdim1, int ydim1, int zdim1, vector<Coord> &result);
826 
827  void sort_sfc(SortableResizeArray<int> &procs, TopoManager &tmgr, vector<Coord> &result)
828  {
829  SortableResizeArray<int> newprocs(procs.size());
830  int num = 0;
831  for (int i=0; i<result.size(); i++) {
832  Coord &c = result[i];
833  for (int j=0; j<procs.size(); j++) {
834  int pe = procs[j];
835  int x,y,z,t;
836  tmgr.rankToCoordinates(pe, x, y, z, t);
837  if (x==c.x && y==c.y && z==c.z)
838  newprocs[num++] = pe;
839  }
840  }
841  CmiAssert(newprocs.size() == procs.size());
842  procs = newprocs;
843  }
844 
845  int find_level_grid(int x)
846  {
847  int a = sqrt(x);
848  int b;
849  for (; a>0; a--) {
850  if (x%a == 0) break;
851  }
852  if (a==1) a = x;
853  b = x/a;
854  //return a>b?a:b;
855  return b;
856  }
857  CmiNodeLock tmgr_lock;
858 #endif
859 
860 void Pme_init()
861 {
862 #if USE_TOPO_SFC
863  if (CkMyRank() == 0)
864  tmgr_lock = CmiCreateLock();
865 #endif
866 }
867 
868 void ComputePmeMgr::initialize(CkQdMsg *msg) {
869  delete msg;
870 
871  localInfo = new LocalPmeInfo[CkNumPes()];
872  gridNodeInfo = new NodePmeInfo[CkNumNodes()];
873  transNodeInfo = new NodePmeInfo[CkNumNodes()];
874  gridPeMap = new int[CkNumPes()];
875  transPeMap = new int[CkNumPes()];
876  recipPeDest = new int[CkNumPes()];
877  gridPeOrder = new int[CkNumPes()];
878  gridNodeOrder = new int[CkNumNodes()];
879  transNodeOrder = new int[CkNumNodes()];
880 
881  if (CkMyRank() == 0) {
882  pencilPMEProcessors = new char [CkNumPes()];
883  memset (pencilPMEProcessors, 0, sizeof(char) * CkNumPes());
884  }
885 
887  PatchMap *patchMap = PatchMap::Object();
888 
889  offload = simParams->PMEOffload;
890 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
891  if ( offload && ! deviceCUDA->one_device_per_node() ) {
892  NAMD_die("PME offload requires exactly one CUDA device per process. Use \"PMEOffload no\".");
893  }
894  if ( offload ) {
895  int dev;
896  cudaGetDevice(&dev);
897  cuda_errcheck("in cudaGetDevice");
898  if ( dev != deviceCUDA->getDeviceID() ) NAMD_bug("ComputePmeMgr::initialize dev != deviceCUDA->getDeviceID()");
899  cudaDeviceProp deviceProp;
900  cudaGetDeviceProperties(&deviceProp, dev);
901  cuda_errcheck("in cudaGetDeviceProperties");
902  if ( deviceProp.major < 2 )
903  NAMD_die("PME offload requires CUDA device of compute capability 2.0 or higher. Use \"PMEOffload no\".");
904  }
905 #endif
906 
907  alchLambda = -1.; // illegal value to catch if not updated
908  alchLambda2 = -1.;
909  useBarrier = simParams->PMEBarrier;
910 
911  if ( numGrids != 1 || simParams->PMEPencils == 0 ) usePencils = 0;
912  else if ( simParams->PMEPencils > 0 ) usePencils = 1;
913  else {
914  int nrps = simParams->PMEProcessors;
915  if ( nrps <= 0 ) nrps = CkNumPes();
916  if ( nrps > CkNumPes() ) nrps = CkNumPes();
917  int dimx = simParams->PMEGridSizeX;
918  int dimy = simParams->PMEGridSizeY;
919  int maxslabs = 1 + (dimx - 1) / simParams->PMEMinSlices;
920  if ( maxslabs > nrps ) maxslabs = nrps;
921  int maxpencils = ( simParams->PMEGridSizeX * (int64) simParams->PMEGridSizeY
922  * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
923  if ( maxpencils > nrps ) maxpencils = nrps;
924  if ( maxpencils > 3 * maxslabs ) usePencils = 1;
925  else usePencils = 0;
926  }
927 
928  if ( usePencils ) {
929  int nrps = simParams->PMEProcessors;
930  if ( nrps <= 0 ) nrps = CkNumPes();
931  if ( nrps > CkNumPes() ) nrps = CkNumPes();
932  if ( simParams->PMEPencils > 1 &&
933  simParams->PMEPencils * simParams->PMEPencils <= nrps ) {
934  xBlocks = yBlocks = zBlocks = simParams->PMEPencils;
935  } else {
936  int nb2 = ( simParams->PMEGridSizeX * (int64) simParams->PMEGridSizeY
937  * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
938  if ( nb2 > nrps ) nb2 = nrps;
939  if ( nb2 < 1 ) nb2 = 1;
940  int nb = (int) sqrt((float)nb2);
941  if ( nb < 1 ) nb = 1;
942  xBlocks = zBlocks = nb;
943  yBlocks = nb2 / nb;
944  }
945 
946  if ( simParams->PMEPencilsX > 0 ) xBlocks = simParams->PMEPencilsX;
947  if ( simParams->PMEPencilsY > 0 ) yBlocks = simParams->PMEPencilsY;
948  if ( simParams->PMEPencilsZ > 0 ) zBlocks = simParams->PMEPencilsZ;
949 
950  int dimx = simParams->PMEGridSizeX;
951  int bx = 1 + ( dimx - 1 ) / xBlocks;
952  xBlocks = 1 + ( dimx - 1 ) / bx;
953 
954  int dimy = simParams->PMEGridSizeY;
955  int by = 1 + ( dimy - 1 ) / yBlocks;
956  yBlocks = 1 + ( dimy - 1 ) / by;
957 
958  int dimz = simParams->PMEGridSizeZ / 2 + 1; // complex
959  int bz = 1 + ( dimz - 1 ) / zBlocks;
960  zBlocks = 1 + ( dimz - 1 ) / bz;
961 
962  if ( xBlocks * yBlocks > CkNumPes() ) {
963  NAMD_die("PME pencils xBlocks * yBlocks > numPes");
964  }
965  if ( xBlocks * zBlocks > CkNumPes() ) {
966  NAMD_die("PME pencils xBlocks * zBlocks > numPes");
967  }
968  if ( yBlocks * zBlocks > CkNumPes() ) {
969  NAMD_die("PME pencils yBlocks * zBlocks > numPes");
970  }
971 
972  if ( ! CkMyPe() ) {
973  iout << iINFO << "PME using " << xBlocks << " x " <<
974  yBlocks << " x " << zBlocks <<
975  " pencil grid for FFT and reciprocal sum.\n" << endi;
976  }
977  } else { // usePencils
978 
979  { // decide how many pes to use for reciprocal sum
980 
981  // rules based on work available
982  int minslices = simParams->PMEMinSlices;
983  int dimx = simParams->PMEGridSizeX;
984  int nrpx = ( dimx + minslices - 1 ) / minslices;
985  int dimy = simParams->PMEGridSizeY;
986  int nrpy = ( dimy + minslices - 1 ) / minslices;
987 
988  // rules based on processors available
989  int nrpp = CkNumPes();
990  // if ( nrpp > 32 ) nrpp = 32; // cap to limit messages
991  if ( nrpp < nrpx ) nrpx = nrpp;
992  if ( nrpp < nrpy ) nrpy = nrpp;
993 
994  // user override
995  int nrps = simParams->PMEProcessors;
996  if ( nrps > CkNumPes() ) nrps = CkNumPes();
997  if ( nrps > 0 ) nrpx = nrps;
998  if ( nrps > 0 ) nrpy = nrps;
999 
1000  // make sure there aren't any totally empty processors
1001  int bx = ( dimx + nrpx - 1 ) / nrpx;
1002  nrpx = ( dimx + bx - 1 ) / bx;
1003  int by = ( dimy + nrpy - 1 ) / nrpy;
1004  nrpy = ( dimy + by - 1 ) / by;
1005  if ( bx != ( dimx + nrpx - 1 ) / nrpx )
1006  NAMD_bug("Error in selecting number of PME processors.");
1007  if ( by != ( dimy + nrpy - 1 ) / nrpy )
1008  NAMD_bug("Error in selecting number of PME processors.");
1009 
1010  numGridPes = nrpx;
1011  numTransPes = nrpy;
1012  }
1013  if ( ! CkMyPe() ) {
1014  iout << iINFO << "PME using " << numGridPes << " and " << numTransPes <<
1015  " processors for FFT and reciprocal sum.\n" << endi;
1016  }
1017 
1018  int sum_npes = numTransPes + numGridPes;
1019  int max_npes = (numTransPes > numGridPes)?numTransPes:numGridPes;
1020 
1021 #if 0 // USE_TOPOMAP
1022  /* This code is being disabled permanently for slab PME on Blue Gene machines */
1023  PatchMap * pmap = PatchMap::Object();
1024 
1025  int patch_pes = pmap->numNodesWithPatches();
1026  TopoManager tmgr;
1027  if(tmgr.hasMultipleProcsPerNode())
1028  patch_pes *= 2;
1029 
1030  bool done = false;
1031  if(CkNumPes() > 2*sum_npes + patch_pes) {
1032  done = generateBGLORBPmePeList(transPeMap, numTransPes);
1033  done &= generateBGLORBPmePeList(gridPeMap, numGridPes, transPeMap, numTransPes);
1034  }
1035  else
1036  if(CkNumPes() > 2 *max_npes + patch_pes) {
1037  done = generateBGLORBPmePeList(transPeMap, max_npes);
1038  gridPeMap = transPeMap;
1039  }
1040 
1041  if (!done)
1042 #endif
1043  {
1044  //generatePmePeList(transPeMap, max_npes);
1045  //gridPeMap = transPeMap;
1046  generatePmePeList2(gridPeMap, numGridPes, transPeMap, numTransPes);
1047  }
1048 
1049  if ( ! CkMyPe() ) {
1050  iout << iINFO << "PME GRID LOCATIONS:";
1051  int i;
1052  for ( i=0; i<numGridPes && i<10; ++i ) {
1053  iout << " " << gridPeMap[i];
1054  }
1055  if ( i < numGridPes ) iout << " ...";
1056  iout << "\n" << endi;
1057  iout << iINFO << "PME TRANS LOCATIONS:";
1058  for ( i=0; i<numTransPes && i<10; ++i ) {
1059  iout << " " << transPeMap[i];
1060  }
1061  if ( i < numTransPes ) iout << " ...";
1062  iout << "\n" << endi;
1063  }
1064 
1065  // sort based on nodes and physical nodes
1066  std::sort(gridPeMap,gridPeMap+numGridPes,WorkDistrib::pe_sortop_compact());
1067 
1068  myGridPe = -1;
1069  myGridNode = -1;
1070  int i = 0;
1071  int node = -1;
1072  int real_node = -1;
1073  for ( i=0; i<numGridPes; ++i ) {
1074  if ( gridPeMap[i] == CkMyPe() ) myGridPe = i;
1075  if (CkMyRank() == 0) pencilPMEProcessors[gridPeMap[i]] |= 1;
1076  int real_node_i = CkNodeOf(gridPeMap[i]);
1077  if ( real_node_i == real_node ) {
1078  gridNodeInfo[node].npe += 1;
1079  } else {
1080  real_node = real_node_i;
1081  ++node;
1082  gridNodeInfo[node].real_node = real_node;
1083  gridNodeInfo[node].pe_start = i;
1084  gridNodeInfo[node].npe = 1;
1085  }
1086  if ( CkMyNode() == real_node_i ) myGridNode = node;
1087  }
1088  numGridNodes = node + 1;
1089  myTransPe = -1;
1090  myTransNode = -1;
1091  node = -1;
1092  real_node = -1;
1093  for ( i=0; i<numTransPes; ++i ) {
1094  if ( transPeMap[i] == CkMyPe() ) myTransPe = i;
1095  if (CkMyRank() == 0) pencilPMEProcessors[transPeMap[i]] |= 2;
1096  int real_node_i = CkNodeOf(transPeMap[i]);
1097  if ( real_node_i == real_node ) {
1098  transNodeInfo[node].npe += 1;
1099  } else {
1100  real_node = real_node_i;
1101  ++node;
1102  transNodeInfo[node].real_node = real_node;
1103  transNodeInfo[node].pe_start = i;
1104  transNodeInfo[node].npe = 1;
1105  }
1106  if ( CkMyNode() == real_node_i ) myTransNode = node;
1107  }
1108  numTransNodes = node + 1;
1109 
1110  if ( ! CkMyPe() ) {
1111  iout << iINFO << "PME USING " << numGridNodes << " GRID NODES AND "
1112  << numTransNodes << " TRANS NODES\n" << endi;
1113  }
1114 
1115  { // generate random orderings for grid and trans messages
1116  int i;
1117  for ( i = 0; i < numGridPes; ++i ) {
1118  gridPeOrder[i] = i;
1119  }
1120  Random rand(CkMyPe());
1121  if ( myGridPe < 0 ) {
1122  rand.reorder(gridPeOrder,numGridPes);
1123  } else { // self last
1124  gridPeOrder[myGridPe] = numGridPes-1;
1125  gridPeOrder[numGridPes-1] = myGridPe;
1126  rand.reorder(gridPeOrder,numGridPes-1);
1127  }
1128  for ( i = 0; i < numGridNodes; ++i ) {
1129  gridNodeOrder[i] = i;
1130  }
1131  if ( myGridNode < 0 ) {
1132  rand.reorder(gridNodeOrder,numGridNodes);
1133  } else { // self last
1134  gridNodeOrder[myGridNode] = numGridNodes-1;
1135  gridNodeOrder[numGridNodes-1] = myGridNode;
1136  rand.reorder(gridNodeOrder,numGridNodes-1);
1137  }
1138  for ( i = 0; i < numTransNodes; ++i ) {
1139  transNodeOrder[i] = i;
1140  }
1141  if ( myTransNode < 0 ) {
1142  rand.reorder(transNodeOrder,numTransNodes);
1143  } else { // self last
1144  transNodeOrder[myTransNode] = numTransNodes-1;
1145  transNodeOrder[numTransNodes-1] = myTransNode;
1146  rand.reorder(transNodeOrder,numTransNodes-1);
1147  }
1148  }
1149 
1150  } // ! usePencils
1151 
1152  myGrid.K1 = simParams->PMEGridSizeX;
1153  myGrid.K2 = simParams->PMEGridSizeY;
1154  myGrid.K3 = simParams->PMEGridSizeZ;
1155  myGrid.order = simParams->PMEInterpOrder;
1156  myGrid.dim2 = myGrid.K2;
1157  myGrid.dim3 = 2 * (myGrid.K3/2 + 1);
1158 
1159  if ( ! usePencils ) {
1160  myGrid.block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
1161  myGrid.block2 = ( myGrid.K2 + numTransPes - 1 ) / numTransPes;
1162  myGrid.block3 = myGrid.dim3 / 2; // complex
1163  }
1164 
1165  if ( usePencils ) {
1166  myGrid.block1 = ( myGrid.K1 + xBlocks - 1 ) / xBlocks;
1167  myGrid.block2 = ( myGrid.K2 + yBlocks - 1 ) / yBlocks;
1168  myGrid.block3 = ( myGrid.K3/2 + 1 + zBlocks - 1 ) / zBlocks; // complex
1169 
1170 
1171  int pe = 0;
1172  int x,y,z;
1173 
1174  SortableResizeArray<int> zprocs(xBlocks*yBlocks);
1175  SortableResizeArray<int> yprocs(xBlocks*zBlocks);
1176  SortableResizeArray<int> xprocs(yBlocks*zBlocks);
1177 
1178  // decide which pes to use by bit reversal and patch use
1179  int i;
1180  int ncpus = CkNumPes();
1181  SortableResizeArray<int> patches, nopatches, pmeprocs;
1182  PatchMap *pmap = PatchMap::Object();
1183  for ( int icpu=0; icpu<ncpus; ++icpu ) {
1184  int ri = WorkDistrib::peDiffuseOrdering[icpu];
1185  if ( ri ) { // keep 0 for special case
1186  // pretend pe 1 has patches to avoid placing extra PME load on node
1187  if ( ri == 1 || pmap->numPatchesOnNode(ri) ) patches.add(ri);
1188  else nopatches.add(ri);
1189  }
1190  }
1191 
1192 #if USE_RANDOM_TOPO
1193  Random rand(CkMyPe());
1194  int *tmp = new int[patches.size()];
1195  int nn = patches.size();
1196  for (i=0;i<nn;i++) tmp[i] = patches[i];
1197  rand.reorder(tmp, nn);
1198  patches.resize(0);
1199  for (i=0;i<nn;i++) patches.add(tmp[i]);
1200  delete [] tmp;
1201  tmp = new int[nopatches.size()];
1202  nn = nopatches.size();
1203  for (i=0;i<nn;i++) tmp[i] = nopatches[i];
1204  rand.reorder(tmp, nn);
1205  nopatches.resize(0);
1206  for (i=0;i<nn;i++) nopatches.add(tmp[i]);
1207  delete [] tmp;
1208 #endif
1209 
1210  // only use zero if it eliminates overloading or has patches
1211  int useZero = 0;
1212  int npens = xBlocks*yBlocks;
1213  if ( npens % ncpus == 0 ) useZero = 1;
1214  if ( npens == nopatches.size() + 1 ) useZero = 1;
1215  npens += xBlocks*zBlocks;
1216  if ( npens % ncpus == 0 ) useZero = 1;
1217  if ( npens == nopatches.size() + 1 ) useZero = 1;
1218  npens += yBlocks*zBlocks;
1219  if ( npens % ncpus == 0 ) useZero = 1;
1220  if ( npens == nopatches.size() + 1 ) useZero = 1;
1221 
1222  // add nopatches then patches in reversed order
1223  for ( i=nopatches.size()-1; i>=0; --i ) pmeprocs.add(nopatches[i]);
1224  if ( useZero && ! pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
1225  for ( i=patches.size()-1; i>=0; --i ) pmeprocs.add(patches[i]);
1226  if ( pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
1227 
1228  int npes = pmeprocs.size();
1229  for ( i=0; i<xBlocks*yBlocks; ++i, ++pe ) zprocs[i] = pmeprocs[pe%npes];
1230  if ( i>1 && zprocs[0] == zprocs[i-1] ) zprocs[0] = 0;
1231 #if !USE_RANDOM_TOPO
1232  zprocs.sort();
1233 #endif
1234  for ( i=0; i<xBlocks*zBlocks; ++i, ++pe ) yprocs[i] = pmeprocs[pe%npes];
1235  if ( i>1 && yprocs[0] == yprocs[i-1] ) yprocs[0] = 0;
1236 #if !USE_RANDOM_TOPO
1237  yprocs.sort();
1238 #endif
1239  for ( i=0; i<yBlocks*zBlocks; ++i, ++pe ) xprocs[i] = pmeprocs[pe%npes];
1240  if ( i>1 && xprocs[0] == xprocs[i-1] ) xprocs[0] = 0;
1241 #if !USE_RANDOM_TOPO
1242  xprocs.sort();
1243 #endif
1244 
1245 #if USE_TOPO_SFC
1246  CmiLock(tmgr_lock);
1247  //{
1248  TopoManager tmgr;
1249  int xdim = tmgr.getDimNX();
1250  int ydim = tmgr.getDimNY();
1251  int zdim = tmgr.getDimNZ();
1252  int xdim1 = find_level_grid(xdim);
1253  int ydim1 = find_level_grid(ydim);
1254  int zdim1 = find_level_grid(zdim);
1255  if(CkMyPe() == 0)
1256  printf("xdim: %d %d %d, %d %d %d\n", xdim, ydim, zdim, xdim1, ydim1, zdim1);
1257 
1258  vector<Coord> result;
1259  SFC_grid(xdim, ydim, zdim, xdim1, ydim1, zdim1, result);
1260  sort_sfc(xprocs, tmgr, result);
1261  sort_sfc(yprocs, tmgr, result);
1262  sort_sfc(zprocs, tmgr, result);
1263  //}
1264  CmiUnlock(tmgr_lock);
1265 #endif
1266 
1267 
1268  if(CkMyPe() == 0){
1269  iout << iINFO << "PME Z PENCIL LOCATIONS:";
1270  for ( i=0; i<zprocs.size() && i<10; ++i ) {
1271 #if USE_TOPO_SFC
1272  int x,y,z,t;
1273  tmgr.rankToCoordinates(zprocs[i], x,y, z, t);
1274  iout << " " << zprocs[i] << "(" << x << " " << y << " " << z << ")";
1275 #else
1276  iout << " " << zprocs[i];
1277 #endif
1278  }
1279  if ( i < zprocs.size() ) iout << " ...";
1280  iout << "\n" << endi;
1281  }
1282 
1283  if (CkMyRank() == 0) {
1284  for (pe=0, x = 0; x < xBlocks; ++x)
1285  for (y = 0; y < yBlocks; ++y, ++pe ) {
1286  pencilPMEProcessors[zprocs[pe]] = 1;
1287  }
1288  }
1289 
1290  if(CkMyPe() == 0){
1291  iout << iINFO << "PME Y PENCIL LOCATIONS:";
1292  for ( i=0; i<yprocs.size() && i<10; ++i ) {
1293 #if USE_TOPO_SFC
1294  int x,y,z,t;
1295  tmgr.rankToCoordinates(yprocs[i], x,y, z, t);
1296  iout << " " << yprocs[i] << "(" << x << " " << y << " " << z << ")";
1297 #else
1298  iout << " " << yprocs[i];
1299 #endif
1300  }
1301  if ( i < yprocs.size() ) iout << " ...";
1302  iout << "\n" << endi;
1303  }
1304 
1305  if (CkMyRank() == 0) {
1306  for (pe=0, z = 0; z < zBlocks; ++z )
1307  for (x = 0; x < xBlocks; ++x, ++pe ) {
1308  pencilPMEProcessors[yprocs[pe]] = 1;
1309  }
1310  }
1311 
1312  if(CkMyPe() == 0){
1313  iout << iINFO << "PME X PENCIL LOCATIONS:";
1314  for ( i=0; i<xprocs.size() && i<10; ++i ) {
1315 #if USE_TOPO_SFC
1316  int x,y,z,t;
1317  tmgr.rankToCoordinates(xprocs[i], x,y, z, t);
1318  iout << " " << xprocs[i] << "(" << x << " " << y << " " << z << ")";
1319 #else
1320  iout << " " << xprocs[i];
1321 #endif
1322  }
1323  if ( i < xprocs.size() ) iout << " ...";
1324  iout << "\n" << endi;
1325  }
1326 
1327  if (CkMyRank() == 0) {
1328  for (pe=0, y = 0; y < yBlocks; ++y )
1329  for (z = 0; z < zBlocks; ++z, ++pe ) {
1330  pencilPMEProcessors[xprocs[pe]] = 1;
1331  }
1332  }
1333 
1334 
1335  // creating the pencil arrays
1336  if ( CkMyPe() == 0 ){
1337 #if !USE_RANDOM_TOPO
1338  // std::sort(zprocs.begin(),zprocs.end(),WorkDistrib::pe_sortop_compact());
1339  WorkDistrib::sortPmePes(zprocs.begin(),xBlocks,yBlocks);
1340  std::sort(yprocs.begin(),yprocs.end(),WorkDistrib::pe_sortop_compact());
1341  std::sort(xprocs.begin(),xprocs.end(),WorkDistrib::pe_sortop_compact());
1342 #endif
1343 #if 1
1344  CProxy_PmePencilMap zm = CProxy_PmePencilMap::ckNew(0,1,yBlocks,xBlocks*yBlocks,zprocs.begin());
1345  CProxy_PmePencilMap ym;
1346  if ( simParams->PMEPencilsYLayout )
1347  ym = CProxy_PmePencilMap::ckNew(0,2,zBlocks,zBlocks*xBlocks,yprocs.begin()); // new
1348  else
1349  ym = CProxy_PmePencilMap::ckNew(2,0,xBlocks,zBlocks*xBlocks,yprocs.begin()); // old
1350  CProxy_PmePencilMap xm;
1351  if ( simParams->PMEPencilsXLayout )
1352  xm = CProxy_PmePencilMap::ckNew(2,1,yBlocks,yBlocks*zBlocks,xprocs.begin()); // new
1353  else
1354  xm = CProxy_PmePencilMap::ckNew(1,2,zBlocks,yBlocks*zBlocks,xprocs.begin()); // old
1355  pmeNodeProxy.recvPencilMapProxies(xm,ym,zm);
1356  CkArrayOptions zo(xBlocks,yBlocks,1); zo.setMap(zm);
1357  CkArrayOptions yo(xBlocks,1,zBlocks); yo.setMap(ym);
1358  CkArrayOptions xo(1,yBlocks,zBlocks); xo.setMap(xm);
1359  zo.setAnytimeMigration(false); zo.setStaticInsertion(true);
1360  yo.setAnytimeMigration(false); yo.setStaticInsertion(true);
1361  xo.setAnytimeMigration(false); xo.setStaticInsertion(true);
1362  zPencil = CProxy_PmeZPencil::ckNew(zo); // (xBlocks,yBlocks,1);
1363  yPencil = CProxy_PmeYPencil::ckNew(yo); // (xBlocks,1,zBlocks);
1364  xPencil = CProxy_PmeXPencil::ckNew(xo); // (1,yBlocks,zBlocks);
1365 #else
1366  zPencil = CProxy_PmeZPencil::ckNew(); // (xBlocks,yBlocks,1);
1367  yPencil = CProxy_PmeYPencil::ckNew(); // (xBlocks,1,zBlocks);
1368  xPencil = CProxy_PmeXPencil::ckNew(); // (1,yBlocks,zBlocks);
1369 
1370  for (pe=0, x = 0; x < xBlocks; ++x)
1371  for (y = 0; y < yBlocks; ++y, ++pe ) {
1372  zPencil(x,y,0).insert(zprocs[pe]);
1373  }
1374  zPencil.doneInserting();
1375 
1376  for (pe=0, x = 0; x < xBlocks; ++x)
1377  for (z = 0; z < zBlocks; ++z, ++pe ) {
1378  yPencil(x,0,z).insert(yprocs[pe]);
1379  }
1380  yPencil.doneInserting();
1381 
1382 
1383  for (pe=0, y = 0; y < yBlocks; ++y )
1384  for (z = 0; z < zBlocks; ++z, ++pe ) {
1385  xPencil(0,y,z).insert(xprocs[pe]);
1386  }
1387  xPencil.doneInserting();
1388 #endif
1389 
1390  pmeProxy.recvArrays(xPencil,yPencil,zPencil);
1391  PmePencilInitMsgData msgdata;
1392  msgdata.grid = myGrid;
1393  msgdata.xBlocks = xBlocks;
1394  msgdata.yBlocks = yBlocks;
1395  msgdata.zBlocks = zBlocks;
1396  msgdata.xPencil = xPencil;
1397  msgdata.yPencil = yPencil;
1398  msgdata.zPencil = zPencil;
1399  msgdata.pmeProxy = pmeProxyDir;
1400  msgdata.pmeNodeProxy = pmeNodeProxy;
1401  msgdata.xm = xm;
1402  msgdata.ym = ym;
1403  msgdata.zm = zm;
1404  xPencil.init(new PmePencilInitMsg(msgdata));
1405  yPencil.init(new PmePencilInitMsg(msgdata));
1406  zPencil.init(new PmePencilInitMsg(msgdata));
1407  }
1408 
1409  return; // continue in initialize_pencils() at next startup stage
1410  }
1411 
1412 
1413  int pe;
1414  int nx = 0;
1415  for ( pe = 0; pe < numGridPes; ++pe ) {
1416  localInfo[pe].x_start = nx;
1417  nx += myGrid.block1;
1418  if ( nx > myGrid.K1 ) nx = myGrid.K1;
1419  localInfo[pe].nx = nx - localInfo[pe].x_start;
1420  }
1421  int ny = 0;
1422  for ( pe = 0; pe < numTransPes; ++pe ) {
1423  localInfo[pe].y_start_after_transpose = ny;
1424  ny += myGrid.block2;
1425  if ( ny > myGrid.K2 ) ny = myGrid.K2;
1426  localInfo[pe].ny_after_transpose =
1427  ny - localInfo[pe].y_start_after_transpose;
1428  }
1429 
1430  { // decide how many pes this node exchanges charges with
1431 
1432  PatchMap *patchMap = PatchMap::Object();
1433  Lattice lattice = simParams->lattice;
1434  BigReal sysdima = lattice.a_r().unit() * lattice.a();
1435  BigReal cutoff = simParams->cutoff;
1436  BigReal patchdim = simParams->patchDimension;
1437  int numPatches = patchMap->numPatches();
1438  int numNodes = CkNumPes();
1439  int *source_flags = new int[numNodes];
1440  int node;
1441  for ( node=0; node<numNodes; ++node ) {
1442  source_flags[node] = 0;
1443  recipPeDest[node] = 0;
1444  }
1445 
1446  // // make sure that we don't get ahead of ourselves on this node
1447  // if ( CkMyPe() < numPatches && myRecipPe >= 0 ) {
1448  // source_flags[CkMyPe()] = 1;
1449  // recipPeDest[myRecipPe] = 1;
1450  // }
1451 
1452  for ( int pid=0; pid < numPatches; ++pid ) {
1453  int pnode = patchMap->node(pid);
1454 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
1455  if ( offload ) pnode = CkNodeFirst(CkNodeOf(pnode));
1456 #endif
1457  int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
1458  BigReal minx = patchMap->min_a(pid);
1459  BigReal maxx = patchMap->max_a(pid);
1460  BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
1461  // min1 (max1) is smallest (largest) grid line for this patch
1462  int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
1463  int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
1464  for ( int i=min1; i<=max1; ++i ) {
1465  int ix = i;
1466  while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
1467  while ( ix < 0 ) ix += myGrid.K1;
1468  // set source_flags[pnode] if this patch sends to our node
1469  if ( myGridPe >= 0 && ix >= localInfo[myGridPe].x_start &&
1470  ix < localInfo[myGridPe].x_start + localInfo[myGridPe].nx ) {
1471  source_flags[pnode] = 1;
1472  }
1473  // set dest_flags[] for node that our patch sends to
1474 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
1475  if ( offload ) {
1476  if ( pnode == CkNodeFirst(CkMyNode()) ) {
1477  recipPeDest[ix / myGrid.block1] = 1;
1478  }
1479  } else
1480 #endif
1481  if ( pnode == CkMyPe() ) {
1482  recipPeDest[ix / myGrid.block1] = 1;
1483  }
1484  }
1485  }
1486 
1487  int numSourcesSamePhysicalNode = 0;
1488  numSources = 0;
1489  numDestRecipPes = 0;
1490  for ( node=0; node<numNodes; ++node ) {
1491  if ( source_flags[node] ) ++numSources;
1492  if ( recipPeDest[node] ) ++numDestRecipPes;
1493  if ( source_flags[node] && CmiPeOnSamePhysicalNode(node,CkMyPe()) ) ++numSourcesSamePhysicalNode;
1494  }
1495 
1496 #if 0
1497  if ( numSources ) {
1498  CkPrintf("pe %5d pme %5d of %5d on same physical node\n",
1499  CkMyPe(), numSourcesSamePhysicalNode, numSources);
1500  iout << iINFO << "PME " << CkMyPe() << " sources:";
1501  for ( node=0; node<numNodes; ++node ) {
1502  if ( source_flags[node] ) iout << " " << node;
1503  }
1504  iout << "\n" << endi;
1505  }
1506 #endif
1507 
1508  delete [] source_flags;
1509 
1510  // CkPrintf("PME on node %d has %d sources and %d destinations\n",
1511  // CkMyPe(), numSources, numDestRecipPes);
1512 
1513  } // decide how many pes this node exchanges charges with (end)
1514 
1515  ungrid_count = numDestRecipPes;
1516 
1517  sendTransBarrier_received = 0;
1518 
1519  if ( myGridPe < 0 && myTransPe < 0 ) return;
1520  // the following only for nodes doing reciprocal sum
1521 
1522  if ( myTransPe >= 0 ) {
1523  recipEvirPe = findRecipEvirPe();
1524  pmeProxy[recipEvirPe].addRecipEvirClient();
1525  }
1526 
1527  if ( myTransPe >= 0 ) {
1528  int k2_start = localInfo[myTransPe].y_start_after_transpose;
1529  int k2_end = k2_start + localInfo[myTransPe].ny_after_transpose;
1530  #ifdef OPENATOM_VERSION
1531  if ( simParams->openatomOn ) {
1532  CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
1533  myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2, moaProxy);
1534  } else {
1535  myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
1536  }
1537  #else // OPENATOM_VERSION
1538  myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
1539  #endif // OPENATOM_VERSION
1540  }
1541 
1542  int local_size = myGrid.block1 * myGrid.K2 * myGrid.dim3;
1543  int local_size_2 = myGrid.block2 * myGrid.K1 * myGrid.dim3;
1544  if ( local_size < local_size_2 ) local_size = local_size_2;
1545  qgrid = new float[local_size*numGrids];
1546  if ( numGridPes > 1 || numTransPes > 1 ) {
1547  kgrid = new float[local_size*numGrids];
1548  } else {
1549  kgrid = qgrid;
1550  }
1551  qgrid_size = local_size;
1552 
1553  if ( myGridPe >= 0 ) {
1554  qgrid_start = localInfo[myGridPe].x_start * myGrid.K2 * myGrid.dim3;
1555  qgrid_len = localInfo[myGridPe].nx * myGrid.K2 * myGrid.dim3;
1556  fgrid_start = localInfo[myGridPe].x_start * myGrid.K2;
1557  fgrid_len = localInfo[myGridPe].nx * myGrid.K2;
1558  }
1559 
1560  int n[3]; n[0] = myGrid.K1; n[1] = myGrid.K2; n[2] = myGrid.K3;
1561 #ifdef NAMD_FFTW
1562  CmiLock(fftw_plan_lock);
1563 #ifdef NAMD_FFTW_3
1564  work = new fftwf_complex[n[0]];
1565  int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT : simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE ;
1566  if ( myGridPe >= 0 ) {
1567  forward_plan_yz=new fftwf_plan[numGrids];
1568  backward_plan_yz=new fftwf_plan[numGrids];
1569  }
1570  if ( myTransPe >= 0 ) {
1571  forward_plan_x=new fftwf_plan[numGrids];
1572  backward_plan_x=new fftwf_plan[numGrids];
1573  }
1574  /* need one plan per grid */
1575  if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps. 1..." << endi;
1576  if ( myGridPe >= 0 ) {
1577  for( int g=0; g<numGrids; g++)
1578  {
1579  forward_plan_yz[g] = fftwf_plan_many_dft_r2c(2, n+1,
1580  localInfo[myGridPe].nx,
1581  qgrid + qgrid_size * g,
1582  NULL,
1583  1,
1584  myGrid.dim2 * myGrid.dim3,
1585  (fftwf_complex *)
1586  (qgrid + qgrid_size * g),
1587  NULL,
1588  1,
1589  myGrid.dim2 * (myGrid.dim3/2),
1590  fftwFlags);
1591  }
1592  }
1593  int zdim = myGrid.dim3;
1594  int xStride=localInfo[myTransPe].ny_after_transpose *( myGrid.dim3 / 2);
1595  if ( ! CkMyPe() ) iout << " 2..." << endi;
1596  if ( myTransPe >= 0 ) {
1597  for( int g=0; g<numGrids; g++)
1598  {
1599 
1600  forward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
1601  (fftwf_complex *)
1602  (kgrid+qgrid_size*g),
1603  NULL,
1604  xStride,
1605  1,
1606  (fftwf_complex *)
1607  (kgrid+qgrid_size*g),
1608  NULL,
1609  xStride,
1610  1,
1611  FFTW_FORWARD,fftwFlags);
1612 
1613  }
1614  }
1615  if ( ! CkMyPe() ) iout << " 3..." << endi;
1616  if ( myTransPe >= 0 ) {
1617  for( int g=0; g<numGrids; g++)
1618  {
1619  backward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
1620  (fftwf_complex *)
1621  (kgrid+qgrid_size*g),
1622  NULL,
1623  xStride,
1624  1,
1625  (fftwf_complex *)
1626  (kgrid+qgrid_size*g),
1627  NULL,
1628  xStride,
1629  1,
1630  FFTW_BACKWARD, fftwFlags);
1631 
1632  }
1633  }
1634  if ( ! CkMyPe() ) iout << " 4..." << endi;
1635  if ( myGridPe >= 0 ) {
1636  for( int g=0; g<numGrids; g++)
1637  {
1638  backward_plan_yz[g] = fftwf_plan_many_dft_c2r(2, n+1,
1639  localInfo[myGridPe].nx,
1640  (fftwf_complex *)
1641  (qgrid + qgrid_size * g),
1642  NULL,
1643  1,
1644  myGrid.dim2*(myGrid.dim3/2),
1645  qgrid + qgrid_size * g,
1646  NULL,
1647  1,
1648  myGrid.dim2 * myGrid.dim3,
1649  fftwFlags);
1650  }
1651  }
1652  if ( ! CkMyPe() ) iout << " Done.\n" << endi;
1653 
1654 #else
1655  work = new fftw_complex[n[0]];
1656 
1657  if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps. 1..." << endi;
1658  if ( myGridPe >= 0 ) {
1659  forward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_REAL_TO_COMPLEX,
1660  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1661  | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
1662  }
1663  if ( ! CkMyPe() ) iout << " 2..." << endi;
1664  if ( myTransPe >= 0 ) {
1665  forward_plan_x = fftw_create_plan_specific(n[0], FFTW_FORWARD,
1666  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1667  | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
1668  localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
1669  }
1670  if ( ! CkMyPe() ) iout << " 3..." << endi;
1671  if ( myTransPe >= 0 ) {
1672  backward_plan_x = fftw_create_plan_specific(n[0], FFTW_BACKWARD,
1673  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1674  | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
1675  localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
1676  }
1677  if ( ! CkMyPe() ) iout << " 4..." << endi;
1678  if ( myGridPe >= 0 ) {
1679  backward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_COMPLEX_TO_REAL,
1680  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1681  | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
1682  }
1683  if ( ! CkMyPe() ) iout << " Done.\n" << endi;
1684 #endif
1685  CmiUnlock(fftw_plan_lock);
1686 #else
1687  NAMD_die("Sorry, FFTW must be compiled in to use PME.");
1688 #endif
1689 
1690  if ( myGridPe >= 0 && numSources == 0 )
1691  NAMD_bug("PME grid elements exist without sources.");
1692  grid_count = numSources;
1693  memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
1694  trans_count = numGridPes;
1695 }
1696 
1697 
1698 
1700  delete msg;
1701  if ( ! usePencils ) return;
1702 
1704 
1705  PatchMap *patchMap = PatchMap::Object();
1706  Lattice lattice = simParams->lattice;
1707  BigReal sysdima = lattice.a_r().unit() * lattice.a();
1708  BigReal sysdimb = lattice.b_r().unit() * lattice.b();
1709  BigReal cutoff = simParams->cutoff;
1710  BigReal patchdim = simParams->patchDimension;
1711  int numPatches = patchMap->numPatches();
1712 
1713  pencilActive = new char[xBlocks*yBlocks];
1714  for ( int i=0; i<xBlocks; ++i ) {
1715  for ( int j=0; j<yBlocks; ++j ) {
1716  pencilActive[i*yBlocks+j] = 0;
1717  }
1718  }
1719 
1720  for ( int pid=0; pid < numPatches; ++pid ) {
1721  int pnode = patchMap->node(pid);
1722 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
1723  if ( offload ) {
1724  if ( CkNodeOf(pnode) != CkMyNode() ) continue;
1725  } else
1726 #endif
1727  if ( pnode != CkMyPe() ) continue;
1728 
1729  int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
1730  int shift2 = (myGrid.K2 + myGrid.order - 1)/2;
1731 
1732  BigReal minx = patchMap->min_a(pid);
1733  BigReal maxx = patchMap->max_a(pid);
1734  BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
1735  // min1 (max1) is smallest (largest) grid line for this patch
1736  int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
1737  int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
1738 
1739  BigReal miny = patchMap->min_b(pid);
1740  BigReal maxy = patchMap->max_b(pid);
1741  BigReal marginb = 0.5 * ( patchdim - cutoff ) / sysdimb;
1742  // min2 (max2) is smallest (largest) grid line for this patch
1743  int min2 = ((int) floor(myGrid.K2 * (miny - marginb))) + shift2 - myGrid.order + 1;
1744  int max2 = ((int) floor(myGrid.K2 * (maxy + marginb))) + shift2;
1745 
1746  for ( int i=min1; i<=max1; ++i ) {
1747  int ix = i;
1748  while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
1749  while ( ix < 0 ) ix += myGrid.K1;
1750  for ( int j=min2; j<=max2; ++j ) {
1751  int jy = j;
1752  while ( jy >= myGrid.K2 ) jy -= myGrid.K2;
1753  while ( jy < 0 ) jy += myGrid.K2;
1754  pencilActive[(ix / myGrid.block1)*yBlocks + (jy / myGrid.block2)] = 1;
1755  }
1756  }
1757  }
1758 
1759  numPencilsActive = 0;
1760  for ( int i=0; i<xBlocks; ++i ) {
1761  for ( int j=0; j<yBlocks; ++j ) {
1762  if ( pencilActive[i*yBlocks+j] ) {
1763  ++numPencilsActive;
1764 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
1765  if ( CkMyPe() == deviceCUDA->getMasterPe() || ! offload )
1766 #endif
1767  zPencil(i,j,0).dummyRecvGrid(CkMyPe(),0);
1768  }
1769  }
1770  }
1771  activePencils = new ijpair[numPencilsActive];
1772  numPencilsActive = 0;
1773  for ( int i=0; i<xBlocks; ++i ) {
1774  for ( int j=0; j<yBlocks; ++j ) {
1775  if ( pencilActive[i*yBlocks+j] ) {
1776  activePencils[numPencilsActive++] = ijpair(i,j);
1777  }
1778  }
1779  }
1780  if ( simParams->PMESendOrder ) {
1781  std::sort(activePencils,activePencils+numPencilsActive,ijpair_sortop_bit_reversed());
1782  } else {
1783  Random rand(CkMyPe());
1784  rand.reorder(activePencils,numPencilsActive);
1785  }
1786  //if ( numPencilsActive ) {
1787  // CkPrintf("node %d sending to %d pencils\n", CkMyPe(), numPencilsActive);
1788  //}
1789 
1790  ungrid_count = numPencilsActive;
1791 }
1792 
1793 
1795  if ( ! usePencils ) return;
1796  if ( CkMyPe() == 0 ) zPencil.dummyRecvGrid(CkMyPe(),1);
1797 }
1798 
1799 
1801 
1802  if ( CmiMyRank() == 0 ) {
1803  CmiDestroyLock(fftw_plan_lock);
1804  }
1805  CmiDestroyLock(pmemgr_lock);
1806 
1807  delete myKSpace;
1808  delete [] localInfo;
1809  delete [] gridNodeInfo;
1810  delete [] transNodeInfo;
1811  delete [] gridPeMap;
1812  delete [] transPeMap;
1813  delete [] recipPeDest;
1814  delete [] gridPeOrder;
1815  delete [] gridNodeOrder;
1816  delete [] transNodeOrder;
1817  delete [] qgrid;
1818  if ( kgrid != qgrid ) delete [] kgrid;
1819  delete [] work;
1820  delete [] gridmsg_reuse;
1821 
1822  if ( ! offload ) {
1823  for (int i=0; i<q_count; ++i) {
1824  delete [] q_list[i];
1825  }
1826  delete [] q_list;
1827  delete [] fz_arr;
1828  }
1829  delete [] f_arr;
1830  delete [] q_arr;
1831 }
1832 
1834  // CkPrintf("recvGrid from %d on Pe(%d)\n",msg->sourceNode,CkMyPe());
1835  if ( grid_count == 0 ) {
1836  NAMD_bug("Message order failure in ComputePmeMgr::recvGrid\n");
1837  }
1838  if ( grid_count == numSources ) {
1839  lattice = msg->lattice;
1840  grid_sequence = msg->sequence;
1841  }
1842 
1843  int zdim = myGrid.dim3;
1844  int zlistlen = msg->zlistlen;
1845  int *zlist = msg->zlist;
1846  float *qmsg = msg->qgrid;
1847  for ( int g=0; g<numGrids; ++g ) {
1848  char *f = msg->fgrid + fgrid_len * g;
1849  float *q = qgrid + qgrid_size * g;
1850  for ( int i=0; i<fgrid_len; ++i ) {
1851  if ( f[i] ) {
1852  for ( int k=0; k<zlistlen; ++k ) {
1853  q[zlist[k]] += *(qmsg++);
1854  }
1855  }
1856  q += zdim;
1857  }
1858  }
1859 
1860  gridmsg_reuse[numSources-grid_count] = msg;
1861  --grid_count;
1862 
1863  if ( grid_count == 0 ) {
1864  pmeProxyDir[CkMyPe()].gridCalc1();
1865  if ( useBarrier ) pmeProxyDir[0].sendTransBarrier();
1866  }
1867 }
1868 #ifdef MANUAL_DEBUG_FFTW3
1869 
1870 /* utility functions for manual debugging */
1871 void dumpMatrixFloat(const char *infilename, float *matrix, int xdim, int ydim, int zdim,int pe)
1872 {
1873 
1874  char fmt[1000];
1875  char filename[1000];
1876  strncpy(fmt,infilename,999);
1877  strncat(fmt,"_%d.out",999);
1878  sprintf(filename,fmt, pe);
1879  FILE *loutfile = fopen(filename, "w");
1880 #ifdef PAIRCALC_TEST_DUMP
1881  fprintf(loutfile,"%d\n",ydim);
1882 #endif
1883  fprintf(loutfile,"%d %d %d\n",xdim,ydim, zdim);
1884  for(int i=0;i<xdim;i++)
1885  for(int j=0;j<ydim;j++)
1886  for(int k=0;k<zdim;k++)
1887  fprintf(loutfile,"%d %d %d %.8f\n",i,j,k,matrix[i*zdim*ydim+j*zdim +k]);
1888  fclose(loutfile);
1889 
1890 }
1891 
1892 void dumpMatrixFloat3(const char *infilename, float *matrix, int xdim, int ydim, int zdim,int x, int y, int z)
1893 {
1894  char fmt[1000];
1895  char filename[1000];
1896  strncpy(fmt,infilename,999);
1897  strncat(fmt,"_%d_%d_%d.out",999);
1898  sprintf(filename,fmt, x,y,z);
1899  FILE *loutfile = fopen(filename, "w");
1900  CkAssert(loutfile!=NULL);
1901  CkPrintf("opened %s for dump\n",filename);
1902  fprintf(loutfile,"%d %d %d\n",xdim,ydim, zdim);
1903  for(int i=0;i<xdim;i++)
1904  for(int j=0;j<ydim;j++)
1905  for(int k=0;k<zdim;k++)
1906  fprintf(loutfile,"%d %d %d %.8f\n",i,j,k,matrix[i*zdim*ydim+j*zdim +k]);
1907  fclose(loutfile);
1908 }
1909 
1910 #endif
1911 
1913  // CkPrintf("gridCalc1 on Pe(%d)\n",CkMyPe());
1914 
1915 #ifdef NAMD_FFTW
1916  for ( int g=0; g<numGrids; ++g ) {
1917 #ifdef NAMD_FFTW_3
1918  fftwf_execute(forward_plan_yz[g]);
1919 #else
1920  rfftwnd_real_to_complex(forward_plan_yz, localInfo[myGridPe].nx,
1921  qgrid + qgrid_size * g, 1, myGrid.dim2 * myGrid.dim3, 0, 0, 0);
1922 #endif
1923 
1924  }
1925 #endif
1926 
1927  if ( ! useBarrier ) pmeProxyDir[CkMyPe()].sendTrans();
1928 }
1929 
1931  sendTransBarrier_received += 1;
1932  // CkPrintf("sendTransBarrier on %d %d\n",myGridPe,numGridPes-sendTransBarrier_received);
1933  if ( sendTransBarrier_received < numGridPes ) return;
1934  sendTransBarrier_received = 0;
1935  for ( int i=0; i<numGridPes; ++i ) {
1936  pmeProxyDir[gridPeMap[i]].sendTrans();
1937  }
1938 }
1939 
1940 static inline void PmeSlabSendTrans(int first, int last, void *result, int paraNum, void *param) {
1941  ComputePmeMgr *mgr = (ComputePmeMgr *)param;
1942  mgr->sendTransSubset(first, last);
1943 }
1944 
1946 
1947  untrans_count = numTransPes;
1948 
1949 #if CMK_SMP && USE_CKLOOP
1950  int useCkLoop = Node::Object()->simParameters->useCkLoop;
1951  if ( useCkLoop >= CKLOOP_CTRL_PME_SENDTRANS && CkNumPes() >= 2 * numGridPes) {
1952  CkLoop_Parallelize(PmeSlabSendTrans, 1, (void *)this, CkMyNodeSize(), 0, numTransNodes-1, 0); // no sync
1953  } else
1954 #endif
1955  {
1956  sendTransSubset(0, numTransNodes-1);
1957  }
1958 
1959 }
1960 
1961 void ComputePmeMgr::sendTransSubset(int first, int last) {
1962  // CkPrintf("sendTrans on Pe(%d)\n",CkMyPe());
1963 
1964  // send data for transpose
1965  int zdim = myGrid.dim3;
1966  int nx = localInfo[myGridPe].nx;
1967  int x_start = localInfo[myGridPe].x_start;
1968  int slicelen = myGrid.K2 * zdim;
1969 
1970  ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
1971 
1972 #if CMK_BLUEGENEL
1973  CmiNetworkProgressAfter (0);
1974 #endif
1975 
1976  for (int j=first; j<=last; j++) {
1977  int node = transNodeOrder[j]; // different order on each node
1978  int pe = transNodeInfo[node].pe_start;
1979  int npe = transNodeInfo[node].npe;
1980  int totlen = 0;
1981  if ( node != myTransNode ) for (int i=0; i<npe; ++i, ++pe) {
1982  LocalPmeInfo &li = localInfo[pe];
1983  int cpylen = li.ny_after_transpose * zdim;
1984  totlen += cpylen;
1985  }
1986  PmeTransMsg *newmsg = new (nx * totlen * numGrids,
1988  newmsg->sourceNode = myGridPe;
1989  newmsg->lattice = lattice;
1990  newmsg->x_start = x_start;
1991  newmsg->nx = nx;
1992  for ( int g=0; g<numGrids; ++g ) {
1993  float *qmsg = newmsg->qgrid + nx * totlen * g;
1994  pe = transNodeInfo[node].pe_start;
1995  for (int i=0; i<npe; ++i, ++pe) {
1996  LocalPmeInfo &li = localInfo[pe];
1997  int cpylen = li.ny_after_transpose * zdim;
1998  if ( node == myTransNode ) {
1999  ComputePmeMgr *m = mgrObjects[CkRankOf(transPeMap[pe])];
2000  qmsg = m->kgrid + m->qgrid_size * g + x_start*cpylen;
2001  }
2002  float *q = qgrid + qgrid_size * g + li.y_start_after_transpose * zdim;
2003  for ( int x = 0; x < nx; ++x ) {
2004  CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
2005  q += slicelen;
2006  qmsg += cpylen;
2007  }
2008  }
2009  }
2010  newmsg->sequence = grid_sequence;
2011  SET_PRIORITY(newmsg,grid_sequence,PME_TRANS_PRIORITY)
2012  if ( node == myTransNode ) newmsg->nx = 0;
2013  if ( npe > 1 ) {
2014  if ( node == myTransNode ) fwdSharedTrans(newmsg);
2015  else pmeNodeProxy[transNodeInfo[node].real_node].recvTrans(newmsg);
2016  } else pmeProxy[transPeMap[transNodeInfo[node].pe_start]].recvTrans(newmsg);
2017  }
2018 }
2019 
2021  // CkPrintf("fwdSharedTrans on Pe(%d)\n",CkMyPe());
2022  int pe = transNodeInfo[myTransNode].pe_start;
2023  int npe = transNodeInfo[myTransNode].npe;
2024  CmiNodeLock lock = CmiCreateLock();
2025  int *count = new int; *count = npe;
2026  for (int i=0; i<npe; ++i, ++pe) {
2029  shmsg->msg = msg;
2030  shmsg->count = count;
2031  shmsg->lock = lock;
2032  pmeProxy[transPeMap[pe]].recvSharedTrans(shmsg);
2033  }
2034 }
2035 
2037  procTrans(msg->msg);
2038  CmiLock(msg->lock);
2039  int count = --(*msg->count);
2040  CmiUnlock(msg->lock);
2041  if ( count == 0 ) {
2042  CmiDestroyLock(msg->lock);
2043  delete msg->count;
2044  delete msg->msg;
2045  }
2046  delete msg;
2047 }
2048 
2050  procTrans(msg);
2051  delete msg;
2052 }
2053 
2055  // CkPrintf("procTrans on Pe(%d)\n",CkMyPe());
2056  if ( trans_count == numGridPes ) {
2057  lattice = msg->lattice;
2058  grid_sequence = msg->sequence;
2059  }
2060 
2061  if ( msg->nx ) {
2062  int zdim = myGrid.dim3;
2063  NodePmeInfo &nodeInfo(transNodeInfo[myTransNode]);
2064  int first_pe = nodeInfo.pe_start;
2065  int last_pe = first_pe+nodeInfo.npe-1;
2066  int y_skip = localInfo[myTransPe].y_start_after_transpose
2067  - localInfo[first_pe].y_start_after_transpose;
2068  int ny_msg = localInfo[last_pe].y_start_after_transpose
2069  + localInfo[last_pe].ny_after_transpose
2070  - localInfo[first_pe].y_start_after_transpose;
2071  int ny = localInfo[myTransPe].ny_after_transpose;
2072  int x_start = msg->x_start;
2073  int nx = msg->nx;
2074  for ( int g=0; g<numGrids; ++g ) {
2075  CmiMemcpy((void*)(kgrid + qgrid_size * g + x_start*ny*zdim),
2076  (void*)(msg->qgrid + nx*(ny_msg*g+y_skip)*zdim),
2077  nx*ny*zdim*sizeof(float));
2078  }
2079  }
2080 
2081  --trans_count;
2082 
2083  if ( trans_count == 0 ) {
2084  pmeProxyDir[CkMyPe()].gridCalc2();
2085  }
2086 }
2087 
2089  // CkPrintf("gridCalc2 on Pe(%d)\n",CkMyPe());
2090 
2091 #if CMK_BLUEGENEL
2092  CmiNetworkProgressAfter (0);
2093 #endif
2094 
2095  int zdim = myGrid.dim3;
2096  // int y_start = localInfo[myTransPe].y_start_after_transpose;
2097  int ny = localInfo[myTransPe].ny_after_transpose;
2098 
2099  for ( int g=0; g<numGrids; ++g ) {
2100  // finish forward FFT (x dimension)
2101 #ifdef NAMD_FFTW
2102 #ifdef NAMD_FFTW_3
2103  fftwf_execute(forward_plan_x[g]);
2104 #else
2105  fftw(forward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
2106  ny * zdim / 2, 1, work, 1, 0);
2107 #endif
2108 #endif
2109  }
2110 
2111 #ifdef OPENATOM_VERSION
2112  if ( ! simParams -> openatomOn ) {
2113 #endif // OPENATOM_VERSION
2114  gridCalc2R();
2115 #ifdef OPENATOM_VERSION
2116  } else {
2117  gridCalc2Moa();
2118  }
2119 #endif // OPENATOM_VERSION
2120 }
2121 
2122 #ifdef OPENATOM_VERSION
2123 void ComputePmeMgr::gridCalc2Moa(void) {
2124 
2125  int zdim = myGrid.dim3;
2126  // int y_start = localInfo[myTransPe].y_start_after_transpose;
2127  int ny = localInfo[myTransPe].ny_after_transpose;
2128 
2130 
2131  CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
2132 
2133  for ( int g=0; g<numGrids; ++g ) {
2134  #ifdef OPENATOM_VERSION_DEBUG
2135  CkPrintf("Sending recQ on processor %d \n", CkMyPe());
2136  for ( int i=0; i<=(ny * zdim / 2); ++i)
2137  {
2138  CkPrintf("PE, g,fftw_q,k*q*g, kgrid, qgrid_size value %d pre-send = %d, %d, %f %f, %d, \n", i, CkMyPe(), g, (kgrid+qgrid_size*g)[i], kgrid[i], qgrid_size);
2139  }
2140  #endif // OPENATOM_VERSION_DEBUG
2141 // mqcpProxy[CkMyPe()].recvQ((ny * zdim / 2),((fftw_complex *)(kgrid+qgrid_size*g)));
2142  CkCallback resumePme(CkIndex_ComputePmeMgr::gridCalc2R(), thishandle);
2143  moaProxy[CkMyPe()].recvQ(g,numGrids,(ny * zdim / 2),(kgrid+qgrid_size*g), resumePme);
2144  }
2145 }
2146 #endif // OPENATOM_VERSION
2147 
2149 
2150  int useCkLoop = 0;
2151 #if CMK_SMP && USE_CKLOOP
2152  if ( Node::Object()->simParameters->useCkLoop >= CKLOOP_CTRL_PME_KSPACE
2153  && CkNumPes() >= 2 * numTransPes ) {
2154  useCkLoop = 1;
2155  }
2156 #endif
2157 
2158  int zdim = myGrid.dim3;
2159  // int y_start = localInfo[myTransPe].y_start_after_transpose;
2160  int ny = localInfo[myTransPe].ny_after_transpose;
2161 
2162  for ( int g=0; g<numGrids; ++g ) {
2163  // reciprocal space portion of PME
2165  recip_evir2[g][0] = myKSpace->compute_energy(kgrid+qgrid_size*g,
2166  lattice, ewaldcof, &(recip_evir2[g][1]), useCkLoop);
2167  // CkPrintf("Ewald reciprocal energy = %f\n", recip_evir2[g][0]);
2168 
2169  // start backward FFT (x dimension)
2170 
2171 #ifdef NAMD_FFTW
2172 #ifdef NAMD_FFTW_3
2173  fftwf_execute(backward_plan_x[g]);
2174 #else
2175  fftw(backward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
2176  ny * zdim / 2, 1, work, 1, 0);
2177 #endif
2178 #endif
2179  }
2180 
2181  pmeProxyDir[CkMyPe()].sendUntrans();
2182 }
2183 
2184 static inline void PmeSlabSendUntrans(int first, int last, void *result, int paraNum, void *param) {
2185  ComputePmeMgr *mgr = (ComputePmeMgr *)param;
2186  mgr->sendUntransSubset(first, last);
2187 }
2188 
2190 
2191  trans_count = numGridPes;
2192 
2193  { // send energy and virial
2194  PmeEvirMsg *newmsg = new (numGrids, PRIORITY_SIZE) PmeEvirMsg;
2195  for ( int g=0; g<numGrids; ++g ) {
2196  newmsg->evir[g] = recip_evir2[g];
2197  }
2198  SET_PRIORITY(newmsg,grid_sequence,PME_UNGRID_PRIORITY)
2199  CmiEnableUrgentSend(1);
2200  pmeProxy[recipEvirPe].recvRecipEvir(newmsg);
2201  CmiEnableUrgentSend(0);
2202  }
2203 
2204 #if CMK_SMP && USE_CKLOOP
2205  int useCkLoop = Node::Object()->simParameters->useCkLoop;
2206  if ( useCkLoop >= CKLOOP_CTRL_PME_SENDUNTRANS && CkNumPes() >= 2 * numTransPes) {
2207  CkLoop_Parallelize(PmeSlabSendUntrans, 1, (void *)this, CkMyNodeSize(), 0, numGridNodes-1, 0); // no sync
2208  } else
2209 #endif
2210  {
2211  sendUntransSubset(0, numGridNodes-1);
2212  }
2213 
2214 }
2215 
2216 void ComputePmeMgr::sendUntransSubset(int first, int last) {
2217 
2218  int zdim = myGrid.dim3;
2219  int y_start = localInfo[myTransPe].y_start_after_transpose;
2220  int ny = localInfo[myTransPe].ny_after_transpose;
2221  int slicelen = myGrid.K2 * zdim;
2222 
2223  ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
2224 
2225 #if CMK_BLUEGENEL
2226  CmiNetworkProgressAfter (0);
2227 #endif
2228 
2229  // send data for reverse transpose
2230  for (int j=first; j<=last; j++) {
2231  int node = gridNodeOrder[j]; // different order on each node
2232  int pe = gridNodeInfo[node].pe_start;
2233  int npe = gridNodeInfo[node].npe;
2234  int totlen = 0;
2235  if ( node != myGridNode ) for (int i=0; i<npe; ++i, ++pe) {
2236  LocalPmeInfo &li = localInfo[pe];
2237  int cpylen = li.nx * zdim;
2238  totlen += cpylen;
2239  }
2240  PmeUntransMsg *newmsg = new (ny * totlen * numGrids, PRIORITY_SIZE) PmeUntransMsg;
2241  newmsg->sourceNode = myTransPe;
2242  newmsg->y_start = y_start;
2243  newmsg->ny = ny;
2244  for ( int g=0; g<numGrids; ++g ) {
2245  float *qmsg = newmsg->qgrid + ny * totlen * g;
2246  pe = gridNodeInfo[node].pe_start;
2247  for (int i=0; i<npe; ++i, ++pe) {
2248  LocalPmeInfo &li = localInfo[pe];
2249  if ( node == myGridNode ) {
2250  ComputePmeMgr *m = mgrObjects[CkRankOf(gridPeMap[pe])];
2251  qmsg = m->qgrid + m->qgrid_size * g + y_start * zdim;
2252  float *q = kgrid + qgrid_size*g + li.x_start*ny*zdim;
2253  int cpylen = ny * zdim;
2254  for ( int x = 0; x < li.nx; ++x ) {
2255  CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
2256  q += cpylen;
2257  qmsg += slicelen;
2258  }
2259  } else {
2260  CmiMemcpy((void*)qmsg,
2261  (void*)(kgrid + qgrid_size*g + li.x_start*ny*zdim),
2262  li.nx*ny*zdim*sizeof(float));
2263  qmsg += li.nx*ny*zdim;
2264  }
2265  }
2266  }
2267  SET_PRIORITY(newmsg,grid_sequence,PME_UNTRANS_PRIORITY)
2268  if ( node == myGridNode ) newmsg->ny = 0;
2269  if ( npe > 1 ) {
2270  if ( node == myGridNode ) fwdSharedUntrans(newmsg);
2271  else pmeNodeProxy[gridNodeInfo[node].real_node].recvUntrans(newmsg);
2272  } else pmeProxy[gridPeMap[gridNodeInfo[node].pe_start]].recvUntrans(newmsg);
2273  }
2274 }
2275 
2277  int pe = gridNodeInfo[myGridNode].pe_start;
2278  int npe = gridNodeInfo[myGridNode].npe;
2279  CmiNodeLock lock = CmiCreateLock();
2280  int *count = new int; *count = npe;
2281  for (int i=0; i<npe; ++i, ++pe) {
2283  shmsg->msg = msg;
2284  shmsg->count = count;
2285  shmsg->lock = lock;
2286  pmeProxy[gridPeMap[pe]].recvSharedUntrans(shmsg);
2287  }
2288 }
2289 
2291  procUntrans(msg->msg);
2292  CmiLock(msg->lock);
2293  int count = --(*msg->count);
2294  CmiUnlock(msg->lock);
2295  if ( count == 0 ) {
2296  CmiDestroyLock(msg->lock);
2297  delete msg->count;
2298  delete msg->msg;
2299  }
2300  delete msg;
2301 }
2302 
2304  procUntrans(msg);
2305  delete msg;
2306 }
2307 
2309  // CkPrintf("recvUntrans on Pe(%d)\n",CkMyPe());
2310 
2311 #if CMK_BLUEGENEL
2312  CmiNetworkProgressAfter (0);
2313 #endif
2314 
2315  NodePmeInfo &nodeInfo(gridNodeInfo[myGridNode]);
2316  int first_pe = nodeInfo.pe_start;
2317  int g;
2318 
2319  if ( msg->ny ) {
2320  int zdim = myGrid.dim3;
2321  int last_pe = first_pe+nodeInfo.npe-1;
2322  int x_skip = localInfo[myGridPe].x_start
2323  - localInfo[first_pe].x_start;
2324  int nx_msg = localInfo[last_pe].x_start
2325  + localInfo[last_pe].nx
2326  - localInfo[first_pe].x_start;
2327  int nx = localInfo[myGridPe].nx;
2328  int y_start = msg->y_start;
2329  int ny = msg->ny;
2330  int slicelen = myGrid.K2 * zdim;
2331  int cpylen = ny * zdim;
2332  for ( g=0; g<numGrids; ++g ) {
2333  float *q = qgrid + qgrid_size * g + y_start * zdim;
2334  float *qmsg = msg->qgrid + (nx_msg*g+x_skip) * cpylen;
2335  for ( int x = 0; x < nx; ++x ) {
2336  CmiMemcpy((void*)q, (void*)qmsg, cpylen*sizeof(float));
2337  q += slicelen;
2338  qmsg += cpylen;
2339  }
2340  }
2341  }
2342 
2343  --untrans_count;
2344 
2345  if ( untrans_count == 0 ) {
2346  pmeProxyDir[CkMyPe()].gridCalc3();
2347  }
2348 }
2349 
2351  // CkPrintf("gridCalc3 on Pe(%d)\n",CkMyPe());
2352 
2353  // finish backward FFT
2354 #ifdef NAMD_FFTW
2355  for ( int g=0; g<numGrids; ++g ) {
2356 #ifdef NAMD_FFTW_3
2357  fftwf_execute(backward_plan_yz[g]);
2358 #else
2359  rfftwnd_complex_to_real(backward_plan_yz, localInfo[myGridPe].nx,
2360  (fftw_complex *) (qgrid + qgrid_size * g),
2361  1, myGrid.dim2 * myGrid.dim3 / 2, 0, 0, 0);
2362 #endif
2363  }
2364 
2365 #endif
2366 
2367  pmeProxyDir[CkMyPe()].sendUngrid();
2368 }
2369 
2370 static inline void PmeSlabSendUngrid(int first, int last, void *result, int paraNum, void *param) {
2371  ComputePmeMgr *mgr = (ComputePmeMgr *)param;
2372  mgr->sendUngridSubset(first, last);
2373 }
2374 
2376 
2377 #if CMK_SMP && USE_CKLOOP
2378  int useCkLoop = Node::Object()->simParameters->useCkLoop;
2379  if ( useCkLoop >= CKLOOP_CTRL_PME_SENDUNTRANS && CkNumPes() >= 2 * numGridPes) {
2380  CkLoop_Parallelize(PmeSlabSendUngrid, 1, (void *)this, CkMyNodeSize(), 0, numSources-1, 1); // sync
2381  } else
2382 #endif
2383  {
2384  sendUngridSubset(0, numSources-1);
2385  }
2386 
2387  grid_count = numSources;
2388  memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
2389 }
2390 
2391 void ComputePmeMgr::sendUngridSubset(int first, int last) {
2392 
2393 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2394  const int UNGRID_PRIORITY = ( offload ? PME_OFFLOAD_UNGRID_PRIORITY : PME_UNGRID_PRIORITY );
2395 #else
2396  const int UNGRID_PRIORITY = PME_UNGRID_PRIORITY ;
2397 #endif
2398 
2399  for ( int j=first; j<=last; ++j ) {
2400  // int msglen = qgrid_len;
2401  PmeGridMsg *newmsg = gridmsg_reuse[j];
2402  int pe = newmsg->sourceNode;
2403  int zdim = myGrid.dim3;
2404  int flen = newmsg->len;
2405  int fstart = newmsg->start;
2406  int zlistlen = newmsg->zlistlen;
2407  int *zlist = newmsg->zlist;
2408  float *qmsg = newmsg->qgrid;
2409  for ( int g=0; g<numGrids; ++g ) {
2410  char *f = newmsg->fgrid + fgrid_len * g;
2411  float *q = qgrid + qgrid_size * g + (fstart-fgrid_start) * zdim;
2412  for ( int i=0; i<flen; ++i ) {
2413  if ( f[i] ) {
2414  for ( int k=0; k<zlistlen; ++k ) {
2415  *(qmsg++) = q[zlist[k]];
2416  }
2417  }
2418  q += zdim;
2419  }
2420  }
2421  newmsg->sourceNode = myGridPe;
2422 
2423  SET_PRIORITY(newmsg,grid_sequence,UNGRID_PRIORITY)
2424  CmiEnableUrgentSend(1);
2425 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2426  if ( offload ) {
2427  pmeNodeProxy[CkNodeOf(pe)].recvUngrid(newmsg);
2428  } else
2429 #endif
2430  pmeProxyDir[pe].recvUngrid(newmsg);
2431  CmiEnableUrgentSend(0);
2432  }
2433 }
2434 
2436  // CkPrintf("recvUngrid on Pe(%d)\n",CkMyPe());
2437 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2438  if ( ! offload ) // would need lock
2439 #endif
2440  if ( ungrid_count == 0 ) {
2441  NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
2442  }
2443 
2444  if ( usePencils ) copyPencils(msg);
2445  else copyResults(msg);
2446  delete msg;
2447  recvAck(0);
2448 }
2449 
2451  if ( msg ) delete msg;
2452 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2453  if ( offload ) {
2454  CmiLock(cuda_lock);
2455  if ( ungrid_count == 0 ) {
2456  NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
2457  }
2458  int uc = --ungrid_count;
2459  CmiUnlock(cuda_lock);
2460 
2461  if ( uc == 0 ) {
2462  pmeProxyDir[master_pe].ungridCalc();
2463  }
2464  return;
2465  }
2466 #endif
2467  --ungrid_count;
2468 
2469  if ( ungrid_count == 0 ) {
2470  pmeProxyDir[CkMyPe()].ungridCalc();
2471  }
2472 }
2473 
2474 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2475 #define count_limit 1000000
2476 #define CUDA_POLL(FN,ARG) CcdCallFnAfter(FN,ARG,0.1)
2477 #define EVENT_STRIDE 10
2478 
2479 extern "C" void CcdCallBacksReset(void *ignored,double curWallTime); // fix Charm++
2480 
2481 void cudaDie(const char *msg, cudaError_t err=cudaSuccess);
2482 
2483 void cuda_check_pme_forces(void *arg, double walltime) {
2484  ComputePmeMgr *argp = (ComputePmeMgr *) arg;
2485 
2486  while ( 1 ) { // process multiple events per call
2487  cudaError_t err = cudaEventQuery(argp->end_forces[argp->forces_done_count/EVENT_STRIDE]);
2488  if ( err == cudaSuccess ) {
2489  argp->check_forces_count = 0;
2490  for ( int i=0; i<EVENT_STRIDE; ++i ) {
2492  if ( ++(argp->forces_done_count) == argp->forces_count ) break;
2493  }
2494  if ( argp->forces_done_count == argp->forces_count ) { // last event
2495  traceUserBracketEvent(CUDA_EVENT_ID_PME_FORCES,argp->forces_time,walltime);
2496  argp->forces_time = walltime - argp->forces_time;
2497  //CkPrintf("cuda_check_pme_forces forces_time == %f\n", argp->forces_time);
2498  return;
2499  } else { // more events
2500  continue; // check next event
2501  }
2502  } else if ( err != cudaErrorNotReady ) {
2503  char errmsg[256];
2504  sprintf(errmsg,"in cuda_check_pme_forces for event %d after polling %d times over %f s on seq %d",
2506  argp->check_forces_count, walltime - argp->forces_time,
2507  argp->saved_sequence);
2508  cudaDie(errmsg,err);
2509  } else if ( ++(argp->check_forces_count) >= count_limit ) {
2510  char errmsg[256];
2511  sprintf(errmsg,"cuda_check_pme_forces for event %d polled %d times over %f s on seq %d",
2513  argp->check_forces_count, walltime - argp->forces_time,
2514  argp->saved_sequence);
2515  cudaDie(errmsg,err);
2516  } else {
2517  break; // call again
2518  }
2519  } // while ( 1 )
2520  CcdCallBacksReset(0,walltime); // fix Charm++
2522 }
2523 #endif // NAMD_CUDA
2524 
2526  // CkPrintf("ungridCalc on Pe(%d)\n",CkMyPe());
2527 
2528  ungridForcesCount = pmeComputes.size();
2529 
2530 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2531  if ( offload ) {
2532  //CmiLock(cuda_lock);
2533  cudaSetDevice(deviceCUDA->getDeviceID());
2534 
2535  if ( this == masterPmeMgr ) {
2536  double before = CmiWallTimer();
2537  cudaMemcpyAsync(v_data_dev, q_data_host, q_data_size, cudaMemcpyHostToDevice, 0 /*streams[stream]*/);
2538  cudaEventRecord(nodePmeMgr->end_potential_memcpy, 0 /*streams[stream]*/);
2539  // try to make the unspecified launch failures go away
2540  cudaEventSynchronize(nodePmeMgr->end_potential_memcpy);
2541  cuda_errcheck("in ComputePmeMgr::ungridCalc after potential memcpy");
2542  traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
2543 
2544  const int myrank = CkMyRank();
2545  for ( int i=0; i<CkMyNodeSize(); ++i ) {
2546  if ( myrank != i && nodePmeMgr->mgrObjects[i]->pmeComputes.size() ) {
2547  nodePmeMgr->mgrObjects[i]->ungridCalc();
2548  }
2549  }
2550  if ( ! pmeComputes.size() ) return;
2551  }
2552 
2553  if ( ! end_forces ) {
2554  int n=(pmeComputes.size()-1)/EVENT_STRIDE+1;
2555  end_forces = new cudaEvent_t[n];
2556  for ( int i=0; i<n; ++i ) {
2557  cudaEventCreateWithFlags(&end_forces[i],cudaEventDisableTiming);
2558  }
2559  }
2560 
2561  const int pcsz = pmeComputes.size();
2562  if ( ! afn_host ) {
2563  cudaMallocHost((void**) &afn_host, 3*pcsz*sizeof(float*));
2564  cudaMalloc((void**) &afn_dev, 3*pcsz*sizeof(float*));
2565  cuda_errcheck("malloc params for pme");
2566  }
2567  int totn = 0;
2568  for ( int i=0; i<pcsz; ++i ) {
2569  int n = pmeComputes[i]->numGridAtoms[0];
2570  totn += n;
2571  }
2572  if ( totn > f_data_mgr_alloc ) {
2573  if ( f_data_mgr_alloc ) {
2574  CkPrintf("Expanding CUDA forces allocation because %d > %d\n", totn, f_data_mgr_alloc);
2575  cudaFree(f_data_mgr_dev);
2576  cudaFreeHost(f_data_mgr_host);
2577  }
2578  f_data_mgr_alloc = 1.2 * (totn + 100);
2579  cudaMalloc((void**) &f_data_mgr_dev, 3*f_data_mgr_alloc*sizeof(float));
2580  cudaMallocHost((void**) &f_data_mgr_host, 3*f_data_mgr_alloc*sizeof(float));
2581  cuda_errcheck("malloc forces for pme");
2582  }
2583  // CkPrintf("pe %d pcsz %d totn %d alloc %d\n", CkMyPe(), pcsz, totn, f_data_mgr_alloc);
2584  float *f_dev = f_data_mgr_dev;
2585  float *f_host = f_data_mgr_host;
2586  for ( int i=0; i<pcsz; ++i ) {
2587  int n = pmeComputes[i]->numGridAtoms[0];
2588  pmeComputes[i]->f_data_dev = f_dev;
2589  pmeComputes[i]->f_data_host = f_host;
2590  afn_host[3*i ] = a_data_dev + 7 * pmeComputes[i]->cuda_atoms_offset;
2591  afn_host[3*i+1] = f_dev;
2592  afn_host[3*i+2] = f_dev + n; // avoid type conversion issues
2593  f_dev += 3*n;
2594  f_host += 3*n;
2595  }
2596  //CmiLock(cuda_lock);
2597  double before = CmiWallTimer();
2598  cudaMemcpyAsync(afn_dev, afn_host, 3*pcsz*sizeof(float*), cudaMemcpyHostToDevice, streams[stream]);
2599  cuda_errcheck("in ComputePmeMgr::ungridCalc after force pointer memcpy");
2600  traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
2601  cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_potential_memcpy, 0);
2602  cuda_errcheck("in ComputePmeMgr::ungridCalc after wait for potential memcpy");
2603  traceUserEvent(CUDA_EVENT_ID_PME_TICK);
2604 
2605  for ( int i=0; i<pcsz; ++i ) {
2606  // cudaMemsetAsync(pmeComputes[i]->f_data_dev, 0, 3*n*sizeof(float), streams[stream]);
2607  if ( i%EVENT_STRIDE == 0 ) {
2608  int dimy = pcsz - i;
2609  if ( dimy > EVENT_STRIDE ) dimy = EVENT_STRIDE;
2610  int maxn = 0;
2611  int subtotn = 0;
2612  for ( int j=0; j<dimy; ++j ) {
2613  int n = pmeComputes[i+j]->numGridAtoms[0];
2614  subtotn += n;
2615  if ( n > maxn ) maxn = n;
2616  }
2617  // CkPrintf("pe %d dimy %d maxn %d subtotn %d\n", CkMyPe(), dimy, maxn, subtotn);
2618  before = CmiWallTimer();
2619  cuda_pme_forces(
2620  bspline_coeffs_dev,
2621  v_arr_dev, afn_dev+3*i, dimy, maxn, /*
2622  pmeComputes[i]->a_data_dev,
2623  pmeComputes[i]->f_data_dev,
2624  n, */ myGrid.K1, myGrid.K2, myGrid.K3, myGrid.order,
2625  streams[stream]);
2626  cuda_errcheck("in ComputePmeMgr::ungridCalc after force kernel submit");
2627  traceUserBracketEvent(CUDA_EVENT_ID_PME_KERNEL,before,CmiWallTimer());
2628  before = CmiWallTimer();
2629  cudaMemcpyAsync(pmeComputes[i]->f_data_host, pmeComputes[i]->f_data_dev, 3*subtotn*sizeof(float),
2630  cudaMemcpyDeviceToHost, streams[stream]);
2631  cuda_errcheck("in ComputePmeMgr::ungridCalc after force memcpy submit");
2632  traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
2633  cudaEventRecord(end_forces[i/EVENT_STRIDE], streams[stream]);
2634  cuda_errcheck("in ComputePmeMgr::ungridCalc after end_forces event");
2635  traceUserEvent(CUDA_EVENT_ID_PME_TICK);
2636  }
2637  // CkPrintf("pe %d c %d natoms %d fdev %lld fhost %lld\n", CkMyPe(), i, (int64)afn_host[3*i+2], pmeComputes[i]->f_data_dev, pmeComputes[i]->f_data_host);
2638  }
2639  //CmiUnlock(cuda_lock);
2640  } else
2641 #endif // NAMD_CUDA
2642  {
2643  for ( int i=0; i<pmeComputes.size(); ++i ) {
2645  // pmeComputes[i]->ungridForces();
2646  }
2647  }
2648  // submitReductions(); // must follow all ungridForces()
2649 
2650 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2651  if ( offload ) {
2652  forces_time = CmiWallTimer();
2653  forces_count = ungridForcesCount;
2654  forces_done_count = 0;
2655  pmeProxy[this_pe].pollForcesReady();
2656  }
2657 #endif
2658 
2659  ungrid_count = (usePencils ? numPencilsActive : numDestRecipPes );
2660 }
2661 
2663 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2664  CcdCallBacksReset(0,CmiWallTimer()); // fix Charm++
2666 #else
2667  NAMD_bug("ComputePmeMgr::pollForcesReady() called in non-CUDA build.");
2668 #endif
2669 }
2670 
2671 void ComputePme::atomUpdate() { atomsChanged = 1; }
2672 
2674 {
2675  DebugM(4,"ComputePme created.\n");
2677  setNumPatches(1);
2678 
2679  CProxy_ComputePmeMgr::ckLocalBranch(
2680  CkpvAccess(BOCclass_group).computePmeMgr)->addCompute(this);
2681 
2682  SimParameters *simParams = Node::Object()->simParameters;
2683 
2684  qmForcesOn = simParams->qmForcesOn;
2685  offload = simParams->PMEOffload;
2686 
2687  numGridsMax = numGrids;
2688 
2689  myGrid.K1 = simParams->PMEGridSizeX;
2690  myGrid.K2 = simParams->PMEGridSizeY;
2691  myGrid.K3 = simParams->PMEGridSizeZ;
2692  myGrid.order = simParams->PMEInterpOrder;
2693  myGrid.dim2 = myGrid.K2;
2694  myGrid.dim3 = 2 * (myGrid.K3/2 + 1);
2695 
2696 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2697  cuda_atoms_offset = 0;
2698  f_data_host = 0;
2699  f_data_dev = 0;
2700  if ( ! offload )
2701 #endif
2702  {
2703  for ( int g=0; g<numGrids; ++g ) myRealSpace[g] = new PmeRealSpace(myGrid);
2704  }
2705 
2706  atomsChanged = 0;
2707 
2708  qmLoclIndx = 0;
2709  qmLocalCharges = 0;
2710 }
2711 
2713  if (!(patch = PatchMap::Object()->patch(patchID))) {
2714  NAMD_bug("ComputePme used with unknown patch.");
2715  }
2716  positionBox = patch->registerPositionPickup(this);
2717  avgPositionBox = patch->registerAvgPositionPickup(this);
2718  forceBox = patch->registerForceDeposit(this);
2719 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2720  if ( offload ) {
2721  myMgr->cuda_atoms_count += patch->getNumAtoms();
2722  }
2723 #endif
2724 }
2725 
2727 
2728  noWorkCount = 0;
2729  doWorkCount = 0;
2730  ungridForcesCount = 0;
2731 
2733 
2734  SimParameters *simParams = Node::Object()->simParameters;
2735 
2736  strayChargeErrors = 0;
2737 
2738 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2739  PatchMap *patchMap = PatchMap::Object();
2740  int pe = master_pe = CkNodeFirst(CkMyNode());
2741  for ( int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
2742  if ( ! patchMap->numPatchesOnNode(master_pe) ) master_pe = pe;
2743  if ( ! patchMap->numPatchesOnNode(pe) ) continue;
2744  if ( master_pe < 1 && pe != deviceCUDA->getMasterPe() ) master_pe = pe;
2745  if ( master_pe == deviceCUDA->getMasterPe() ) master_pe = pe;
2747  && pe != deviceCUDA->getMasterPe() ) {
2748  master_pe = pe;
2749  }
2750  }
2751  if ( ! patchMap->numPatchesOnNode(master_pe) ) {
2752  NAMD_bug("ComputePmeMgr::initialize_computes() master_pe has no patches.");
2753  }
2754 
2755  masterPmeMgr = nodePmeMgr->mgrObjects[master_pe - CkNodeFirst(CkMyNode())];
2756  bool cudaFirst = 1;
2757  if ( offload ) {
2758  CmiLock(cuda_lock);
2759  cudaFirst = ! masterPmeMgr->chargeGridSubmittedCount++;
2760  }
2761 
2762  if ( cudaFirst ) {
2763  nodePmeMgr->master_pe = master_pe;
2764  nodePmeMgr->masterPmeMgr = masterPmeMgr;
2765  }
2766 #endif
2767 
2768  qsize = myGrid.K1 * myGrid.dim2 * myGrid.dim3;
2769  fsize = myGrid.K1 * myGrid.dim2;
2770  if ( myGrid.K2 != myGrid.dim2 ) NAMD_bug("PME myGrid.K2 != myGrid.dim2");
2771 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2772  if ( ! offload )
2773 #endif
2774  {
2775  q_arr = new float*[fsize*numGrids];
2776  memset( (void*) q_arr, 0, fsize*numGrids * sizeof(float*) );
2777  q_list = new float*[fsize*numGrids];
2778  memset( (void*) q_list, 0, fsize*numGrids * sizeof(float*) );
2779  q_count = 0;
2780  }
2781 
2782 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2783  if ( cudaFirst || ! offload ) {
2784 #endif
2785  f_arr = new char[fsize*numGrids];
2786  // memset to non-zero value has race condition on BlueGene/Q
2787  // memset( (void*) f_arr, 2, fsize*numGrids * sizeof(char) );
2788  for ( int n=fsize*numGrids, i=0; i<n; ++i ) f_arr[i] = 2;
2789 
2790  for ( int g=0; g<numGrids; ++g ) {
2791  char *f = f_arr + g*fsize;
2792  if ( usePencils ) {
2793  int K1 = myGrid.K1;
2794  int K2 = myGrid.K2;
2795  int block1 = ( K1 + xBlocks - 1 ) / xBlocks;
2796  int block2 = ( K2 + yBlocks - 1 ) / yBlocks;
2797  int dim2 = myGrid.dim2;
2798  for (int ap=0; ap<numPencilsActive; ++ap) {
2799  int ib = activePencils[ap].i;
2800  int jb = activePencils[ap].j;
2801  int ibegin = ib*block1;
2802  int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
2803  int jbegin = jb*block2;
2804  int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
2805  int flen = numGrids * (iend - ibegin) * (jend - jbegin);
2806  for ( int i=ibegin; i<iend; ++i ) {
2807  for ( int j=jbegin; j<jend; ++j ) {
2808  f[i*dim2+j] = 0;
2809  }
2810  }
2811  }
2812  } else {
2813  int block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
2814  bsize = block1 * myGrid.dim2 * myGrid.dim3;
2815  for (int pe=0; pe<numGridPes; pe++) {
2816  if ( ! recipPeDest[pe] ) continue;
2817  int start = pe * bsize;
2818  int len = bsize;
2819  if ( start >= qsize ) { start = 0; len = 0; }
2820  if ( start + len > qsize ) { len = qsize - start; }
2821  int zdim = myGrid.dim3;
2822  int fstart = start / zdim;
2823  int flen = len / zdim;
2824  memset(f + fstart, 0, flen*sizeof(char));
2825  // CkPrintf("pe %d enabled slabs %d to %d\n", CkMyPe(), fstart/myGrid.dim2, (fstart+flen)/myGrid.dim2-1);
2826  }
2827  }
2828  }
2829 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2830  }
2831  if ( offload ) {
2832  cudaSetDevice(deviceCUDA->getDeviceID());
2833  if ( cudaFirst ) {
2834 
2835  int f_alloc_count = 0;
2836  for ( int n=fsize, i=0; i<n; ++i ) {
2837  if ( f_arr[i] == 0 ) {
2838  ++f_alloc_count;
2839  }
2840  }
2841  // CkPrintf("pe %d f_alloc_count == %d (%d slabs)\n", CkMyPe(), f_alloc_count, f_alloc_count/myGrid.dim2);
2842 
2843  q_arr = new float*[fsize*numGrids];
2844  memset( (void*) q_arr, 0, fsize*numGrids * sizeof(float*) );
2845 
2846  float **q_arr_dev_host = new float*[fsize];
2847  cudaMalloc((void**) &q_arr_dev, fsize * sizeof(float*));
2848 
2849  float **v_arr_dev_host = new float*[fsize];
2850  cudaMalloc((void**) &v_arr_dev, fsize * sizeof(float*));
2851 
2852  int q_stride = myGrid.K3+myGrid.order-1;
2853  q_data_size = f_alloc_count * q_stride * sizeof(float);
2854  ffz_size = (fsize + q_stride) * sizeof(int);
2855 
2856  // tack ffz onto end of q_data to allow merged transfer
2857  cudaMallocHost((void**) &q_data_host, q_data_size+ffz_size);
2858  ffz_host = (int*)(((char*)q_data_host) + q_data_size);
2859  cudaMalloc((void**) &q_data_dev, q_data_size+ffz_size);
2860  ffz_dev = (int*)(((char*)q_data_dev) + q_data_size);
2861  cudaMalloc((void**) &v_data_dev, q_data_size);
2862  cuda_errcheck("malloc grid data for pme");
2863  cudaMemset(q_data_dev, 0, q_data_size + ffz_size); // for first time
2864  cudaEventCreateWithFlags(&(nodePmeMgr->end_charge_memset),cudaEventDisableTiming);
2865  cudaEventRecord(nodePmeMgr->end_charge_memset, 0);
2866  cudaEventCreateWithFlags(&(nodePmeMgr->end_all_pme_kernels),cudaEventDisableTiming);
2867  cudaEventCreateWithFlags(&(nodePmeMgr->end_potential_memcpy),cudaEventDisableTiming);
2868 
2869  f_alloc_count = 0;
2870  for ( int n=fsize, i=0; i<n; ++i ) {
2871  if ( f_arr[i] == 0 ) {
2872  q_arr[i] = q_data_host + f_alloc_count * q_stride;
2873  q_arr_dev_host[i] = q_data_dev + f_alloc_count * q_stride;
2874  v_arr_dev_host[i] = v_data_dev + f_alloc_count * q_stride;
2875  ++f_alloc_count;
2876  } else {
2877  q_arr[i] = 0;
2878  q_arr_dev_host[i] = 0;
2879  v_arr_dev_host[i] = 0;
2880  }
2881  }
2882 
2883  cudaMemcpy(q_arr_dev, q_arr_dev_host, fsize * sizeof(float*), cudaMemcpyHostToDevice);
2884  cudaMemcpy(v_arr_dev, v_arr_dev_host, fsize * sizeof(float*), cudaMemcpyHostToDevice);
2885  delete [] q_arr_dev_host;
2886  delete [] v_arr_dev_host;
2887  delete [] f_arr;
2888  f_arr = new char[fsize + q_stride];
2889  fz_arr = f_arr + fsize;
2890  memset(f_arr, 0, fsize + q_stride);
2891  memset(ffz_host, 0, (fsize + q_stride)*sizeof(int));
2892 
2893  cuda_errcheck("initialize grid data for pme");
2894 
2895  cuda_init_bspline_coeffs(&bspline_coeffs_dev, &bspline_dcoeffs_dev, myGrid.order);
2896  cuda_errcheck("initialize bspline coefficients for pme");
2897 
2898 #define XCOPY(X) masterPmeMgr->X = X;
2899  XCOPY(bspline_coeffs_dev)
2900  XCOPY(bspline_dcoeffs_dev)
2901  XCOPY(q_arr)
2902  XCOPY(q_arr_dev)
2903  XCOPY(v_arr_dev)
2904  XCOPY(q_data_size)
2905  XCOPY(q_data_host)
2906  XCOPY(q_data_dev)
2907  XCOPY(v_data_dev)
2908  XCOPY(ffz_size)
2909  XCOPY(ffz_host)
2910  XCOPY(ffz_dev)
2911  XCOPY(f_arr)
2912  XCOPY(fz_arr)
2913 #undef XCOPY
2914  //CkPrintf("pe %d init first\n", CkMyPe());
2915  } else { // cudaFirst
2916  //CkPrintf("pe %d init later\n", CkMyPe());
2917 #define XCOPY(X) X = masterPmeMgr->X;
2918  XCOPY(bspline_coeffs_dev)
2919  XCOPY(bspline_dcoeffs_dev)
2920  XCOPY(q_arr)
2921  XCOPY(q_arr_dev)
2922  XCOPY(v_arr_dev)
2923  XCOPY(q_data_size)
2924  XCOPY(q_data_host)
2925  XCOPY(q_data_dev)
2926  XCOPY(v_data_dev)
2927  XCOPY(ffz_size)
2928  XCOPY(ffz_host)
2929  XCOPY(ffz_dev)
2930  XCOPY(f_arr)
2931  XCOPY(fz_arr)
2932 #undef XCOPY
2933  } // cudaFirst
2934  CmiUnlock(cuda_lock);
2935  } else // offload
2936 #endif // NAMD_CUDA
2937  {
2938  fz_arr = new char[myGrid.K3+myGrid.order-1];
2939  }
2940 
2941 #if 0 && USE_PERSISTENT
2942  recvGrid_handle = NULL;
2943 #endif
2944 }
2945 
2947 {
2948 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2949  if ( ! offload )
2950 #endif
2951  {
2952  for ( int g=0; g<numGridsMax; ++g ) delete myRealSpace[g];
2953  }
2954 }
2955 
2956 #if 0 && USE_PERSISTENT
2957 void ComputePmeMgr::setup_recvgrid_persistent()
2958 {
2959  int K1 = myGrid.K1;
2960  int K2 = myGrid.K2;
2961  int dim2 = myGrid.dim2;
2962  int dim3 = myGrid.dim3;
2963  int block1 = myGrid.block1;
2964  int block2 = myGrid.block2;
2965 
2966  CkArray *zPencil_local = zPencil.ckLocalBranch();
2967  recvGrid_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * numPencilsActive);
2968  for (int ap=0; ap<numPencilsActive; ++ap) {
2969  int ib = activePencils[ap].i;
2970  int jb = activePencils[ap].j;
2971  int ibegin = ib*block1;
2972  int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
2973  int jbegin = jb*block2;
2974  int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
2975  int flen = numGrids * (iend - ibegin) * (jend - jbegin);
2976  // f is changing
2977  int fcount = 0;
2978  for ( int g=0; g<numGrids; ++g ) {
2979  char *f = f_arr + g*fsize;
2980  for ( int i=ibegin; i<iend; ++i ) {
2981  for ( int j=jbegin; j<jend; ++j ) {
2982  fcount += f[i*dim2+j];
2983  }
2984  }
2985  }
2986  int zlistlen = 0;
2987  for ( int i=0; i<myGrid.K3; ++i ) {
2988  if ( fz_arr[i] ) ++zlistlen;
2989  }
2990  int hd = ( fcount? 1 : 0 ); // has data?
2991  int peer = zPencil_local->homePe(CkArrayIndex3D(ib, jb, 0));
2992  int compress_start = sizeof(PmeGridMsg ) + sizeof(envelope) + sizeof(int)*hd*zlistlen + sizeof(char)*hd*flen +sizeof(PmeReduction)*hd*numGrids ;
2993  int compress_size = sizeof(float)*hd*fcount*zlistlen;
2994  int size = compress_start + compress_size + PRIORITY_SIZE/8+6;
2995  recvGrid_handle[ap] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
2996  }
2997 }
2998 #endif
2999 
3001 
3002  if ( patch->flags.doFullElectrostatics ) {
3003  // In QM/MM simulations, atom charges form QM regions need special treatment.
3004  if ( qmForcesOn ) {
3005  return 1;
3006  }
3007  if ( ! myMgr->ungridForcesCount && ! myMgr->recipEvirCount ) return 0; // work to do, enqueue as usual
3008  myMgr->heldComputes.add(this);
3009  return 1; // don't enqueue yet
3010  }
3011 
3012  positionBox->skip();
3013  forceBox->skip();
3014 
3015  if ( ++(myMgr->noWorkCount) == myMgr->pmeComputes.size() ) {
3016  myMgr->noWorkCount = 0;
3017  myMgr->reduction->submit();
3018  }
3019 
3020  atomsChanged = 0;
3021 
3022  return 1; // no work for this step
3023 }
3024 
3026  ++recipEvirClients;
3027 }
3028 
3030  if ( ! pmeComputes.size() ) NAMD_bug("ComputePmeMgr::recvRecipEvir() called on pe without patches");
3031  for ( int g=0; g<numGrids; ++g ) {
3032  evir[g] += msg->evir[g];
3033  }
3034  delete msg;
3035  // CkPrintf("recvRecipEvir pe %d %d %d\n", CkMyPe(), ungridForcesCount, recipEvirCount);
3036  if ( ! --recipEvirCount && ! ungridForcesCount ) submitReductions();
3037 }
3038 
3040 
3041 // iout << CkMyPe() << ") ----> PME doQMWork.\n" << endi ;
3042 
3043 
3044  int numQMAtms = Node::Object()->molecule->get_numQMAtoms();
3045  const Real *qmAtmChrg = Node::Object()->molecule->get_qmAtmChrg() ;
3046  const int *qmAtmIndx = Node::Object()->molecule->get_qmAtmIndx() ;
3047  const Real *qmAtomGroup = Node::Object()->molecule->get_qmAtomGroup() ;
3048 
3049  const CompAtomExt *xExt = patch->getCompAtomExtInfo();
3050 
3051  // Determine number of qm atoms in this patch for the current step.
3052  numLocalQMAtoms = 0;
3053  for (int paIter=0; paIter<patch->getNumAtoms(); paIter++) {
3054  if ( qmAtomGroup[xExt[paIter].id] != 0 ) {
3055  numLocalQMAtoms++;
3056  }
3057  }
3058 
3059  // We prepare a charge vector with QM charges for use in the PME calculation.
3060 
3061  // Clears data from last step, if there is any.
3062  if (qmLoclIndx != 0)
3063  delete [] qmLoclIndx;
3064  if (qmLocalCharges != 0)
3065  delete [] qmLocalCharges;
3066 
3067  qmLoclIndx = new int[numLocalQMAtoms] ;
3068  qmLocalCharges = new Real[numLocalQMAtoms] ;
3069 
3070  // I am assuming there will be (in general) more QM atoms among all QM groups
3071  // than MM atoms in a patch.
3072  int procAtms = 0;
3073 
3074  for (int paIter=0; paIter<patch->getNumAtoms(); paIter++) {
3075 
3076  for (int i=0; i<numQMAtms; i++) {
3077 
3078  if (qmAtmIndx[i] == xExt[paIter].id) {
3079 
3080  qmLoclIndx[procAtms] = paIter ;
3081  qmLocalCharges[procAtms] = qmAtmChrg[i];
3082 
3083  procAtms++;
3084  break;
3085  }
3086 
3087  }
3088 
3089  if (procAtms == numLocalQMAtoms)
3090  break;
3091  }
3092 
3093  doWork();
3094  return ;
3095 }
3096 
3098 {
3099  DebugM(4,"Entering ComputePme::doWork().\n");
3100 
3102 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3104 #else
3106 #endif
3107  ungridForces();
3108  // CkPrintf("doWork 2 pe %d %d %d\n", CkMyPe(), myMgr->ungridForcesCount, myMgr->recipEvirCount);
3109  if ( ! --(myMgr->ungridForcesCount) && ! myMgr->recipEvirCount ) myMgr->submitReductions();
3110  return;
3111  }
3113  // CkPrintf("doWork 1 pe %d %d %d\n", CkMyPe(), myMgr->ungridForcesCount, myMgr->recipEvirCount);
3114 
3115 #ifdef TRACE_COMPUTE_OBJECTS
3116  double traceObjStartTime = CmiWallTimer();
3117 #endif
3118 
3119 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3120  if ( offload ) cudaSetDevice(deviceCUDA->getDeviceID());
3121 #endif
3122 
3123  // allocate storage
3124  numLocalAtoms = patch->getNumAtoms();
3125 
3126  Lattice &lattice = patch->flags.lattice;
3127 
3128  localData_alloc.resize(numLocalAtoms*(numGrids+ ((numGrids>1 || selfOn)?1:0)));
3129  localData = localData_alloc.begin();
3130  localPartition_alloc.resize(numLocalAtoms);
3131  localPartition = localPartition_alloc.begin();
3132 
3133  int g;
3134  for ( g=0; g<numGrids; ++g ) {
3135  localGridData[g] = localData + numLocalAtoms*(g+1);
3136  }
3137 
3138  // get positions and charges
3139  PmeParticle * data_ptr = localData;
3140  unsigned char * part_ptr = localPartition;
3141  const BigReal coulomb_sqrt = sqrt( COULOMB * ComputeNonbondedUtil::scaling
3143 
3144  {
3145  CompAtom *x = positionBox->open();
3146  // CompAtomExt *xExt = patch->getCompAtomExtInfo();
3147  if ( patch->flags.doMolly ) {
3148  positionBox->close(&x);
3149  x = avgPositionBox->open();
3150  }
3151  int numAtoms = patch->getNumAtoms();
3152 
3153  for(int i=0; i<numAtoms; ++i)
3154  {
3155  data_ptr->x = x[i].position.x;
3156  data_ptr->y = x[i].position.y;
3157  data_ptr->z = x[i].position.z;
3158  data_ptr->cg = coulomb_sqrt * x[i].charge;
3159  ++data_ptr;
3160  *part_ptr = x[i].partition;
3161  ++part_ptr;
3162  }
3163 
3164  // QM loop to overwrite charges of QM atoms.
3165  // They are zero for NAMD, but are updated in ComputeQM.
3166  if ( qmForcesOn ) {
3167 
3168  for(int i=0; i<numLocalQMAtoms; ++i)
3169  {
3170  localData[qmLoclIndx[i]].cg = coulomb_sqrt * qmLocalCharges[i];
3171  }
3172 
3173  }
3174 
3175  if ( patch->flags.doMolly ) { avgPositionBox->close(&x); }
3176  else { positionBox->close(&x); }
3177  }
3178 
3179  // copy to other grids if needed
3180  if ( (alchOn && (!alchDecouple)) || lesOn ) {
3181  for ( g=0; g<numGrids; ++g ) {
3182  PmeParticle *lgd = localGridData[g];
3183  if (g < 2) {
3184  int nga = 0;
3185  for(int i=0; i<numLocalAtoms; ++i) {
3186  if ( localPartition[i] == 0 || localPartition[i] == (g+1) || localPartition[i] == (g+3)) {
3187  // for FEP/TI: grid 0 gets non-alch + partition 1 + partition 3;
3188  // grid 1 gets non-alch + partition 2 + + partition 4;
3189  lgd[nga++] = localData[i];
3190  }
3191  }
3192  numGridAtoms[g] = nga;
3193  } else {
3194  int nga = 0;
3195  for(int i=0; i<numLocalAtoms; ++i) {
3196  if ( localPartition[i] == 0 ) {
3197  // grid 2 (only if called for with numGrids=3) gets only non-alch
3198  lgd[nga++] = localData[i];
3199  }
3200  }
3201  numGridAtoms[g] = nga;
3202  }
3203  }
3204  } else if ( alchOn && alchDecouple) {
3205  // alchemical decoupling: four grids
3206  // g=0: partition 0 and partition 1
3207  // g=1: partition 0 and partition 2
3208  // g=2: only partition 1 atoms
3209  // g=3: only partition 2 atoms
3210  // plus one grid g=4, only partition 0, if numGrids=5
3211  for ( g=0; g<2; ++g ) { // same as before for first 2
3212  PmeParticle *lgd = localGridData[g];
3213  int nga = 0;
3214  for(int i=0; i<numLocalAtoms; ++i) {
3215  if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
3216  lgd[nga++] = localData[i];
3217  }
3218  }
3219  numGridAtoms[g] = nga;
3220  }
3221  for (g=2 ; g<4 ; ++g ) { // only alchemical atoms for these 2
3222  PmeParticle *lgd = localGridData[g];
3223  int nga = 0;
3224  for(int i=0; i<numLocalAtoms; ++i) {
3225  if ( localPartition[i] == (g-1) ) {
3226  lgd[nga++] = localData[i];
3227  }
3228  }
3229  numGridAtoms[g] = nga;
3230  }
3231  for (g=4 ; g<numGrids ; ++g ) { // only non-alchemical atoms
3232  // numGrids=5 only if alchElecLambdaStart > 0
3233  PmeParticle *lgd = localGridData[g];
3234  int nga = 0;
3235  for(int i=0; i<numLocalAtoms; ++i) {
3236  if ( localPartition[i] == 0 ) {
3237  lgd[nga++] = localData[i];
3238  }
3239  }
3240  numGridAtoms[g] = nga;
3241  }
3242  } else if ( selfOn ) {
3243  if ( numGrids != 1 ) NAMD_bug("ComputePme::doWork assertion 1 failed");
3244  g = 0;
3245  PmeParticle *lgd = localGridData[g];
3246  int nga = 0;
3247  for(int i=0; i<numLocalAtoms; ++i) {
3248  if ( localPartition[i] == 1 ) {
3249  lgd[nga++] = localData[i];
3250  }
3251  }
3252  numGridAtoms[g] = nga;
3253  } else if ( pairOn ) {
3254  if ( numGrids != 3 ) NAMD_bug("ComputePme::doWork assertion 2 failed");
3255  g = 0;
3256  PmeParticle *lgd = localGridData[g];
3257  int nga = 0;
3258  for(int i=0; i<numLocalAtoms; ++i) {
3259  if ( localPartition[i] == 1 || localPartition[i] == 2 ) {
3260  lgd[nga++] = localData[i];
3261  }
3262  }
3263  numGridAtoms[g] = nga;
3264  for ( g=1; g<3; ++g ) {
3265  PmeParticle *lgd = localGridData[g];
3266  int nga = 0;
3267  for(int i=0; i<numLocalAtoms; ++i) {
3268  if ( localPartition[i] == g ) {
3269  lgd[nga++] = localData[i];
3270  }
3271  }
3272  numGridAtoms[g] = nga;
3273  }
3274  } else {
3275  if ( numGrids != 1 ) NAMD_bug("ComputePme::doWork assertion 3 failed");
3276  localGridData[0] = localData;
3277  numGridAtoms[0] = numLocalAtoms;
3278  }
3279 
3280  if ( ! myMgr->doWorkCount ) {
3281  myMgr->doWorkCount = myMgr->pmeComputes.size();
3282 
3283 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3284  if ( ! offload )
3285 #endif // NAMD_CUDA
3286  {
3287  memset( (void*) myMgr->fz_arr, 0, (myGrid.K3+myGrid.order-1) * sizeof(char) );
3288 
3289  for (int i=0; i<myMgr->q_count; ++i) {
3290  memset( (void*) (myMgr->q_list[i]), 0, (myGrid.K3+myGrid.order-1) * sizeof(float) );
3291  }
3292  }
3293 
3294  for ( g=0; g<numGrids; ++g ) {
3295  myMgr->evir[g] = 0;
3296  }
3297 
3298  myMgr->strayChargeErrors = 0;
3299 
3300  myMgr->compute_sequence = sequence();
3301  }
3302 
3303  if ( sequence() != myMgr->compute_sequence ) NAMD_bug("ComputePme sequence mismatch in doWork()");
3304 
3305  int strayChargeErrors = 0;
3306 
3307  // calculate self energy
3309  for ( g=0; g<numGrids; ++g ) {
3310  BigReal selfEnergy = 0;
3311  data_ptr = localGridData[g];
3312  int i;
3313  for(i=0; i<numGridAtoms[g]; ++i)
3314  {
3315  selfEnergy += data_ptr->cg * data_ptr->cg;
3316  ++data_ptr;
3317  }
3318  selfEnergy *= -1. * ewaldcof / SQRT_PI;
3319  myMgr->evir[g][0] += selfEnergy;
3320 
3321  float **q = myMgr->q_arr + g*myMgr->fsize;
3322  char *f = myMgr->f_arr + g*myMgr->fsize;
3323 
3324  scale_coordinates(localGridData[g], numGridAtoms[g], lattice, myGrid);
3325 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3326  if ( offload ) {
3327  if ( myMgr->cuda_atoms_alloc == 0 ) { // first call
3328  int na = myMgr->cuda_atoms_alloc = 1.2 * (myMgr->cuda_atoms_count + 1000);
3329  cuda_errcheck("before malloc atom data for pme");
3330  cudaMallocHost((void**) &(myMgr->a_data_host), 7*na*sizeof(float));
3331  cudaMalloc((void**) &(myMgr->a_data_dev), 7*na*sizeof(float));
3332  cuda_errcheck("malloc atom data for pme");
3333  myMgr->cuda_atoms_count = 0;
3334  }
3335  cuda_atoms_offset = myMgr->cuda_atoms_count;
3336  int n = numGridAtoms[g];
3337  myMgr->cuda_atoms_count += n;
3338  if ( myMgr->cuda_atoms_count > myMgr->cuda_atoms_alloc ) {
3339  CkPrintf("Pe %d expanding CUDA PME atoms allocation because %d > %d\n",
3340  CkMyPe(), myMgr->cuda_atoms_count, myMgr->cuda_atoms_alloc);
3341  cuda_errcheck("before malloc expanded atom data for pme");
3342  int na = myMgr->cuda_atoms_alloc = 1.2 * (myMgr->cuda_atoms_count + 1000);
3343  const float *a_data_host_old = myMgr->a_data_host;
3344  cudaMallocHost((void**) &(myMgr->a_data_host), 7*na*sizeof(float));
3345  cuda_errcheck("malloc expanded host atom data for pme");
3346  memcpy(myMgr->a_data_host, a_data_host_old, 7*cuda_atoms_offset*sizeof(float));
3347  cudaFreeHost((void*) a_data_host_old);
3348  cuda_errcheck("free expanded host atom data for pme");
3349  cudaFree(myMgr->a_data_dev);
3350  cuda_errcheck("free expanded dev atom data for pme");
3351  cudaMalloc((void**) &(myMgr->a_data_dev), 7*na*sizeof(float));
3352  cuda_errcheck("malloc expanded dev atom data for pme");
3353  }
3354  float *a_data_host = myMgr->a_data_host + 7 * cuda_atoms_offset;
3355  data_ptr = localGridData[g];
3356  double order_1 = myGrid.order - 1;
3357  double K1 = myGrid.K1;
3358  double K2 = myGrid.K2;
3359  double K3 = myGrid.K3;
3360  int found_negative = 0;
3361  for ( int i=0; i<n; ++i ) {
3362  if ( data_ptr[i].x < 0 || data_ptr[i].y < 0 || data_ptr[i].z < 0 ) {
3363  found_negative = 1;
3364  // CkPrintf("low coord: %f %f %f\n", data_ptr[i].x, data_ptr[i].y, data_ptr[i].z);
3365  }
3366  double x_int = (int) data_ptr[i].x;
3367  double y_int = (int) data_ptr[i].y;
3368  double z_int = (int) data_ptr[i].z;
3369  a_data_host[7*i ] = data_ptr[i].x - x_int; // subtract in double precision
3370  a_data_host[7*i+1] = data_ptr[i].y - y_int;
3371  a_data_host[7*i+2] = data_ptr[i].z - z_int;
3372  a_data_host[7*i+3] = data_ptr[i].cg;
3373  x_int -= order_1; if ( x_int < 0 ) x_int += K1;
3374  y_int -= order_1; if ( y_int < 0 ) y_int += K2;
3375  z_int -= order_1; if ( z_int < 0 ) z_int += K3;
3376  a_data_host[7*i+4] = x_int;
3377  a_data_host[7*i+5] = y_int;
3378  a_data_host[7*i+6] = z_int;
3379  }
3380  if ( found_negative ) NAMD_bug("found negative atom coordinate in ComputePme::doWork");
3381  } else
3382 #endif // NAMD_CUDA
3383  {
3384  myRealSpace[g]->set_num_atoms(numGridAtoms[g]);
3385  myRealSpace[g]->fill_charges(q, myMgr->q_list, myMgr->q_count, strayChargeErrors, f, myMgr->fz_arr, localGridData[g]);
3386  }
3387  }
3388  myMgr->strayChargeErrors += strayChargeErrors;
3389 
3390 #ifdef TRACE_COMPUTE_OBJECTS
3391  traceUserBracketEvent(TRACE_COMPOBJ_IDOFFSET+this->cid, traceObjStartTime, CmiWallTimer());
3392 #endif
3393 
3394  if ( --(myMgr->doWorkCount) == 0 ) {
3395 // cudaDeviceSynchronize(); // XXXX
3396 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3397  if ( offload ) {
3399  args.mgr = myMgr;
3400  args.lattice = &lattice;
3401  args.sequence = sequence();
3402  CmiLock(ComputePmeMgr::cuda_lock);
3403  if ( ComputePmeMgr::cuda_busy ) {
3405  } else if ( CkMyPe() == deviceCUDA->getMasterPe() ) {
3406  // avoid adding work to nonbonded data preparation pe
3407  args.mgr->cuda_submit_charges(*args.lattice, args.sequence);
3408  } else {
3409  ComputePmeMgr::cuda_busy = true;
3410  while ( 1 ) {
3411  CmiUnlock(ComputePmeMgr::cuda_lock);
3412  args.mgr->cuda_submit_charges(*args.lattice, args.sequence);
3413  CmiLock(ComputePmeMgr::cuda_lock);
3417  } else {
3418  ComputePmeMgr::cuda_busy = false;
3419  break;
3420  }
3421  }
3422  }
3423  CmiUnlock(ComputePmeMgr::cuda_lock);
3424  } else
3425 #endif // NAMD_CUDA
3426  {
3427  myMgr->chargeGridReady(lattice,sequence());
3428  }
3429  }
3430  atomsChanged = 0;
3431 }
3432 
3433 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3434 
3435 void ComputePmeMgr::cuda_submit_charges(Lattice &lattice, int sequence) {
3436 
3437  int n = cuda_atoms_count;
3438  //CkPrintf("pe %d cuda_atoms_count %d\n", CkMyPe(), cuda_atoms_count);
3439  cuda_atoms_count = 0;
3440 
3441  const double before = CmiWallTimer();
3442  cudaMemcpyAsync(a_data_dev, a_data_host, 7*n*sizeof(float),
3443  cudaMemcpyHostToDevice, streams[stream]);
3444  const double after = CmiWallTimer();
3445 
3446  cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_charge_memset, 0);
3447 
3448  cuda_pme_charges(
3449  bspline_coeffs_dev,
3450  q_arr_dev, ffz_dev, ffz_dev + fsize,
3451  a_data_dev, n,
3452  myGrid.K1, myGrid.K2, myGrid.K3, myGrid.order,
3453  streams[stream]);
3454  const double after2 = CmiWallTimer();
3455 
3456  chargeGridSubmitted(lattice,sequence); // must be inside lock
3457 
3458  masterPmeMgr->charges_time = before;
3459  traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,after);
3460  traceUserBracketEvent(CUDA_EVENT_ID_PME_KERNEL,after,after2);
3461 }
3462 
3463 void cuda_check_pme_charges(void *arg, double walltime) {
3464  ComputePmeMgr *argp = (ComputePmeMgr *) arg;
3465 
3466  cudaError_t err = cudaEventQuery(argp->end_charges);
3467  if ( err == cudaSuccess ) {
3468  traceUserBracketEvent(CUDA_EVENT_ID_PME_CHARGES,argp->charges_time,walltime);
3469  argp->charges_time = walltime - argp->charges_time;
3470  argp->sendChargeGridReady();
3471  argp->check_charges_count = 0;
3472  } else if ( err != cudaErrorNotReady ) {
3473  char errmsg[256];
3474  sprintf(errmsg,"in cuda_check_pme_charges after polling %d times over %f s on seq %d",
3475  argp->check_charges_count, walltime - argp->charges_time,
3476  argp->saved_sequence);
3477  cudaDie(errmsg,err);
3478  } else if ( ++(argp->check_charges_count) >= count_limit ) {
3479  char errmsg[256];
3480  sprintf(errmsg,"cuda_check_pme_charges polled %d times over %f s on seq %d",
3481  argp->check_charges_count, walltime - argp->charges_time,
3482  argp->saved_sequence);
3483  cudaDie(errmsg,err);
3484  } else {
3485  CcdCallBacksReset(0,walltime); // fix Charm++
3487  }
3488 }
3489 
3490 void ComputePmeMgr::chargeGridSubmitted(Lattice &lattice, int sequence) {
3491  saved_lattice = &lattice;
3492  saved_sequence = sequence;
3493 
3494  // cudaDeviceSynchronize(); // XXXX TESTING
3495  //int q_stride = myGrid.K3+myGrid.order-1;
3496  //for (int n=fsize+q_stride, j=0; j<n; ++j) {
3497  // if ( ffz_host[j] != 0 && ffz_host[j] != 1 ) {
3498  // CkPrintf("pre-memcpy flag %d/%d == %d on pe %d in ComputePmeMgr::chargeGridReady\n", j, n, ffz_host[j], CkMyPe());
3499  // }
3500  //}
3501  //CmiLock(cuda_lock);
3502 
3503  if ( --(masterPmeMgr->chargeGridSubmittedCount) == 0 ) {
3504  double before = CmiWallTimer();
3505  cudaEventRecord(nodePmeMgr->end_all_pme_kernels, 0); // when all streams complete
3506  cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_all_pme_kernels, 0);
3507  cudaMemcpyAsync(q_data_host, q_data_dev, q_data_size+ffz_size,
3508  cudaMemcpyDeviceToHost, streams[stream]);
3509  traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
3510  cudaEventRecord(masterPmeMgr->end_charges, streams[stream]);
3511  cudaMemsetAsync(q_data_dev, 0, q_data_size + ffz_size, streams[stream]); // for next time
3512  cudaEventRecord(nodePmeMgr->end_charge_memset, streams[stream]);
3513  //CmiUnlock(cuda_lock);
3514  // cudaDeviceSynchronize(); // XXXX TESTING
3515  // cuda_errcheck("after memcpy grid to host");
3516 
3517  SimParameters *simParams = Node::Object()->simParameters;
3518  if ( ! simParams->useCUDA2 ) {
3519  CProxy_ComputeMgr cm(CkpvAccess(BOCclass_group).computeMgr);
3520  cm[deviceCUDA->getMasterPe()].recvYieldDevice(-1);
3521  }
3522 
3523  pmeProxy[master_pe].pollChargeGridReady();
3524  }
3525 }
3526 
3528  for ( int i=0; i<CkMyNodeSize(); ++i ) {
3529  ComputePmeMgr *mgr = nodePmeMgr->mgrObjects[i];
3530  int cs = mgr->pmeComputes.size();
3531  if ( cs ) {
3532  mgr->ungridForcesCount = cs;
3533  mgr->recipEvirCount = mgr->recipEvirClients;
3534  masterPmeMgr->chargeGridSubmittedCount++;
3535  }
3536  }
3537  pmeProxy[master_pe].recvChargeGridReady();
3538 }
3539 #endif // NAMD_CUDA
3540 
3542 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3543  CcdCallBacksReset(0,CmiWallTimer()); // fix Charm++
3545 #else
3546  NAMD_bug("ComputePmeMgr::pollChargeGridReady() called in non-CUDA build.");
3547 #endif
3548 }
3549 
3552 }
3553 
3554 void ComputePmeMgr::chargeGridReady(Lattice &lattice, int sequence) {
3555 
3556 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3557  if ( offload ) {
3558  int errcount = 0;
3559  int q_stride = myGrid.K3+myGrid.order-1;
3560  for (int n=fsize+q_stride, j=fsize; j<n; ++j) {
3561  f_arr[j] = ffz_host[j];
3562  if ( ffz_host[j] & ~1 ) ++errcount;
3563  }
3564  if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::chargeGridReady");
3565  }
3566 #endif
3567  recipEvirCount = recipEvirClients;
3568  ungridForcesCount = pmeComputes.size();
3569 
3570  for (int j=0; j<myGrid.order-1; ++j) {
3571  fz_arr[j] |= fz_arr[myGrid.K3+j];
3572  }
3573 
3574  if ( usePencils ) {
3575  sendPencils(lattice,sequence);
3576  } else {
3577  sendData(lattice,sequence);
3578  }
3579 }
3580 
3581 
3582 void ComputePmeMgr::sendPencilsPart(int first, int last, Lattice &lattice, int sequence, int sourcepe) {
3583 
3584  // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
3585 
3586 #if 0 && USE_PERSISTENT
3587  if (recvGrid_handle== NULL) setup_recvgrid_persistent();
3588 #endif
3589  int K1 = myGrid.K1;
3590  int K2 = myGrid.K2;
3591  int dim2 = myGrid.dim2;
3592  int dim3 = myGrid.dim3;
3593  int block1 = myGrid.block1;
3594  int block2 = myGrid.block2;
3595 
3596  // int savedMessages = 0;
3597  NodePmeMgr *npMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
3598 
3599  for (int ap=first; ap<=last; ++ap) {
3600  int ib = activePencils[ap].i;
3601  int jb = activePencils[ap].j;
3602  int ibegin = ib*block1;
3603  int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
3604  int jbegin = jb*block2;
3605  int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
3606  int flen = numGrids * (iend - ibegin) * (jend - jbegin);
3607 
3608  int fcount = 0;
3609  for ( int g=0; g<numGrids; ++g ) {
3610  char *f = f_arr + g*fsize;
3611 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3612  if ( offload ) {
3613  int errcount = 0;
3614  for ( int i=ibegin; i<iend; ++i ) {
3615  for ( int j=jbegin; j<jend; ++j ) {
3616  int k = i*dim2+j;
3617  f[k] = ffz_host[k];
3618  fcount += f[k];
3619  if ( ffz_host[k] & ~1 ) ++errcount;
3620  }
3621  }
3622  if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::sendPencilsPart");
3623  } else
3624 #endif
3625  for ( int i=ibegin; i<iend; ++i ) {
3626  for ( int j=jbegin; j<jend; ++j ) {
3627  fcount += f[i*dim2+j];
3628  }
3629  }
3630  }
3631 
3632 #ifdef NETWORK_PROGRESS
3633  CmiNetworkProgress();
3634 #endif
3635 
3636  if ( ! pencilActive[ib*yBlocks+jb] )
3637  NAMD_bug("PME activePencils list inconsistent");
3638 
3639  int zlistlen = 0;
3640  for ( int i=0; i<myGrid.K3; ++i ) {
3641  if ( fz_arr[i] ) ++zlistlen;
3642  }
3643 
3644  int hd = ( fcount? 1 : 0 ); // has data?
3645  // if ( ! hd ) ++savedMessages;
3646 
3647 
3648  PmeGridMsg *msg = new ( hd*zlistlen, hd*flen,
3649  hd*fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
3650  msg->sourceNode = sourcepe;
3651  msg->hasData = hd;
3652  msg->lattice = lattice;
3653  if ( hd ) {
3654 #if 0
3655  msg->start = fstart;
3656  msg->len = flen;
3657 #else
3658  msg->start = -1; // obsolete?
3659  msg->len = -1; // obsolete?
3660 #endif
3661  msg->zlistlen = zlistlen;
3662  int *zlist = msg->zlist;
3663  zlistlen = 0;
3664  for ( int i=0; i<myGrid.K3; ++i ) {
3665  if ( fz_arr[i] ) zlist[zlistlen++] = i;
3666  }
3667  char *fmsg = msg->fgrid;
3668  float *qmsg = msg->qgrid;
3669  for ( int g=0; g<numGrids; ++g ) {
3670  char *f = f_arr + g*fsize;
3671  float **q = q_arr + g*fsize;
3672  for ( int i=ibegin; i<iend; ++i ) {
3673  for ( int j=jbegin; j<jend; ++j ) {
3674  *(fmsg++) = f[i*dim2+j];
3675  if( f[i*dim2+j] ) {
3676  for (int h=0; h<myGrid.order-1; ++h) {
3677  q[i*dim2+j][h] += q[i*dim2+j][myGrid.K3+h];
3678  }
3679  for ( int k=0; k<zlistlen; ++k ) {
3680  *(qmsg++) = q[i*dim2+j][zlist[k]];
3681  }
3682  }
3683  }
3684  }
3685  }
3686  }
3687 
3688  msg->sequence = compute_sequence;
3689  SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
3690  CmiEnableUrgentSend(1);
3691 #if USE_NODE_PAR_RECEIVE
3692  msg->destElem=CkArrayIndex3D(ib,jb,0);
3693  CProxy_PmePencilMap lzm = npMgr->zm;
3694  int destproc = lzm.ckLocalBranch()->procNum(0, msg->destElem);
3695  int destnode = CmiNodeOf(destproc);
3696 
3697 #if 0
3698  CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
3699 #endif
3700  pmeNodeProxy[destnode].recvZGrid(msg);
3701 #if 0
3702  CmiUsePersistentHandle(NULL, 0);
3703 #endif
3704 #else
3705 #if 0
3706  CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
3707 #endif
3708  zPencil(ib,jb,0).recvGrid(msg);
3709 #if 0
3710  CmiUsePersistentHandle(NULL, 0);
3711 #endif
3712 #endif
3713  CmiEnableUrgentSend(0);
3714  }
3715 
3716 
3717  // if ( savedMessages ) {
3718  // CkPrintf("Pe %d eliminated %d PME messages\n",CkMyPe(),savedMessages);
3719  // }
3720 
3721 }
3722 
3723 
3725  nodePmeMgr->sendPencilsHelper(iter);
3726 }
3727 
3729 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3730  ComputePmeMgr *obj = masterPmeMgr;
3732 #else
3733  NAMD_bug("NodePmeMgr::sendPencilsHelper called in non-CUDA build");
3734 #endif
3735 }
3736 
3737 void ComputePmeMgr::sendPencils(Lattice &lattice, int sequence) {
3738 
3739  sendDataHelper_lattice = &lattice;
3740  sendDataHelper_sequence = sequence;
3741  sendDataHelper_sourcepe = CkMyPe();
3742 
3743 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3744  if ( offload ) {
3745  for ( int ap=0; ap < numPencilsActive; ++ap ) {
3746 #if CMK_MULTICORE
3747  // nodegroup messages on multicore are delivered to sending pe, or pe 0 if expedited
3748  int ib = activePencils[ap].i;
3749  int jb = activePencils[ap].j;
3750  int destproc = nodePmeMgr->zm.ckLocalBranch()->procNum(0, CkArrayIndex3D(ib,jb,0));
3751  pmeProxy[destproc].sendPencilsHelper(ap);
3752 #else
3753  pmeNodeProxy[CkMyNode()].sendPencilsHelper(ap);
3754 #endif
3755  }
3756  } else
3757 #endif
3758  {
3759  sendPencilsPart(0,numPencilsActive-1,lattice,sequence,CkMyPe());
3760  }
3761 
3762  if ( strayChargeErrors ) {
3763  strayChargeErrors = 0;
3764  iout << iERROR << "Stray PME grid charges detected: "
3765  << CkMyPe() << " sending to (x,y)";
3766  int K1 = myGrid.K1;
3767  int K2 = myGrid.K2;
3768  int dim2 = myGrid.dim2;
3769  int block1 = myGrid.block1;
3770  int block2 = myGrid.block2;
3771  for (int ib=0; ib<xBlocks; ++ib) {
3772  for (int jb=0; jb<yBlocks; ++jb) {
3773  int ibegin = ib*block1;
3774  int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
3775  int jbegin = jb*block2;
3776  int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
3777  int flen = numGrids * (iend - ibegin) * (jend - jbegin);
3778 
3779  for ( int g=0; g<numGrids; ++g ) {
3780  char *f = f_arr + g*fsize;
3781  if ( ! pencilActive[ib*yBlocks+jb] ) {
3782  for ( int i=ibegin; i<iend; ++i ) {
3783  for ( int j=jbegin; j<jend; ++j ) {
3784  if ( f[i*dim2+j] == 3 ) {
3785  f[i*dim2+j] = 2;
3786  iout << " (" << i << "," << j << ")";
3787  }
3788  }
3789  }
3790  }
3791  }
3792  }
3793  }
3794  iout << "\n" << endi;
3795  }
3796 
3797 }
3798 
3799 
3801 
3802  int K1 = myGrid.K1;
3803  int K2 = myGrid.K2;
3804  int dim2 = myGrid.dim2;
3805  int dim3 = myGrid.dim3;
3806  int block1 = myGrid.block1;
3807  int block2 = myGrid.block2;
3808 
3809  // msg->sourceNode = thisIndex.x * initdata.yBlocks + thisIndex.y;
3810  int ib = msg->sourceNode / yBlocks;
3811  int jb = msg->sourceNode % yBlocks;
3812 
3813  int ibegin = ib*block1;
3814  int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
3815  int jbegin = jb*block2;
3816  int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
3817 
3818  int zlistlen = msg->zlistlen;
3819  int *zlist = msg->zlist;
3820  float *qmsg = msg->qgrid;
3821  int g;
3822  for ( g=0; g<numGrids; ++g ) {
3823  char *f = f_arr + g*fsize;
3824  float **q = q_arr + g*fsize;
3825  for ( int i=ibegin; i<iend; ++i ) {
3826  for ( int j=jbegin; j<jend; ++j ) {
3827  if( f[i*dim2+j] ) {
3828  f[i*dim2+j] = 0;
3829  for ( int k=0; k<zlistlen; ++k ) {
3830  q[i*dim2+j][zlist[k]] = *(qmsg++);
3831  }
3832  for (int h=0; h<myGrid.order-1; ++h) {
3833  q[i*dim2+j][myGrid.K3+h] = q[i*dim2+j][h];
3834  }
3835  }
3836  }
3837  }
3838  }
3839 }
3840 
3841 
3842 void ComputePmeMgr::sendDataPart(int first, int last, Lattice &lattice, int sequence, int sourcepe, int errors) {
3843 
3844  // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
3845 
3846  bsize = myGrid.block1 * myGrid.dim2 * myGrid.dim3;
3847 
3848  CProxy_ComputePmeMgr pmeProxy(CkpvAccess(BOCclass_group).computePmeMgr);
3849  for (int j=first; j<=last; j++) {
3850  int pe = gridPeOrder[j]; // different order
3851  if ( ! recipPeDest[pe] && ! errors ) continue;
3852  int start = pe * bsize;
3853  int len = bsize;
3854  if ( start >= qsize ) { start = 0; len = 0; }
3855  if ( start + len > qsize ) { len = qsize - start; }
3856  int zdim = myGrid.dim3;
3857  int fstart = start / zdim;
3858  int flen = len / zdim;
3859  int fcount = 0;
3860  int i;
3861 
3862  int g;
3863  for ( g=0; g<numGrids; ++g ) {
3864  char *f = f_arr + fstart + g*fsize;
3865 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3866  if ( offload ) {
3867  int errcount = 0;
3868  for ( i=0; i<flen; ++i ) {
3869  f[i] = ffz_host[fstart+i];
3870  fcount += f[i];
3871  if ( ffz_host[fstart+i] & ~1 ) ++errcount;
3872  }
3873  if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::sendDataPart");
3874  } else
3875 #endif
3876  for ( i=0; i<flen; ++i ) {
3877  fcount += f[i];
3878  }
3879  if ( ! recipPeDest[pe] ) {
3880  int errfound = 0;
3881  for ( i=0; i<flen; ++i ) {
3882  if ( f[i] == 3 ) {
3883  errfound = 1;
3884  break;
3885  }
3886  }
3887  if ( errfound ) {
3888  iout << iERROR << "Stray PME grid charges detected: "
3889  << sourcepe << " sending to " << gridPeMap[pe] << " for planes";
3890  int iz = -1;
3891  for ( i=0; i<flen; ++i ) {
3892  if ( f[i] == 3 ) {
3893  f[i] = 2;
3894  int jz = (i+fstart)/myGrid.K2;
3895  if ( iz != jz ) { iout << " " << jz; iz = jz; }
3896  }
3897  }
3898  iout << "\n" << endi;
3899  }
3900  }
3901  }
3902 
3903 #ifdef NETWORK_PROGRESS
3904  CmiNetworkProgress();
3905 #endif
3906 
3907  if ( ! recipPeDest[pe] ) continue;
3908 
3909  int zlistlen = 0;
3910  for ( i=0; i<myGrid.K3; ++i ) {
3911  if ( fz_arr[i] ) ++zlistlen;
3912  }
3913 
3914  PmeGridMsg *msg = new (zlistlen, flen*numGrids,
3915  fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
3916 
3917  msg->sourceNode = sourcepe;
3918  msg->lattice = lattice;
3919  msg->start = fstart;
3920  msg->len = flen;
3921  msg->zlistlen = zlistlen;
3922  int *zlist = msg->zlist;
3923  zlistlen = 0;
3924  for ( i=0; i<myGrid.K3; ++i ) {
3925  if ( fz_arr[i] ) zlist[zlistlen++] = i;
3926  }
3927  float *qmsg = msg->qgrid;
3928  for ( g=0; g<numGrids; ++g ) {
3929  char *f = f_arr + fstart + g*fsize;
3930  CmiMemcpy((void*)(msg->fgrid+g*flen),(void*)f,flen*sizeof(char));
3931  float **q = q_arr + fstart + g*fsize;
3932  for ( i=0; i<flen; ++i ) {
3933  if ( f[i] ) {
3934  for (int h=0; h<myGrid.order-1; ++h) {
3935  q[i][h] += q[i][myGrid.K3+h];
3936  }
3937  for ( int k=0; k<zlistlen; ++k ) {
3938  *(qmsg++) = q[i][zlist[k]];
3939  }
3940  }
3941  }
3942  }
3943 
3944  msg->sequence = compute_sequence;
3945  SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
3946  pmeProxy[gridPeMap[pe]].recvGrid(msg);
3947  }
3948 
3949 }
3950 
3952  nodePmeMgr->sendDataHelper(iter);
3953 }
3954 
3956 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3957  ComputePmeMgr *obj = masterPmeMgr;
3959 #else
3960  NAMD_bug("NodePmeMgr::sendDataHelper called in non-CUDA build");
3961 #endif
3962 }
3963 
3964 void ComputePmeMgr::sendData(Lattice &lattice, int sequence) {
3965 
3966  sendDataHelper_lattice = &lattice;
3967  sendDataHelper_sequence = sequence;
3968  sendDataHelper_sourcepe = CkMyPe();
3969  sendDataHelper_errors = strayChargeErrors;
3970  strayChargeErrors = 0;
3971 
3972 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3973  if ( offload ) {
3974  for ( int i=0; i < numGridPes; ++i ) {
3975  int pe = gridPeOrder[i]; // different order
3976  if ( ! recipPeDest[pe] && ! sendDataHelper_errors ) continue;
3977 #if CMK_MULTICORE
3978  // nodegroup messages on multicore are delivered to sending pe, or pe 0 if expedited
3979  pmeProxy[gridPeMap[pe]].sendDataHelper(i);
3980 #else
3981  pmeNodeProxy[CkMyNode()].sendDataHelper(i);
3982 #endif
3983  }
3984  } else
3985 #endif
3986  {
3987  sendDataPart(0,numGridPes-1,lattice,sequence,CkMyPe(),sendDataHelper_errors);
3988  }
3989 
3990 }
3991 
3993 
3994  int zdim = myGrid.dim3;
3995  int flen = msg->len;
3996  int fstart = msg->start;
3997  int zlistlen = msg->zlistlen;
3998  int *zlist = msg->zlist;
3999  float *qmsg = msg->qgrid;
4000  int g;
4001  for ( g=0; g<numGrids; ++g ) {
4002  char *f = msg->fgrid + g*flen;
4003  float **q = q_arr + fstart + g*fsize;
4004  for ( int i=0; i<flen; ++i ) {
4005  if ( f[i] ) {
4006  f[i] = 0;
4007  for ( int k=0; k<zlistlen; ++k ) {
4008  q[i][zlist[k]] = *(qmsg++);
4009  }
4010  for (int h=0; h<myGrid.order-1; ++h) {
4011  q[i][myGrid.K3+h] = q[i][h];
4012  }
4013  }
4014  }
4015  }
4016 }
4017 
4019 
4020  if ( sequence() != myMgr->compute_sequence ) NAMD_bug("ComputePme sequence mismatch in ungridForces()");
4021 
4022  SimParameters *simParams = Node::Object()->simParameters;
4023 
4024  localResults_alloc.resize(numLocalAtoms* ((numGrids>1 || selfOn)?2:1));
4025  Vector *localResults = localResults_alloc.begin();
4026  Vector *gridResults;
4027 
4028  if ( alchOn || lesOn || selfOn || pairOn ) {
4029  for(int i=0; i<numLocalAtoms; ++i) { localResults[i] = 0.; }
4030  gridResults = localResults + numLocalAtoms;
4031  } else {
4032  gridResults = localResults;
4033  }
4034 
4035  Vector pairForce = 0.;
4036  Lattice &lattice = patch->flags.lattice;
4037  int g = 0;
4038  if(!simParams->commOnly) {
4039  for ( g=0; g<numGrids; ++g ) {
4040 #ifdef NETWORK_PROGRESS
4041  CmiNetworkProgress();
4042 #endif
4043 
4044 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
4045  if ( offload ) {
4046  int errfound = 0;
4047  for ( int n=numGridAtoms[g], i=0; i<n; ++i ) {
4048  // Neither isnan() nor x != x worked when testing on Cray; this does.
4049  if ( ((int*)f_data_host)[3*i] == 0x7fffffff ) { errfound = 1; } // CUDA NaN
4050  gridResults[i].x = f_data_host[3*i];
4051  gridResults[i].y = f_data_host[3*i+1];
4052  gridResults[i].z = f_data_host[3*i+2];
4053  }
4054  if ( errfound ) {
4055  int errcount = 0;
4056  for ( int n=numGridAtoms[g], i=0; i<n; ++i ) {
4057  float f = f_data_host[3*i];
4058  if ( ((int*)f_data_host)[3*i] == 0x7fffffff ) { // CUDA NaN
4059  ++errcount;
4060  gridResults[i] = 0.;
4061  }
4062  }
4063  iout << iERROR << "Stray PME grid charges detected: "
4064  << errcount << " atoms on pe " << CkMyPe() << "\n" << endi;
4065  }
4066  } else
4067 #endif // NAMD_CUDA
4068  {
4069  myRealSpace[g]->compute_forces(myMgr->q_arr+g*myMgr->fsize, localGridData[g], gridResults);
4070  }
4071  scale_forces(gridResults, numGridAtoms[g], lattice);
4072 
4073  if (alchOn) {
4074  float scale = 1.;
4075  BigReal elecLambdaUp, elecLambdaDown;
4076  BigReal alchLambda = simParams->getCurrentLambda(patch->flags.step);
4077  myMgr->alchLambda = alchLambda;
4078  BigReal alchLambda2 = simParams->getCurrentLambda2(patch->flags.step);
4079  myMgr->alchLambda2 = alchLambda2;
4080  elecLambdaUp = simParams->getElecLambda(alchLambda);
4081  elecLambdaDown = simParams->getElecLambda(1. - alchLambda);
4082 
4083  if ( g == 0 ) scale = elecLambdaUp;
4084  else if ( g == 1 ) scale = elecLambdaDown;
4085  else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
4086 
4087  if (alchDecouple) {
4088  if ( g == 2 ) scale = 1 - elecLambdaUp;
4089  else if ( g == 3 ) scale = 1 - elecLambdaDown;
4090  else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
4091  }
4092  int nga = 0;
4093  if (!alchDecouple) {
4094  if (g < 2 ) {
4095  for(int i=0; i<numLocalAtoms; ++i) {
4096  if ( localPartition[i] == 0 || localPartition[i] == (g+1) || localPartition[i] == (g+3) ) {
4097  // (g=0: only partition 0 and partiton 1 and partion 3)
4098  // (g=1: only partition 0 and partiton 2 and partion 4)
4099  localResults[i] += gridResults[nga++] * scale;
4100  }
4101  }
4102  } else {
4103  for(int i=0; i<numLocalAtoms; ++i) {
4104  if ( localPartition[i] == 0 ) {
4105  // (g=2: only partition 0)
4106  localResults[i] += gridResults[nga++] * scale;
4107  }
4108  }
4109  }
4110  } else { // alchDecouple
4111  if ( g < 2 ) {
4112  for(int i=0; i<numLocalAtoms; ++i) {
4113  if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
4114  // g = 0: partition 0 or partition 1
4115  // g = 1: partition 0 or partition 2
4116  localResults[i] += gridResults[nga++] * scale;
4117  }
4118  }
4119  }
4120  else {
4121  for(int i=0; i<numLocalAtoms; ++i) {
4122  if ( localPartition[i] == (g-1) || localPartition[i] == (g-4)) {
4123  // g = 2: partition 1 only
4124  // g = 3: partition 2 only
4125  // g = 4: partition 0 only
4126  localResults[i] += gridResults[nga++] * scale;
4127  }
4128  }
4129  }
4130  }
4131  } else if ( lesOn ) {
4132  float scale = 1.;
4133  if ( alchFepOn ) {
4134  BigReal alchLambda = simParams->getCurrentLambda(patch->flags.step);
4135  myMgr->alchLambda = alchLambda;
4136  BigReal alchLambda2 = simParams->getCurrentLambda2(patch->flags.step);
4137  myMgr->alchLambda2 = alchLambda2;
4138  if ( g == 0 ) scale = alchLambda;
4139  else if ( g == 1 ) scale = 1. - alchLambda;
4140  } else if ( lesOn ) {
4141  scale = 1.0 / (float)lesFactor;
4142  }
4143  int nga = 0;
4144  for(int i=0; i<numLocalAtoms; ++i) {
4145  if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
4146  localResults[i] += gridResults[nga++] * scale;
4147  }
4148  }
4149  } else if ( selfOn ) {
4150  PmeParticle *lgd = localGridData[g];
4151  int nga = 0;
4152  for(int i=0; i<numLocalAtoms; ++i) {
4153  if ( localPartition[i] == 1 ) {
4154  pairForce += gridResults[nga]; // should add up to almost zero
4155  localResults[i] += gridResults[nga++];
4156  }
4157  }
4158  } else if ( pairOn ) {
4159  if ( g == 0 ) {
4160  int nga = 0;
4161  for(int i=0; i<numLocalAtoms; ++i) {
4162  if ( localPartition[i] == 1 ) {
4163  pairForce += gridResults[nga];
4164  }
4165  if ( localPartition[i] == 1 || localPartition[i] == 2 ) {
4166  localResults[i] += gridResults[nga++];
4167  }
4168  }
4169  } else if ( g == 1 ) {
4170  int nga = 0;
4171  for(int i=0; i<numLocalAtoms; ++i) {
4172  if ( localPartition[i] == g ) {
4173  pairForce -= gridResults[nga]; // should add up to almost zero
4174  localResults[i] -= gridResults[nga++];
4175  }
4176  }
4177  } else {
4178  int nga = 0;
4179  for(int i=0; i<numLocalAtoms; ++i) {
4180  if ( localPartition[i] == g ) {
4181  localResults[i] -= gridResults[nga++];
4182  }
4183  }
4184  }
4185  }
4186  }
4187  }
4188 
4189  Vector *results_ptr = localResults;
4190 
4191  // add in forces
4192  {
4193  Results *r = forceBox->open();
4194  Force *f = r->f[Results::slow];
4195  int numAtoms = patch->getNumAtoms();
4196 
4197  if ( ! myMgr->strayChargeErrors && ! simParams->commOnly ) {
4198  for(int i=0; i<numAtoms; ++i) {
4199  f[i].x += results_ptr->x;
4200  f[i].y += results_ptr->y;
4201  f[i].z += results_ptr->z;
4202  ++results_ptr;
4203  }
4204  }
4205  forceBox->close(&r);
4206  }
4207 
4208  if ( pairOn || selfOn ) {
4209  ADD_VECTOR_OBJECT(myMgr->reduction,REDUCTION_PAIR_ELECT_FORCE,pairForce);
4210  }
4211 
4212 }
4213 
4215 
4216  SimParameters *simParams = Node::Object()->simParameters;
4217 
4218  for ( int g=0; g<numGrids; ++g ) {
4219  float scale = 1.;
4220  if (alchOn) {
4221  BigReal elecLambdaUp, elecLambdaDown;
4222  // alchLambda set on each step in ComputePme::ungridForces()
4223  if ( alchLambda < 0 || alchLambda > 1 ) {
4224  NAMD_bug("ComputePmeMgr::submitReductions alchLambda out of range");
4225  }
4226  elecLambdaUp = simParams->getElecLambda(alchLambda);
4227  elecLambdaDown = simParams->getElecLambda(1-alchLambda);
4228  if ( g == 0 ) scale = elecLambdaUp;
4229  else if ( g == 1 ) scale = elecLambdaDown;
4230  else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
4231  if (alchDecouple) {
4232  if ( g == 2 ) scale = 1-elecLambdaUp;
4233  else if ( g == 3 ) scale = 1-elecLambdaDown;
4234  else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
4235  }
4236  } else if ( lesOn ) {
4237  scale = 1.0 / lesFactor;
4238  } else if ( pairOn ) {
4239  scale = ( g == 0 ? 1. : -1. );
4240  }
4241  reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += evir[g][0] * scale;
4242  reduction->item(REDUCTION_VIRIAL_SLOW_XX) += evir[g][1] * scale;
4243  reduction->item(REDUCTION_VIRIAL_SLOW_XY) += evir[g][2] * scale;
4244  reduction->item(REDUCTION_VIRIAL_SLOW_XZ) += evir[g][3] * scale;
4245  reduction->item(REDUCTION_VIRIAL_SLOW_YX) += evir[g][2] * scale;
4246  reduction->item(REDUCTION_VIRIAL_SLOW_YY) += evir[g][4] * scale;
4247  reduction->item(REDUCTION_VIRIAL_SLOW_YZ) += evir[g][5] * scale;
4248  reduction->item(REDUCTION_VIRIAL_SLOW_ZX) += evir[g][3] * scale;
4249  reduction->item(REDUCTION_VIRIAL_SLOW_ZY) += evir[g][5] * scale;
4250  reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += evir[g][6] * scale;
4251 
4252  float scale2 = 0.;
4253 
4254  // why is this declared/defined again here?
4255  SimParameters *simParams = Node::Object()->simParameters;
4256 
4257  if (alchFepOn) {
4258  BigReal elecLambda2Up=0.0, elecLambda2Down=0.0;
4259  elecLambda2Up = simParams->getElecLambda(alchLambda2);
4260  elecLambda2Down = simParams->getElecLambda(1.-alchLambda2);
4261  if ( g == 0 ) scale2 = elecLambda2Up;
4262  else if ( g == 1 ) scale2 = elecLambda2Down;
4263  else if ( g == 2 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
4264  if (alchDecouple && g == 2 ) scale2 = 1 - elecLambda2Up;
4265  else if (alchDecouple && g == 3 ) scale2 = 1 - elecLambda2Down;
4266  else if (alchDecouple && g == 4 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
4267  }
4268  reduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += evir[g][0] * scale2;
4269 
4270  if (alchThermIntOn) {
4271 
4272  // no decoupling:
4273  // part. 1 <-> all of system except partition 2: g[0] - g[2]
4274  // (interactions between all atoms [partition 0 OR partition 1],
4275  // minus all [within partition 0])
4276  // U = elecLambdaUp * (U[0] - U[2])
4277  // dU/dl = U[0] - U[2];
4278 
4279  // part. 2 <-> all of system except partition 1: g[1] - g[2]
4280  // (interactions between all atoms [partition 0 OR partition 2],
4281  // minus all [within partition 0])
4282  // U = elecLambdaDown * (U[1] - U[2])
4283  // dU/dl = U[1] - U[2];
4284 
4285  // alchDecouple:
4286  // part. 1 <-> part. 0: g[0] - g[2] - g[4]
4287  // (interactions between all atoms [partition 0 OR partition 1]
4288  // minus all [within partition 1] minus all [within partition 0]
4289  // U = elecLambdaUp * (U[0] - U[4]) + (1-elecLambdaUp)* U[2]
4290  // dU/dl = U[0] - U[2] - U[4];
4291 
4292  // part. 2 <-> part. 0: g[1] - g[3] - g[4]
4293  // (interactions between all atoms [partition 0 OR partition 2]
4294  // minus all [within partition 2] minus all [within partition 0]
4295  // U = elecLambdaDown * (U[1] - U[4]) + (1-elecLambdaDown)* U[3]
4296  // dU/dl = U[1] - U[3] - U[4];
4297 
4298 
4299  if ( g == 0 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) += evir[g][0];
4300  if ( g == 1 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) += evir[g][0];
4301  if (!alchDecouple) {
4302  if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
4303  if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
4304  }
4305  else { // alchDecouple
4306  if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
4307  if ( g == 3 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
4308  if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
4309  if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
4310  }
4311  }
4312  }
4313 
4314  alchLambda = -1.; // illegal value to catch if not updated
4315 
4316  reduction->item(REDUCTION_STRAY_CHARGE_ERRORS) += strayChargeErrors;
4317  reduction->submit();
4318 
4319  for ( int i=0; i<heldComputes.size(); ++i ) {
4320  WorkDistrib::messageEnqueueWork(heldComputes[i]);
4321  }
4322  heldComputes.resize(0);
4323 }
4324 
4325 #if USE_TOPOMAP
4326 
4327 #define NPRIMES 8
4328 const static unsigned int NAMDPrimes[] = {
4329  3,
4330  5,
4331  7,
4332  11,
4333  13,
4334  17,
4335  19,
4336  23,
4337  29,
4338  31,
4339  37,
4340  59,
4341  73,
4342  93,
4343  113,
4344  157,
4345  307,
4346  617,
4347  1217 //This should b enough for 64K nodes of BGL.
4348 };
4349 
4350 #include "RecBisection.h"
4351 
4352 /***-----------------------------------------------------**********
4353  The Orthogonal Recursive Bisection strategy, which allocates PME
4354  objects close to the patches they communicate, and at the same
4355  time spreads them around the grid
4356 ****----------------------------------------------------------****/
4357 
4358 bool generateBGLORBPmePeList(int *pemap, int numPes,
4359  int *block_pes, int nbpes) {
4360 
4361  PatchMap *pmap = PatchMap::Object();
4362  int *pmemap = new int [CkNumPes()];
4363 
4364  if (pemap == NULL)
4365  return false;
4366 
4367  TopoManager tmgr;
4368 
4369  memset(pmemap, 0, sizeof(int) * CkNumPes());
4370 
4371  for(int count = 0; count < CkNumPes(); count++) {
4372  if(count < nbpes)
4373  pmemap[block_pes[count]] = 1;
4374 
4375  if(pmap->numPatchesOnNode(count)) {
4376  pmemap[count] = 1;
4377 
4378  //Assumes an XYZT mapping !!
4379  if(tmgr.hasMultipleProcsPerNode()) {
4380  pmemap[(count + CkNumPes()/2)% CkNumPes()] = 1;
4381  }
4382  }
4383  }
4384 
4385  if(numPes + nbpes + pmap->numNodesWithPatches() > CkNumPes())
4386  //NAMD_bug("PME ORB Allocator: Processors Unavailable\n");
4387  return false;
4388 
4389  CProxy_Node nd(CkpvAccess(BOCclass_group).node);
4390  Node *node = nd.ckLocalBranch();
4391  SimParameters *simParams = node->simParameters;
4392 
4393  //first split PME processors into patch groups
4394 
4395  int xsize = 0, ysize = 0, zsize = 0;
4396 
4397  xsize = tmgr.getDimNX();
4398  ysize = tmgr.getDimNY();
4399  zsize = tmgr.getDimNZ();
4400 
4401  int nx = xsize, ny = ysize, nz = zsize;
4402  DimensionMap dm;
4403 
4404  dm.x = 0;
4405  dm.y = 1;
4406  dm.z = 2;
4407 
4408  findOptimalDimensions(xsize, ysize, zsize, nx, ny, nz, dm);
4409 
4410  //group size processors have to be allocated to each YZ plane
4411  int group_size = numPes/nx;
4412  if(numPes % nx)
4413  group_size ++;
4414 
4415  int my_prime = NAMDPrimes[0];
4416  int density = (ny * nz)/group_size + 1;
4417  int count = 0;
4418 
4419  //Choose a suitable prime Number
4420  for(count = 0; count < NPRIMES; count ++) {
4421  //Find a prime just greater than the density
4422  if(density < NAMDPrimes[count]) {
4423  my_prime = NAMDPrimes[count];
4424  break;
4425  }
4426  }
4427 
4428  if(count == NPRIMES)
4429  my_prime = NAMDPrimes[NPRIMES-1];
4430 
4431  //int gcount = numPes/2;
4432  int gcount = 0;
4433  int npme_pes = 0;
4434 
4435  int coord[3];
4436 
4437  for(int x = 0; x < nx; x++) {
4438  coord[0] = (x + nx/2)%nx;
4439 
4440  for(count=0; count < group_size && npme_pes < numPes; count++) {
4441  int dest = (count + 1) * my_prime;
4442  dest = dest % (ny * nz);
4443 
4444  coord[2] = dest / ny;
4445  coord[1] = dest - coord[2] * ny;
4446 
4447  //Locate where in the actual grid the processor is
4448  int destPe = coord[dm.x] + coord[dm.y] * xsize +
4449  coord[dm.z] * xsize* ysize;
4450 
4451  if(pmemap[destPe] == 0) {
4452  pemap[gcount++] = destPe;
4453  pmemap[destPe] = 1;
4454 
4455  if(tmgr.hasMultipleProcsPerNode())
4456  pmemap[(destPe + CkNumPes()/2) % CkNumPes()] = 1;
4457 
4458  npme_pes ++;
4459  }
4460  else {
4461  for(int pos = 1; pos < ny * nz; pos++) {
4462 
4463  coord[2] += pos / ny;
4464  coord[1] += pos % ny;
4465 
4466  coord[2] = coord[2] % nz;
4467  coord[1] = coord[1] % ny;
4468 
4469  int newdest = coord[dm.x] + coord[dm.y] * xsize +
4470  coord[dm.z] * xsize * ysize;
4471 
4472  if(pmemap[newdest] == 0) {
4473  pemap[gcount++] = newdest;
4474  pmemap[newdest] = 1;
4475 
4476  if(tmgr.hasMultipleProcsPerNode())
4477  pmemap[(newdest + CkNumPes()/2) % CkNumPes()] = 1;
4478 
4479  npme_pes ++;
4480  break;
4481  }
4482  }
4483  }
4484  }
4485 
4486  if(gcount == numPes)
4487  gcount = 0;
4488 
4489  if(npme_pes >= numPes)
4490  break;
4491  }
4492 
4493  delete [] pmemap;
4494 
4495  if(npme_pes != numPes)
4496  //NAMD_bug("ORB PME allocator failed\n");
4497  return false;
4498 
4499  return true;
4500 }
4501 
4502 #endif
4503 
4504 template <class T> class PmePencil : public T {
4505 public:
4507  data = 0;
4508  work = 0;
4509  send_order = 0;
4510  needs_reply = 0;
4511 #if USE_PERSISTENT
4512  trans_handle = untrans_handle = ungrid_handle = NULL;
4513 #endif
4514  }
4516 #ifdef NAMD_FFTW
4517  fftwf_free(data);
4518 #endif
4519  delete [] work;
4520  delete [] send_order;
4521  delete [] needs_reply;
4522  }
4524  imsg=0;
4525  imsgb=0;
4526  hasData=0;
4527  initdata = msg->data;
4528  }
4529  void order_init(int nBlocks) {
4530  send_order = new int[nBlocks];
4531  for ( int i=0; i<nBlocks; ++i ) send_order[i] = i;
4532  if ( Node::Object()->simParameters->PMESendOrder ) {
4534  } else {
4535  Random rand(CkMyPe());
4536  rand.reorder(send_order,nBlocks);
4537  }
4538  needs_reply = new int[nBlocks];
4540  }
4544  int sequence; // used for priorities
4545 #ifndef CmiMemoryAtomicType
4546  typedef int AtomicInt;
4547 #else
4548  typedef CmiMemoryAtomicInt AtomicInt;
4549 #endif
4550  AtomicInt imsg; // used in sdag code
4551  AtomicInt imsgb; // Node par uses distinct counter for back path
4552  int hasData; // used in message elimination
4553  int offload;
4554  float *data;
4555  float *work;
4558 #if USE_PERSISTENT
4559  PersistentHandle *trans_handle;
4560  PersistentHandle *untrans_handle;
4561  PersistentHandle *ungrid_handle;
4562 #endif
4563 };
4564 
4565 class PmeZPencil : public PmePencil<CBase_PmeZPencil> {
4566 public:
4567  PmeZPencil_SDAG_CODE
4568  PmeZPencil() { __sdag_init(); setMigratable(false); }
4569  PmeZPencil(CkMigrateMessage *) { __sdag_init(); setMigratable (false); imsg=imsgb=0;}
4571  #ifdef NAMD_FFTW
4572  #ifdef NAMD_FFTW_3
4573  delete [] forward_plans;
4574  delete [] backward_plans;
4575  #endif
4576  #endif
4577  }
4578  void fft_init();
4579  void recv_grid(const PmeGridMsg *);
4580  void forward_fft();
4581  void send_trans();
4582  void send_subset_trans(int fromIdx, int toIdx);
4583  void recv_untrans(const PmeUntransMsg *);
4584  void recvNodeAck(PmeAckMsg *);
4586  void node_process_grid(PmeGridMsg *);
4587  void backward_fft();
4588  void send_ungrid(PmeGridMsg *);
4589  void send_all_ungrid();
4590  void send_subset_ungrid(int fromIdx, int toIdx);
4591 private:
4592  ResizeArray<PmeGridMsg *> grid_msgs;
4593  ResizeArray<int> work_zlist;
4594 #ifdef NAMD_FFTW
4595 #ifdef NAMD_FFTW_3
4596  fftwf_plan forward_plan, backward_plan;
4597 
4598  //for ckloop usage
4599  int numPlans;
4600  fftwf_plan *forward_plans, *backward_plans;
4601 #else
4602  rfftwnd_plan forward_plan, backward_plan;
4603 #endif
4604 #endif
4605 
4606  int nx, ny;
4607 #if USE_PERSISTENT
4608  void setup_persistent() {
4609  int hd = 1;// ( hasData ? 1 : 0 );
4610  int zBlocks = initdata.zBlocks;
4611  int block3 = initdata.grid.block3;
4612  int dim3 = initdata.grid.dim3;
4613  CkArray *yPencil_local = initdata.yPencil.ckLocalBranch();
4614  CmiAssert(yPencil_local);
4615  trans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * zBlocks);
4616  for ( int isend=0; isend<zBlocks; ++isend ) {
4617  int kb = send_order[isend];
4618  int nz1 = block3;
4619  if ( (kb+1)*block3 > dim3/2 ) nz1 = dim3/2 - kb*block3;
4620  int peer = yPencil_local->homePe(CkArrayIndex3D(thisIndex.x, 0, kb));
4621  int size = sizeof(PmeTransMsg) + sizeof(float)*hd*nx*ny*nz1*2 +sizeof( envelope)+PRIORITY_SIZE/8+24;
4622  int compress_start = sizeof(PmeTransMsg)+sizeof(envelope);
4623  int compress_size = sizeof(float)*hd*nx*ny*nz1*2;
4624  trans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
4625  }
4626  }
4627 
4628  void setup_ungrid_persistent()
4629  {
4630  ungrid_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * grid_msgs.size());
4631  int limsg;
4632  for ( limsg=0; limsg < grid_msgs.size(); ++limsg ) {
4633  int peer = grid_msgs[limsg]->sourceNode;
4634  //ungrid_handle[limsg] = CmiCreatePersistent(peer, 0);
4635  }
4636  imsg = limsg;
4637  }
4638 #endif
4639 };
4640 
4641 class PmeYPencil : public PmePencil<CBase_PmeYPencil> {
4642 public:
4643  PmeYPencil_SDAG_CODE
4644  PmeYPencil() { __sdag_init(); setMigratable(false); imsg=imsgb=0;}
4645  PmeYPencil(CkMigrateMessage *) { __sdag_init(); }
4646  void fft_init();
4647  void recv_trans(const PmeTransMsg *);
4648  void forward_fft();
4649  void forward_subset_fft(int fromIdx, int toIdx);
4650  void send_trans();
4651  void send_subset_trans(int fromIdx, int toIdx);
4652  void recv_untrans(const PmeUntransMsg *);
4654  void recvNodeAck(PmeAckMsg *);
4656  void backward_fft();
4657  void backward_subset_fft(int fromIdx, int toIdx);
4658  void send_untrans();
4659  void send_subset_untrans(int fromIdx, int toIdx);
4660 private:
4661 #ifdef NAMD_FFTW
4662 #ifdef NAMD_FFTW_3
4663  fftwf_plan forward_plan, backward_plan;
4664 #else
4665  fftw_plan forward_plan, backward_plan;
4666 #endif
4667 #endif
4668 
4669  int nx, nz;
4670 #if USE_PERSISTENT
4671  void setup_persistent() {
4672  int yBlocks = initdata.yBlocks;
4673  int block2 = initdata.grid.block2;
4674  int K2 = initdata.grid.K2;
4675  int hd = 1;
4676  CkArray *xPencil_local = initdata.xPencil.ckLocalBranch();
4677  trans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * yBlocks);
4678  for ( int isend=0; isend<yBlocks; ++isend ) {
4679  int jb = send_order[isend];
4680  int ny1 = block2;
4681  if ( (jb+1)*block2 > K2 ) ny1 = K2 - jb*block2;
4682  int peer = xPencil_local->homePe(CkArrayIndex3D(0, jb, thisIndex.z));
4683  int size = sizeof(PmeTransMsg) + sizeof(float)*hd*nx*ny1*nz*2 +sizeof( envelope) + PRIORITY_SIZE/8+24;
4684  int compress_start = sizeof(PmeTransMsg)+sizeof( envelope);
4685  int compress_size = sizeof(float)*hd*nx*ny1*nz*2;
4686  trans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
4687  }
4688 
4689  CkArray *zPencil_local = initdata.zPencil.ckLocalBranch();
4690  untrans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * yBlocks);
4691  for ( int isend=0; isend<yBlocks; ++isend ) {
4692  int jb = send_order[isend];
4693  int ny1 = block2;
4694  if ( (jb+1)*block2 > K2 ) ny1 = K2 - jb*block2;
4695  int peer = zPencil_local->homePe(CkArrayIndex3D(thisIndex.x, jb, 0));
4696  int size= sizeof(PmeUntransMsg) + sizeof(float)*nx*ny1*nz*2 + sizeof( envelope) + PRIORITY_SIZE/8+24;
4697  int compress_start = sizeof(PmeUntransMsg) + sizeof( envelope);
4698  int compress_size = sizeof(float)*nx*ny1*nz*2;
4699  untrans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
4700  }
4701  }
4702 #endif
4703 };
4704 
4705 class PmeXPencil : public PmePencil<CBase_PmeXPencil> {
4706 public:
4707  PmeXPencil_SDAG_CODE
4708  PmeXPencil() { __sdag_init(); myKSpace = 0; setMigratable(false); imsg=imsgb=0; recipEvirPe = -999; }
4709  PmeXPencil(CkMigrateMessage *) { __sdag_init(); }
4711  #ifdef NAMD_FFTW
4712  #ifdef NAMD_FFTW_3
4713  delete [] forward_plans;
4714  delete [] backward_plans;
4715  #endif
4716  #endif
4717  }
4718  void fft_init();
4719  void recv_trans(const PmeTransMsg *);
4720  void forward_fft();
4721  void pme_kspace();
4722  void backward_fft();
4723  void send_untrans();
4724  void send_subset_untrans(int fromIdx, int toIdx);
4726 #ifdef NAMD_FFTW
4727 #ifdef NAMD_FFTW_3
4728  fftwf_plan forward_plan, backward_plan;
4729 
4730  int numPlans;
4731  fftwf_plan *forward_plans, *backward_plans;
4732 #else
4734 #endif
4735 #endif
4736  int ny, nz;
4738  void evir_init();
4740 #if USE_PERSISTENT
4741  void setup_persistent() {
4742  int xBlocks = initdata.xBlocks;
4743  int block1 = initdata.grid.block1;
4744  int K1 = initdata.grid.K1;
4745  CkArray *yPencil_local = initdata.yPencil.ckLocalBranch();
4746  untrans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * xBlocks);
4747  for ( int isend=0; isend<xBlocks; ++isend ) {
4748  int ib = send_order[isend];
4749  int nx1 = block1;
4750  if ( (ib+1)*block1 > K1 ) nx1 = K1 - ib*block1;
4751  int peer = yPencil_local->procNum(CkArrayIndex3D(ib, 0, thisIndex.z));
4752  int size = sizeof(PmeUntransMsg) +
4753  sizeof(float)*nx1*ny*nz*2 +sizeof( envelope) + PRIORITY_SIZE/8+24;
4754  int compress_start = sizeof(PmeUntransMsg) + sizeof( envelope);
4755  int compress_size = sizeof(float)*nx1*ny*nz*2;
4756  untrans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
4757  }
4758  }
4759 #endif
4760 
4761 };
4762 
4765  initdata.pmeProxy[recipEvirPe].addRecipEvirClient();
4766 }
4767 
4769  CProxy_Node nd(CkpvAccess(BOCclass_group).node);
4770  Node *node = nd.ckLocalBranch();
4771  SimParameters *simParams = node->simParameters;
4772 
4773 #if USE_NODE_PAR_RECEIVE
4774  ((NodePmeMgr *)CkLocalNodeBranch(initdata.pmeNodeProxy))->registerZPencil(thisIndex,this);
4775 #endif
4776 
4777  int K1 = initdata.grid.K1;
4778  int K2 = initdata.grid.K2;
4779  int K3 = initdata.grid.K3;
4780  int dim3 = initdata.grid.dim3;
4781  int block1 = initdata.grid.block1;
4782  int block2 = initdata.grid.block2;
4783 
4784  nx = block1;
4785  if ( (thisIndex.x + 1) * block1 > K1 ) nx = K1 - thisIndex.x * block1;
4786  ny = block2;
4787  if ( (thisIndex.y + 1) * block2 > K2 ) ny = K2 - thisIndex.y * block2;
4788 
4789 #ifdef NAMD_FFTW
4791 
4792  data = (float *) fftwf_malloc( sizeof(float) *nx*ny*dim3);
4793  work = new float[dim3];
4794 
4796 
4797 #ifdef NAMD_FFTW_3
4798  /* need array of sizes for the how many */
4799 
4800  int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT : simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE ;
4801  int sizeLines=nx*ny;
4802  int planLineSizes[1];
4803  planLineSizes[0]=K3;
4804  int ndim=initdata.grid.dim3; // storage space is initdata.grid.dim3
4805  int ndimHalf=ndim/2;
4806  forward_plan = fftwf_plan_many_dft_r2c(1, planLineSizes, sizeLines,
4807  (float *) data, NULL, 1,
4808  ndim,
4809  (fftwf_complex *) data, NULL, 1,
4810  ndimHalf,
4811  fftwFlags);
4812 
4813  backward_plan = fftwf_plan_many_dft_c2r(1, planLineSizes, sizeLines,
4814  (fftwf_complex *) data, NULL, 1,
4815  ndimHalf,
4816  (float *) data, NULL, 1,
4817  ndim,
4818  fftwFlags);
4819 #if CMK_SMP && USE_CKLOOP
4820  if(simParams->useCkLoop) {
4821  //How many FFT plans to be created? The grain-size issue!!.
4822  //Currently, I am choosing the min(nx, ny) to be coarse-grain
4823  numPlans = (nx<=ny?nx:ny);
4824  if ( numPlans < CkMyNodeSize() ) numPlans = (nx>=ny?nx:ny);
4825  if ( numPlans < CkMyNodeSize() ) numPlans = sizeLines;
4826  int howmany = sizeLines/numPlans;
4827  forward_plans = new fftwf_plan[numPlans];
4828  backward_plans = new fftwf_plan[numPlans];
4829  for(int i=0; i<numPlans; i++) {
4830  int dimStride = i*ndim*howmany;
4831  int dimHalfStride = i*ndimHalf*howmany;
4832  forward_plans[i] = fftwf_plan_many_dft_r2c(1, planLineSizes, howmany,
4833  ((float *)data)+dimStride, NULL, 1,
4834  ndim,
4835  ((fftwf_complex *)data)+dimHalfStride, NULL, 1,
4836  ndimHalf,
4837  fftwFlags);
4838 
4839  backward_plans[i] = fftwf_plan_many_dft_c2r(1, planLineSizes, howmany,
4840  ((fftwf_complex *)data)+dimHalfStride, NULL, 1,
4841  ndimHalf,
4842  ((float *)data)+dimStride, NULL, 1,
4843  ndim,
4844  fftwFlags);
4845  }
4846  }else
4847 #endif
4848  {
4849  forward_plans = NULL;
4850  backward_plans = NULL;
4851  }
4852 #else
4853  forward_plan = rfftwnd_create_plan_specific(1, &K3, FFTW_REAL_TO_COMPLEX,
4854  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
4855  | FFTW_IN_PLACE | FFTW_USE_WISDOM, data, 1, work, 1);
4856  backward_plan = rfftwnd_create_plan_specific(1, &K3, FFTW_COMPLEX_TO_REAL,
4857  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
4858  | FFTW_IN_PLACE | FFTW_USE_WISDOM, data, 1, work, 1);
4859 #endif
4860  CmiUnlock(ComputePmeMgr::fftw_plan_lock);
4861 #else
4862  NAMD_die("Sorry, FFTW must be compiled in to use PME.");
4863 #endif
4864 
4865 #if USE_NODE_PAR_RECEIVE
4866  evir = 0.;
4867  memset(data, 0, sizeof(float) * nx*ny*dim3);
4868 #endif
4869 }
4870 
4872  CProxy_Node nd(CkpvAccess(BOCclass_group).node);
4873  Node *node = nd.ckLocalBranch();
4874  SimParameters *simParams = node->simParameters;
4875 
4876 #if USE_NODE_PAR_RECEIVE
4877  ((NodePmeMgr *)CkLocalNodeBranch(initdata.pmeNodeProxy))->registerYPencil(thisIndex,this);
4878 #endif
4879 
4880  int K1 = initdata.grid.K1;
4881  int K2 = initdata.grid.K2;
4882  int dim2 = initdata.grid.dim2;
4883  int dim3 = initdata.grid.dim3;
4884  int block1 = initdata.grid.block1;
4885  int block3 = initdata.grid.block3;
4886 
4887  nx = block1;
4888  if ( (thisIndex.x + 1) * block1 > K1 ) nx = K1 - thisIndex.x * block1;
4889  nz = block3;
4890  if ( (thisIndex.z+1)*block3 > dim3/2 ) nz = dim3/2 - thisIndex.z*block3;
4891 
4892 #ifdef NAMD_FFTW
4894 
4895  data = (float *) fftwf_malloc( sizeof(float) * nx*dim2*nz*2);
4896  work = new float[2*K2];
4897 
4899 
4900 #ifdef NAMD_FFTW_3
4901  /* need array of sizes for the dimensions */
4902  /* ideally this should be implementable as a single multidimensional
4903  * plan, but that has proven tricky to implement, so we maintain the
4904  * loop of 1d plan executions. */
4905  int sizeLines=nz;
4906  int planLineSizes[1];
4907  planLineSizes[0]=K2;
4908  int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT : simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE ;
4909  forward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
4910  (fftwf_complex *) data, NULL, sizeLines, 1,
4911  (fftwf_complex *) data, NULL, sizeLines, 1,
4912  FFTW_FORWARD,
4913  fftwFlags);
4914  backward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
4915  (fftwf_complex *) data, NULL, sizeLines, 1,
4916  (fftwf_complex *) data, NULL, sizeLines, 1,
4917  FFTW_BACKWARD,
4918  fftwFlags);
4919  CkAssert(forward_plan != NULL);
4920  CkAssert(backward_plan != NULL);
4921 #else
4922  forward_plan = fftw_create_plan_specific(K2, FFTW_FORWARD,
4923  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
4924  | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
4925  nz, (fftw_complex *) work, 1);
4926  backward_plan = fftw_create_plan_specific(K2, FFTW_BACKWARD,
4927  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
4928  | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
4929  nz, (fftw_complex *) work, 1);
4930 #endif
4931  CmiUnlock(ComputePmeMgr::fftw_plan_lock);
4932 #else
4933  NAMD_die("Sorry, FFTW must be compiled in to use PME.");
4934 #endif
4935 
4936 #if USE_NODE_PAR_RECEIVE
4937  evir = 0;
4938  CmiMemoryWriteFence();
4939 #endif
4940 }
4941 
4943 {
4944  if ( msg->hasData ) hasData = 1;
4945  needs_reply[msg->sourceNode] = msg->hasData;
4946  recv_trans(msg);
4947  int limsg;
4948  CmiMemoryAtomicFetchAndInc(imsg,limsg);
4949  if(limsg+1 == initdata.yBlocks)
4950  {
4951  if ( hasData ) {
4952  forward_fft();
4953  }
4954  send_trans();
4955  imsg=0;
4956  CmiMemoryWriteFence();
4957  }
4958 }
4959 
4961  delete msg;
4963 }
4964 
4966 {
4967  if ( msg ) {
4968  if ( ! hasData ) NAMD_bug("PmeYPencil::node_process_untrans non-null msg but not hasData");
4969  recv_untrans(msg);
4970  } else if ( hasData ) NAMD_bug("PmeYPencil::node_process_untrans hasData but null msg");
4971  int limsg;
4972  CmiMemoryAtomicFetchAndInc(imsgb,limsg);
4973  if(limsg+1 == initdata.yBlocks)
4974  {
4975  if ( hasData ) {
4976  backward_fft();
4977  }
4978  hasData=0;
4979  imsgb=0;
4980  CmiMemoryWriteFence();
4981  send_untrans();
4982  }
4983 }
4984 
4985 #define DEBUG_NODE_PAR_RECV 0
4986 
4988  // CkPrintf("[%d] NodePmeMgr recvXTrans for %d %d %d\n",CkMyPe(),msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
4989  PmeXPencil *target=xPencilObj.get(msg->destElem);
4990 #if DEBUG_NODE_PAR_RECV
4991  if(target == NULL)
4992  CkAbort("xpencil in recvXTrans not found, debug registeration");
4993 #endif
4994  target->node_process_trans(msg);
4995  delete msg;
4996 }
4997 
4998 
5000  // CkPrintf("[%d] NodePmeMgr recvYTrans for %d %d %d\n",CkMyPe(),msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
5001  PmeYPencil *target=yPencilObj.get(msg->destElem);
5002 #if DEBUG_NODE_PAR_RECV
5003  if(target == NULL)
5004  CkAbort("ypencil in recvYTrans not found, debug registeration");
5005 #endif
5006  target->node_process_trans(msg);
5007  delete msg;
5008  }
5010  // CkPrintf("[%d] NodePmeMgr recvYUntrans for %d %d %d\n",CkMyPe(),msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
5011  PmeYPencil *target=yPencilObj.get(msg->destElem);
5012 #if DEBUG_NODE_PAR_RECV
5013  if(target == NULL)
5014  CkAbort("ypencil in recvYUntrans not found, debug registeration");
5015 #endif
5016  target->node_process_untrans(msg);
5017  delete msg;
5018  }
5020  //CkPrintf("[%d] NodePmeMgr recvZUntrans for %d %d %d\n",CkMyPe(),msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
5021  PmeZPencil *target=zPencilObj.get(msg->destElem);
5022 #if DEBUG_NODE_PAR_RECV
5023  if(target == NULL)
5024  CkAbort("zpencil in recvZUntrans not found, debug registeration");
5025 #endif
5026  target->node_process_untrans(msg);
5027  delete msg;
5028 }
5029 
5031  //CkPrintf("[%d] NodePmeMgr %p recvGrid for %d %d %d\n",CkMyPe(),this,msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
5032  PmeZPencil *target=zPencilObj.get(msg->destElem);
5033 #if DEBUG_NODE_PAR_RECV
5034  if(target == NULL){
5035  CkAbort("zpencil in recvZGrid not found, debug registeration");
5036  }
5037 #endif
5038  target->node_process_grid(msg); //msg is stored inside node_proces_grid
5039 }
5040 
5042  CProxy_Node nd(CkpvAccess(BOCclass_group).node);
5043  Node *node = nd.ckLocalBranch();
5044  SimParameters *simParams = node->simParameters;
5045 #if USE_NODE_PAR_RECEIVE
5046  ((NodePmeMgr *)CkLocalNodeBranch(initdata.pmeNodeProxy))->registerXPencil(thisIndex,this);
5047 #endif
5048 
5049  int K1 = initdata.grid.K1;
5050  int K2 = initdata.grid.K2;
5051  int dim3 = initdata.grid.dim3;
5052  int block2 = initdata.grid.block2;
5053  int block3 = initdata.grid.block3;
5054 
5055  ny = block2;
5056  if ( (thisIndex.y + 1) * block2 > K2 ) ny = K2 - thisIndex.y * block2;
5057  nz = block3;
5058  if ( (thisIndex.z+1)*block3 > dim3/2 ) nz = dim3/2 - thisIndex.z*block3;
5059 
5060 #ifdef NAMD_FFTW
5062 
5063  data = (float *) fftwf_malloc( sizeof(float) * K1*ny*nz*2);
5064  work = new float[2*K1];
5065 
5067 
5068 #ifdef NAMD_FFTW_3
5069  /* need array of sizes for the how many */
5070  int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT : simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE ;
5071  int sizeLines=ny*nz;
5072  int planLineSizes[1];
5073  planLineSizes[0]=K1;
5074  forward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
5075  (fftwf_complex *) data, NULL, sizeLines, 1,
5076  (fftwf_complex *) data, NULL, sizeLines, 1,
5077  FFTW_FORWARD,
5078  fftwFlags);
5079  backward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
5080  (fftwf_complex *) data, NULL, sizeLines, 1,
5081  (fftwf_complex *) data, NULL, sizeLines, 1,
5082  FFTW_BACKWARD,
5083  fftwFlags);
5084 
5085 #if CMK_SMP && USE_CKLOOP
5086  if(simParams->useCkLoop) {
5087  //How many FFT plans to be created? The grain-size issue!!.
5088  //Currently, I am choosing the min(nx, ny) to be coarse-grain
5089  numPlans = (ny<=nz?ny:nz);
5090  // limit attempted parallelism due to false sharing
5091  //if ( numPlans < CkMyNodeSize() ) numPlans = (ny>=nz?ny:nz);
5092  //if ( numPlans < CkMyNodeSize() ) numPlans = sizeLines;
5093  if ( sizeLines/numPlans < 4 ) numPlans = 1;
5094  int howmany = sizeLines/numPlans;
5095  forward_plans = new fftwf_plan[numPlans];
5096  backward_plans = new fftwf_plan[numPlans];
5097  for(int i=0; i<numPlans; i++) {
5098  int curStride = i*howmany;
5099  forward_plans[i] = fftwf_plan_many_dft(1, planLineSizes, howmany,
5100  ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
5101  ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
5102  FFTW_FORWARD,
5103  fftwFlags);
5104 
5105  backward_plans[i] = fftwf_plan_many_dft(1, planLineSizes, howmany,
5106  ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
5107  ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
5108  FFTW_BACKWARD,
5109  fftwFlags);
5110  }
5111  }else
5112 #endif
5113  {
5114  forward_plans = NULL;
5115  backward_plans = NULL;
5116  }
5117 #else
5118  forward_plan = fftw_create_plan_specific(K1, FFTW_FORWARD,
5119  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
5120  | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
5121  ny*nz, (fftw_complex *) work, 1);
5122  backward_plan = fftw_create_plan_specific(K1, FFTW_BACKWARD,
5123  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
5124  | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
5125  ny*nz, (fftw_complex *) work, 1);
5126 #endif
5127  CmiUnlock(ComputePmeMgr::fftw_plan_lock);
5128 #else
5129  NAMD_die("Sorry, FFTW must be compiled in to use PME.");
5130 #endif
5131 
5133  thisIndex.y*block2, thisIndex.y*block2 + ny,
5134  thisIndex.z*block3, thisIndex.z*block3 + nz);
5135 
5136 }
5137 
5138 // #define FFTCHECK // run a grid of integers through the fft
5139 // #define ZEROCHECK // check for suspicious zeros in fft
5140 
5142 
5143  int dim3 = initdata.grid.dim3;
5144  if ( imsg == 0 ) {
5145  lattice = msg->lattice;
5146  sequence = msg->sequence;
5147 #if ! USE_NODE_PAR_RECEIVE
5148  memset(data, 0, sizeof(float)*nx*ny*dim3);
5149 #endif
5150  }
5151 
5152  if ( ! msg->hasData ) return;
5153 
5154  int zlistlen = msg->zlistlen;
5155 #ifdef NAMD_KNL
5156  int * __restrict msg_zlist = msg->zlist;
5157  int * __restrict zlist = work_zlist.begin();
5158  __assume_aligned(zlist,64);
5159  for ( int k=0; k<zlistlen; ++k ) {
5160  zlist[k] = msg_zlist[k];
5161  }
5162 #else
5163  int * __restrict zlist = msg->zlist;
5164 #endif
5165  char * __restrict fmsg = msg->fgrid;
5166  float * __restrict qmsg = msg->qgrid;
5167  float * __restrict d = data;
5168  int numGrids = 1; // pencil FFT doesn't support multiple grids
5169  for ( int g=0; g<numGrids; ++g ) {
5170  for ( int i=0; i<nx; ++i ) {
5171  for ( int j=0; j<ny; ++j, d += dim3 ) {
5172  if( *(fmsg++) ) {
5173  #pragma ivdep
5174  for ( int k=0; k<zlistlen; ++k ) {
5175  d[zlist[k]] += *(qmsg++);
5176  }
5177  }
5178  }
5179  }
5180  }
5181 }
5182 
5183 static inline void PmeXZPencilFFT(int first, int last, void *result, int paraNum, void *param){
5184 #ifdef NAMD_FFTW
5185 #ifdef NAMD_FFTW_3
5186  fftwf_plan *plans = (fftwf_plan *)param;
5187  for(int i=first; i<=last; i++) fftwf_execute(plans[i]);
5188 #endif
5189 #endif
5190 }
5191 
5193  evir = 0.;
5194 #ifdef FFTCHECK
5195  int dim3 = initdata.grid.dim3;
5196  int K3 = initdata.grid.K3;
5197  float std_base = 100. * (thisIndex.x+1.) + 10. * (thisIndex.y+1.);
5198  float *d = data;
5199  for ( int i=0; i<nx; ++i ) {
5200  for ( int j=0; j<ny; ++j, d += dim3 ) {
5201  for ( int k=0; k<dim3; ++k ) {
5202  d[k] = 10. * (10. * (10. * std_base + i) + j) + k;
5203  }
5204  }
5205  }
5206 #endif
5207 #ifdef NAMD_FFTW
5208 #ifdef MANUAL_DEBUG_FFTW3
5209  dumpMatrixFloat3("fw_z_b", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
5210 #endif
5211 #ifdef NAMD_FFTW_3
5212 #if CMK_SMP && USE_CKLOOP
5213  int useCkLoop = Node::Object()->simParameters->useCkLoop;
5214  if(useCkLoop>=CKLOOP_CTRL_PME_FORWARDFFT
5215  && CkNumPes() >= 2 * initdata.xBlocks * initdata.yBlocks) {
5216  //for(int i=0; i<numPlans; i++) fftwf_execute(forward_plans[i]);
5217  //transform the above loop
5218  CkLoop_Parallelize(PmeXZPencilFFT, 1, (void *)forward_plans, CkMyNodeSize(), 0, numPlans-1); //sync
5219  return;
5220  }
5221 #endif
5222  fftwf_execute(forward_plan);
5223 #else
5224  rfftwnd_real_to_complex(forward_plan, nx*ny,
5225  data, 1, initdata.grid.dim3, (fftw_complex *) work, 1, 0);
5226 #endif
5227 #ifdef MANUAL_DEBUG_FFTW3
5228  dumpMatrixFloat3("fw_z_a", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
5229 #endif
5230 
5231 #endif
5232 #ifdef ZEROCHECK
5233  int dim3 = initdata.grid.dim3;
5234  int K3 = initdata.grid.K3;
5235  float *d = data;
5236  for ( int i=0; i<nx; ++i ) {
5237  for ( int j=0; j<ny; ++j, d += dim3 ) {
5238  for ( int k=0; k<dim3; ++k ) {
5239  if ( d[k] == 0. ) CkPrintf("0 in Z at %d %d %d %d %d %d %d %d %d\n",
5240  thisIndex.x, thisIndex.y, i, j, k, nx, ny, dim3);
5241  }
5242  }
5243  }
5244 #endif
5245 }
5246 
5247 /* A single task for partitioned PmeZPencil::send_trans work */
5248 static inline void PmeZPencilSendTrans(int first, int last, void *result, int paraNum, void *param){
5249  PmeZPencil *zpencil = (PmeZPencil *)param;
5250  zpencil->send_subset_trans(first, last);
5251 }
5252 
5253 void PmeZPencil::send_subset_trans(int fromIdx, int toIdx){
5254  int zBlocks = initdata.zBlocks;
5255  int block3 = initdata.grid.block3;
5256  int dim3 = initdata.grid.dim3;
5257  for ( int isend=fromIdx; isend<=toIdx; ++isend ) {
5258  int kb = send_order[isend];
5259  int nz = block3;
5260  if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
5261  int hd = ( hasData ? 1 : 0 );
5262  PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
5263  msg->lattice = lattice;
5264  msg->sourceNode = thisIndex.y;
5265  msg->hasData = hasData;
5266  msg->nx = ny;
5267  if ( hasData ) {
5268  float *md = msg->qgrid;
5269  const float *d = data;
5270  for ( int i=0; i<nx; ++i ) {
5271  for ( int j=0; j<ny; ++j, d += dim3 ) {
5272  for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
5273  *(md++) = d[2*k];
5274  *(md++) = d[2*k+1];
5275  }
5276  }
5277  }
5278  }
5279  msg->sequence = sequence;
5281 
5282  CmiEnableUrgentSend(1);
5283 #if USE_NODE_PAR_RECEIVE
5284  msg->destElem=CkArrayIndex3D(thisIndex.x,0,kb);
5285 #if Y_PERSIST
5286  CmiUsePersistentHandle(&trans_handle[isend], 1);
5287 #endif
5288  initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYTrans(msg);
5289 #if Y_PERSIST
5290  CmiUsePersistentHandle(NULL, 0);
5291 #endif
5292 #else
5293 #if Y_PERSIST
5294  CmiUsePersistentHandle(&trans_handle[isend], 1);
5295 #endif
5296  initdata.yPencil(thisIndex.x,0,kb).recvTrans(msg);
5297 #if Y_PERSIST
5298  CmiUsePersistentHandle(NULL, 0);
5299 #endif
5300 #endif
5301  CmiEnableUrgentSend(0);
5302  }
5303 }
5304 
5306 #if USE_PERSISTENT
5307  if (trans_handle == NULL) setup_persistent();
5308 #endif
5309 #if CMK_SMP && USE_CKLOOP
5310  int useCkLoop = Node::Object()->simParameters->useCkLoop;
5311  if(useCkLoop>=CKLOOP_CTRL_PME_SENDTRANS
5312  && CkNumPes() >= 2 * initdata.xBlocks * initdata.yBlocks) {
5319  //send_subset_trans(0, initdata.zBlocks-1);
5320  CkLoop_Parallelize(PmeZPencilSendTrans, 1, (void *)this, CkMyNodeSize(), 0, initdata.zBlocks-1, 1); //not sync
5321  return;
5322  }
5323 #endif
5324  int zBlocks = initdata.zBlocks;
5325  int block3 = initdata.grid.block3;
5326  int dim3 = initdata.grid.dim3;
5327  for ( int isend=0; isend<zBlocks; ++isend ) {
5328  int kb = send_order[isend];
5329  int nz = block3;
5330  if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
5331  int hd = ( hasData ? 1 : 0 );
5332  PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
5333  msg->lattice = lattice;
5334  msg->sourceNode = thisIndex.y;
5335  msg->hasData = hasData;
5336  msg->nx = ny;
5337  if ( hasData ) {
5338  float *md = msg->qgrid;
5339  const float *d = data;
5340  for ( int i=0; i<nx; ++i ) {
5341  for ( int j=0; j<ny; ++j, d += dim3 ) {
5342  for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
5343  *(md++) = d[2*k];
5344  *(md++) = d[2*k+1];
5345  }
5346  }
5347  }
5348  }
5349  msg->sequence = sequence;
5351 
5352  CmiEnableUrgentSend(1);
5353 #if USE_NODE_PAR_RECEIVE
5354  msg->destElem=CkArrayIndex3D(thisIndex.x,0,kb);
5355 #if Y_PERSIST
5356  CmiUsePersistentHandle(&trans_handle[isend], 1);
5357 #endif
5358  initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYTrans(msg);
5359 #if Y_PERSIST
5360  CmiUsePersistentHandle(NULL, 0);
5361 #endif
5362 #else
5363 #if Y_PERSIST
5364  CmiUsePersistentHandle(&trans_handle[isend], 1);
5365 #endif
5366  initdata.yPencil(thisIndex.x,0,kb).recvTrans(msg);
5367 #if Y_PERSIST
5368  CmiUsePersistentHandle(NULL, 0);
5369 #endif
5370 #endif
5371  CmiEnableUrgentSend(0);
5372  }
5373 }
5374 
5376  if ( imsg == 0 ) {
5377  lattice = msg->lattice;
5378  sequence = msg->sequence;
5379  }
5380  int block2 = initdata.grid.block2;
5381  int K2 = initdata.grid.K2;
5382  int jb = msg->sourceNode;
5383  int ny = msg->nx;
5384  if ( msg->hasData ) {
5385  const float *md = msg->qgrid;
5386  float *d = data;
5387  for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
5388  for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
5389  for ( int k=0; k<nz; ++k ) {
5390 #ifdef ZEROCHECK
5391  if ( (*md) == 0. ) CkPrintf("0 in ZY at %d %d %d %d %d %d %d %d %d\n",
5392  thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
5393 #endif
5394  d[2*(j*nz+k)] = *(md++);
5395  d[2*(j*nz+k)+1] = *(md++);
5396  }
5397  }
5398  }
5399  } else {
5400  float *d = data;
5401  for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
5402  for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
5403  for ( int k=0; k<nz; ++k ) {
5404  d[2*(j*nz+k)] = 0;
5405  d[2*(j*nz+k)+1] = 0;
5406  }
5407  }
5408  }
5409  }
5410 }
5411 
5412 static inline void PmeYPencilForwardFFT(int first, int last, void *result, int paraNum, void *param){
5413  PmeYPencil *ypencil = (PmeYPencil *)param;
5414  ypencil->forward_subset_fft(first, last);
5415 }
5416 void PmeYPencil::forward_subset_fft(int fromIdx, int toIdx) {
5417 #ifdef NAMD_FFTW
5418 #ifdef NAMD_FFTW_3
5419  for(int i=fromIdx; i<=toIdx; i++){
5420  fftwf_execute_dft(forward_plan, ((fftwf_complex *) data) + i
5421  * nz * initdata.grid.K2,
5422  ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
5423  }
5424 #endif
5425 #endif
5426 }
5427 
5429  evir = 0.;
5430 #ifdef NAMD_FFTW
5431 #ifdef MANUAL_DEBUG_FFTW3
5432  dumpMatrixFloat3("fw_y_b", data, nx, initdata.grid.K2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
5433 #endif
5434 
5435 #ifdef NAMD_FFTW_3
5436 #if CMK_SMP && USE_CKLOOP
5437  int useCkLoop = Node::Object()->simParameters->useCkLoop;
5438  if(useCkLoop>=CKLOOP_CTRL_PME_FORWARDFFT
5439  && CkNumPes() >= 2 * initdata.xBlocks * initdata.zBlocks) {
5440  CkLoop_Parallelize(PmeYPencilForwardFFT, 1, (void *)this, CkMyNodeSize(), 0, nx-1); //sync
5441  return;
5442  }
5443 #endif
5444  //the above is a transformation of the following loop using CkLoop
5445  for ( int i=0; i<nx; ++i ) {
5446  fftwf_execute_dft(forward_plan, ((fftwf_complex *) data) + i
5447  * nz * initdata.grid.K2,
5448  ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
5449  }
5450 #else
5451  for ( int i=0; i<nx; ++i ) {
5452  fftw(forward_plan, nz,
5453  ((fftw_complex *) data) + i * nz * initdata.grid.K2,
5454  nz, 1, (fftw_complex *) work, 1, 0);
5455  }
5456 #endif
5457 #ifdef MANUAL_DEBUG_FFTW3
5458  dumpMatrixFloat3("fw_y_a", data, nx, initdata.grid.dim2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
5459 #endif
5460 
5461 #endif
5462 }
5463 
5464 static inline void PmeYPencilSendTrans(int first, int last, void *result, int paraNum, void *param){
5465  PmeYPencil *ypencil = (PmeYPencil *)param;
5466  ypencil->send_subset_trans(first, last);
5467 }
5468 
5469 void PmeYPencil::send_subset_trans(int fromIdx, int toIdx){
5470  int yBlocks = initdata.yBlocks;
5471  int block2 = initdata.grid.block2;
5472  int K2 = initdata.grid.K2;
5473  for ( int isend=fromIdx; isend<=toIdx; ++isend ) {
5474  int jb = send_order[isend];
5475  int ny = block2;
5476  if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
5477  int hd = ( hasData ? 1 : 0 );
5478  PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
5479  msg->lattice = lattice;
5480  msg->sourceNode = thisIndex.x;
5481  msg->hasData = hasData;
5482  msg->nx = nx;
5483  if ( hasData ) {
5484  float *md = msg->qgrid;
5485  const float *d = data;
5486  for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
5487  for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
5488  for ( int k=0; k<nz; ++k ) {
5489  *(md++) = d[2*(j*nz+k)];
5490  *(md++) = d[2*(j*nz+k)+1];
5491  #ifdef ZEROCHECK
5492  if ( *(md-2) == 0. ) CkPrintf("send 0 in YX at %d %d %d %d %d %d %d %d %d\n",
5493  thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
5494  #endif
5495  }
5496  }
5497  }
5498  if ( md != msg->qgrid + nx*ny*nz*2 ) CkPrintf("error in YX at %d %d %d\n",
5499  thisIndex.x, jb, thisIndex.z);
5500  }
5501  msg->sequence = sequence;
5503  CmiEnableUrgentSend(1);
5504 #if USE_NODE_PAR_RECEIVE
5505  msg->destElem=CkArrayIndex3D(0,jb,thisIndex.z);
5506 #if X_PERSIST
5507  CmiUsePersistentHandle(&trans_handle[isend], 1);
5508 #endif
5509  initdata.pmeNodeProxy[CmiNodeOf(initdata.xm.ckLocalBranch()->procNum(0,msg->destElem))].recvXTrans(msg);
5510 #if X_PERSIST
5511  CmiUsePersistentHandle(NULL, 0);
5512 #endif
5513 #else
5514 #if X_PERSIST
5515  CmiUsePersistentHandle(&trans_handle[isend], 1);
5516 #endif
5517  initdata.xPencil(0,jb,thisIndex.z).recvTrans(msg);
5518 #if X_PERSIST
5519  CmiUsePersistentHandle(NULL, 0);
5520 #endif
5521 #endif
5522  CmiEnableUrgentSend(0);
5523  }
5524 }
5525 
5527 #if USE_PERSISTENT
5528  if (trans_handle == NULL) setup_persistent();
5529 #endif
5530 #if CMK_SMP && USE_CKLOOP
5531  int useCkLoop = Node::Object()->simParameters->useCkLoop;
5532  if(useCkLoop>=CKLOOP_CTRL_PME_SENDTRANS
5533  && CkNumPes() >= 2 * initdata.xBlocks * initdata.zBlocks) {
5540  //send_subset_trans(0, initdata.yBlocks-1);
5541  CkLoop_Parallelize(PmeYPencilSendTrans, 1, (void *)this, CkMyNodeSize(), 0, initdata.yBlocks-1, 1); //not sync
5542  return;
5543  }
5544 #endif
5545  int yBlocks = initdata.yBlocks;
5546  int block2 = initdata.grid.block2;
5547  int K2 = initdata.grid.K2;
5548  for ( int isend=0; isend<yBlocks; ++isend ) {
5549  int jb = send_order[isend];
5550  int ny = block2;
5551  if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
5552  int hd = ( hasData ? 1 : 0 );
5553  PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
5554  msg->lattice = lattice;
5555  msg->sourceNode = thisIndex.x;
5556  msg->hasData = hasData;
5557  msg->nx = nx;
5558  if ( hasData ) {
5559  float *md = msg->qgrid;
5560  const float *d = data;
5561  for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
5562  for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
5563  for ( int k=0; k<nz; ++k ) {
5564  *(md++) = d[2*(j*nz+k)];
5565  *(md++) = d[2*(j*nz+k)+1];
5566 #ifdef ZEROCHECK
5567  if ( *(md-2) == 0. ) CkPrintf("send 0 in YX at %d %d %d %d %d %d %d %d %d\n",
5568  thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
5569 #endif
5570  }
5571  }
5572  }
5573  if ( md != msg->qgrid + nx*ny*nz*2 ) CkPrintf("error in YX at %d %d %d\n",
5574  thisIndex.x, jb, thisIndex.z);
5575  }
5576  msg->sequence = sequence;
5578  CmiEnableUrgentSend(1);
5579 #if USE_NODE_PAR_RECEIVE
5580  msg->destElem=CkArrayIndex3D(0,jb,thisIndex.z);
5581 #if X_PERSIST
5582  CmiUsePersistentHandle(&trans_handle[isend], 1);
5583 #endif
5584  initdata.pmeNodeProxy[CmiNodeOf(initdata.xm.ckLocalBranch()->procNum(0,msg->destElem))].recvXTrans(msg);
5585 #if X_PERSIST
5586  CmiUsePersistentHandle(NULL, 0);
5587 #endif
5588 #else
5589 #if X_PERSIST
5590  CmiUsePersistentHandle(&trans_handle[isend], 1);
5591 #endif
5592  initdata.xPencil(0,jb,thisIndex.z).recvTrans(msg);
5593 #if X_PERSIST
5594  CmiUsePersistentHandle(NULL, 0);
5595 #endif
5596 
5597 #endif
5598  CmiEnableUrgentSend(0);
5599  }
5600 }
5601 
5603 {
5604  if(msg->hasData) hasData=1;
5605  needs_reply[msg->sourceNode] = msg->hasData;
5606  recv_trans(msg);
5607  int limsg;
5608  CmiMemoryAtomicFetchAndInc(imsg,limsg);
5609  if(limsg+1 == initdata.xBlocks)
5610  {
5611  if(hasData){
5612  forward_fft();
5613  pme_kspace();
5614  backward_fft();
5615  }
5616  send_untrans();
5617  imsg=0;
5618  CmiMemoryWriteFence();
5619  }
5620 }
5621 
5623  if ( imsg == 0 ) {
5624  lattice = msg->lattice;
5625  sequence = msg->sequence;
5626  }
5627  int block1 = initdata.grid.block1;
5628  int K1 = initdata.grid.K1;
5629  int ib = msg->sourceNode;
5630  int nx = msg->nx;
5631  if ( msg->hasData ) {
5632  const float *md = msg->qgrid;
5633  for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
5634  float *d = data + i*ny*nz*2;
5635  for ( int j=0; j<ny; ++j, d += nz*2 ) {
5636  for ( int k=0; k<nz; ++k ) {
5637 #ifdef ZEROCHECK
5638  if ( (*md) == 0. ) CkPrintf("0 in YX at %d %d %d %d %d %d %d %d %d\n",
5639  ib, thisIndex.y, thisIndex.z, i, j, k, nx, ny, nz);
5640 #endif
5641  d[2*k] = *(md++);
5642  d[2*k+1] = *(md++);
5643  }
5644  }
5645  }
5646  } else {
5647  for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
5648  float *d = data + i*ny*nz*2;
5649  for ( int j=0; j<ny; ++j, d += nz*2 ) {
5650  for ( int k=0; k<nz; ++k ) {
5651  d[2*k] = 0;
5652  d[2*k+1] = 0;
5653  }
5654  }
5655  }
5656  }
5657 }
5658 
5660 #ifdef NAMD_FFTW
5661 
5662 #ifdef MANUAL_DEBUG_FFTW3
5663  dumpMatrixFloat3("fw_x_b", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
5664 #endif
5665 
5666 #ifdef NAMD_FFTW_3
5667 #if CMK_SMP && USE_CKLOOP
5668  int useCkLoop = Node::Object()->simParameters->useCkLoop;
5669  if(useCkLoop>=CKLOOP_CTRL_PME_FORWARDFFT
5670  && CkNumPes() >= 2 * initdata.yBlocks * initdata.zBlocks) {
5671  //for(int i=0; i<numPlans; i++) fftwf_execute(forward_plans[i]);
5672  //transform the above loop
5673  CkLoop_Parallelize(PmeXZPencilFFT, 1, (void *)forward_plans, CkMyNodeSize(), 0, numPlans-1); //sync
5674  return;
5675  }
5676 #endif
5677  fftwf_execute(forward_plan);
5678 #else
5679  fftw(forward_plan, ny*nz,
5680  ((fftw_complex *) data), ny*nz, 1, (fftw_complex *) work, 1, 0);
5681 #endif
5682 #ifdef MANUAL_DEBUG_FFTW3
5683  dumpMatrixFloat3("fw_x_a", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
5684 #endif
5685 
5686 #endif
5687 }
5688 
5690 
5691  evir = 0.;
5692 
5693 #ifdef FFTCHECK
5694  return;
5695 #endif
5696 
5698 
5699  int useCkLoop = 0;
5700 #if CMK_SMP && USE_CKLOOP
5701  if ( Node::Object()->simParameters->useCkLoop >= CKLOOP_CTRL_PME_KSPACE
5702  && CkNumPes() >= 2 * initdata.yBlocks * initdata.zBlocks ) {
5703  useCkLoop = 1;
5704  }
5705 #endif
5706 
5707  int numGrids = 1;
5708  for ( int g=0; g<numGrids; ++g ) {
5709  evir[0] = myKSpace->compute_energy(data+0*g,
5710  lattice, ewaldcof, &(evir[1]), useCkLoop);
5711  }
5712 
5713 #if USE_NODE_PAR_RECEIVE
5714  CmiMemoryWriteFence();
5715 #endif
5716 }
5717 
5719 #ifdef NAMD_FFTW
5720 #ifdef MANUAL_DEBUG_FFTW3
5721  dumpMatrixFloat3("bw_x_b", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
5722 #endif
5723 
5724 #ifdef NAMD_FFTW_3
5725 #if CMK_SMP && USE_CKLOOP
5726  int useCkLoop = Node::Object()->simParameters->useCkLoop;
5727  if(useCkLoop>=CKLOOP_CTRL_PME_BACKWARDFFT
5728  && CkNumPes() >= 2 * initdata.yBlocks * initdata.zBlocks) {
5729  //for(int i=0; i<numPlans; i++) fftwf_execute(backward_plans[i]);
5730  //transform the above loop
5731  CkLoop_Parallelize(PmeXZPencilFFT, 1, (void *)backward_plans, CkMyNodeSize(), 0, numPlans-1); //sync
5732  return;
5733  }
5734 #endif
5735  fftwf_execute(backward_plan);
5736 #else
5737  fftw(backward_plan, ny*nz,
5738  ((fftw_complex *) data), ny*nz, 1, (fftw_complex *) work, 1, 0);
5739 #endif
5740 #ifdef MANUAL_DEBUG_FFTW3
5741  dumpMatrixFloat3("bw_x_a", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
5742 #endif
5743 #endif
5744 }
5745 
5746 static inline void PmeXPencilSendUntrans(int first, int last, void *result, int paraNum, void *param){
5747  PmeXPencil *xpencil = (PmeXPencil *)param;
5748  xpencil->send_subset_untrans(first, last);
5749 }
5750 
5751 void PmeXPencil::send_subset_untrans(int fromIdx, int toIdx){
5752  int xBlocks = initdata.xBlocks;
5753  int block1 = initdata.grid.block1;
5754  int K1 = initdata.grid.K1;
5755 
5756  for(int isend=fromIdx; isend<=toIdx; isend++) {
5757  int ib = send_order[isend];
5758  if ( ! needs_reply[ib] ) {
5759  PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
5760  CmiEnableUrgentSend(1);
5762 #if USE_NODE_PAR_RECEIVE
5763  initdata.yPencil(ib,0,thisIndex.z).recvNodeAck(msg);
5764 #else
5765  initdata.yPencil(ib,0,thisIndex.z).recvAck(msg);
5766 #endif
5767  CmiEnableUrgentSend(0);
5768  continue;
5769  }
5770  int nx = block1;
5771  if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
5772  PmeUntransMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;
5773  msg->sourceNode = thisIndex.y;
5774  msg->ny = ny;
5775  float *md = msg->qgrid;
5776  for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
5777  float *d = data + i*ny*nz*2;
5778  for ( int j=0; j<ny; ++j, d += nz*2 ) {
5779  for ( int k=0; k<nz; ++k ) {
5780  *(md++) = d[2*k];
5781  *(md++) = d[2*k+1];
5782  }
5783  }
5784  }
5786  CmiEnableUrgentSend(1);
5787 #if USE_NODE_PAR_RECEIVE
5788  msg->destElem=CkArrayIndex3D(ib,0, thisIndex.z);
5789 #if Y_PERSIST
5790  CmiUsePersistentHandle(&untrans_handle[isend], 1);
5791 #endif
5792  initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYUntrans(msg);
5793 #if Y_PERSIST
5794  CmiUsePersistentHandle(NULL, 0);
5795 #endif
5796 #else
5797 #if Y_PERSIST
5798  // CmiUsePersistentHandle(&untrans_handle[isend], 1);
5799 #endif
5800  initdata.yPencil(ib,0,thisIndex.z).recvUntrans(msg);
5801 #if Y_PERSIST
5802  // CmiUsePersistentHandle(NULL, 0);
5803 #endif
5804 #endif
5805  CmiEnableUrgentSend(0);
5806  }
5807 }
5808 
5810 
5811  { // send energy and virial
5812  int numGrids = 1;
5813  PmeEvirMsg *newmsg = new (numGrids, PRIORITY_SIZE) PmeEvirMsg;
5814  newmsg->evir[0] = evir;
5816  CmiEnableUrgentSend(1);
5817  initdata.pmeProxy[recipEvirPe].recvRecipEvir(newmsg);
5818  CmiEnableUrgentSend(0);
5819  }
5820 
5821 #if USE_PERSISTENT
5822  if (untrans_handle == NULL) setup_persistent();
5823 #endif
5824 #if CMK_SMP && USE_CKLOOP
5825  int useCkLoop = Node::Object()->simParameters->useCkLoop;
5826  if(useCkLoop>=CKLOOP_CTRL_PME_SENDUNTRANS
5827  && CkNumPes() >= 2 * initdata.yBlocks * initdata.zBlocks) {
5828  int xBlocks = initdata.xBlocks;
5829 
5830 #if USE_NODE_PAR_RECEIVE
5831  //CkLoop_Parallelize(PmeXPencilSendUntrans, 1, (void *)this, CkMyNodeSize(), 0, xBlocks-1, 1); //has to sync
5832  CkLoop_Parallelize(PmeXPencilSendUntrans, 1, (void *)this, xBlocks, 0, xBlocks-1, 1); //has to sync
5833 #else
5834  //CkLoop_Parallelize(PmeXPencilSendUntrans, 1, (void *)this, CkMyNodeSize(), 0, xBlocks-1, 0); //not sync
5835  CkLoop_Parallelize(PmeXPencilSendUntrans, 1, (void *)this, xBlocks, 0, xBlocks-1, 0); //not sync
5836 #endif
5837  return;
5838  }
5839 #endif
5840  int xBlocks = initdata.xBlocks;
5841  int block1 = initdata.grid.block1;
5842  int K1 = initdata.grid.K1;
5843  for ( int isend=0; isend<xBlocks; ++isend ) {
5844  int ib = send_order[isend];
5845  if ( ! needs_reply[ib] ) {
5846  PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
5847  CmiEnableUrgentSend(1);
5849 #if USE_NODE_PAR_RECEIVE
5850  initdata.yPencil(ib,0,thisIndex.z).recvNodeAck(msg);
5851 #else
5852  initdata.yPencil(ib,0,thisIndex.z).recvAck(msg);
5853 #endif
5854  CmiEnableUrgentSend(0);
5855  continue;
5856  }
5857  int nx = block1;
5858  if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
5859  PmeUntransMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;
5860  msg->sourceNode = thisIndex.y;
5861  msg->ny = ny;
5862  float *md = msg->qgrid;
5863  for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
5864  float *d = data + i*ny*nz*2;
5865  for ( int j=0; j<ny; ++j, d += nz*2 ) {
5866  for ( int k=0; k<nz; ++k ) {
5867  *(md++) = d[2*k];
5868  *(md++) = d[2*k+1];
5869  }
5870  }
5871  }
5873 
5874  CmiEnableUrgentSend(1);
5875 #if USE_NODE_PAR_RECEIVE
5876  msg->destElem=CkArrayIndex3D(ib,0, thisIndex.z);
5877 #if Y_PERSIST
5878  CmiUsePersistentHandle(&untrans_handle[isend], 1);
5879 #endif
5880  initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYUntrans(msg);
5881 #if Y_PERSIST
5882  CmiUsePersistentHandle(NULL, 0);
5883 #endif
5884 #else
5885 #if Y_PERSIST
5886  CmiUsePersistentHandle(&untrans_handle[isend], 1);
5887 #endif
5888  initdata.yPencil(ib,0,thisIndex.z).recvUntrans(msg);
5889 #if Y_PERSIST
5890  CmiUsePersistentHandle(NULL, 0);
5891 #endif
5892 #endif
5893  CmiEnableUrgentSend(0);
5894  }
5895 }
5896 
5898  int block2 = initdata.grid.block2;
5899  int K2 = initdata.grid.K2;
5900  int jb = msg->sourceNode;
5901  int ny = msg->ny;
5902  const float *md = msg->qgrid;
5903  float *d = data;
5904  for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
5905 #if CMK_BLUEGENEL
5906  CmiNetworkProgress();
5907 #endif
5908  for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
5909  for ( int k=0; k<nz; ++k ) {
5910 #ifdef ZEROCHECK
5911  if ( (*md) == 0. ) CkPrintf("0 in XY at %d %d %d %d %d %d %d %d %d\n",
5912  thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
5913 #endif
5914  d[2*(j*nz+k)] = *(md++);
5915  d[2*(j*nz+k)+1] = *(md++);
5916  }
5917  }
5918  }
5919 }
5920 
5921 static inline void PmeYPencilBackwardFFT(int first, int last, void *result, int paraNum, void *param){
5922  PmeYPencil *ypencil = (PmeYPencil *)param;
5923  ypencil->backward_subset_fft(first, last);
5924 }
5925 
5926 void PmeYPencil::backward_subset_fft(int fromIdx, int toIdx) {
5927 #ifdef NAMD_FFTW
5928 #ifdef NAMD_FFTW_3
5929  for(int i=fromIdx; i<=toIdx; i++){
5930  fftwf_execute_dft(backward_plan,
5931  ((fftwf_complex *) data) + i * nz * initdata.grid.K2,
5932  ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
5933  }
5934 #endif
5935 #endif
5936 }
5937 
5939 #ifdef NAMD_FFTW
5940 #ifdef MANUAL_DEBUG_FFTW3
5941  dumpMatrixFloat3("bw_y_b", data, nx, initdata.grid.K2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
5942 #endif
5943 
5944 #ifdef NAMD_FFTW_3
5945 #if CMK_SMP && USE_CKLOOP
5946  int useCkLoop = Node::Object()->simParameters->useCkLoop;
5947  if(useCkLoop>=CKLOOP_CTRL_PME_BACKWARDFFT
5948  && CkNumPes() >= 2 * initdata.xBlocks * initdata.zBlocks) {
5949  CkLoop_Parallelize(PmeYPencilBackwardFFT, 1, (void *)this, CkMyNodeSize(), 0, nx-1); //sync
5950  return;
5951  }
5952 #endif
5953  //the above is a transformation of the following loop using CkLoop
5954  for ( int i=0; i<nx; ++i ) {
5955 #if CMK_BLUEGENEL
5956  CmiNetworkProgress();
5957 #endif
5958  fftwf_execute_dft(backward_plan,
5959  ((fftwf_complex *) data) + i * nz * initdata.grid.K2,
5960  ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
5961  }
5962 #else
5963  for ( int i=0; i<nx; ++i ) {
5964 #if CMK_BLUEGENEL
5965  CmiNetworkProgress();
5966 #endif
5967  fftw(backward_plan, nz,
5968  ((fftw_complex *) data) + i * nz * initdata.grid.K2,
5969  nz, 1, (fftw_complex *) work, 1, 0);
5970  }
5971 #endif
5972 
5973 #ifdef MANUAL_DEBUG_FFTW3
5974  dumpMatrixFloat3("bw_y_a", data, nx, initdata.grid.K2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
5975 #endif
5976 
5977 #endif
5978 }
5979 
5980 static inline void PmeYPencilSendUntrans(int first, int last, void *result, int paraNum, void *param){
5981  PmeYPencil *ypencil = (PmeYPencil *)param;
5982  ypencil->send_subset_untrans(first, last);
5983 }
5984 
5985 void PmeYPencil::send_subset_untrans(int fromIdx, int toIdx){
5986  int yBlocks = initdata.yBlocks;
5987  int block2 = initdata.grid.block2;
5988  int K2 = initdata.grid.K2;
5989 
5990  for(int isend=fromIdx; isend<=toIdx; isend++) {
5991  int jb = send_order[isend];
5992  if ( ! needs_reply[jb] ) {
5993  PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
5994  CmiEnableUrgentSend(1);
5996 #if USE_NODE_PAR_RECEIVE
5997  initdata.zPencil(thisIndex.x,jb,0).recvNodeAck(msg);
5998 #else
5999  initdata.zPencil(thisIndex.x,jb,0).recvAck(msg);
6000 #endif
6001  CmiEnableUrgentSend(0);
6002  continue;
6003  }
6004  int ny = block2;
6005  if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
6006  PmeUntransMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;
6007  msg->sourceNode = thisIndex.z;
6008  msg->ny = nz;
6009  float *md = msg->qgrid;
6010  const float *d = data;
6011  for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
6012  for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
6013  for ( int k=0; k<nz; ++k ) {
6014  *(md++) = d[2*(j*nz+k)];
6015  *(md++) = d[2*(j*nz+k)+1];
6016  }
6017  }
6018  }
6020  CmiEnableUrgentSend(1);
6021 #if USE_NODE_PAR_RECEIVE
6022  msg->destElem=CkArrayIndex3D( thisIndex.x, jb, 0);
6023  // CkPrintf("[%d] sending to %d %d %d recvZUntrans on node %d\n", CkMyPe(), thisIndex.x, jb, 0, CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem)));
6024 #if Z_PERSIST
6025  CmiUsePersistentHandle(&untrans_handle[isend], 1);
6026 #endif
6027  initdata.pmeNodeProxy[CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem))].recvZUntrans(msg);
6028 #if Z_PERSIST
6029  CmiUsePersistentHandle(NULL, 0);
6030 #endif
6031 #else
6032 #if Z_PERSIST
6033  CmiUsePersistentHandle(&untrans_handle[isend], 1);
6034 #endif
6035  initdata.zPencil(thisIndex.x,jb,0).recvUntrans(msg);
6036 #if Z_PERSIST
6037  CmiUsePersistentHandle(NULL, 0);
6038 #endif
6039 #endif
6040  CmiEnableUrgentSend(0);
6041  }
6042 }
6043 
6045 #if USE_PERSISTENT
6046  if (untrans_handle == NULL) setup_persistent();
6047 #endif
6048 #if CMK_SMP && USE_CKLOOP
6049  int useCkLoop = Node::Object()->simParameters->useCkLoop;
6050  if(useCkLoop>=CKLOOP_CTRL_PME_SENDUNTRANS
6051  && CkNumPes() >= 2 * initdata.xBlocks * initdata.zBlocks) {
6052  int yBlocks = initdata.yBlocks;
6053 
6054 #if USE_NODE_PAR_RECEIVE
6055  //CkLoop_Parallelize(PmeYPencilSendUntrans, 1, (void *)this, CkMyNodeSize(), 0, yBlocks-1, 1); //sync
6056  CkLoop_Parallelize(PmeYPencilSendUntrans, 1, (void *)this, yBlocks, 0, yBlocks-1, 1);
6057 #else
6058  //CkLoop_Parallelize(PmeYPencilSendUntrans, 1, (void *)this, CkMyNodeSize(), 0, yBlocks-1, 0); //not sync
6059  CkLoop_Parallelize(PmeYPencilSendUntrans, 1, (void *)this, yBlocks, 0, yBlocks-1, 0); //not sync
6060 #endif
6061  return;
6062  }
6063 #endif
6064  int yBlocks = initdata.yBlocks;
6065  int block2 = initdata.grid.block2;
6066  int K2 = initdata.grid.K2;
6067  for ( int isend=0; isend<yBlocks; ++isend ) {
6068  int jb = send_order[isend];
6069  if ( ! needs_reply[jb] ) {
6070  PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
6071  CmiEnableUrgentSend(1);
6073 #if USE_NODE_PAR_RECEIVE
6074  initdata.zPencil(thisIndex.x,jb,0).recvNodeAck(msg);
6075 #else
6076  initdata.zPencil(thisIndex.x,jb,0).recvAck(msg);
6077 #endif
6078  CmiEnableUrgentSend(0);
6079  continue;
6080  }
6081  int ny = block2;
6082  if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
6083  PmeUntransMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;
6084  msg->sourceNode = thisIndex.z;
6085  msg->ny = nz;
6086  float *md = msg->qgrid;
6087  const float *d = data;
6088  for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
6089  for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
6090  for ( int k=0; k<nz; ++k ) {
6091  *(md++) = d[2*(j*nz+k)];
6092  *(md++) = d[2*(j*nz+k)+1];
6093  }
6094  }
6095  }
6097 
6098  CmiEnableUrgentSend(1);
6099 #if USE_NODE_PAR_RECEIVE
6100  msg->destElem=CkArrayIndex3D( thisIndex.x, jb, 0);
6101  // CkPrintf("[%d] sending to %d %d %d recvZUntrans on node %d\n", CkMyPe(), thisIndex.x, jb, 0, CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem)));
6102 #if Z_PERSIST
6103  CmiUsePersistentHandle(&untrans_handle[isend], 1);
6104 #endif
6105  initdata.pmeNodeProxy[CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem))].recvZUntrans(msg);
6106 #if Z_PERSIST
6107  CmiUsePersistentHandle(NULL, 0);
6108 #endif
6109 #else
6110 #if Z_PERSIST
6111  CmiUsePersistentHandle(&untrans_handle[isend], 1);
6112 #endif
6113  initdata.zPencil(thisIndex.x,jb,0).recvUntrans(msg);
6114 #if Z_PERSIST
6115  CmiUsePersistentHandle(NULL, 0);
6116 #endif
6117 #endif
6118  CmiEnableUrgentSend(0);
6119  }
6120 
6121 #if USE_NODE_PAR_RECEIVE
6122  evir = 0.;
6123  CmiMemoryWriteFence();
6124 #endif
6125 }
6126 
6128 #if ! USE_NODE_PAR_RECEIVE
6129  if(imsg==0) evir=0.;
6130 #endif
6131 
6132  int block3 = initdata.grid.block3;
6133  int dim3 = initdata.grid.dim3;
6134  int kb = msg->sourceNode;
6135  int nz = msg->ny;
6136  const float *md = msg->qgrid;
6137  float *d = data;
6138  for ( int i=0; i<nx; ++i ) {
6139 #if CMK_BLUEGENEL
6140  CmiNetworkProgress();
6141 #endif
6142  for ( int j=0; j<ny; ++j, d += dim3 ) {
6143  for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
6144 #ifdef ZEROCHECK
6145  if ( (*md) == 0. ) CkPrintf("0 in YZ at %d %d %d %d %d %d %d %d %d\n",
6146  thisIndex.x, thisIndex.y, kb, i, j, k, nx, ny, nz);
6147 #endif
6148  d[2*k] = *(md++);
6149  d[2*k+1] = *(md++);
6150  }
6151  }
6152  }
6153 }
6154 
6156 #ifdef NAMD_FFTW
6157 #ifdef MANUAL_DEBUG_FFTW3
6158  dumpMatrixFloat3("bw_z_b", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
6159 #endif
6160 #ifdef NAMD_FFTW_3
6161 #if CMK_SMP && USE_CKLOOP
6162  int useCkLoop = Node::Object()->simParameters->useCkLoop;
6163  if(useCkLoop>=CKLOOP_CTRL_PME_BACKWARDFFT
6164  && CkNumPes() >= 2 * initdata.xBlocks * initdata.yBlocks) {
6165  //for(int i=0; i<numPlans; i++) fftwf_execute(backward_plans[i]);
6166  //transform the above loop
6167  CkLoop_Parallelize(PmeXZPencilFFT, 1, (void *)backward_plans, CkMyNodeSize(), 0, numPlans-1); //sync
6168  return;
6169  }
6170 #endif
6171  fftwf_execute(backward_plan);
6172 #else
6173  rfftwnd_complex_to_real(backward_plan, nx*ny,
6174  (fftw_complex *) data, 1, initdata.grid.dim3/2, work, 1, 0);
6175 #endif
6176 #ifdef MANUAL_DEBUG_FFTW3
6177  dumpMatrixFloat3("bw_z_a", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
6178 #endif
6179 
6180 #endif
6181 
6182 #if CMK_BLUEGENEL
6183  CmiNetworkProgress();
6184 #endif
6185 
6186 #ifdef FFTCHECK
6187  int dim3 = initdata.grid.dim3;
6188  int K1 = initdata.grid.K1;
6189  int K2 = initdata.grid.K2;
6190  int K3 = initdata.grid.K3;
6191  float scale = 1. / (1. * K1 * K2 * K3);
6192  float maxerr = 0.;
6193  float maxstd = 0.;
6194  int mi, mj, mk; mi = mj = mk = -1;
6195  float std_base = 100. * (thisIndex.x+1.) + 10. * (thisIndex.y+1.);
6196  const float *d = data;
6197  for ( int i=0; i<nx; ++i ) {
6198  for ( int j=0; j<ny; ++j, d += dim3 ) {
6199  for ( int k=0; k<K3; ++k ) {
6200  float std = 10. * (10. * (10. * std_base + i) + j) + k;
6201  float err = scale * d[k] - std;
6202  if ( fabsf(err) > fabsf(maxerr) ) {
6203  maxerr = err;
6204  maxstd = std;
6205  mi = i; mj = j; mk = k;
6206  }
6207  }
6208  }
6209  }
6210  CkPrintf("pencil %d %d max error %f at %d %d %d (should be %f)\n",
6211  thisIndex.x, thisIndex.y, maxerr, mi, mj, mk, maxstd);
6212 #endif
6213 
6214 }
6215 
6216 static inline void PmeZPencilSendUngrid(int first, int last, void *result, int paraNum, void *param){
6217  //to take advantage of the interface which allows 3 user params at most.
6218  //under such situtation, no new parameter list needs to be created!! -Chao Mei
6219  PmeZPencil *zpencil = (PmeZPencil *)param;
6220  zpencil->send_subset_ungrid(first, last);
6221 }
6222 
6224 
6225 #if CMK_SMP && USE_CKLOOP
6226  int useCkLoop = Node::Object()->simParameters->useCkLoop;
6227  if(useCkLoop>=CKLOOP_CTRL_PME_SENDUNTRANS
6228  && CkNumPes() >= 2 * initdata.xBlocks * initdata.yBlocks) {
6229  //????What's the best value for numChunks?????
6230  CkLoop_Parallelize(PmeZPencilSendUngrid, 1, (void *)this, grid_msgs.size(), 0, grid_msgs.size()-1, 1); //has to sync
6231  return;
6232  }
6233 #endif
6234  send_subset_ungrid(0, grid_msgs.size()-1);
6235 }
6236 
6237 void PmeZPencil::send_subset_ungrid(int fromIdx, int toIdx){
6238  for (int limsg=fromIdx; limsg <=toIdx; ++limsg ) {
6239  PmeGridMsg *msg = grid_msgs[limsg];
6240  send_ungrid(msg);
6241  }
6242 }
6243 
6245 
6246 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
6247  const int UNGRID_PRIORITY = ( offload ? PME_OFFLOAD_UNGRID_PRIORITY : PME_UNGRID_PRIORITY );
6248 #else
6249  const int UNGRID_PRIORITY = PME_UNGRID_PRIORITY ;
6250 #endif
6251 
6252  int pe = msg->sourceNode;
6253  if ( ! msg->hasData ) {
6254  delete msg;
6255  PmeAckMsg *ackmsg = new (PRIORITY_SIZE) PmeAckMsg;
6256  SET_PRIORITY(ackmsg,sequence,UNGRID_PRIORITY)
6257  CmiEnableUrgentSend(1);
6258  initdata.pmeProxy[pe].recvAck(ackmsg);
6259  CmiEnableUrgentSend(0);
6260  return;
6261  }
6262  if ( ! hasData ) NAMD_bug("PmeZPencil::send_ungrid msg->hasData but not pencil->hasData");
6263  msg->sourceNode = thisIndex.x * initdata.yBlocks + thisIndex.y;
6264  int dim3 = initdata.grid.dim3;
6265  int zlistlen = msg->zlistlen;
6266  int *zlist = msg->zlist;
6267  char *fmsg = msg->fgrid;
6268  float *qmsg = msg->qgrid;
6269  float *d = data;
6270  int numGrids = 1; // pencil FFT doesn't support multiple grids
6271  for ( int g=0; g<numGrids; ++g ) {
6272 #if CMK_BLUEGENEL
6273  CmiNetworkProgress();
6274 #endif
6275  for ( int i=0; i<nx; ++i ) {
6276  for ( int j=0; j<ny; ++j, d += dim3 ) {
6277  if( *(fmsg++) ) {
6278  for ( int k=0; k<zlistlen; ++k ) {
6279  *(qmsg++) = d[zlist[k]];
6280  }
6281  }
6282  }
6283  }
6284  }
6285  SET_PRIORITY(msg,sequence,UNGRID_PRIORITY)
6286  CmiEnableUrgentSend(1);
6287 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
6288  if ( offload ) {
6289  initdata.pmeNodeProxy[CkNodeOf(pe)].recvUngrid(msg);
6290  } else
6291 #endif
6292  initdata.pmeProxy[pe].recvUngrid(msg);
6293  CmiEnableUrgentSend(0);
6294 }
6295 
6297 {
6298 #if USE_NODE_PAR_RECEIVE
6300  CmiMemoryReadFence();
6301 #endif
6302  recv_grid(msg);
6303  if(msg->hasData) hasData=msg->hasData;
6304  int limsg;
6305  CmiMemoryAtomicFetchAndInc(imsg,limsg);
6306  grid_msgs[limsg] = msg;
6307  // CkPrintf("[%d] PmeZPencil node_process_grid for %d %d %d has %d of %d imsg %d\n",CkMyPe(),thisIndex.x,thisIndex.y,thisIndex.z, limsg, grid_msgs.size(), imsg);
6308  if(limsg+1 == grid_msgs.size())
6309  {
6310 
6311  if (hasData)
6312  {
6313  forward_fft();
6314  }
6315  send_trans();
6316  imsg=0;
6317  CmiMemoryWriteFence();
6318  // CkPrintf("[%d] PmeZPencil grid node_zero imsg for %d %d %d\n",CkMyPe(),thisIndex.x,thisIndex.y,thisIndex.z);
6319  }
6320 #if USE_NODE_PAR_RECEIVE
6321  CmiUnlock(ComputePmeMgr::fftw_plan_lock);
6322  CmiMemoryWriteFence();
6323 #endif
6324 }
6325 
6327  delete msg;
6329 }
6330 
6332 {
6333  if ( msg ) {
6334  if ( ! hasData ) NAMD_bug("PmeZPencil::node_process_untrans non-null msg but not hasData");
6335  recv_untrans(msg);
6336  } else if ( hasData ) NAMD_bug("PmeZPencil::node_process_untrans hasData but null msg");
6337 #if USE_NODE_PAR_RECEIVE
6338  CmiMemoryWriteFence();
6340 #endif
6341  int limsg;
6342  CmiMemoryAtomicFetchAndInc(imsgb,limsg);
6343  if(limsg+1 == initdata.zBlocks)
6344  {
6345 #if USE_NODE_PAR_RECEIVE
6346  CmiMemoryReadFence();
6347 #endif
6348  if(hasData) {
6349  backward_fft();
6350  }
6351  send_all_ungrid();
6352  hasData=0;
6353  imsgb=0;
6354  evir = 0;
6355  memset(data, 0, sizeof(float) * nx*ny* initdata.grid.dim3);
6356  CmiMemoryWriteFence();
6357  // CkPrintf("[%d] PmeZPencil untrans node_zero imsg for %d %d %d\n",CkMyPe(),thisIndex.x,thisIndex.y,thisIndex.z);
6358  }
6359 #if USE_NODE_PAR_RECEIVE
6360  CmiUnlock(ComputePmeMgr::fftw_plan_lock);
6361 #endif
6362 }
6363 
6365 {
6366  if ( CkMyRank() ) return;
6367 
6368  SimParameters *simParams = Node::Object()->simParameters;
6369 
6370  alchOn = simParams->alchOn;
6371  alchFepOn = simParams->alchFepOn;
6372  alchThermIntOn = simParams->alchThermIntOn;
6373  alchDecouple = alchOn && simParams->alchDecouple;
6374  alchElecLambdaStart = alchOn ? simParams->alchElecLambdaStart : 0;
6375  lesOn = simParams->lesOn;
6376  lesFactor = simParams->lesFactor;
6377  pairOn = simParams->pairInteractionOn;
6378  selfOn = simParams->pairInteractionSelf;
6379 
6380  if ( alchOn ) {
6381  numGrids = 2;
6382  if (alchDecouple) numGrids += 2;
6383  if (alchElecLambdaStart || alchThermIntOn) numGrids++;
6384  } else if ( lesOn ) {
6385  numGrids = lesFactor;
6386  } else if ( pairOn ) {
6387  if ( selfOn ) pairOn = 0; // make pairOn and selfOn exclusive
6388  numGrids = (selfOn ? 1 : 3);
6389  } else {
6390  numGrids = 1;
6391  }
6392 
6393 }
6394 
6395 #include "ComputePmeMgr.def.h"
6396 
static Node * Object()
Definition: Node.h:86
void node_process_grid(PmeGridMsg *)
Definition: ComputePme.C:6296
void setNumPatches(int n)
Definition: Compute.h:52
int dim2
Definition: PmeBase.h:19
void sendPencilsHelper(int)
Definition: ComputePme.C:3724
Lattice * saved_lattice
Definition: ComputePme.C:453
static void scale_coordinates(PmeParticle p[], int N, Lattice lattice, PmeGrid grid)
Definition: PmeBase.inl:17
unsigned char partition
Definition: NamdTypes.h:56
void recvZGrid(PmeGridMsg *)
Definition: ComputePme.C:5030
#define CUDA_POLL(FN, ARG)
Definition: ComputePme.C:2476
void initialize(CkQdMsg *)
Definition: ComputePme.C:868
void compute_forces(const float *const *q_arr, const PmeParticle p[], Vector f[])
Definition: PmeRealSpace.C:141
static CmiNodeLock fftw_plan_lock
Definition: ComputePme.C:420
std::ostream & iINFO(std::ostream &s)
Definition: InfoStream.C:81
static void sortPmePes(int *pmepes, int xdim, int ydim)
Definition: WorkDistrib.C:304
Box< Patch, CompAtom > * registerAvgPositionPickup(Compute *cid)
Definition: Patch.C:134
void ungridForces()
Definition: ComputePme.C:4018
float * qgrid
Definition: ComputePme.C:160
int numNodesWithPatches(void)
Definition: PatchMap.h:61
static void PmeYPencilSendTrans(int first, int last, void *result, int paraNum, void *param)
Definition: ComputePme.C:5464
PmePencilInitMsgData initdata
Definition: ComputePme.C:4541
double forces_time
Definition: ComputePme.C:437
int sequence(void)
Definition: Compute.h:64
float * a_data_dev
Definition: ComputePme.C:425
#define PME_UNGRID_PRIORITY
Definition: Priorities.h:74
void addRecipEvirClient(void)
Definition: ComputePme.C:3025
CProxy_PmePencilMap zm
Definition: ComputePme.C:640
Vector a_r() const
Definition: Lattice.h:268
#define EVENT_STRIDE
Definition: ComputePme.C:2477
void send_subset_untrans(int fromIdx, int toIdx)
Definition: ComputePme.C:5985
#define PME_TRANS2_PRIORITY
Definition: Priorities.h:32
void recv_trans(const PmeTransMsg *)
Definition: ComputePme.C:5375
int dim3
Definition: PmeBase.h:19
cudaEvent_t end_charges
Definition: ComputePme.C:432
void gridCalc2(void)
Definition: ComputePme.C:2088
CProxy_ComputePmeMgr pmeProxy
Definition: ComputePme.C:223
void procUntrans(PmeUntransMsg *)
Definition: ComputePme.C:2308
Definition: Node.h:78
int ComputeID
Definition: NamdTypes.h:183
#define TRACE_COMPOBJ_IDOFFSET
Definition: Compute.h:77
void doWork()
Definition: ComputePme.C:3097
const int * get_qmAtmIndx()
Definition: Molecule.h:824
int sequence
Definition: ComputePme.C:122
void send_trans()
Definition: ComputePme.C:5526
double x
Definition: PmeBase.h:26
static PatchMap * Object()
Definition: PatchMap.h:27
void registerXPencil(CkArrayIndex3D, PmeXPencil *)
Definition: ComputePme.C:697
void order_init(int nBlocks)
Definition: ComputePme.C:4529
friend class ComputePmeMgr
Definition: ComputePme.C:615
void doQMWork()
Definition: ComputePme.C:3039
BigReal alchElecLambdaStart
static __thread ComputeMgr * computeMgr
void sendPencils(Lattice &, int sequence)
Definition: ComputePme.C:3737
Definition: Vector.h:64
BigReal min_a(int pid) const
Definition: PatchMap.h:91
int K2
Definition: PmeBase.h:18
CProxy_PmeZPencil zPencil
Definition: ComputePme.C:222
SimParameters * simParameters
Definition: Node.h:178
int K1
Definition: PmeBase.h:18
void recvXTrans(PmeTransMsg *)
Definition: ComputePme.C:4987
int sequence
Definition: ComputePme.C:4544
int get_numQMAtoms()
Definition: Molecule.h:826
void fft_init()
Definition: ComputePme.C:4871
#define CUDA_STREAM_CREATE(X)
float Real
Definition: common.h:109
#define COULOMB
Definition: common.h:46
#define CUDA_EVENT_ID_PME_COPY
BigReal & item(int i)
Definition: ReductionMgr.h:312
#define DebugM(x, y)
Definition: Debug.h:59
static int numGrids
Definition: ComputePme.h:32
std::ostream & endi(std::ostream &s)
Definition: InfoStream.C:54
BigReal z
Definition: Vector.h:66
int block1
Definition: PmeBase.h:21
Position position
Definition: NamdTypes.h:53
void recvZUntrans(PmeUntransMsg *)
Definition: ComputePme.C:5019
CProxy_PmeYPencil yPencil
Definition: ComputePme.C:221
CProxy_PmePencilMap zm
Definition: ComputePme.C:227
void backward_subset_fft(int fromIdx, int toIdx)
Definition: ComputePme.C:5926
static Bool alchOn
Definition: ComputePme.h:33
static void messageEnqueueWork(Compute *)
Definition: WorkDistrib.C:2732
SubmitReduction * willSubmit(int setID, int size=-1)
Definition: ReductionMgr.C:365
CProxy_PmePencilMap xm
Definition: ComputePme.C:225
if(ComputeNonbondedUtil::goMethod==2)
void sendChargeGridReady()
Definition: ComputePme.C:3527
#define PME_OFFLOAD_UNGRID_PRIORITY
Definition: Priorities.h:42
void recvYTrans(PmeTransMsg *)
Definition: ComputePme.C:4999
Lattice lattice
Definition: ComputePme.C:124
static ReductionMgr * Object(void)
Definition: ReductionMgr.h:278
CProxy_NodePmeMgr pmeNodeProxy
Definition: ComputePme.C:224
#define iout
Definition: InfoStream.h:51
void fft_init()
Definition: ComputePme.C:5041
static void PmeXZPencilFFT(int first, int last, void *result, int paraNum, void *param)
Definition: ComputePme.C:5183
float * qgrid
Definition: ComputePme.C:143
void fwdSharedTrans(PmeTransMsg *)
Definition: ComputePme.C:2020
double cg
Definition: PmeBase.h:27
Lattice lattice
Definition: ComputePme.C:4542
int block2
Definition: PmeBase.h:21
void recvUngrid(PmeGridMsg *)
Definition: ComputePme.C:2435
void CcdCallBacksReset(void *ignored, double curWallTime)
double z
Definition: PmeBase.h:26
double y
Definition: PmeBase.h:26
void recvGrid(PmeGridMsg *)
Definition: ComputePme.C:1833
PmePencilInitMsg(PmePencilInitMsgData &d)
Definition: ComputePme.C:232
PmeZPencil(CkMigrateMessage *)
Definition: ComputePme.C:4569
void recvUngrid(PmeGridMsg *)
Definition: ComputePme.C:689
Bool pairInteractionOn
void Pme_init()
Definition: ComputePme.C:860
CmiNodeLock pmemgr_lock
Definition: ComputePme.C:421
int check_charges_count
Definition: ComputePme.C:438
Vector b_r() const
Definition: Lattice.h:269
static void PmeSlabSendTrans(int first, int last, void *result, int paraNum, void *param)
Definition: ComputePme.C:1940
float * f_data_host
Definition: ComputePme.C:426
BigReal min_b(int pid) const
Definition: PatchMap.h:93
int sourceNode
Definition: ComputePme.C:121
void send_subset_untrans(int fromIdx, int toIdx)
Definition: ComputePme.C:5751
void recv_trans(const PmeTransMsg *)
Definition: ComputePme.C:5622
MathArray< double, 7 > PmeReduction
Definition: PmeBase.h:31
static void PmeZPencilSendUngrid(int first, int last, void *result, int paraNum, void *param)
Definition: ComputePme.C:6216
Flags flags
Definition: Patch.h:127
#define SQRT_PI
Definition: ComputeExt.C:30
__thread cudaStream_t stream
AtomicInt imsg
Definition: ComputePme.C:4550
void sendDataHelper(int)
Definition: ComputePme.C:3951
CProxy_PmePencilMap xm
Definition: ComputePme.C:638
int isPmeProcessor(int)
Definition: ComputePme.C:604
void pme_kspace()
Definition: ComputePme.C:5689
PmeYPencil(CkMigrateMessage *)
Definition: ComputePme.C:4645
PmeKSpace * myKSpace
Definition: ComputePme.C:4739
void forward_fft()
Definition: ComputePme.C:5659
static void PmeSlabSendUngrid(int first, int last, void *result, int paraNum, void *param)
Definition: ComputePme.C:2370
Charge charge
Definition: NamdTypes.h:54
void sendUntransSubset(int first, int last)
Definition: ComputePme.C:2216
void recvAck(PmeAckMsg *)
Definition: ComputePme.C:2450
void reorder(Elem *a, int n)
Definition: Random.h:220
PmeReduction * evir
Definition: ComputePme.C:173
virtual int registerArray(CkArrayIndexMax &, CkArrayID)
Definition: ComputePme.C:182
void node_process_trans(PmeTransMsg *)
Definition: ComputePme.C:4942
void send_untrans()
Definition: ComputePme.C:5809
void backward_fft()
Definition: ComputePme.C:6155
void chargeGridReady(Lattice &lattice, int sequence)
Definition: ComputePme.C:3554
const Real * get_qmAtomGroup() const
Definition: Molecule.h:820
#define CKLOOP_CTRL_PME_SENDUNTRANS
Definition: SimParameters.h:98
int cuda_atoms_alloc
Definition: ComputePme.C:429
#define PRIORITY_SIZE
Definition: Priorities.h:13
float * data
Definition: ComputePme.C:4554
int AtomicInt
Definition: ComputePme.C:4546
#define PME_PRIORITY
Definition: Priorities.h:29
#define CUDA_EVENT_ID_PME_FORCES
int sourceNode
Definition: ComputePme.C:137
#define CKLOOP_CTRL_PME_SENDTRANS
Definition: SimParameters.h:95
Definition: Random.h:37
void send_all_ungrid()
Definition: ComputePme.C:6223
int sendDataHelper_sequence
Definition: ComputePme.C:377
double compute_energy(float q_arr[], const Lattice &lattice, double ewald, double virial[], int useCkLoop)
Definition: PmeKSpace.C:321
void recvSharedUntrans(PmeSharedUntransMsg *)
Definition: ComputePme.C:2290
#define PME_TRANS_PRIORITY
Definition: Priorities.h:31
void recvNodeAck(PmeAckMsg *)
Definition: ComputePme.C:4960
void sendTrans(void)
Definition: ComputePme.C:1945
int order
Definition: PmeBase.h:20
#define COMPUTE_HOME_PRIORITY
Definition: Priorities.h:76
int compare_bit_reversed(int a, int b)
Definition: ComputePme.C:324
#define CUDA_EVENT_ID_PME_TICK
ResizeArray< ComputePme * > & getComputes(ComputePmeMgr *mgr)
Definition: ComputePme.C:593
void recvRecipEvir(PmeEvirMsg *)
Definition: ComputePme.C:3029
int getMasterPe()
Definition: DeviceCUDA.h:105
void NAMD_bug(const char *err_msg)
Definition: common.C:129
CkArrayIndex3D destElem
Definition: ComputePme.C:131
void sendPencilsHelper(int)
Definition: ComputePme.C:3728
void pollForcesReady()
Definition: ComputePme.C:2662
void fill_charges(float **q_arr, float **q_arr_list, int &q_arr_count, int &stray_count, char *f_arr, char *fz_arr, PmeParticle p[])
Definition: PmeRealSpace.C:47
int block3
Definition: PmeBase.h:21
int doFullElectrostatics
Definition: PatchTypes.h:23
iterator end(void)
Definition: ResizeArray.h:37
CProxy_PmePencilMap ym
Definition: ComputePme.C:639
void send_subset_ungrid(int fromIdx, int toIdx)
Definition: ComputePme.C:6237
static void PmeYPencilSendUntrans(int first, int last, void *result, int paraNum, void *param)
Definition: ComputePme.C:5980
int sendDataHelper_sourcepe
Definition: ComputePme.C:378
void recv_untrans(const PmeUntransMsg *)
Definition: ComputePme.C:5897
void sendData(Lattice &, int sequence)
Definition: ComputePme.C:3964
void generatePmePeList2(int *gridPeMap, int numGridPes, int *transPeMap, int numTransPes)
Definition: ComputePme.C:298
void registerYPencil(CkArrayIndex3D, PmeYPencil *)
Definition: ComputePme.C:703
float * f_data_dev
Definition: ComputePme.C:427
BigReal getCurrentLambda2(const int)
void forward_fft()
Definition: ComputePme.C:5428
void node_process_untrans(PmeUntransMsg *)
Definition: ComputePme.C:4965
gridSize z
int Bool
Definition: common.h:133
void backward_fft()
Definition: ComputePme.C:5938
static void PmeYPencilBackwardFFT(int first, int last, void *result, int paraNum, void *param)
Definition: ComputePme.C:5921
static BigReal alchElecLambdaStart
Definition: ComputePme.h:37
Force * f[maxNumForces]
Definition: PatchTypes.h:67
static std::deque< cuda_submit_charges_args > cuda_submit_charges_deque
Definition: ComputePme.C:447
ComputePme(ComputeID c, PatchID pid)
Definition: ComputePme.C:2673
static void PmeXPencilSendUntrans(int first, int last, void *result, int paraNum, void *param)
Definition: ComputePme.C:5746
Real * get_qmAtmChrg()
Definition: Molecule.h:823
int chargeGridSubmittedCount
Definition: ComputePme.C:450
void recv_untrans(const PmeUntransMsg *)
Definition: ComputePme.C:6127
int noWork()
Definition: ComputePme.C:3000
static void scale_forces(Vector f[], int N, Lattice &lattice)
Definition: PmeBase.inl:60
void fwdSharedUntrans(PmeUntransMsg *)
Definition: ComputePme.C:2276
BigReal x
Definition: Vector.h:66
void sendDataHelper(int)
Definition: ComputePme.C:3955
#define fftwf_free
Definition: ComputePme.C:14
void cuda_check_pme_charges(void *arg, double walltime)
Definition: ComputePme.C:3463
void ungridCalc(void)
Definition: ComputePme.C:2525
float * work
Definition: ComputePme.C:4555
void recvArrays(CProxy_PmeXPencil, CProxy_PmeYPencil, CProxy_PmeZPencil)
Definition: ComputePme.C:806
void cudaDie(const char *msg, cudaError_t err=cudaSuccess)
Definition: CudaUtils.C:9
int PatchID
Definition: NamdTypes.h:182
float * qgrid
Definition: ComputePme.C:130
ijpair(int I, int J)
Definition: ComputePme.C:351
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int numPatches
void setMgr(ComputePmeMgr *mgr)
Definition: ComputePme.h:56
void recvSharedTrans(PmeSharedTransMsg *)
Definition: ComputePme.C:2036
#define PME_OFFLOAD_PRIORITY
Definition: Priorities.h:41
void NAMD_die(const char *err_msg)
Definition: common.C:85
static void PmeZPencilSendTrans(int first, int last, void *result, int paraNum, void *param)
Definition: ComputePme.C:5248
PmeTransMsg * msg
Definition: ComputePme.C:149
static Bool alchDecouple
Definition: ComputePme.h:36
void recvUntrans(PmeUntransMsg *)
Definition: ComputePme.C:2303
static int findRecipEvirPe()
Definition: ComputePme.C:247
int * zlist
Definition: ComputePme.C:128
#define CKLOOP_CTRL_PME_KSPACE
Definition: SimParameters.h:96
void initialize_pencils(CkQdMsg *)
Definition: ComputePme.C:1699
void sendUngrid(void)
Definition: ComputePme.C:2375
void submitReductions()
Definition: ComputePme.C:4214
static int lesFactor
Definition: ComputePme.h:39
CkArrayIndex3D destElem
Definition: ComputePme.C:144
void send_subset_trans(int fromIdx, int toIdx)
Definition: ComputePme.C:5469
static int * peDiffuseOrdering
Definition: WorkDistrib.h:115
void gridCalc1(void)
Definition: ComputePme.C:1912
int ny_after_transpose
Definition: ComputePme.C:239
void base_init(PmePencilInitMsg *msg)
Definition: ComputePme.C:4523
int * needs_reply
Definition: ComputePme.C:4557
int * send_order
Definition: ComputePme.C:4556
#define XCOPY(X)
void initialize_computes()
Definition: ComputePme.C:2726
int getDeviceID()
Definition: DeviceCUDA.h:112
PmePencilMap(int i_a, int i_b, int n_b, int n, int *d)
Definition: ComputePme.C:178
virtual int procNum(int, const CkArrayIndex &i)
Definition: ComputePme.C:186
void recvUntrans(PmeUntransMsg *)
Definition: ComputePme.C:685
BigReal max_b(int pid) const
Definition: PatchMap.h:94
AtomicInt imsgb
Definition: ComputePme.C:4551
void skip(void)
Definition: Box.h:63
PmeYPencil_SDAG_CODE PmeYPencil()
Definition: ComputePme.C:4644
void initialize()
Definition: ComputePme.C:672
int add(const Elem &elem)
Definition: ResizeArray.h:97
int recipEvirPe
Definition: ComputePme.C:4737
BigReal max_a(int pid) const
Definition: PatchMap.h:92
void pollChargeGridReady()
Definition: ComputePme.C:3541
void cuda_check_pme_forces(void *arg, double walltime)
Definition: ComputePme.C:2483
void forward_fft()
Definition: ComputePme.C:5192
BlockRadixSort::TempStorage sort
void recvTrans(PmeTransMsg *)
Definition: ComputePme.C:2049
void recvNodeAck(PmeAckMsg *)
Definition: ComputePme.C:6326
void initialize()
Definition: ComputePme.C:2712
long long int64
Definition: common.h:34
#define simParams
Definition: Output.C:127
void fft_init()
Definition: ComputePme.C:4768
static Bool pairOn
Definition: ComputePme.h:40
#define PME_UNTRANS_PRIORITY
Definition: Priorities.h:33
int K3
Definition: PmeBase.h:18
int cuda_atoms_count
Definition: ComputePme.C:428
void node_process_untrans(PmeUntransMsg *)
Definition: ComputePme.C:6331
int numPatches(void) const
Definition: PatchMap.h:59
static Bool lesOn
Definition: ComputePme.h:38
void resize(int i)
Definition: ResizeArray.h:84
int node(int pid) const
Definition: PatchMap.h:114
void cuda_errcheck(const char *msg)
Lattice * sendDataHelper_lattice
Definition: ComputePme.C:376
void evir_init()
Definition: ComputePme.C:4763
fftw_plan backward_plan
Definition: ComputePme.C:4733
int sendDataHelper_errors
Definition: ComputePme.C:379
void recvTrans(PmeTransMsg *)
Definition: ComputePme.C:681
#define count_limit
Definition: ComputePme.C:2475
#define PME_MAX_EVALS
Definition: PmeBase.h:30
void backward_fft()
Definition: ComputePme.C:5718
void send_untrans()
Definition: ComputePme.C:6044
PmeXPencil_SDAG_CODE PmeXPencil()
Definition: ComputePme.C:4708
Lattice lattice
Definition: ComputePme.C:140
virtual void populateInitial(int, CkArrayIndexMax &, void *msg, CkArrMgr *mgr)
Definition: ComputePme.C:190
#define CKLOOP_CTRL_PME_FORWARDFFT
Definition: SimParameters.h:94
BigReal y
Definition: Vector.h:66
Vector b() const
Definition: Lattice.h:253
int getNumAtoms()
Definition: Patch.h:105
ResizeArray< ComputePme * > pmeComputes
Definition: ComputePme.C:460
PmeReduction evir
Definition: ComputePme.C:4543
BigReal getCurrentLambda(const int)
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:22
void send_subset_trans(int fromIdx, int toIdx)
Definition: ComputePme.C:5253
static void select(void)
Definition: ComputePme.C:6364
#define ADD_VECTOR_OBJECT(R, RL, D)
Definition: ReductionMgr.h:27
void sendTransSubset(int first, int last)
Definition: ComputePme.C:1961
static Bool selfOn
Definition: ComputePme.h:41
Lattice lattice
Definition: PatchTypes.h:44
cudaEvent_t * end_forces
Definition: ComputePme.C:433
bool less_than_bit_reversed(int a, int b)
Definition: ComputePme.C:333
void sendUntrans(void)
Definition: ComputePme.C:2189
Bool pairInteractionSelf
CmiNodeLock lock
Definition: ComputePme.C:168
#define PME_GRID_PRIORITY
Definition: Priorities.h:30
PmeUntransMsg * msg
Definition: ComputePme.C:166
virtual ~ComputePme()
Definition: ComputePme.C:2946
#define CUDA_EVENT_ID_PME_KERNEL
void set_num_atoms(int natoms)
Definition: PmeRealSpace.C:20
void registerZPencil(CkArrayIndex3D, PmeZPencil *)
Definition: ComputePme.C:709
BigReal patchDimension
void node_process_trans(PmeTransMsg *)
Definition: ComputePme.C:5602
void sendUngridSubset(int first, int last)
Definition: ComputePme.C:2391
int numPatchesOnNode(int node)
Definition: PatchMap.h:60
int i
Definition: ComputePme.C:349
gridSize y
int saved_sequence
Definition: ComputePme.C:454
void gridCalc2R(void)
Definition: ComputePme.C:2148
void copyPencils(PmeGridMsg *)
Definition: ComputePme.C:3800
void atomUpdate()
Definition: ComputePme.C:2671
void procTrans(PmeTransMsg *)
Definition: ComputePme.C:2054
CProxy_PmePencilMap ym
Definition: ComputePme.C:226
void submit(void)
Definition: ReductionMgr.h:323
int basePriority
Definition: Compute.h:37
int size(void) const
Definition: ResizeArray.h:127
std::ostream & iERROR(std::ostream &s)
Definition: InfoStream.C:83
int zlistlen
Definition: ComputePme.C:127
#define SET_PRIORITY(MSG, SEQ, PRIO)
Definition: Priorities.h:18
void cuda_init_bspline_coeffs(float **c, float **dc, int order)
static Bool alchFepOn
Definition: ComputePme.h:34
CmiNodeLock lock
Definition: ComputePme.C:151
#define CKLOOP_CTRL_PME_BACKWARDFFT
Definition: SimParameters.h:97
void recvPencilMapProxies(CProxy_PmePencilMap _xm, CProxy_PmePencilMap _ym, CProxy_PmePencilMap _zm)
Definition: ComputePme.C:635
int forces_done_count
Definition: ComputePme.C:435
gridSize x
CkArrayIndex3D destElem
Definition: ComputePme.C:161
void copyResults(PmeGridMsg *)
Definition: ComputePme.C:3992
double charges_time
Definition: ComputePme.C:436
bool one_device_per_node()
Definition: DeviceCUDA.C:402
char * pencilPMEProcessors
Definition: ComputePme.C:113
void cuda_submit_charges(Lattice &lattice, int sequence)
Definition: ComputePme.C:3435
Data * open(void)
Definition: Box.h:39
#define CUDA_EVENT_ID_PME_CHARGES
ijpair()
Definition: ComputePme.C:350
Molecule * molecule
Definition: Node.h:176
void gridCalc3(void)
Definition: ComputePme.C:2350
PmeZPencil_SDAG_CODE PmeZPencil()
Definition: ComputePme.C:4568
static void PmeSlabSendUntrans(int first, int last, void *result, int paraNum, void *param)
Definition: ComputePme.C:2184
Vector a() const
Definition: Lattice.h:252
fftw_plan forward_plan
Definition: ComputePme.C:4733
PmePencilInitMsgData data
Definition: ComputePme.C:233
char * fgrid
Definition: ComputePme.C:129
const ComputeID cid
Definition: Compute.h:43
void chargeGridSubmitted(Lattice &lattice, int sequence)
Definition: ComputePme.C:3490
void send_ungrid(PmeGridMsg *)
Definition: ComputePme.C:6244
static void PmeYPencilForwardFFT(int first, int last, void *result, int paraNum, void *param)
Definition: ComputePme.C:5412
void send_trans()
Definition: ComputePme.C:5305
#define PME_UNTRANS2_PRIORITY
Definition: Priorities.h:34
void recvAck(DataMessage *dmsg)
Definition: DataExchanger.C:99
int j
Definition: ComputePme.C:349
void close(Data **const t)
Definition: Box.h:49
void forward_subset_fft(int fromIdx, int toIdx)
Definition: ComputePme.C:5416
int doMolly
Definition: PatchTypes.h:24
float * a_data_host
Definition: ComputePme.C:424
Vector unit(void) const
Definition: Vector.h:182
static bool cuda_busy
Definition: ComputePme.C:448
void recvYUntrans(PmeUntransMsg *)
Definition: ComputePme.C:5009
#define NUM_STREAMS
Definition: ComputePme.C:520
void sendPencilsPart(int first, int last, Lattice &, int sequence, int sourcepe)
Definition: ComputePme.C:3582
Box< Patch, CompAtom > * registerPositionPickup(Compute *cid)
Definition: Patch.C:107
void recvChargeGridReady()
Definition: ComputePme.C:3550
#define fftwf_malloc
Definition: ComputePme.C:13
void sendDataPart(int first, int last, Lattice &, int sequence, int sourcepe, int errors)
Definition: ComputePme.C:3842
BigReal getElecLambda(const BigReal)
double BigReal
Definition: common.h:114
CProxy_PmeXPencil xPencil
Definition: ComputePme.C:220
static Bool alchThermIntOn
Definition: ComputePme.h:35
int check_forces_count
Definition: ComputePme.C:439
int step
Definition: PatchTypes.h:16
#define PATCH_PRIORITY(PID)
Definition: Priorities.h:25
static CmiNodeLock cuda_lock
Definition: ComputePme.C:430
void activate_pencils(CkQdMsg *)
Definition: ComputePme.C:1794
PmeXPencil(CkMigrateMessage *)
Definition: ComputePme.C:4709
CompAtomExt * getCompAtomExtInfo()
Definition: Patch.h:117
int y_start_after_transpose
Definition: ComputePme.C:239
iterator begin(void)
Definition: ResizeArray.h:36
void recv_grid(const PmeGridMsg *)
Definition: ComputePme.C:5141
void sendTransBarrier(void)
Definition: ComputePme.C:1930
Box< Patch, Results > * registerForceDeposit(Compute *cid)
Definition: Patch.C:228