NAMD
ComputePme.C
Go to the documentation of this file.
1 
7 #ifdef NAMD_FFTW
8 //#define MANUAL_DEBUG_FFTW3 1
9 #ifdef NAMD_FFTW_3
10 #include <fftw3.h>
11 #else
12 // fftw2 doesn't have these defined
13 #define fftwf_malloc fftw_malloc
14 #define fftwf_free fftw_free
15 #ifdef NAMD_FFTW_NO_TYPE_PREFIX
16 #include <fftw.h>
17 #include <rfftw.h>
18 #else
19 #include <sfftw.h>
20 #include <srfftw.h>
21 #endif
22 #endif
23 #endif
24 
25 #include <vector>
26 #include <algorithm>
27 #include <deque>
28 using namespace std;
29 
30 #include "InfoStream.h"
31 #include "Node.h"
32 #include "PatchMap.h"
33 #include "PatchMap.inl"
34 #include "AtomMap.h"
35 #include "ComputePme.h"
36 #include "ComputePmeMgr.decl.h"
37 #include "PmeBase.inl"
38 #include "PmeRealSpace.h"
39 #include "PmeKSpace.h"
40 #include "ComputeNonbondedUtil.h"
41 #include "PatchMgr.h"
42 #include "Molecule.h"
43 #include "ReductionMgr.h"
44 #include "ComputeMgr.h"
45 #include "ComputeMgr.decl.h"
46 // #define DEBUGM
47 #define MIN_DEBUG_LEVEL 3
48 #include "Debug.h"
49 #include "SimParameters.h"
50 #include "WorkDistrib.h"
51 #include "varsizemsg.h"
52 #include "Random.h"
53 #include "ckhashtable.h"
54 #include "Priorities.h"
55 #include "CudaUtils.h"
56 #include "ComputeMoa.h"
57 #include "ComputeMoaMgr.decl.h"
58 
59 //#define USE_RANDOM_TOPO 1
60 
61 //#define USE_TOPO_SFC 1
62 //#define USE_CKLOOP 1
63 //#include "TopoManager.h"
64 
65 #include "DeviceCUDA.h"
66 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
67 void cuda_errcheck(const char *msg) {
68  cudaError_t err;
69  if ((err = cudaGetLastError()) != cudaSuccess) {
70  char host[128];
71  gethostname(host, 128); host[127] = 0;
72  char devstr[128] = "";
73  int devnum;
74  if ( cudaGetDevice(&devnum) == cudaSuccess ) {
75  sprintf(devstr, " device %d", devnum);
76  }
77  cudaDeviceProp deviceProp;
78  if ( cudaGetDeviceProperties(&deviceProp, devnum) == cudaSuccess ) {
79  sprintf(devstr, " device %d pci %x:%x:%x", devnum,
80  deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
81  }
82  char errmsg[1024];
83  sprintf(errmsg,"CUDA error %s on Pe %d (%s%s): %s", msg, CkMyPe(), host, devstr, cudaGetErrorString(err));
84  NAMD_die(errmsg);
85  }
86 }
87 #ifdef NAMD_CUDA
88 #include <cuda_runtime.h>
89 #include <cuda.h>
90 #endif
91 #ifdef NAMD_HIP
92 #include "HipDefines.h"
93 #include <hip/hip_runtime.h>
94 #endif
95 void cuda_errcheck(const char *msg);
96 #ifdef WIN32
97 #define __thread __declspec(thread)
98 #endif
99 extern __thread DeviceCUDA *deviceCUDA;
100 #endif
101 
102 #include "ComputePmeCUDAKernel.h"
103 
104 #ifndef SQRT_PI
105 #define SQRT_PI 1.7724538509055160273 /* mathematica 15 digits*/
106 #endif
107 
108 #if CMK_PERSISTENT_COMM
109 #define USE_PERSISTENT 1
110 #endif
111 
112 #if USE_PERSISTENT
113 #define Z_PERSIST 1
114 #define Y_PERSIST 1
115 #define X_PERSIST 1
116 #endif
117 
118 #if (defined(NAMD_HIP) || defined(NAMD_CUDA)) && defined(MEM_OPT_VERSION)
119 #define USE_NODE_PAR_RECEIVE 1
120 #endif
121 
132 
134 
136 
137 class PmeAckMsg : public CMessage_PmeAckMsg {
138 };
139 
140 class PmeGridMsg : public CMessage_PmeGridMsg {
141 public:
142 
144  int sequence;
145  int hasData;
147  int start;
148  int len;
149  int zlistlen;
150  int *zlist;
151  char *fgrid;
152  float *qgrid;
153  CkArrayIndex3D destElem;
154 };
155 
156 class PmeTransMsg : public CMessage_PmeTransMsg {
157 public:
158 
160  int sequence;
161  int hasData;
163  int x_start;
164  int nx;
165  float *qgrid;
166  CkArrayIndex3D destElem;
167 };
168 
169 class PmeSharedTransMsg : public CMessage_PmeSharedTransMsg {
170 public:
172  int *count;
173  CmiNodeLock lock;
174 };
175 
176 class PmeUntransMsg : public CMessage_PmeUntransMsg {
177 public:
178 
180  int y_start;
181  int ny;
182  float *qgrid;
183  CkArrayIndex3D destElem;
184 };
185 
186 class PmeSharedUntransMsg : public CMessage_PmeSharedUntransMsg {
187 public:
189  int *count;
190  CmiNodeLock lock;
191 };
192 
193 class PmeEvirMsg : public CMessage_PmeEvirMsg {
194 public:
196 };
197 
198 class PmePencilMap : public CBase_PmePencilMap {
199 public:
200  PmePencilMap(int i_a, int i_b, int n_b, int n, int *d)
201  : ia(i_a), ib(i_b), nb(n_b),
202  size(n), data(newcopyint(n,d)) {
203  }
204  virtual int registerArray(CkArrayIndexMax&, CkArrayID) {
205  //Return an ``arrayHdl'', given some information about the array
206  return 0;
207  }
208  virtual int procNum(int, const CkArrayIndex &i) {
209  //Return the home processor number for this element of this array
210  return data[ i.data()[ia] * nb + i.data()[ib] ];
211  }
212  virtual void populateInitial(int, CkArrayIndexMax &, void *msg, CkArrMgr *mgr) {
213  int mype = CkMyPe();
214  for ( int i=0; i < size; ++i ) {
215  if ( data[i] == mype ) {
216  CkArrayIndex3D ai(0,0,0);
217  ai.data()[ia] = i / nb;
218  ai.data()[ib] = i % nb;
219  if ( procNum(0,ai) != mype ) NAMD_bug("PmePencilMap is inconsistent");
220  if ( ! msg ) NAMD_bug("PmePencilMap multiple pencils on a pe?");
221  mgr->insertInitial(ai,msg);
222  msg = 0;
223  }
224  }
225  mgr->doneInserting();
226  if ( msg ) CkFreeMsg(msg);
227  }
228 private:
229  const int ia, ib, nb, size;
230  const int* const data;
231  static int* newcopyint(int n, int *d) {
232  int *newd = new int[n];
233  memcpy(newd, d, n*sizeof(int));
234  return newd;
235  }
236 };
237 
238 // use this idiom since messages don't have copy constructors
241  int xBlocks, yBlocks, zBlocks;
242  CProxy_PmeXPencil xPencil;
243  CProxy_PmeYPencil yPencil;
244  CProxy_PmeZPencil zPencil;
245  CProxy_ComputePmeMgr pmeProxy;
246  CProxy_NodePmeMgr pmeNodeProxy;
247  CProxy_PmePencilMap xm;
248  CProxy_PmePencilMap ym;
249  CProxy_PmePencilMap zm;
250 };
251 
252 class PmePencilInitMsg : public CMessage_PmePencilInitMsg {
253 public:
256 };
257 
258 
259 struct LocalPmeInfo {
260  int nx, x_start;
261  int ny_after_transpose, y_start_after_transpose;
262 };
263 
264 struct NodePmeInfo {
265  int npe, pe_start, real_node;
266 };
267 
268 
269 static int findRecipEvirPe() {
270  PatchMap *patchMap = PatchMap::Object();
271  {
272  int mype = CkMyPe();
273  if ( patchMap->numPatchesOnNode(mype) ) {
274  return mype;
275  }
276  }
277  {
278  int node = CmiMyNode();
279  int firstpe = CmiNodeFirst(node);
280  int nodeSize = CmiNodeSize(node);
281  int myrank = CkMyRank();
282  for ( int i=0; i<nodeSize; ++i ) {
283  int pe = firstpe + (myrank+i)%nodeSize;
284  if ( patchMap->numPatchesOnNode(pe) ) {
285  return pe;
286  }
287  }
288  }
289  {
290  int *pelist;
291  int nodeSize;
292  CmiGetPesOnPhysicalNode(CmiPhysicalNodeID(CkMyPe()), &pelist, &nodeSize);
293  int myrank;
294  for ( int i=0; i<nodeSize; ++i ) {
295  if ( pelist[i] == CkMyPe() ) myrank = i;
296  }
297  for ( int i=0; i<nodeSize; ++i ) {
298  int pe = pelist[(myrank+i)%nodeSize];
299  if ( patchMap->numPatchesOnNode(pe) ) {
300  return pe;
301  }
302  }
303  }
304  {
305  int mype = CkMyPe();
306  int npes = CkNumPes();
307  for ( int i=0; i<npes; ++i ) {
308  int pe = (mype+i)%npes;
309  if ( patchMap->numPatchesOnNode(pe) ) {
310  return pe;
311  }
312  }
313  }
314  NAMD_bug("findRecipEvirPe() failed!");
315  return -999; // should never happen
316 }
317 
318 
319 //Assigns gridPeMap and transPeMap to different set of processors.
320 void generatePmePeList2(int *gridPeMap, int numGridPes, int *transPeMap, int numTransPes){
321  int ncpus = CkNumPes();
322 
323  for ( int i=0; i<numGridPes; ++i ) {
324  gridPeMap[i] = WorkDistrib::peDiffuseOrdering[ncpus - numGridPes + i];
325  }
326  std::sort(gridPeMap,gridPeMap+numGridPes);
327  int firstTransPe = ncpus - numGridPes - numTransPes;
328  if ( firstTransPe < 0 ) {
329  firstTransPe = 0;
330  // 0 should be first in list, skip if possible
331  if ( ncpus > numTransPes ) firstTransPe = 1;
332  }
333  for ( int i=0; i<numTransPes; ++i ) {
334  transPeMap[i] = WorkDistrib::peDiffuseOrdering[firstTransPe + i];
335  }
336  std::sort(transPeMap,transPeMap+numTransPes);
337 }
338 
339 #if USE_TOPOMAP
340 //Topology aware PME allocation
341 bool generateBGLORBPmePeList(int *pemap, int numPes, int *block_pes=0,
342  int nbpes=0);
343 #endif
344 
345 
346 int compare_bit_reversed(int a, int b) {
347  int d = a ^ b;
348  int c = 1;
349  if ( d ) while ( ! (d & c) ) {
350  c = c << 1;
351  }
352  return (a & c) - (b & c);
353 }
354 
355 inline bool less_than_bit_reversed(int a, int b) {
356  int d = a ^ b;
357  int c = 1;
358  if ( d ) while ( ! (d & c) ) {
359  c = c << 1;
360  }
361  return d && (b & c);
362 }
363 
365  inline bool operator() (int a, int b) const {
366  return less_than_bit_reversed(a,b);
367  }
368 };
369 
370 struct ijpair {
371  int i,j;
372  ijpair() {;}
373  ijpair(int I, int J) : i(I), j(J) {;}
374 };
375 
377  inline bool operator() (const ijpair &a, const ijpair &b) const {
378  return ( less_than_bit_reversed(a.i,b.i)
379  || ( (a.i == b.i) && less_than_bit_reversed(a.j,b.j) ) );
380  }
381 };
382 
383 class ComputePmeMgr : public CBase_ComputePmeMgr, public ComputePmeUtil {
384 public:
385  friend class ComputePme;
386  friend class NodePmeMgr;
387  ComputePmeMgr();
388  ~ComputePmeMgr();
389 
390  void initialize(CkQdMsg*);
391  void initialize_pencils(CkQdMsg*);
392  void activate_pencils(CkQdMsg*);
393  void recvArrays(CProxy_PmeXPencil, CProxy_PmeYPencil, CProxy_PmeZPencil);
394  void initialize_computes();
395 
396  void sendData(Lattice &, int sequence);
397  void sendDataPart(int first, int last, Lattice &, int sequence, int sourcepe, int errors);
402  void sendPencils(Lattice &, int sequence);
403  void sendPencilsPart(int first, int last, Lattice &, int sequence, int sourcepe);
404  void recvGrid(PmeGridMsg *);
405  void gridCalc1(void);
406  void sendTransBarrier(void);
407  void sendTransSubset(int first, int last);
408  void sendTrans(void);
409  void fwdSharedTrans(PmeTransMsg *);
410  void recvSharedTrans(PmeSharedTransMsg *);
411  void sendDataHelper(int);
412  void sendPencilsHelper(int);
413  void recvTrans(PmeTransMsg *);
414  void procTrans(PmeTransMsg *);
415  void gridCalc2(void);
416  #ifdef OPENATOM_VERSION
417  void gridCalc2Moa(void);
418  #endif // OPENATOM_VERSION
419  void gridCalc2R(void);
420  void fwdSharedUntrans(PmeUntransMsg *);
421  void recvSharedUntrans(PmeSharedUntransMsg *);
422  void sendUntrans(void);
423  void sendUntransSubset(int first, int last);
424  void recvUntrans(PmeUntransMsg *);
425  void procUntrans(PmeUntransMsg *);
426  void gridCalc3(void);
427  void sendUngrid(void);
428  void sendUngridSubset(int first, int last);
429  void recvUngrid(PmeGridMsg *);
430  void recvAck(PmeAckMsg *);
431  void copyResults(PmeGridMsg *);
432  void copyPencils(PmeGridMsg *);
433  void ungridCalc(void);
434  void recvRecipEvir(PmeEvirMsg *);
435  void addRecipEvirClient(void);
436  void submitReductions();
437 
438 #if 0 && USE_PERSISTENT
439  void setup_recvgrid_persistent();
440 #endif
441 
442  static CmiNodeLock fftw_plan_lock;
443  CmiNodeLock pmemgr_lock; // for accessing this object from other threads
444 
445 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
446  float *a_data_host;
447  float *a_data_dev;
448  float *f_data_host;
449  float *f_data_dev;
452  static CmiNodeLock cuda_lock;
453  void chargeGridSubmitted(Lattice &lattice, int sequence);
454  cudaEvent_t end_charges;
455  cudaEvent_t *end_forces;
458  double charges_time;
459  double forces_time;
463  int this_pe;
464 
465  void cuda_submit_charges(Lattice &lattice, int sequence);
467  ComputePmeMgr *mgr; Lattice *lattice; int sequence;
468  };
469  static std::deque<cuda_submit_charges_args> cuda_submit_charges_deque;
470  static bool cuda_busy;
471 
473  void sendChargeGridReady();
474 #endif
475  Lattice *saved_lattice; // saved by chargeGridSubmitted
476  int saved_sequence; // saved by chargeGridSubmitted
477  void pollChargeGridReady();
478  void pollForcesReady();
479  void recvChargeGridReady();
480  void chargeGridReady(Lattice &lattice, int sequence);
481 
483 
484 private:
485 
486 #if 0 && USE_PERSISTENT
487  PersistentHandle *recvGrid_handle;
488 #endif
489 
490  CProxy_ComputePmeMgr pmeProxy;
491  CProxy_ComputePmeMgr pmeProxyDir;
492  CProxy_NodePmeMgr pmeNodeProxy;
493  NodePmeMgr *nodePmeMgr;
494  ComputePmeMgr *masterPmeMgr;
495 
496  void addCompute(ComputePme *c) {
497  if ( ! pmeComputes.size() ) initialize_computes();
498  pmeComputes.add(c);
499  c->setMgr(this);
500  }
501 
502  ResizeArray<ComputePme*> heldComputes;
503  PmeGrid myGrid;
504  Lattice lattice;
505  PmeKSpace *myKSpace;
506  float *qgrid;
507  float *kgrid;
508 
509 #ifdef NAMD_FFTW
510 #ifdef NAMD_FFTW_3
511  fftwf_plan *forward_plan_x, *backward_plan_x;
512  fftwf_plan *forward_plan_yz, *backward_plan_yz;
513  fftwf_complex *work;
514 #else
515  fftw_plan forward_plan_x, backward_plan_x;
516  rfftwnd_plan forward_plan_yz, backward_plan_yz;
517  fftw_complex *work;
518 #endif
519 #else
520  float *work;
521 #endif
522 
523  int qsize, fsize, bsize;
524  int offload;
525  BigReal alchLambda; // set on each step in ComputePme::ungridForces()
526  BigReal alchLambda2; // set on each step in ComputePme::ungridForces()
527 
528  float **q_arr;
529  // q_list and q_count not used for offload
530  float **q_list;
531  int q_count;
532  char *f_arr;
533  char *fz_arr;
535  SubmitReduction *reduction;
536 
537  int noWorkCount;
538  int doWorkCount;
539  int ungridForcesCount;
540 
541 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
542 #define NUM_STREAMS 1
543  cudaStream_t streams[NUM_STREAMS];
544  int stream;
545 
546  float **q_arr_dev;
547  float **v_arr_dev;
548  float *q_data_host;
549  float *q_data_dev;
550  float *v_data_dev;
551  int *ffz_host;
552  int *ffz_dev;
553  int q_data_size;
554  int ffz_size;
555 
556  int f_data_mgr_alloc;
557  float *f_data_mgr_host;
558  float *f_data_mgr_dev;
559  float **afn_host;
560  float **afn_dev;
561 
562  float *bspline_coeffs_dev;
563  float *bspline_dcoeffs_dev;
564 #endif
565  int recipEvirCount; // used in compute only
566  int recipEvirClients; // used in compute only
567  int recipEvirPe; // used in trans only
568 
569  LocalPmeInfo *localInfo;
570  NodePmeInfo *gridNodeInfo;
571  NodePmeInfo *transNodeInfo;
572  int qgrid_size;
573  int qgrid_start;
574  int qgrid_len;
575  int fgrid_start;
576  int fgrid_len;
577 
578  int numSources;
579  int numGridPes;
580  int numTransPes;
581  int numGridNodes;
582  int numTransNodes;
583  int numDestRecipPes;
584  int myGridPe, myGridNode;
585  int myTransPe, myTransNode;
586  int *gridPeMap;
587  int *transPeMap;
588  int *recipPeDest;
589  int *gridPeOrder;
590  int *gridNodeOrder;
591  int *transNodeOrder;
592  int grid_count;
593  int trans_count;
594  int untrans_count;
595  int ungrid_count;
596  PmeGridMsg **gridmsg_reuse;
597  PmeReduction recip_evir2[PME_MAX_EVALS];
598 
599  int compute_sequence; // set from patch computes, used for priorities
600  int grid_sequence; // set from grid messages, used for priorities
601  int useBarrier;
602  int sendTransBarrier_received;
603 
604  int usePencils;
605  int xBlocks, yBlocks, zBlocks;
606  CProxy_PmeXPencil xPencil;
607  CProxy_PmeYPencil yPencil;
608  CProxy_PmeZPencil zPencil;
609  char *pencilActive;
610  ijpair *activePencils;
611  int numPencilsActive;
612  int strayChargeErrors;
613 };
614 
616  return mgr->pmeComputes ;
617 }
618 
619  CmiNodeLock ComputePmeMgr::fftw_plan_lock;
620 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
621  CmiNodeLock ComputePmeMgr::cuda_lock;
622  std::deque<ComputePmeMgr::cuda_submit_charges_args> ComputePmeMgr::cuda_submit_charges_deque;
624 #endif
625 
626 int isPmeProcessor(int p){
628  if (simParams->usePMECUDA) {
629  return 0;
630  } else {
631  return pencilPMEProcessors[p];
632  }
633 }
634 
635 class NodePmeMgr : public CBase_NodePmeMgr {
636 public:
637  friend class ComputePmeMgr;
638  friend class ComputePme;
639  NodePmeMgr();
640  ~NodePmeMgr();
641  void initialize();
642  void sendDataHelper(int);
643  void sendPencilsHelper(int);
644  void recvTrans(PmeTransMsg *);
645  void recvUntrans(PmeUntransMsg *);
646  void registerXPencil(CkArrayIndex3D, PmeXPencil *);
647  void registerYPencil(CkArrayIndex3D, PmeYPencil *);
648  void registerZPencil(CkArrayIndex3D, PmeZPencil *);
649  void recvXTrans(PmeTransMsg *);
650  void recvYTrans(PmeTransMsg *);
651  void recvYUntrans(PmeUntransMsg *);
652  void recvZGrid(PmeGridMsg *);
653  void recvZUntrans(PmeUntransMsg *);
654 
655  void recvUngrid(PmeGridMsg *);
656 
657  void recvPencilMapProxies(CProxy_PmePencilMap _xm, CProxy_PmePencilMap _ym, CProxy_PmePencilMap _zm){
658  xm=_xm; ym=_ym; zm=_zm;
659  }
660  CProxy_PmePencilMap xm;
661  CProxy_PmePencilMap ym;
662  CProxy_PmePencilMap zm;
663 
664 private:
665  CProxy_ComputePmeMgr mgrProxy;
666  ComputePmeMgr *mgrObject;
667  ComputePmeMgr **mgrObjects;
668 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
669  ComputePmeMgr *masterPmeMgr;
670  int master_pe;
671 #endif
672  CProxy_PmeXPencil xPencil;
673  CProxy_PmeYPencil yPencil;
674  CProxy_PmeZPencil zPencil;
675  CkHashtableT<CkArrayIndex3D,PmeXPencil*> xPencilObj;
676  CkHashtableT<CkArrayIndex3D,PmeYPencil*> yPencilObj;
677  CkHashtableT<CkArrayIndex3D,PmeZPencil*> zPencilObj;
678 
679 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
680  cudaEvent_t end_charge_memset;
681  cudaEvent_t end_all_pme_kernels;
682  cudaEvent_t end_potential_memcpy;
683 #endif
684 };
685 
687  mgrObjects = new ComputePmeMgr*[CkMyNodeSize()];
688 }
689 
691  delete [] mgrObjects;
692 }
693 
695  CProxy_ComputePmeMgr proxy = CkpvAccess(BOCclass_group).computePmeMgr;
696  mgrObjects[CkMyRank()] = proxy.ckLocalBranch();
697  if ( CkMyRank() == 0 ) {
698  mgrProxy = proxy;
699  mgrObject = proxy.ckLocalBranch();
700  }
701 }
702 
704  mgrObject->fwdSharedTrans(msg);
705 }
706 
708  mgrObject->fwdSharedUntrans(msg);
709 }
710 
712 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
713  masterPmeMgr->recvUngrid(msg);
714 #else
715  NAMD_bug("NodePmeMgr::recvUngrid called in non-CUDA build.");
716 #endif
717 }
718 
719 void NodePmeMgr::registerXPencil(CkArrayIndex3D idx, PmeXPencil *obj)
720 {
722  xPencilObj.put(idx)=obj;
724 }
725 void NodePmeMgr::registerYPencil(CkArrayIndex3D idx, PmeYPencil *obj)
726 {
728  yPencilObj.put(idx)=obj;
730 }
731 void NodePmeMgr::registerZPencil(CkArrayIndex3D idx, PmeZPencil *obj)
732 {
734  zPencilObj.put(idx)=obj;
736 }
737 
738 ComputePmeMgr::ComputePmeMgr() : pmeProxy(thisgroup),
739  pmeProxyDir(thisgroup) {
740 
741  CkpvAccess(BOCclass_group).computePmeMgr = thisgroup;
742  pmeNodeProxy = CkpvAccess(BOCclass_group).nodePmeMgr;
743  nodePmeMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
744 
745  pmeNodeProxy.ckLocalBranch()->initialize();
746 
747  if ( CmiMyRank() == 0 ) {
748  fftw_plan_lock = CmiCreateLock();
749  }
750  pmemgr_lock = CmiCreateLock();
751 
752  myKSpace = 0;
753  kgrid = 0;
754  work = 0;
755  grid_count = 0;
756  trans_count = 0;
757  untrans_count = 0;
758  ungrid_count = 0;
759  gridmsg_reuse= new PmeGridMsg*[CkNumPes()];
760  useBarrier = 0;
761  sendTransBarrier_received = 0;
762  usePencils = 0;
763 
764 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
765  // offload has not been set so this happens on every run
766  if ( CmiMyRank() == 0 ) {
767  cuda_lock = CmiCreateLock();
768  }
769 
770 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP)
771  int leastPriority, greatestPriority;
772  cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
773  cuda_errcheck("in cudaDeviceGetStreamPriorityRange");
774  //if ( CkMyNode() == 0 ) {
775  // CkPrintf("Pe %d PME CUDA stream priority range %d %d\n", CkMyPe(), leastPriority, greatestPriority);
776  //}
777 #define CUDA_STREAM_CREATE(X) cudaStreamCreateWithPriority(X,cudaStreamDefault,greatestPriority)
778 #else
779 #define CUDA_STREAM_CREATE(X) cudaStreamCreate(X)
780 #endif
781 
782  stream = 0;
783  for ( int i=0; i<NUM_STREAMS; ++i ) {
784 #if 1
785  CUDA_STREAM_CREATE(&streams[i]);
786  cuda_errcheck("cudaStreamCreate");
787 #else
788  streams[i] = 0; // XXXX Testing!!!
789 #endif
790  }
791 
792  this_pe = CkMyPe();
793 
794  cudaEventCreateWithFlags(&end_charges,cudaEventDisableTiming);
795  end_forces = 0;
797  check_forces_count = 0;
799 
800  cuda_atoms_count = 0;
801  cuda_atoms_alloc = 0;
802 
803  f_data_mgr_alloc = 0;
804  f_data_mgr_host = 0;
805  f_data_mgr_dev = 0;
806  afn_host = 0;
807  afn_dev = 0;
808 
809 #define CUDA_EVENT_ID_PME_CHARGES 80
810 #define CUDA_EVENT_ID_PME_FORCES 81
811 #define CUDA_EVENT_ID_PME_TICK 82
812 #define CUDA_EVENT_ID_PME_COPY 83
813 #define CUDA_EVENT_ID_PME_KERNEL 84
814  if ( 0 == CkMyPe() ) {
815  traceRegisterUserEvent("CUDA PME charges", CUDA_EVENT_ID_PME_CHARGES);
816  traceRegisterUserEvent("CUDA PME forces", CUDA_EVENT_ID_PME_FORCES);
817  traceRegisterUserEvent("CUDA PME tick", CUDA_EVENT_ID_PME_TICK);
818  traceRegisterUserEvent("CUDA PME memcpy", CUDA_EVENT_ID_PME_COPY);
819  traceRegisterUserEvent("CUDA PME kernel", CUDA_EVENT_ID_PME_KERNEL);
820  }
821 #endif
822  recipEvirCount = 0;
823  recipEvirClients = 0;
824  recipEvirPe = -999;
825 }
826 
827 
829  CProxy_PmeXPencil x, CProxy_PmeYPencil y, CProxy_PmeZPencil z) {
830  xPencil = x; yPencil = y; zPencil = z;
831 
832  if(CmiMyRank()==0)
833  {
834  pmeNodeProxy.ckLocalBranch()->xPencil=x;
835  pmeNodeProxy.ckLocalBranch()->yPencil=y;
836  pmeNodeProxy.ckLocalBranch()->zPencil=z;
837  }
838 }
839 
840 #if USE_TOPO_SFC
841  struct Coord
842  {
843  int x, y, z;
844  Coord(): x(0), y(0), z(0) {}
845  Coord(int a, int b, int c): x(a), y(b), z(c) {}
846  };
847  extern void SFC_grid(int xdim, int ydim, int zdim, int xdim1, int ydim1, int zdim1, vector<Coord> &result);
848 
849  void sort_sfc(SortableResizeArray<int> &procs, TopoManager &tmgr, vector<Coord> &result)
850  {
851  SortableResizeArray<int> newprocs(procs.size());
852  int num = 0;
853  for (int i=0; i<result.size(); i++) {
854  Coord &c = result[i];
855  for (int j=0; j<procs.size(); j++) {
856  int pe = procs[j];
857  int x,y,z,t;
858  tmgr.rankToCoordinates(pe, x, y, z, t);
859  if (x==c.x && y==c.y && z==c.z)
860  newprocs[num++] = pe;
861  }
862  }
863  CmiAssert(newprocs.size() == procs.size());
864  procs = newprocs;
865  }
866 
867  int find_level_grid(int x)
868  {
869  int a = sqrt(x);
870  int b;
871  for (; a>0; a--) {
872  if (x%a == 0) break;
873  }
874  if (a==1) a = x;
875  b = x/a;
876  //return a>b?a:b;
877  return b;
878  }
879  CmiNodeLock tmgr_lock;
880 #endif
881 
882 void Pme_init()
883 {
884 #if USE_TOPO_SFC
885  if (CkMyRank() == 0)
886  tmgr_lock = CmiCreateLock();
887 #endif
888 }
889 
890 void ComputePmeMgr::initialize(CkQdMsg *msg) {
891  delete msg;
892 
893  localInfo = new LocalPmeInfo[CkNumPes()];
894  gridNodeInfo = new NodePmeInfo[CkNumNodes()];
895  transNodeInfo = new NodePmeInfo[CkNumNodes()];
896  gridPeMap = new int[CkNumPes()];
897  transPeMap = new int[CkNumPes()];
898  recipPeDest = new int[CkNumPes()];
899  gridPeOrder = new int[CkNumPes()];
900  gridNodeOrder = new int[CkNumNodes()];
901  transNodeOrder = new int[CkNumNodes()];
902 
903  if (CkMyRank() == 0) {
904  pencilPMEProcessors = new char [CkNumPes()];
905  memset (pencilPMEProcessors, 0, sizeof(char) * CkNumPes());
906  }
907 
909  PatchMap *patchMap = PatchMap::Object();
910 
911  offload = simParams->PMEOffload;
912 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
913  if ( offload && ! deviceCUDA->one_device_per_node() ) {
914  NAMD_die("PME offload requires exactly one CUDA device per process. Use \"PMEOffload no\".");
915  }
916  if ( offload ) {
917  int dev;
918  cudaGetDevice(&dev);
919  cuda_errcheck("in cudaGetDevice");
920  if ( dev != deviceCUDA->getDeviceID() ) NAMD_bug("ComputePmeMgr::initialize dev != deviceCUDA->getDeviceID()");
921  cudaDeviceProp deviceProp;
922  cudaGetDeviceProperties(&deviceProp, dev);
923  cuda_errcheck("in cudaGetDeviceProperties");
924  if ( deviceProp.major < 2 )
925  NAMD_die("PME offload requires CUDA device of compute capability 2.0 or higher. Use \"PMEOffload no\".");
926  }
927 #endif
928 
929  alchLambda = -1.; // illegal value to catch if not updated
930  alchLambda2 = -1.;
931  useBarrier = simParams->PMEBarrier;
932 
933  if ( numGrids != 1 || simParams->PMEPencils == 0 ) usePencils = 0;
934  else if ( simParams->PMEPencils > 0 ) usePencils = 1;
935  else {
936  int nrps = simParams->PMEProcessors;
937  if ( nrps <= 0 ) nrps = CkNumPes();
938  if ( nrps > CkNumPes() ) nrps = CkNumPes();
939  int dimx = simParams->PMEGridSizeX;
940  int dimy = simParams->PMEGridSizeY;
941  int maxslabs = 1 + (dimx - 1) / simParams->PMEMinSlices;
942  if ( maxslabs > nrps ) maxslabs = nrps;
943  int maxpencils = ( simParams->PMEGridSizeX * (int64) simParams->PMEGridSizeY
944  * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
945  if ( maxpencils > nrps ) maxpencils = nrps;
946  if ( maxpencils > 3 * maxslabs ) usePencils = 1;
947  else usePencils = 0;
948  }
949 
950  if ( usePencils ) {
951  int nrps = simParams->PMEProcessors;
952  if ( nrps <= 0 ) nrps = CkNumPes();
953  if ( nrps > CkNumPes() ) nrps = CkNumPes();
954  if ( simParams->PMEPencils > 1 &&
955  simParams->PMEPencils * simParams->PMEPencils <= nrps ) {
956  xBlocks = yBlocks = zBlocks = simParams->PMEPencils;
957  } else {
958  int nb2 = ( simParams->PMEGridSizeX * (int64) simParams->PMEGridSizeY
959  * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
960  if ( nb2 > nrps ) nb2 = nrps;
961  if ( nb2 < 1 ) nb2 = 1;
962  int nb = (int) sqrt((float)nb2);
963  if ( nb < 1 ) nb = 1;
964  xBlocks = zBlocks = nb;
965  yBlocks = nb2 / nb;
966  }
967 
968  if ( simParams->PMEPencilsX > 0 ) xBlocks = simParams->PMEPencilsX;
969  if ( simParams->PMEPencilsY > 0 ) yBlocks = simParams->PMEPencilsY;
970  if ( simParams->PMEPencilsZ > 0 ) zBlocks = simParams->PMEPencilsZ;
971 
972  int dimx = simParams->PMEGridSizeX;
973  int bx = 1 + ( dimx - 1 ) / xBlocks;
974  xBlocks = 1 + ( dimx - 1 ) / bx;
975 
976  int dimy = simParams->PMEGridSizeY;
977  int by = 1 + ( dimy - 1 ) / yBlocks;
978  yBlocks = 1 + ( dimy - 1 ) / by;
979 
980  int dimz = simParams->PMEGridSizeZ / 2 + 1; // complex
981  int bz = 1 + ( dimz - 1 ) / zBlocks;
982  zBlocks = 1 + ( dimz - 1 ) / bz;
983 
984  if ( xBlocks * yBlocks > CkNumPes() ) {
985  NAMD_die("PME pencils xBlocks * yBlocks > numPes");
986  }
987  if ( xBlocks * zBlocks > CkNumPes() ) {
988  NAMD_die("PME pencils xBlocks * zBlocks > numPes");
989  }
990  if ( yBlocks * zBlocks > CkNumPes() ) {
991  NAMD_die("PME pencils yBlocks * zBlocks > numPes");
992  }
993 
994  if ( ! CkMyPe() ) {
995  iout << iINFO << "PME using " << xBlocks << " x " <<
996  yBlocks << " x " << zBlocks <<
997  " pencil grid for FFT and reciprocal sum.\n" << endi;
998  }
999  } else { // usePencils
1000 
1001  { // decide how many pes to use for reciprocal sum
1002 
1003  // rules based on work available
1004  int minslices = simParams->PMEMinSlices;
1005  int dimx = simParams->PMEGridSizeX;
1006  int nrpx = ( dimx + minslices - 1 ) / minslices;
1007  int dimy = simParams->PMEGridSizeY;
1008  int nrpy = ( dimy + minslices - 1 ) / minslices;
1009 
1010  // rules based on processors available
1011  int nrpp = CkNumPes();
1012  // if ( nrpp > 32 ) nrpp = 32; // cap to limit messages
1013  if ( nrpp < nrpx ) nrpx = nrpp;
1014  if ( nrpp < nrpy ) nrpy = nrpp;
1015 
1016  // user override
1017  int nrps = simParams->PMEProcessors;
1018  if ( nrps > CkNumPes() ) nrps = CkNumPes();
1019  if ( nrps > 0 ) nrpx = nrps;
1020  if ( nrps > 0 ) nrpy = nrps;
1021 
1022  // make sure there aren't any totally empty processors
1023  int bx = ( dimx + nrpx - 1 ) / nrpx;
1024  nrpx = ( dimx + bx - 1 ) / bx;
1025  int by = ( dimy + nrpy - 1 ) / nrpy;
1026  nrpy = ( dimy + by - 1 ) / by;
1027  if ( bx != ( dimx + nrpx - 1 ) / nrpx )
1028  NAMD_bug("Error in selecting number of PME processors.");
1029  if ( by != ( dimy + nrpy - 1 ) / nrpy )
1030  NAMD_bug("Error in selecting number of PME processors.");
1031 
1032  numGridPes = nrpx;
1033  numTransPes = nrpy;
1034  }
1035  if ( ! CkMyPe() ) {
1036  iout << iINFO << "PME using " << numGridPes << " and " << numTransPes <<
1037  " processors for FFT and reciprocal sum.\n" << endi;
1038  }
1039 
1040  int sum_npes = numTransPes + numGridPes;
1041  int max_npes = (numTransPes > numGridPes)?numTransPes:numGridPes;
1042 
1043 #if 0 // USE_TOPOMAP
1044  /* This code is being disabled permanently for slab PME on Blue Gene machines */
1045  PatchMap * pmap = PatchMap::Object();
1046 
1047  int patch_pes = pmap->numNodesWithPatches();
1048  TopoManager tmgr;
1049  if(tmgr.hasMultipleProcsPerNode())
1050  patch_pes *= 2;
1051 
1052  bool done = false;
1053  if(CkNumPes() > 2*sum_npes + patch_pes) {
1054  done = generateBGLORBPmePeList(transPeMap, numTransPes);
1055  done &= generateBGLORBPmePeList(gridPeMap, numGridPes, transPeMap, numTransPes);
1056  }
1057  else
1058  if(CkNumPes() > 2 *max_npes + patch_pes) {
1059  done = generateBGLORBPmePeList(transPeMap, max_npes);
1060  gridPeMap = transPeMap;
1061  }
1062 
1063  if (!done)
1064 #endif
1065  {
1066  //generatePmePeList(transPeMap, max_npes);
1067  //gridPeMap = transPeMap;
1068  generatePmePeList2(gridPeMap, numGridPes, transPeMap, numTransPes);
1069  }
1070 
1071  if ( ! CkMyPe() ) {
1072  iout << iINFO << "PME GRID LOCATIONS:";
1073  int i;
1074  for ( i=0; i<numGridPes && i<10; ++i ) {
1075  iout << " " << gridPeMap[i];
1076  }
1077  if ( i < numGridPes ) iout << " ...";
1078  iout << "\n" << endi;
1079  iout << iINFO << "PME TRANS LOCATIONS:";
1080  for ( i=0; i<numTransPes && i<10; ++i ) {
1081  iout << " " << transPeMap[i];
1082  }
1083  if ( i < numTransPes ) iout << " ...";
1084  iout << "\n" << endi;
1085  }
1086 
1087  // sort based on nodes and physical nodes
1088  std::sort(gridPeMap,gridPeMap+numGridPes,WorkDistrib::pe_sortop_compact());
1089 
1090  myGridPe = -1;
1091  myGridNode = -1;
1092  int i = 0;
1093  int node = -1;
1094  int real_node = -1;
1095  for ( i=0; i<numGridPes; ++i ) {
1096  if ( gridPeMap[i] == CkMyPe() ) myGridPe = i;
1097  if (CkMyRank() == 0) pencilPMEProcessors[gridPeMap[i]] |= 1;
1098  int real_node_i = CkNodeOf(gridPeMap[i]);
1099  if ( real_node_i == real_node ) {
1100  gridNodeInfo[node].npe += 1;
1101  } else {
1102  real_node = real_node_i;
1103  ++node;
1104  gridNodeInfo[node].real_node = real_node;
1105  gridNodeInfo[node].pe_start = i;
1106  gridNodeInfo[node].npe = 1;
1107  }
1108  if ( CkMyNode() == real_node_i ) myGridNode = node;
1109  }
1110  numGridNodes = node + 1;
1111  myTransPe = -1;
1112  myTransNode = -1;
1113  node = -1;
1114  real_node = -1;
1115  for ( i=0; i<numTransPes; ++i ) {
1116  if ( transPeMap[i] == CkMyPe() ) myTransPe = i;
1117  if (CkMyRank() == 0) pencilPMEProcessors[transPeMap[i]] |= 2;
1118  int real_node_i = CkNodeOf(transPeMap[i]);
1119  if ( real_node_i == real_node ) {
1120  transNodeInfo[node].npe += 1;
1121  } else {
1122  real_node = real_node_i;
1123  ++node;
1124  transNodeInfo[node].real_node = real_node;
1125  transNodeInfo[node].pe_start = i;
1126  transNodeInfo[node].npe = 1;
1127  }
1128  if ( CkMyNode() == real_node_i ) myTransNode = node;
1129  }
1130  numTransNodes = node + 1;
1131 
1132  if ( ! CkMyPe() ) {
1133  iout << iINFO << "PME USING " << numGridNodes << " GRID NODES AND "
1134  << numTransNodes << " TRANS NODES\n" << endi;
1135  }
1136 
1137  { // generate random orderings for grid and trans messages
1138  int i;
1139  for ( i = 0; i < numGridPes; ++i ) {
1140  gridPeOrder[i] = i;
1141  }
1142  Random rand(CkMyPe());
1143  if ( myGridPe < 0 ) {
1144  rand.reorder(gridPeOrder,numGridPes);
1145  } else { // self last
1146  gridPeOrder[myGridPe] = numGridPes-1;
1147  gridPeOrder[numGridPes-1] = myGridPe;
1148  rand.reorder(gridPeOrder,numGridPes-1);
1149  }
1150  for ( i = 0; i < numGridNodes; ++i ) {
1151  gridNodeOrder[i] = i;
1152  }
1153  if ( myGridNode < 0 ) {
1154  rand.reorder(gridNodeOrder,numGridNodes);
1155  } else { // self last
1156  gridNodeOrder[myGridNode] = numGridNodes-1;
1157  gridNodeOrder[numGridNodes-1] = myGridNode;
1158  rand.reorder(gridNodeOrder,numGridNodes-1);
1159  }
1160  for ( i = 0; i < numTransNodes; ++i ) {
1161  transNodeOrder[i] = i;
1162  }
1163  if ( myTransNode < 0 ) {
1164  rand.reorder(transNodeOrder,numTransNodes);
1165  } else { // self last
1166  transNodeOrder[myTransNode] = numTransNodes-1;
1167  transNodeOrder[numTransNodes-1] = myTransNode;
1168  rand.reorder(transNodeOrder,numTransNodes-1);
1169  }
1170  }
1171 
1172  } // ! usePencils
1173 
1174  myGrid.K1 = simParams->PMEGridSizeX;
1175  myGrid.K2 = simParams->PMEGridSizeY;
1176  myGrid.K3 = simParams->PMEGridSizeZ;
1177  myGrid.order = simParams->PMEInterpOrder;
1178  myGrid.dim2 = myGrid.K2;
1179  myGrid.dim3 = 2 * (myGrid.K3/2 + 1);
1180 
1181  if ( ! usePencils ) {
1182  myGrid.block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
1183  myGrid.block2 = ( myGrid.K2 + numTransPes - 1 ) / numTransPes;
1184  myGrid.block3 = myGrid.dim3 / 2; // complex
1185  }
1186 
1187  if ( usePencils ) {
1188  myGrid.block1 = ( myGrid.K1 + xBlocks - 1 ) / xBlocks;
1189  myGrid.block2 = ( myGrid.K2 + yBlocks - 1 ) / yBlocks;
1190  myGrid.block3 = ( myGrid.K3/2 + 1 + zBlocks - 1 ) / zBlocks; // complex
1191 
1192 
1193  int pe = 0;
1194  int x,y,z;
1195 
1196  SortableResizeArray<int> zprocs(xBlocks*yBlocks);
1197  SortableResizeArray<int> yprocs(xBlocks*zBlocks);
1198  SortableResizeArray<int> xprocs(yBlocks*zBlocks);
1199 
1200  // decide which pes to use by bit reversal and patch use
1201  int i;
1202  int ncpus = CkNumPes();
1203  SortableResizeArray<int> patches, nopatches, pmeprocs;
1204  PatchMap *pmap = PatchMap::Object();
1205  for ( int icpu=0; icpu<ncpus; ++icpu ) {
1206  int ri = WorkDistrib::peDiffuseOrdering[icpu];
1207  if ( ri ) { // keep 0 for special case
1208  // pretend pe 1 has patches to avoid placing extra PME load on node
1209  if ( ri == 1 || pmap->numPatchesOnNode(ri) ) patches.add(ri);
1210  else nopatches.add(ri);
1211  }
1212  }
1213 
1214 #if USE_RANDOM_TOPO
1215  Random rand(CkMyPe());
1216  int *tmp = new int[patches.size()];
1217  int nn = patches.size();
1218  for (i=0;i<nn;i++) tmp[i] = patches[i];
1219  rand.reorder(tmp, nn);
1220  patches.resize(0);
1221  for (i=0;i<nn;i++) patches.add(tmp[i]);
1222  delete [] tmp;
1223  tmp = new int[nopatches.size()];
1224  nn = nopatches.size();
1225  for (i=0;i<nn;i++) tmp[i] = nopatches[i];
1226  rand.reorder(tmp, nn);
1227  nopatches.resize(0);
1228  for (i=0;i<nn;i++) nopatches.add(tmp[i]);
1229  delete [] tmp;
1230 #endif
1231 
1232  // only use zero if it eliminates overloading or has patches
1233  int useZero = 0;
1234  int npens = xBlocks*yBlocks;
1235  if ( npens % ncpus == 0 ) useZero = 1;
1236  if ( npens == nopatches.size() + 1 ) useZero = 1;
1237  npens += xBlocks*zBlocks;
1238  if ( npens % ncpus == 0 ) useZero = 1;
1239  if ( npens == nopatches.size() + 1 ) useZero = 1;
1240  npens += yBlocks*zBlocks;
1241  if ( npens % ncpus == 0 ) useZero = 1;
1242  if ( npens == nopatches.size() + 1 ) useZero = 1;
1243 
1244  // add nopatches then patches in reversed order
1245  for ( i=nopatches.size()-1; i>=0; --i ) pmeprocs.add(nopatches[i]);
1246  if ( useZero && ! pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
1247  for ( i=patches.size()-1; i>=0; --i ) pmeprocs.add(patches[i]);
1248  if ( pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
1249 
1250  int npes = pmeprocs.size();
1251  for ( i=0; i<xBlocks*yBlocks; ++i, ++pe ) zprocs[i] = pmeprocs[pe%npes];
1252  if ( i>1 && zprocs[0] == zprocs[i-1] ) zprocs[0] = 0;
1253 #if !USE_RANDOM_TOPO
1254  zprocs.sort();
1255 #endif
1256  for ( i=0; i<xBlocks*zBlocks; ++i, ++pe ) yprocs[i] = pmeprocs[pe%npes];
1257  if ( i>1 && yprocs[0] == yprocs[i-1] ) yprocs[0] = 0;
1258 #if !USE_RANDOM_TOPO
1259  yprocs.sort();
1260 #endif
1261  for ( i=0; i<yBlocks*zBlocks; ++i, ++pe ) xprocs[i] = pmeprocs[pe%npes];
1262  if ( i>1 && xprocs[0] == xprocs[i-1] ) xprocs[0] = 0;
1263 #if !USE_RANDOM_TOPO
1264  xprocs.sort();
1265 #endif
1266 
1267 #if USE_TOPO_SFC
1268  CmiLock(tmgr_lock);
1269  //{
1270  TopoManager tmgr;
1271  int xdim = tmgr.getDimNX();
1272  int ydim = tmgr.getDimNY();
1273  int zdim = tmgr.getDimNZ();
1274  int xdim1 = find_level_grid(xdim);
1275  int ydim1 = find_level_grid(ydim);
1276  int zdim1 = find_level_grid(zdim);
1277  if(CkMyPe() == 0)
1278  printf("xdim: %d %d %d, %d %d %d\n", xdim, ydim, zdim, xdim1, ydim1, zdim1);
1279 
1280  vector<Coord> result;
1281  SFC_grid(xdim, ydim, zdim, xdim1, ydim1, zdim1, result);
1282  sort_sfc(xprocs, tmgr, result);
1283  sort_sfc(yprocs, tmgr, result);
1284  sort_sfc(zprocs, tmgr, result);
1285  //}
1286  CmiUnlock(tmgr_lock);
1287 #endif
1288 
1289 
1290  if(CkMyPe() == 0){
1291  iout << iINFO << "PME Z PENCIL LOCATIONS:";
1292  for ( i=0; i<zprocs.size() && i<10; ++i ) {
1293 #if USE_TOPO_SFC
1294  int x,y,z,t;
1295  tmgr.rankToCoordinates(zprocs[i], x,y, z, t);
1296  iout << " " << zprocs[i] << "(" << x << " " << y << " " << z << ")";
1297 #else
1298  iout << " " << zprocs[i];
1299 #endif
1300  }
1301  if ( i < zprocs.size() ) iout << " ...";
1302  iout << "\n" << endi;
1303  }
1304 
1305  if (CkMyRank() == 0) {
1306  for (pe=0, x = 0; x < xBlocks; ++x)
1307  for (y = 0; y < yBlocks; ++y, ++pe ) {
1308  pencilPMEProcessors[zprocs[pe]] = 1;
1309  }
1310  }
1311 
1312  if(CkMyPe() == 0){
1313  iout << iINFO << "PME Y PENCIL LOCATIONS:";
1314  for ( i=0; i<yprocs.size() && i<10; ++i ) {
1315 #if USE_TOPO_SFC
1316  int x,y,z,t;
1317  tmgr.rankToCoordinates(yprocs[i], x,y, z, t);
1318  iout << " " << yprocs[i] << "(" << x << " " << y << " " << z << ")";
1319 #else
1320  iout << " " << yprocs[i];
1321 #endif
1322  }
1323  if ( i < yprocs.size() ) iout << " ...";
1324  iout << "\n" << endi;
1325  }
1326 
1327  if (CkMyRank() == 0) {
1328  for (pe=0, z = 0; z < zBlocks; ++z )
1329  for (x = 0; x < xBlocks; ++x, ++pe ) {
1330  pencilPMEProcessors[yprocs[pe]] = 1;
1331  }
1332  }
1333 
1334  if(CkMyPe() == 0){
1335  iout << iINFO << "PME X PENCIL LOCATIONS:";
1336  for ( i=0; i<xprocs.size() && i<10; ++i ) {
1337 #if USE_TOPO_SFC
1338  int x,y,z,t;
1339  tmgr.rankToCoordinates(xprocs[i], x,y, z, t);
1340  iout << " " << xprocs[i] << "(" << x << " " << y << " " << z << ")";
1341 #else
1342  iout << " " << xprocs[i];
1343 #endif
1344  }
1345  if ( i < xprocs.size() ) iout << " ...";
1346  iout << "\n" << endi;
1347  }
1348 
1349  if (CkMyRank() == 0) {
1350  for (pe=0, y = 0; y < yBlocks; ++y )
1351  for (z = 0; z < zBlocks; ++z, ++pe ) {
1352  pencilPMEProcessors[xprocs[pe]] = 1;
1353  }
1354  }
1355 
1356 
1357  // creating the pencil arrays
1358  if ( CkMyPe() == 0 ){
1359 #if !USE_RANDOM_TOPO
1360  // std::sort(zprocs.begin(),zprocs.end(),WorkDistrib::pe_sortop_compact());
1361  WorkDistrib::sortPmePes(zprocs.begin(),xBlocks,yBlocks);
1362  std::sort(yprocs.begin(),yprocs.end(),WorkDistrib::pe_sortop_compact());
1363  std::sort(xprocs.begin(),xprocs.end(),WorkDistrib::pe_sortop_compact());
1364 #endif
1365 #if 1
1366  CProxy_PmePencilMap zm = CProxy_PmePencilMap::ckNew(0,1,yBlocks,xBlocks*yBlocks,zprocs.begin());
1367  CProxy_PmePencilMap ym;
1368  if ( simParams->PMEPencilsYLayout )
1369  ym = CProxy_PmePencilMap::ckNew(0,2,zBlocks,zBlocks*xBlocks,yprocs.begin()); // new
1370  else
1371  ym = CProxy_PmePencilMap::ckNew(2,0,xBlocks,zBlocks*xBlocks,yprocs.begin()); // old
1372  CProxy_PmePencilMap xm;
1373  if ( simParams->PMEPencilsXLayout )
1374  xm = CProxy_PmePencilMap::ckNew(2,1,yBlocks,yBlocks*zBlocks,xprocs.begin()); // new
1375  else
1376  xm = CProxy_PmePencilMap::ckNew(1,2,zBlocks,yBlocks*zBlocks,xprocs.begin()); // old
1377  pmeNodeProxy.recvPencilMapProxies(xm,ym,zm);
1378  CkArrayOptions zo(xBlocks,yBlocks,1); zo.setMap(zm);
1379  CkArrayOptions yo(xBlocks,1,zBlocks); yo.setMap(ym);
1380  CkArrayOptions xo(1,yBlocks,zBlocks); xo.setMap(xm);
1381  zo.setAnytimeMigration(false); zo.setStaticInsertion(true);
1382  yo.setAnytimeMigration(false); yo.setStaticInsertion(true);
1383  xo.setAnytimeMigration(false); xo.setStaticInsertion(true);
1384  zPencil = CProxy_PmeZPencil::ckNew(zo); // (xBlocks,yBlocks,1);
1385  yPencil = CProxy_PmeYPencil::ckNew(yo); // (xBlocks,1,zBlocks);
1386  xPencil = CProxy_PmeXPencil::ckNew(xo); // (1,yBlocks,zBlocks);
1387 #else
1388  zPencil = CProxy_PmeZPencil::ckNew(); // (xBlocks,yBlocks,1);
1389  yPencil = CProxy_PmeYPencil::ckNew(); // (xBlocks,1,zBlocks);
1390  xPencil = CProxy_PmeXPencil::ckNew(); // (1,yBlocks,zBlocks);
1391 
1392  for (pe=0, x = 0; x < xBlocks; ++x)
1393  for (y = 0; y < yBlocks; ++y, ++pe ) {
1394  zPencil(x,y,0).insert(zprocs[pe]);
1395  }
1396  zPencil.doneInserting();
1397 
1398  for (pe=0, x = 0; x < xBlocks; ++x)
1399  for (z = 0; z < zBlocks; ++z, ++pe ) {
1400  yPencil(x,0,z).insert(yprocs[pe]);
1401  }
1402  yPencil.doneInserting();
1403 
1404 
1405  for (pe=0, y = 0; y < yBlocks; ++y )
1406  for (z = 0; z < zBlocks; ++z, ++pe ) {
1407  xPencil(0,y,z).insert(xprocs[pe]);
1408  }
1409  xPencil.doneInserting();
1410 #endif
1411 
1412  pmeProxy.recvArrays(xPencil,yPencil,zPencil);
1413  PmePencilInitMsgData msgdata;
1414  msgdata.grid = myGrid;
1415  msgdata.xBlocks = xBlocks;
1416  msgdata.yBlocks = yBlocks;
1417  msgdata.zBlocks = zBlocks;
1418  msgdata.xPencil = xPencil;
1419  msgdata.yPencil = yPencil;
1420  msgdata.zPencil = zPencil;
1421  msgdata.pmeProxy = pmeProxyDir;
1422  msgdata.pmeNodeProxy = pmeNodeProxy;
1423  msgdata.xm = xm;
1424  msgdata.ym = ym;
1425  msgdata.zm = zm;
1426  xPencil.init(new PmePencilInitMsg(msgdata));
1427  yPencil.init(new PmePencilInitMsg(msgdata));
1428  zPencil.init(new PmePencilInitMsg(msgdata));
1429  }
1430 
1431  return; // continue in initialize_pencils() at next startup stage
1432  }
1433 
1434 
1435  int pe;
1436  int nx = 0;
1437  for ( pe = 0; pe < numGridPes; ++pe ) {
1438  localInfo[pe].x_start = nx;
1439  nx += myGrid.block1;
1440  if ( nx > myGrid.K1 ) nx = myGrid.K1;
1441  localInfo[pe].nx = nx - localInfo[pe].x_start;
1442  }
1443  int ny = 0;
1444  for ( pe = 0; pe < numTransPes; ++pe ) {
1445  localInfo[pe].y_start_after_transpose = ny;
1446  ny += myGrid.block2;
1447  if ( ny > myGrid.K2 ) ny = myGrid.K2;
1448  localInfo[pe].ny_after_transpose =
1449  ny - localInfo[pe].y_start_after_transpose;
1450  }
1451 
1452  { // decide how many pes this node exchanges charges with
1453 
1454  PatchMap *patchMap = PatchMap::Object();
1455  Lattice lattice = simParams->lattice;
1456  BigReal sysdima = lattice.a_r().unit() * lattice.a();
1457  BigReal cutoff = simParams->cutoff;
1458  BigReal patchdim = simParams->patchDimension;
1459  int numPatches = patchMap->numPatches();
1460  int numNodes = CkNumPes();
1461  int *source_flags = new int[numNodes];
1462  int node;
1463  for ( node=0; node<numNodes; ++node ) {
1464  source_flags[node] = 0;
1465  recipPeDest[node] = 0;
1466  }
1467 
1468  // // make sure that we don't get ahead of ourselves on this node
1469  // if ( CkMyPe() < numPatches && myRecipPe >= 0 ) {
1470  // source_flags[CkMyPe()] = 1;
1471  // recipPeDest[myRecipPe] = 1;
1472  // }
1473 
1474  for ( int pid=0; pid < numPatches; ++pid ) {
1475  int pnode = patchMap->node(pid);
1476 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
1477  if ( offload ) pnode = CkNodeFirst(CkNodeOf(pnode));
1478 #endif
1479  int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
1480  BigReal minx = patchMap->min_a(pid);
1481  BigReal maxx = patchMap->max_a(pid);
1482  BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
1483  // min1 (max1) is smallest (largest) grid line for this patch
1484  int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
1485  int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
1486  for ( int i=min1; i<=max1; ++i ) {
1487  int ix = i;
1488  while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
1489  while ( ix < 0 ) ix += myGrid.K1;
1490  // set source_flags[pnode] if this patch sends to our node
1491  if ( myGridPe >= 0 && ix >= localInfo[myGridPe].x_start &&
1492  ix < localInfo[myGridPe].x_start + localInfo[myGridPe].nx ) {
1493  source_flags[pnode] = 1;
1494  }
1495  // set dest_flags[] for node that our patch sends to
1496 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
1497  if ( offload ) {
1498  if ( pnode == CkNodeFirst(CkMyNode()) ) {
1499  recipPeDest[ix / myGrid.block1] = 1;
1500  }
1501  } else
1502 #endif
1503  if ( pnode == CkMyPe() ) {
1504  recipPeDest[ix / myGrid.block1] = 1;
1505  }
1506  }
1507  }
1508 
1509  int numSourcesSamePhysicalNode = 0;
1510  numSources = 0;
1511  numDestRecipPes = 0;
1512  for ( node=0; node<numNodes; ++node ) {
1513  if ( source_flags[node] ) ++numSources;
1514  if ( recipPeDest[node] ) ++numDestRecipPes;
1515  if ( source_flags[node] && CmiPeOnSamePhysicalNode(node,CkMyPe()) ) ++numSourcesSamePhysicalNode;
1516  }
1517 
1518 #if 0
1519  if ( numSources ) {
1520  CkPrintf("pe %5d pme %5d of %5d on same physical node\n",
1521  CkMyPe(), numSourcesSamePhysicalNode, numSources);
1522  iout << iINFO << "PME " << CkMyPe() << " sources:";
1523  for ( node=0; node<numNodes; ++node ) {
1524  if ( source_flags[node] ) iout << " " << node;
1525  }
1526  iout << "\n" << endi;
1527  }
1528 #endif
1529 
1530  delete [] source_flags;
1531 
1532  // CkPrintf("PME on node %d has %d sources and %d destinations\n",
1533  // CkMyPe(), numSources, numDestRecipPes);
1534 
1535  } // decide how many pes this node exchanges charges with (end)
1536 
1537  ungrid_count = numDestRecipPes;
1538 
1539  sendTransBarrier_received = 0;
1540 
1541  if ( myGridPe < 0 && myTransPe < 0 ) return;
1542  // the following only for nodes doing reciprocal sum
1543 
1544  if ( myTransPe >= 0 ) {
1545  recipEvirPe = findRecipEvirPe();
1546  pmeProxy[recipEvirPe].addRecipEvirClient();
1547  }
1548 
1549  if ( myTransPe >= 0 ) {
1550  int k2_start = localInfo[myTransPe].y_start_after_transpose;
1551  int k2_end = k2_start + localInfo[myTransPe].ny_after_transpose;
1552  #ifdef OPENATOM_VERSION
1553  if ( simParams->openatomOn ) {
1554  CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
1555  myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2, moaProxy);
1556  } else {
1557  myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
1558  }
1559  #else // OPENATOM_VERSION
1560  myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
1561  #endif // OPENATOM_VERSION
1562  }
1563 
1564  int local_size = myGrid.block1 * myGrid.K2 * myGrid.dim3;
1565  int local_size_2 = myGrid.block2 * myGrid.K1 * myGrid.dim3;
1566  if ( local_size < local_size_2 ) local_size = local_size_2;
1567  qgrid = new float[local_size*numGrids];
1568  if ( numGridPes > 1 || numTransPes > 1 ) {
1569  kgrid = new float[local_size*numGrids];
1570  } else {
1571  kgrid = qgrid;
1572  }
1573  qgrid_size = local_size;
1574 
1575  if ( myGridPe >= 0 ) {
1576  qgrid_start = localInfo[myGridPe].x_start * myGrid.K2 * myGrid.dim3;
1577  qgrid_len = localInfo[myGridPe].nx * myGrid.K2 * myGrid.dim3;
1578  fgrid_start = localInfo[myGridPe].x_start * myGrid.K2;
1579  fgrid_len = localInfo[myGridPe].nx * myGrid.K2;
1580  }
1581 
1582  int n[3]; n[0] = myGrid.K1; n[1] = myGrid.K2; n[2] = myGrid.K3;
1583 #ifdef NAMD_FFTW
1584  CmiLock(fftw_plan_lock);
1585 #ifdef NAMD_FFTW_3
1586  work = new fftwf_complex[n[0]];
1587  int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT : simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE ;
1588  if ( myGridPe >= 0 ) {
1589  forward_plan_yz=new fftwf_plan[numGrids];
1590  backward_plan_yz=new fftwf_plan[numGrids];
1591  }
1592  if ( myTransPe >= 0 ) {
1593  forward_plan_x=new fftwf_plan[numGrids];
1594  backward_plan_x=new fftwf_plan[numGrids];
1595  }
1596  /* need one plan per grid */
1597  if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps. 1..." << endi;
1598  if ( myGridPe >= 0 ) {
1599  for( int g=0; g<numGrids; g++)
1600  {
1601  forward_plan_yz[g] = fftwf_plan_many_dft_r2c(2, n+1,
1602  localInfo[myGridPe].nx,
1603  qgrid + qgrid_size * g,
1604  NULL,
1605  1,
1606  myGrid.dim2 * myGrid.dim3,
1607  (fftwf_complex *)
1608  (qgrid + qgrid_size * g),
1609  NULL,
1610  1,
1611  myGrid.dim2 * (myGrid.dim3/2),
1612  fftwFlags);
1613  }
1614  }
1615  int zdim = myGrid.dim3;
1616  int xStride=localInfo[myTransPe].ny_after_transpose *( myGrid.dim3 / 2);
1617  if ( ! CkMyPe() ) iout << " 2..." << endi;
1618  if ( myTransPe >= 0 ) {
1619  for( int g=0; g<numGrids; g++)
1620  {
1621 
1622  forward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
1623  (fftwf_complex *)
1624  (kgrid+qgrid_size*g),
1625  NULL,
1626  xStride,
1627  1,
1628  (fftwf_complex *)
1629  (kgrid+qgrid_size*g),
1630  NULL,
1631  xStride,
1632  1,
1633  FFTW_FORWARD,fftwFlags);
1634 
1635  }
1636  }
1637  if ( ! CkMyPe() ) iout << " 3..." << endi;
1638  if ( myTransPe >= 0 ) {
1639  for( int g=0; g<numGrids; g++)
1640  {
1641  backward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
1642  (fftwf_complex *)
1643  (kgrid+qgrid_size*g),
1644  NULL,
1645  xStride,
1646  1,
1647  (fftwf_complex *)
1648  (kgrid+qgrid_size*g),
1649  NULL,
1650  xStride,
1651  1,
1652  FFTW_BACKWARD, fftwFlags);
1653 
1654  }
1655  }
1656  if ( ! CkMyPe() ) iout << " 4..." << endi;
1657  if ( myGridPe >= 0 ) {
1658  for( int g=0; g<numGrids; g++)
1659  {
1660  backward_plan_yz[g] = fftwf_plan_many_dft_c2r(2, n+1,
1661  localInfo[myGridPe].nx,
1662  (fftwf_complex *)
1663  (qgrid + qgrid_size * g),
1664  NULL,
1665  1,
1666  myGrid.dim2*(myGrid.dim3/2),
1667  qgrid + qgrid_size * g,
1668  NULL,
1669  1,
1670  myGrid.dim2 * myGrid.dim3,
1671  fftwFlags);
1672  }
1673  }
1674  if ( ! CkMyPe() ) iout << " Done.\n" << endi;
1675 
1676 #else
1677  work = new fftw_complex[n[0]];
1678 
1679  if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps. 1..." << endi;
1680  if ( myGridPe >= 0 ) {
1681  forward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_REAL_TO_COMPLEX,
1682  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1683  | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
1684  }
1685  if ( ! CkMyPe() ) iout << " 2..." << endi;
1686  if ( myTransPe >= 0 ) {
1687  forward_plan_x = fftw_create_plan_specific(n[0], FFTW_FORWARD,
1688  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1689  | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
1690  localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
1691  }
1692  if ( ! CkMyPe() ) iout << " 3..." << endi;
1693  if ( myTransPe >= 0 ) {
1694  backward_plan_x = fftw_create_plan_specific(n[0], FFTW_BACKWARD,
1695  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1696  | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
1697  localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
1698  }
1699  if ( ! CkMyPe() ) iout << " 4..." << endi;
1700  if ( myGridPe >= 0 ) {
1701  backward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_COMPLEX_TO_REAL,
1702  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1703  | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
1704  }
1705  if ( ! CkMyPe() ) iout << " Done.\n" << endi;
1706 #endif
1707  CmiUnlock(fftw_plan_lock);
1708 #else
1709  NAMD_die("Sorry, FFTW must be compiled in to use PME.");
1710 #endif
1711 
1712  if ( myGridPe >= 0 && numSources == 0 )
1713  NAMD_bug("PME grid elements exist without sources.");
1714  grid_count = numSources;
1715  memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
1716  trans_count = numGridPes;
1717 }
1718 
1719 
1720 
1722  delete msg;
1723  if ( ! usePencils ) return;
1724 
1726 
1727  PatchMap *patchMap = PatchMap::Object();
1728  Lattice lattice = simParams->lattice;
1729  BigReal sysdima = lattice.a_r().unit() * lattice.a();
1730  BigReal sysdimb = lattice.b_r().unit() * lattice.b();
1731  BigReal cutoff = simParams->cutoff;
1732  BigReal patchdim = simParams->patchDimension;
1733  int numPatches = patchMap->numPatches();
1734 
1735  pencilActive = new char[xBlocks*yBlocks];
1736  for ( int i=0; i<xBlocks; ++i ) {
1737  for ( int j=0; j<yBlocks; ++j ) {
1738  pencilActive[i*yBlocks+j] = 0;
1739  }
1740  }
1741 
1742  for ( int pid=0; pid < numPatches; ++pid ) {
1743  int pnode = patchMap->node(pid);
1744 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
1745  if ( offload ) {
1746  if ( CkNodeOf(pnode) != CkMyNode() ) continue;
1747  } else
1748 #endif
1749  if ( pnode != CkMyPe() ) continue;
1750 
1751  int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
1752  int shift2 = (myGrid.K2 + myGrid.order - 1)/2;
1753 
1754  BigReal minx = patchMap->min_a(pid);
1755  BigReal maxx = patchMap->max_a(pid);
1756  BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
1757  // min1 (max1) is smallest (largest) grid line for this patch
1758  int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
1759  int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
1760 
1761  BigReal miny = patchMap->min_b(pid);
1762  BigReal maxy = patchMap->max_b(pid);
1763  BigReal marginb = 0.5 * ( patchdim - cutoff ) / sysdimb;
1764  // min2 (max2) is smallest (largest) grid line for this patch
1765  int min2 = ((int) floor(myGrid.K2 * (miny - marginb))) + shift2 - myGrid.order + 1;
1766  int max2 = ((int) floor(myGrid.K2 * (maxy + marginb))) + shift2;
1767 
1768  for ( int i=min1; i<=max1; ++i ) {
1769  int ix = i;
1770  while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
1771  while ( ix < 0 ) ix += myGrid.K1;
1772  for ( int j=min2; j<=max2; ++j ) {
1773  int jy = j;
1774  while ( jy >= myGrid.K2 ) jy -= myGrid.K2;
1775  while ( jy < 0 ) jy += myGrid.K2;
1776  pencilActive[(ix / myGrid.block1)*yBlocks + (jy / myGrid.block2)] = 1;
1777  }
1778  }
1779  }
1780 
1781  numPencilsActive = 0;
1782  for ( int i=0; i<xBlocks; ++i ) {
1783  for ( int j=0; j<yBlocks; ++j ) {
1784  if ( pencilActive[i*yBlocks+j] ) {
1785  ++numPencilsActive;
1786 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
1787  if ( CkMyPe() == deviceCUDA->getMasterPe() || ! offload )
1788 #endif
1789  zPencil(i,j,0).dummyRecvGrid(CkMyPe(),0);
1790  }
1791  }
1792  }
1793  activePencils = new ijpair[numPencilsActive];
1794  numPencilsActive = 0;
1795  for ( int i=0; i<xBlocks; ++i ) {
1796  for ( int j=0; j<yBlocks; ++j ) {
1797  if ( pencilActive[i*yBlocks+j] ) {
1798  activePencils[numPencilsActive++] = ijpair(i,j);
1799  }
1800  }
1801  }
1802  if ( simParams->PMESendOrder ) {
1803  std::sort(activePencils,activePencils+numPencilsActive,ijpair_sortop_bit_reversed());
1804  } else {
1805  Random rand(CkMyPe());
1806  rand.reorder(activePencils,numPencilsActive);
1807  }
1808  //if ( numPencilsActive ) {
1809  // CkPrintf("node %d sending to %d pencils\n", CkMyPe(), numPencilsActive);
1810  //}
1811 
1812  ungrid_count = numPencilsActive;
1813 }
1814 
1815 
1817  if ( ! usePencils ) return;
1818  if ( CkMyPe() == 0 ) zPencil.dummyRecvGrid(CkMyPe(),1);
1819 }
1820 
1821 
1823 
1824  if ( CmiMyRank() == 0 ) {
1825  CmiDestroyLock(fftw_plan_lock);
1826  }
1827  CmiDestroyLock(pmemgr_lock);
1828 
1829  delete myKSpace;
1830  delete [] localInfo;
1831  delete [] gridNodeInfo;
1832  delete [] transNodeInfo;
1833  delete [] gridPeMap;
1834  delete [] transPeMap;
1835  delete [] recipPeDest;
1836  delete [] gridPeOrder;
1837  delete [] gridNodeOrder;
1838  delete [] transNodeOrder;
1839  delete [] qgrid;
1840  if ( kgrid != qgrid ) delete [] kgrid;
1841  delete [] work;
1842  delete [] gridmsg_reuse;
1843 
1844  if ( ! offload ) {
1845  for (int i=0; i<q_count; ++i) {
1846  delete [] q_list[i];
1847  }
1848  delete [] q_list;
1849  delete [] fz_arr;
1850  }
1851  delete [] f_arr;
1852  delete [] q_arr;
1853 }
1854 
1856  // CkPrintf("recvGrid from %d on Pe(%d)\n",msg->sourceNode,CkMyPe());
1857  if ( grid_count == 0 ) {
1858  NAMD_bug("Message order failure in ComputePmeMgr::recvGrid\n");
1859  }
1860  if ( grid_count == numSources ) {
1861  lattice = msg->lattice;
1862  grid_sequence = msg->sequence;
1863  }
1864 
1865  int zdim = myGrid.dim3;
1866  int zlistlen = msg->zlistlen;
1867  int *zlist = msg->zlist;
1868  float *qmsg = msg->qgrid;
1869  for ( int g=0; g<numGrids; ++g ) {
1870  char *f = msg->fgrid + fgrid_len * g;
1871  float *q = qgrid + qgrid_size * g;
1872  for ( int i=0; i<fgrid_len; ++i ) {
1873  if ( f[i] ) {
1874  for ( int k=0; k<zlistlen; ++k ) {
1875  q[zlist[k]] += *(qmsg++);
1876  }
1877  }
1878  q += zdim;
1879  }
1880  }
1881 
1882  gridmsg_reuse[numSources-grid_count] = msg;
1883  --grid_count;
1884 
1885  if ( grid_count == 0 ) {
1886  pmeProxyDir[CkMyPe()].gridCalc1();
1887  if ( useBarrier ) pmeProxyDir[0].sendTransBarrier();
1888  }
1889 }
1890 #ifdef MANUAL_DEBUG_FFTW3
1891 
1892 /* utility functions for manual debugging */
1893 void dumpMatrixFloat(const char *infilename, float *matrix, int xdim, int ydim, int zdim,int pe)
1894 {
1895 
1896  char fmt[1000];
1897  char filename[1000];
1898  strncpy(fmt,infilename,999);
1899  strncat(fmt,"_%d.out",999);
1900  sprintf(filename,fmt, pe);
1901  FILE *loutfile = fopen(filename, "w");
1902 #ifdef PAIRCALC_TEST_DUMP
1903  fprintf(loutfile,"%d\n",ydim);
1904 #endif
1905  fprintf(loutfile,"%d %d %d\n",xdim,ydim, zdim);
1906  for(int i=0;i<xdim;i++)
1907  for(int j=0;j<ydim;j++)
1908  for(int k=0;k<zdim;k++)
1909  fprintf(loutfile,"%d %d %d %.8f\n",i,j,k,matrix[i*zdim*ydim+j*zdim +k]);
1910  fclose(loutfile);
1911 
1912 }
1913 
1914 void dumpMatrixFloat3(const char *infilename, float *matrix, int xdim, int ydim, int zdim,int x, int y, int z)
1915 {
1916  char fmt[1000];
1917  char filename[1000];
1918  strncpy(fmt,infilename,999);
1919  strncat(fmt,"_%d_%d_%d.out",999);
1920  sprintf(filename,fmt, x,y,z);
1921  FILE *loutfile = fopen(filename, "w");
1922  CkAssert(loutfile!=NULL);
1923  CkPrintf("opened %s for dump\n",filename);
1924  fprintf(loutfile,"%d %d %d\n",xdim,ydim, zdim);
1925  for(int i=0;i<xdim;i++)
1926  for(int j=0;j<ydim;j++)
1927  for(int k=0;k<zdim;k++)
1928  fprintf(loutfile,"%d %d %d %.8f\n",i,j,k,matrix[i*zdim*ydim+j*zdim +k]);
1929  fclose(loutfile);
1930 }
1931 
1932 #endif
1933 
1935  // CkPrintf("gridCalc1 on Pe(%d)\n",CkMyPe());
1936 
1937 #ifdef NAMD_FFTW
1938  for ( int g=0; g<numGrids; ++g ) {
1939 #ifdef NAMD_FFTW_3
1940  fftwf_execute(forward_plan_yz[g]);
1941 #else
1942  rfftwnd_real_to_complex(forward_plan_yz, localInfo[myGridPe].nx,
1943  qgrid + qgrid_size * g, 1, myGrid.dim2 * myGrid.dim3, 0, 0, 0);
1944 #endif
1945 
1946  }
1947 #endif
1948 
1949  if ( ! useBarrier ) pmeProxyDir[CkMyPe()].sendTrans();
1950 }
1951 
1953  sendTransBarrier_received += 1;
1954  // CkPrintf("sendTransBarrier on %d %d\n",myGridPe,numGridPes-sendTransBarrier_received);
1955  if ( sendTransBarrier_received < numGridPes ) return;
1956  sendTransBarrier_received = 0;
1957  for ( int i=0; i<numGridPes; ++i ) {
1958  pmeProxyDir[gridPeMap[i]].sendTrans();
1959  }
1960 }
1961 
1962 static inline void PmeSlabSendTrans(int first, int last, void *result, int paraNum, void *param) {
1963  ComputePmeMgr *mgr = (ComputePmeMgr *)param;
1964  mgr->sendTransSubset(first, last);
1965 }
1966 
1968 
1969  untrans_count = numTransPes;
1970 
1971 #if CMK_SMP && USE_CKLOOP
1972  int useCkLoop = Node::Object()->simParameters->useCkLoop;
1973  if ( useCkLoop >= CKLOOP_CTRL_PME_SENDTRANS && CkNumPes() >= 2 * numGridPes) {
1974  CkLoop_Parallelize(PmeSlabSendTrans, 1, (void *)this, CkMyNodeSize(), 0, numTransNodes-1, 0); // no sync
1975  } else
1976 #endif
1977  {
1978  sendTransSubset(0, numTransNodes-1);
1979  }
1980 
1981 }
1982 
1983 void ComputePmeMgr::sendTransSubset(int first, int last) {
1984  // CkPrintf("sendTrans on Pe(%d)\n",CkMyPe());
1985 
1986  // send data for transpose
1987  int zdim = myGrid.dim3;
1988  int nx = localInfo[myGridPe].nx;
1989  int x_start = localInfo[myGridPe].x_start;
1990  int slicelen = myGrid.K2 * zdim;
1991 
1992  ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
1993 
1994 #if CMK_BLUEGENEL
1995  CmiNetworkProgressAfter (0);
1996 #endif
1997 
1998  for (int j=first; j<=last; j++) {
1999  int node = transNodeOrder[j]; // different order on each node
2000  int pe = transNodeInfo[node].pe_start;
2001  int npe = transNodeInfo[node].npe;
2002  int totlen = 0;
2003  if ( node != myTransNode ) for (int i=0; i<npe; ++i, ++pe) {
2004  LocalPmeInfo &li = localInfo[pe];
2005  int cpylen = li.ny_after_transpose * zdim;
2006  totlen += cpylen;
2007  }
2008  PmeTransMsg *newmsg = new (nx * totlen * numGrids,
2010  newmsg->sourceNode = myGridPe;
2011  newmsg->lattice = lattice;
2012  newmsg->x_start = x_start;
2013  newmsg->nx = nx;
2014  for ( int g=0; g<numGrids; ++g ) {
2015  float *qmsg = newmsg->qgrid + nx * totlen * g;
2016  pe = transNodeInfo[node].pe_start;
2017  for (int i=0; i<npe; ++i, ++pe) {
2018  LocalPmeInfo &li = localInfo[pe];
2019  int cpylen = li.ny_after_transpose * zdim;
2020  if ( node == myTransNode ) {
2021  ComputePmeMgr *m = mgrObjects[CkRankOf(transPeMap[pe])];
2022  qmsg = m->kgrid + m->qgrid_size * g + x_start*cpylen;
2023  }
2024  float *q = qgrid + qgrid_size * g + li.y_start_after_transpose * zdim;
2025  for ( int x = 0; x < nx; ++x ) {
2026  CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
2027  q += slicelen;
2028  qmsg += cpylen;
2029  }
2030  }
2031  }
2032  newmsg->sequence = grid_sequence;
2033  SET_PRIORITY(newmsg,grid_sequence,PME_TRANS_PRIORITY)
2034  if ( node == myTransNode ) newmsg->nx = 0;
2035  if ( npe > 1 ) {
2036  if ( node == myTransNode ) fwdSharedTrans(newmsg);
2037  else pmeNodeProxy[transNodeInfo[node].real_node].recvTrans(newmsg);
2038  } else pmeProxy[transPeMap[transNodeInfo[node].pe_start]].recvTrans(newmsg);
2039  }
2040 }
2041 
2043  // CkPrintf("fwdSharedTrans on Pe(%d)\n",CkMyPe());
2044  int pe = transNodeInfo[myTransNode].pe_start;
2045  int npe = transNodeInfo[myTransNode].npe;
2046  CmiNodeLock lock = CmiCreateLock();
2047  int *count = new int; *count = npe;
2048  for (int i=0; i<npe; ++i, ++pe) {
2051  shmsg->msg = msg;
2052  shmsg->count = count;
2053  shmsg->lock = lock;
2054  pmeProxy[transPeMap[pe]].recvSharedTrans(shmsg);
2055  }
2056 }
2057 
2059  procTrans(msg->msg);
2060  CmiLock(msg->lock);
2061  int count = --(*msg->count);
2062  CmiUnlock(msg->lock);
2063  if ( count == 0 ) {
2064  CmiDestroyLock(msg->lock);
2065  delete msg->count;
2066  delete msg->msg;
2067  }
2068  delete msg;
2069 }
2070 
2072  procTrans(msg);
2073  delete msg;
2074 }
2075 
2077  // CkPrintf("procTrans on Pe(%d)\n",CkMyPe());
2078  if ( trans_count == numGridPes ) {
2079  lattice = msg->lattice;
2080  grid_sequence = msg->sequence;
2081  }
2082 
2083  if ( msg->nx ) {
2084  int zdim = myGrid.dim3;
2085  NodePmeInfo &nodeInfo(transNodeInfo[myTransNode]);
2086  int first_pe = nodeInfo.pe_start;
2087  int last_pe = first_pe+nodeInfo.npe-1;
2088  int y_skip = localInfo[myTransPe].y_start_after_transpose
2089  - localInfo[first_pe].y_start_after_transpose;
2090  int ny_msg = localInfo[last_pe].y_start_after_transpose
2091  + localInfo[last_pe].ny_after_transpose
2092  - localInfo[first_pe].y_start_after_transpose;
2093  int ny = localInfo[myTransPe].ny_after_transpose;
2094  int x_start = msg->x_start;
2095  int nx = msg->nx;
2096  for ( int g=0; g<numGrids; ++g ) {
2097  CmiMemcpy((void*)(kgrid + qgrid_size * g + x_start*ny*zdim),
2098  (void*)(msg->qgrid + nx*(ny_msg*g+y_skip)*zdim),
2099  nx*ny*zdim*sizeof(float));
2100  }
2101  }
2102 
2103  --trans_count;
2104 
2105  if ( trans_count == 0 ) {
2106  pmeProxyDir[CkMyPe()].gridCalc2();
2107  }
2108 }
2109 
2111  // CkPrintf("gridCalc2 on Pe(%d)\n",CkMyPe());
2112 
2113 #if CMK_BLUEGENEL
2114  CmiNetworkProgressAfter (0);
2115 #endif
2116 
2117  int zdim = myGrid.dim3;
2118  // int y_start = localInfo[myTransPe].y_start_after_transpose;
2119  int ny = localInfo[myTransPe].ny_after_transpose;
2120 
2121  for ( int g=0; g<numGrids; ++g ) {
2122  // finish forward FFT (x dimension)
2123 #ifdef NAMD_FFTW
2124 #ifdef NAMD_FFTW_3
2125  fftwf_execute(forward_plan_x[g]);
2126 #else
2127  fftw(forward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
2128  ny * zdim / 2, 1, work, 1, 0);
2129 #endif
2130 #endif
2131  }
2132 
2133 #ifdef OPENATOM_VERSION
2134  if ( ! simParams -> openatomOn ) {
2135 #endif // OPENATOM_VERSION
2136  gridCalc2R();
2137 #ifdef OPENATOM_VERSION
2138  } else {
2139  gridCalc2Moa();
2140  }
2141 #endif // OPENATOM_VERSION
2142 }
2143 
2144 #ifdef OPENATOM_VERSION
2145 void ComputePmeMgr::gridCalc2Moa(void) {
2146 
2147  int zdim = myGrid.dim3;
2148  // int y_start = localInfo[myTransPe].y_start_after_transpose;
2149  int ny = localInfo[myTransPe].ny_after_transpose;
2150 
2152 
2153  CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
2154 
2155  for ( int g=0; g<numGrids; ++g ) {
2156  #ifdef OPENATOM_VERSION_DEBUG
2157  CkPrintf("Sending recQ on processor %d \n", CkMyPe());
2158  for ( int i=0; i<=(ny * zdim / 2); ++i)
2159  {
2160  CkPrintf("PE, g,fftw_q,k*q*g, kgrid, qgrid_size value %d pre-send = %d, %d, %f %f, %d, \n", i, CkMyPe(), g, (kgrid+qgrid_size*g)[i], kgrid[i], qgrid_size);
2161  }
2162  #endif // OPENATOM_VERSION_DEBUG
2163 // mqcpProxy[CkMyPe()].recvQ((ny * zdim / 2),((fftw_complex *)(kgrid+qgrid_size*g)));
2164  CkCallback resumePme(CkIndex_ComputePmeMgr::gridCalc2R(), thishandle);
2165  moaProxy[CkMyPe()].recvQ(g,numGrids,(ny * zdim / 2),(kgrid+qgrid_size*g), resumePme);
2166  }
2167 }
2168 #endif // OPENATOM_VERSION
2169 
2171 
2172  int useCkLoop = 0;
2173 #if CMK_SMP && USE_CKLOOP
2174  if ( Node::Object()->simParameters->useCkLoop >= CKLOOP_CTRL_PME_KSPACE
2175  && CkNumPes() >= 2 * numTransPes ) {
2176  useCkLoop = 1;
2177  }
2178 #endif
2179 
2180  int zdim = myGrid.dim3;
2181  // int y_start = localInfo[myTransPe].y_start_after_transpose;
2182  int ny = localInfo[myTransPe].ny_after_transpose;
2183 
2184  for ( int g=0; g<numGrids; ++g ) {
2185  // reciprocal space portion of PME
2186  if ( LJPMEOn && 1==g ) {
2188  recip_evir2[g][0] = myKSpace->compute_energy_LJPME(kgrid+qgrid_size*g,
2189  lattice, LJewaldcof, &(recip_evir2[g][1]), useCkLoop);
2190  // CkPrintf("LJ Ewald reciprocal energy = %f\n", recip_evir2[g][0]);
2191  } else {
2193  recip_evir2[g][0] = myKSpace->compute_energy(kgrid+qgrid_size*g,
2194  lattice, ewaldcof, &(recip_evir2[g][1]), useCkLoop);
2195  // CkPrintf("Ewald reciprocal energy = %f\n", recip_evir2[g][0]);
2196  }
2197 
2198  // start backward FFT (x dimension)
2199 
2200 #ifdef NAMD_FFTW
2201 #ifdef NAMD_FFTW_3
2202  fftwf_execute(backward_plan_x[g]);
2203 #else
2204  fftw(backward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
2205  ny * zdim / 2, 1, work, 1, 0);
2206 #endif
2207 #endif
2208  }
2209 
2210  pmeProxyDir[CkMyPe()].sendUntrans();
2211 }
2212 
2213 static inline void PmeSlabSendUntrans(int first, int last, void *result, int paraNum, void *param) {
2214  ComputePmeMgr *mgr = (ComputePmeMgr *)param;
2215  mgr->sendUntransSubset(first, last);
2216 }
2217 
2219 
2220  trans_count = numGridPes;
2221 
2222  { // send energy and virial
2223  PmeEvirMsg *newmsg = new (numGrids, PRIORITY_SIZE) PmeEvirMsg;
2224  for ( int g=0; g<numGrids; ++g ) {
2225  newmsg->evir[g] = recip_evir2[g];
2226  }
2227  SET_PRIORITY(newmsg,grid_sequence,PME_UNGRID_PRIORITY)
2228  CmiEnableUrgentSend(1);
2229  pmeProxy[recipEvirPe].recvRecipEvir(newmsg);
2230  CmiEnableUrgentSend(0);
2231  }
2232 
2233 #if CMK_SMP && USE_CKLOOP
2234  int useCkLoop = Node::Object()->simParameters->useCkLoop;
2235  if ( useCkLoop >= CKLOOP_CTRL_PME_SENDUNTRANS && CkNumPes() >= 2 * numTransPes) {
2236  CkLoop_Parallelize(PmeSlabSendUntrans, 1, (void *)this, CkMyNodeSize(), 0, numGridNodes-1, 0); // no sync
2237  } else
2238 #endif
2239  {
2240  sendUntransSubset(0, numGridNodes-1);
2241  }
2242 
2243 }
2244 
2245 void ComputePmeMgr::sendUntransSubset(int first, int last) {
2246 
2247  int zdim = myGrid.dim3;
2248  int y_start = localInfo[myTransPe].y_start_after_transpose;
2249  int ny = localInfo[myTransPe].ny_after_transpose;
2250  int slicelen = myGrid.K2 * zdim;
2251 
2252  ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
2253 
2254 #if CMK_BLUEGENEL
2255  CmiNetworkProgressAfter (0);
2256 #endif
2257 
2258  // send data for reverse transpose
2259  for (int j=first; j<=last; j++) {
2260  int node = gridNodeOrder[j]; // different order on each node
2261  int pe = gridNodeInfo[node].pe_start;
2262  int npe = gridNodeInfo[node].npe;
2263  int totlen = 0;
2264  if ( node != myGridNode ) for (int i=0; i<npe; ++i, ++pe) {
2265  LocalPmeInfo &li = localInfo[pe];
2266  int cpylen = li.nx * zdim;
2267  totlen += cpylen;
2268  }
2269  PmeUntransMsg *newmsg = new (ny * totlen * numGrids, PRIORITY_SIZE) PmeUntransMsg;
2270  newmsg->sourceNode = myTransPe;
2271  newmsg->y_start = y_start;
2272  newmsg->ny = ny;
2273  for ( int g=0; g<numGrids; ++g ) {
2274  float *qmsg = newmsg->qgrid + ny * totlen * g;
2275  pe = gridNodeInfo[node].pe_start;
2276  for (int i=0; i<npe; ++i, ++pe) {
2277  LocalPmeInfo &li = localInfo[pe];
2278  if ( node == myGridNode ) {
2279  ComputePmeMgr *m = mgrObjects[CkRankOf(gridPeMap[pe])];
2280  qmsg = m->qgrid + m->qgrid_size * g + y_start * zdim;
2281  float *q = kgrid + qgrid_size*g + li.x_start*ny*zdim;
2282  int cpylen = ny * zdim;
2283  for ( int x = 0; x < li.nx; ++x ) {
2284  CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
2285  q += cpylen;
2286  qmsg += slicelen;
2287  }
2288  } else {
2289  CmiMemcpy((void*)qmsg,
2290  (void*)(kgrid + qgrid_size*g + li.x_start*ny*zdim),
2291  li.nx*ny*zdim*sizeof(float));
2292  qmsg += li.nx*ny*zdim;
2293  }
2294  }
2295  }
2296  SET_PRIORITY(newmsg,grid_sequence,PME_UNTRANS_PRIORITY)
2297  if ( node == myGridNode ) newmsg->ny = 0;
2298  if ( npe > 1 ) {
2299  if ( node == myGridNode ) fwdSharedUntrans(newmsg);
2300  else pmeNodeProxy[gridNodeInfo[node].real_node].recvUntrans(newmsg);
2301  } else pmeProxy[gridPeMap[gridNodeInfo[node].pe_start]].recvUntrans(newmsg);
2302  }
2303 }
2304 
2306  int pe = gridNodeInfo[myGridNode].pe_start;
2307  int npe = gridNodeInfo[myGridNode].npe;
2308  CmiNodeLock lock = CmiCreateLock();
2309  int *count = new int; *count = npe;
2310  for (int i=0; i<npe; ++i, ++pe) {
2312  shmsg->msg = msg;
2313  shmsg->count = count;
2314  shmsg->lock = lock;
2315  pmeProxy[gridPeMap[pe]].recvSharedUntrans(shmsg);
2316  }
2317 }
2318 
2320  procUntrans(msg->msg);
2321  CmiLock(msg->lock);
2322  int count = --(*msg->count);
2323  CmiUnlock(msg->lock);
2324  if ( count == 0 ) {
2325  CmiDestroyLock(msg->lock);
2326  delete msg->count;
2327  delete msg->msg;
2328  }
2329  delete msg;
2330 }
2331 
2333  procUntrans(msg);
2334  delete msg;
2335 }
2336 
2338  // CkPrintf("recvUntrans on Pe(%d)\n",CkMyPe());
2339 
2340 #if CMK_BLUEGENEL
2341  CmiNetworkProgressAfter (0);
2342 #endif
2343 
2344  NodePmeInfo &nodeInfo(gridNodeInfo[myGridNode]);
2345  int first_pe = nodeInfo.pe_start;
2346  int g;
2347 
2348  if ( msg->ny ) {
2349  int zdim = myGrid.dim3;
2350  int last_pe = first_pe+nodeInfo.npe-1;
2351  int x_skip = localInfo[myGridPe].x_start
2352  - localInfo[first_pe].x_start;
2353  int nx_msg = localInfo[last_pe].x_start
2354  + localInfo[last_pe].nx
2355  - localInfo[first_pe].x_start;
2356  int nx = localInfo[myGridPe].nx;
2357  int y_start = msg->y_start;
2358  int ny = msg->ny;
2359  int slicelen = myGrid.K2 * zdim;
2360  int cpylen = ny * zdim;
2361  for ( g=0; g<numGrids; ++g ) {
2362  float *q = qgrid + qgrid_size * g + y_start * zdim;
2363  float *qmsg = msg->qgrid + (nx_msg*g+x_skip) * cpylen;
2364  for ( int x = 0; x < nx; ++x ) {
2365  CmiMemcpy((void*)q, (void*)qmsg, cpylen*sizeof(float));
2366  q += slicelen;
2367  qmsg += cpylen;
2368  }
2369  }
2370  }
2371 
2372  --untrans_count;
2373 
2374  if ( untrans_count == 0 ) {
2375  pmeProxyDir[CkMyPe()].gridCalc3();
2376  }
2377 }
2378 
2380  // CkPrintf("gridCalc3 on Pe(%d)\n",CkMyPe());
2381 
2382  // finish backward FFT
2383 #ifdef NAMD_FFTW
2384  for ( int g=0; g<numGrids; ++g ) {
2385 #ifdef NAMD_FFTW_3
2386  fftwf_execute(backward_plan_yz[g]);
2387 #else
2388  rfftwnd_complex_to_real(backward_plan_yz, localInfo[myGridPe].nx,
2389  (fftw_complex *) (qgrid + qgrid_size * g),
2390  1, myGrid.dim2 * myGrid.dim3 / 2, 0, 0, 0);
2391 #endif
2392  }
2393 
2394 #endif
2395 
2396  pmeProxyDir[CkMyPe()].sendUngrid();
2397 }
2398 
2399 static inline void PmeSlabSendUngrid(int first, int last, void *result, int paraNum, void *param) {
2400  ComputePmeMgr *mgr = (ComputePmeMgr *)param;
2401  mgr->sendUngridSubset(first, last);
2402 }
2403 
2405 
2406 #if CMK_SMP && USE_CKLOOP
2407  int useCkLoop = Node::Object()->simParameters->useCkLoop;
2408  if ( useCkLoop >= CKLOOP_CTRL_PME_SENDUNTRANS && CkNumPes() >= 2 * numGridPes) {
2409  CkLoop_Parallelize(PmeSlabSendUngrid, 1, (void *)this, CkMyNodeSize(), 0, numSources-1, 1); // sync
2410  } else
2411 #endif
2412  {
2413  sendUngridSubset(0, numSources-1);
2414  }
2415 
2416  grid_count = numSources;
2417  memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
2418 }
2419 
2420 void ComputePmeMgr::sendUngridSubset(int first, int last) {
2421 
2422 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2423  const int UNGRID_PRIORITY = ( offload ? PME_OFFLOAD_UNGRID_PRIORITY : PME_UNGRID_PRIORITY );
2424 #else
2425  const int UNGRID_PRIORITY = PME_UNGRID_PRIORITY ;
2426 #endif
2427 
2428  for ( int j=first; j<=last; ++j ) {
2429  // int msglen = qgrid_len;
2430  PmeGridMsg *newmsg = gridmsg_reuse[j];
2431  int pe = newmsg->sourceNode;
2432  int zdim = myGrid.dim3;
2433  int flen = newmsg->len;
2434  int fstart = newmsg->start;
2435  int zlistlen = newmsg->zlistlen;
2436  int *zlist = newmsg->zlist;
2437  float *qmsg = newmsg->qgrid;
2438  for ( int g=0; g<numGrids; ++g ) {
2439  char *f = newmsg->fgrid + fgrid_len * g;
2440  float *q = qgrid + qgrid_size * g + (fstart-fgrid_start) * zdim;
2441  for ( int i=0; i<flen; ++i ) {
2442  if ( f[i] ) {
2443  for ( int k=0; k<zlistlen; ++k ) {
2444  *(qmsg++) = q[zlist[k]];
2445  }
2446  }
2447  q += zdim;
2448  }
2449  }
2450  newmsg->sourceNode = myGridPe;
2451 
2452  SET_PRIORITY(newmsg,grid_sequence,UNGRID_PRIORITY)
2453  CmiEnableUrgentSend(1);
2454 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2455  if ( offload ) {
2456  pmeNodeProxy[CkNodeOf(pe)].recvUngrid(newmsg);
2457  } else
2458 #endif
2459  pmeProxyDir[pe].recvUngrid(newmsg);
2460  CmiEnableUrgentSend(0);
2461  }
2462 }
2463 
2465  // CkPrintf("recvUngrid on Pe(%d)\n",CkMyPe());
2466 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2467  if ( ! offload ) // would need lock
2468 #endif
2469  if ( ungrid_count == 0 ) {
2470  NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
2471  }
2472 
2473  if ( usePencils ) copyPencils(msg);
2474  else copyResults(msg);
2475  delete msg;
2476  recvAck(0);
2477 }
2478 
2480  if ( msg ) delete msg;
2481 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2482  if ( offload ) {
2483  CmiLock(cuda_lock);
2484  if ( ungrid_count == 0 ) {
2485  NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
2486  }
2487  int uc = --ungrid_count;
2488  CmiUnlock(cuda_lock);
2489 
2490  if ( uc == 0 ) {
2491  pmeProxyDir[master_pe].ungridCalc();
2492  }
2493  return;
2494  }
2495 #endif
2496  --ungrid_count;
2497 
2498  if ( ungrid_count == 0 ) {
2499  pmeProxyDir[CkMyPe()].ungridCalc();
2500  }
2501 }
2502 
2503 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2504 #define count_limit 1000000
2505 #define CUDA_POLL(FN,ARG) CcdCallFnAfter(FN,ARG,0.1)
2506 #define EVENT_STRIDE 10
2507 
2508 extern "C" void CcdCallBacksReset(void *ignored,double curWallTime); // fix Charm++
2509 
2510 //void cudaDie(const char *msg, cudaError_t err=cudaSuccess);
2511 
2512 void cuda_check_pme_forces(void *arg, double walltime) {
2513  ComputePmeMgr *argp = (ComputePmeMgr *) arg;
2514 
2515  while ( 1 ) { // process multiple events per call
2516  cudaError_t err = cudaEventQuery(argp->end_forces[argp->forces_done_count/EVENT_STRIDE]);
2517  if ( err == cudaSuccess ) {
2518  argp->check_forces_count = 0;
2519  for ( int i=0; i<EVENT_STRIDE; ++i ) {
2521  if ( ++(argp->forces_done_count) == argp->forces_count ) break;
2522  }
2523  if ( argp->forces_done_count == argp->forces_count ) { // last event
2524  traceUserBracketEvent(CUDA_EVENT_ID_PME_FORCES,argp->forces_time,walltime);
2525  argp->forces_time = walltime - argp->forces_time;
2526  //CkPrintf("cuda_check_pme_forces forces_time == %f\n", argp->forces_time);
2527  return;
2528  } else { // more events
2529  continue; // check next event
2530  }
2531  } else if ( err != cudaErrorNotReady ) {
2532  char errmsg[256];
2533  sprintf(errmsg,"in cuda_check_pme_forces for event %d after polling %d times over %f s on seq %d",
2535  argp->check_forces_count, walltime - argp->forces_time,
2536  argp->saved_sequence);
2537  cudaDie(errmsg,err);
2538  } else if ( ++(argp->check_forces_count) >= count_limit ) {
2539  char errmsg[256];
2540  sprintf(errmsg,"cuda_check_pme_forces for event %d polled %d times over %f s on seq %d",
2542  argp->check_forces_count, walltime - argp->forces_time,
2543  argp->saved_sequence);
2544  cudaDie(errmsg,err);
2545  } else {
2546  break; // call again
2547  }
2548  } // while ( 1 )
2549  CcdCallBacksReset(0,walltime); // fix Charm++
2551 }
2552 #endif // NAMD_CUDA
2553 
2555  // CkPrintf("ungridCalc on Pe(%d)\n",CkMyPe());
2556 
2557  ungridForcesCount = pmeComputes.size();
2558 
2559 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2560  if ( offload ) {
2561  //CmiLock(cuda_lock);
2562  cudaSetDevice(deviceCUDA->getDeviceID());
2563 
2564  if ( this == masterPmeMgr ) {
2565  double before = CmiWallTimer();
2566  // XXX prevents something from breaking???
2567  cudaMemcpyAsync(v_data_dev, q_data_host, q_data_size, cudaMemcpyHostToDevice, 0 /*streams[stream]*/);
2568  cudaEventRecord(nodePmeMgr->end_potential_memcpy, 0 /*streams[stream]*/);
2569  // try to make the unspecified launch failures go away
2570  cudaEventSynchronize(nodePmeMgr->end_potential_memcpy);
2571  cuda_errcheck("in ComputePmeMgr::ungridCalc after potential memcpy");
2572  traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
2573 
2574  const int myrank = CkMyRank();
2575  for ( int i=0; i<CkMyNodeSize(); ++i ) {
2576  if ( myrank != i && nodePmeMgr->mgrObjects[i]->pmeComputes.size() ) {
2577  nodePmeMgr->mgrObjects[i]->ungridCalc();
2578  }
2579  }
2580  if ( ! pmeComputes.size() ) return;
2581  }
2582 
2583  if ( ! end_forces ) {
2584  int n=(pmeComputes.size()-1)/EVENT_STRIDE+1;
2585  end_forces = new cudaEvent_t[n];
2586  for ( int i=0; i<n; ++i ) {
2587  cudaEventCreateWithFlags(&end_forces[i],cudaEventDisableTiming);
2588  }
2589  }
2590 
2591  const int pcsz = pmeComputes.size();
2592  if ( ! afn_host ) {
2593  cudaMallocHost((void**) &afn_host, 3*pcsz*sizeof(float*));
2594  cudaMalloc((void**) &afn_dev, 3*pcsz*sizeof(float*));
2595  cuda_errcheck("malloc params for pme");
2596  }
2597  int totn = 0;
2598  for ( int i=0; i<pcsz; ++i ) {
2599  int n = pmeComputes[i]->numGridAtoms[0];
2600  totn += n;
2601  }
2602  if ( totn > f_data_mgr_alloc ) {
2603  if ( f_data_mgr_alloc ) {
2604  CkPrintf("Expanding CUDA forces allocation because %d > %d\n", totn, f_data_mgr_alloc);
2605  cudaFree(f_data_mgr_dev);
2606  cudaFreeHost(f_data_mgr_host);
2607  }
2608  f_data_mgr_alloc = 1.2 * (totn + 100);
2609  cudaMalloc((void**) &f_data_mgr_dev, 3*f_data_mgr_alloc*sizeof(float));
2610  cudaMallocHost((void**) &f_data_mgr_host, 3*f_data_mgr_alloc*sizeof(float));
2611  cuda_errcheck("malloc forces for pme");
2612  }
2613  // CkPrintf("pe %d pcsz %d totn %d alloc %d\n", CkMyPe(), pcsz, totn, f_data_mgr_alloc);
2614  float *f_dev = f_data_mgr_dev;
2615  float *f_host = f_data_mgr_host;
2616  for ( int i=0; i<pcsz; ++i ) {
2617  int n = pmeComputes[i]->numGridAtoms[0];
2618  pmeComputes[i]->f_data_dev = f_dev;
2619  pmeComputes[i]->f_data_host = f_host;
2620  afn_host[3*i ] = a_data_dev + 7 * pmeComputes[i]->cuda_atoms_offset;
2621  afn_host[3*i+1] = f_dev;
2622  afn_host[3*i+2] = f_dev + n; // avoid type conversion issues
2623  f_dev += 3*n;
2624  f_host += 3*n;
2625  }
2626  //CmiLock(cuda_lock);
2627  double before = CmiWallTimer();
2628  cudaMemcpyAsync(afn_dev, afn_host, 3*pcsz*sizeof(float*), cudaMemcpyHostToDevice, streams[stream]);
2629  cuda_errcheck("in ComputePmeMgr::ungridCalc after force pointer memcpy");
2630  traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
2631  cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_potential_memcpy, 0);
2632  cuda_errcheck("in ComputePmeMgr::ungridCalc after wait for potential memcpy");
2633  traceUserEvent(CUDA_EVENT_ID_PME_TICK);
2634 
2635  for ( int i=0; i<pcsz; ++i ) {
2636  // cudaMemsetAsync(pmeComputes[i]->f_data_dev, 0, 3*n*sizeof(float), streams[stream]);
2637  if ( i%EVENT_STRIDE == 0 ) {
2638  int dimy = pcsz - i;
2639  if ( dimy > EVENT_STRIDE ) dimy = EVENT_STRIDE;
2640  int maxn = 0;
2641  int subtotn = 0;
2642  for ( int j=0; j<dimy; ++j ) {
2643  int n = pmeComputes[i+j]->numGridAtoms[0];
2644  subtotn += n;
2645  if ( n > maxn ) maxn = n;
2646  }
2647  // CkPrintf("pe %d dimy %d maxn %d subtotn %d\n", CkMyPe(), dimy, maxn, subtotn);
2648  before = CmiWallTimer();
2649  cuda_pme_forces(
2650  bspline_coeffs_dev,
2651  v_arr_dev, afn_dev+3*i, dimy, maxn, /*
2652  pmeComputes[i]->a_data_dev,
2653  pmeComputes[i]->f_data_dev,
2654  n, */ myGrid.K1, myGrid.K2, myGrid.K3, myGrid.order,
2655  streams[stream]);
2656  cuda_errcheck("in ComputePmeMgr::ungridCalc after force kernel submit");
2657  traceUserBracketEvent(CUDA_EVENT_ID_PME_KERNEL,before,CmiWallTimer());
2658  before = CmiWallTimer();
2659  cudaMemcpyAsync(pmeComputes[i]->f_data_host, pmeComputes[i]->f_data_dev, 3*subtotn*sizeof(float),
2660  cudaMemcpyDeviceToHost, streams[stream]);
2661 #if 0
2662  cudaDeviceSynchronize();
2663  fprintf(stderr, "i = %d\n", i);
2664  for(int k=0; k < subtotn*3; k++)
2665  {
2666  fprintf(stderr, "f_data_host[%d][%d] = %f\n", i, k,
2667  pmeComputes[i]->f_data_host[k]);
2668  }
2669 #endif
2670  cuda_errcheck("in ComputePmeMgr::ungridCalc after force memcpy submit");
2671  traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
2672  cudaEventRecord(end_forces[i/EVENT_STRIDE], streams[stream]);
2673  cuda_errcheck("in ComputePmeMgr::ungridCalc after end_forces event");
2674  traceUserEvent(CUDA_EVENT_ID_PME_TICK);
2675  }
2676  // CkPrintf("pe %d c %d natoms %d fdev %lld fhost %lld\n", CkMyPe(), i, (int64)afn_host[3*i+2], pmeComputes[i]->f_data_dev, pmeComputes[i]->f_data_host);
2677  }
2678  //CmiUnlock(cuda_lock);
2679  } else
2680 #endif // NAMD_CUDA
2681  {
2682  for ( int i=0; i<pmeComputes.size(); ++i ) {
2684  // pmeComputes[i]->ungridForces();
2685  }
2686  }
2687  // submitReductions(); // must follow all ungridForces()
2688 
2689 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2690  if ( offload ) {
2691  forces_time = CmiWallTimer();
2692  forces_count = ungridForcesCount;
2693  forces_done_count = 0;
2694  pmeProxy[this_pe].pollForcesReady();
2695  }
2696 #endif
2697 
2698  ungrid_count = (usePencils ? numPencilsActive : numDestRecipPes );
2699 }
2700 
2702 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2703  CcdCallBacksReset(0,CmiWallTimer()); // fix Charm++
2705 #else
2706  NAMD_bug("ComputePmeMgr::pollForcesReady() called in non-CUDA build.");
2707 #endif
2708 }
2709 
2710 void ComputePme::atomUpdate() { atomsChanged = 1; }
2711 
2713 {
2714  DebugM(4,"ComputePme created.\n");
2716  setNumPatches(1);
2717 
2718  CProxy_ComputePmeMgr::ckLocalBranch(
2719  CkpvAccess(BOCclass_group).computePmeMgr)->addCompute(this);
2720 
2722 
2723  qmForcesOn = simParams->qmForcesOn;
2724  offload = simParams->PMEOffload;
2725 
2726  numGridsMax = numGrids;
2727 
2728  myGrid.K1 = simParams->PMEGridSizeX;
2729  myGrid.K2 = simParams->PMEGridSizeY;
2730  myGrid.K3 = simParams->PMEGridSizeZ;
2731  myGrid.order = simParams->PMEInterpOrder;
2732  myGrid.dim2 = myGrid.K2;
2733  myGrid.dim3 = 2 * (myGrid.K3/2 + 1);
2734 
2735 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2736  cuda_atoms_offset = 0;
2737  f_data_host = 0;
2738  f_data_dev = 0;
2739  if ( ! offload )
2740 #endif
2741  {
2742  for ( int g=0; g<numGrids; ++g ) myRealSpace[g] = new PmeRealSpace(myGrid);
2743  }
2744 
2745  atomsChanged = 0;
2746 
2747  qmLoclIndx = 0;
2748  qmLocalCharges = 0;
2749 }
2750 
2752  if (!(patch = PatchMap::Object()->patch(patchID))) {
2753  NAMD_bug("ComputePme used with unknown patch.");
2754  }
2755  positionBox = patch->registerPositionPickup(this);
2756  avgPositionBox = patch->registerAvgPositionPickup(this);
2757  forceBox = patch->registerForceDeposit(this);
2758 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2759  if ( offload ) {
2760  myMgr->cuda_atoms_count += patch->getNumAtoms();
2761  }
2762 #endif
2763 }
2764 
2766 
2767  noWorkCount = 0;
2768  doWorkCount = 0;
2769  ungridForcesCount = 0;
2770 
2772 
2774 
2775  strayChargeErrors = 0;
2776 
2777 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2778  PatchMap *patchMap = PatchMap::Object();
2779  int pe = master_pe = CkNodeFirst(CkMyNode());
2780  for ( int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
2781  if ( ! patchMap->numPatchesOnNode(master_pe) ) master_pe = pe;
2782  if ( ! patchMap->numPatchesOnNode(pe) ) continue;
2783  if ( master_pe < 1 && pe != deviceCUDA->getMasterPe() ) master_pe = pe;
2784  if ( master_pe == deviceCUDA->getMasterPe() ) master_pe = pe;
2786  && pe != deviceCUDA->getMasterPe() ) {
2787  master_pe = pe;
2788  }
2789  }
2790  if ( ! patchMap->numPatchesOnNode(master_pe) ) {
2791  NAMD_bug("ComputePmeMgr::initialize_computes() master_pe has no patches.");
2792  }
2793 
2794  masterPmeMgr = nodePmeMgr->mgrObjects[master_pe - CkNodeFirst(CkMyNode())];
2795  bool cudaFirst = 1;
2796  if ( offload ) {
2797  CmiLock(cuda_lock);
2798  cudaFirst = ! masterPmeMgr->chargeGridSubmittedCount++;
2799  }
2800 
2801  if ( cudaFirst ) {
2802  nodePmeMgr->master_pe = master_pe;
2803  nodePmeMgr->masterPmeMgr = masterPmeMgr;
2804  }
2805 #endif
2806 
2807  qsize = myGrid.K1 * myGrid.dim2 * myGrid.dim3;
2808  fsize = myGrid.K1 * myGrid.dim2;
2809  if ( myGrid.K2 != myGrid.dim2 ) NAMD_bug("PME myGrid.K2 != myGrid.dim2");
2810 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2811  if ( ! offload )
2812 #endif
2813  {
2814  q_arr = new float*[fsize*numGrids];
2815  memset( (void*) q_arr, 0, fsize*numGrids * sizeof(float*) );
2816  q_list = new float*[fsize*numGrids];
2817  memset( (void*) q_list, 0, fsize*numGrids * sizeof(float*) );
2818  q_count = 0;
2819  }
2820 
2821 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2822  if ( cudaFirst || ! offload ) {
2823 #endif
2824  f_arr = new char[fsize*numGrids];
2825  // memset to non-zero value has race condition on BlueGene/Q
2826  // memset( (void*) f_arr, 2, fsize*numGrids * sizeof(char) );
2827  for ( int n=fsize*numGrids, i=0; i<n; ++i ) f_arr[i] = 2;
2828 
2829  for ( int g=0; g<numGrids; ++g ) {
2830  char *f = f_arr + g*fsize;
2831  if ( usePencils ) {
2832  int K1 = myGrid.K1;
2833  int K2 = myGrid.K2;
2834  int block1 = ( K1 + xBlocks - 1 ) / xBlocks;
2835  int block2 = ( K2 + yBlocks - 1 ) / yBlocks;
2836  int dim2 = myGrid.dim2;
2837  for (int ap=0; ap<numPencilsActive; ++ap) {
2838  int ib = activePencils[ap].i;
2839  int jb = activePencils[ap].j;
2840  int ibegin = ib*block1;
2841  int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
2842  int jbegin = jb*block2;
2843  int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
2844  int flen = numGrids * (iend - ibegin) * (jend - jbegin);
2845  for ( int i=ibegin; i<iend; ++i ) {
2846  for ( int j=jbegin; j<jend; ++j ) {
2847  f[i*dim2+j] = 0;
2848  }
2849  }
2850  }
2851  } else {
2852  int block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
2853  bsize = block1 * myGrid.dim2 * myGrid.dim3;
2854  for (int pe=0; pe<numGridPes; pe++) {
2855  if ( ! recipPeDest[pe] ) continue;
2856  int start = pe * bsize;
2857  int len = bsize;
2858  if ( start >= qsize ) { start = 0; len = 0; }
2859  if ( start + len > qsize ) { len = qsize - start; }
2860  int zdim = myGrid.dim3;
2861  int fstart = start / zdim;
2862  int flen = len / zdim;
2863  memset(f + fstart, 0, flen*sizeof(char));
2864  // CkPrintf("pe %d enabled slabs %d to %d\n", CkMyPe(), fstart/myGrid.dim2, (fstart+flen)/myGrid.dim2-1);
2865  }
2866  }
2867  }
2868 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2869  }
2870  if ( offload ) {
2871  cudaSetDevice(deviceCUDA->getDeviceID());
2872  if ( cudaFirst ) {
2873 
2874  int f_alloc_count = 0;
2875  for ( int n=fsize, i=0; i<n; ++i ) {
2876  if ( f_arr[i] == 0 ) {
2877  ++f_alloc_count;
2878  }
2879  }
2880  // CkPrintf("pe %d f_alloc_count == %d (%d slabs)\n", CkMyPe(), f_alloc_count, f_alloc_count/myGrid.dim2);
2881 
2882  q_arr = new float*[fsize*numGrids];
2883  memset( (void*) q_arr, 0, fsize*numGrids * sizeof(float*) );
2884 
2885  float **q_arr_dev_host = new float*[fsize];
2886  cudaMalloc((void**) &q_arr_dev, fsize * sizeof(float*));
2887 
2888  float **v_arr_dev_host = new float*[fsize];
2889  cudaMalloc((void**) &v_arr_dev, fsize * sizeof(float*));
2890 
2891  int q_stride = myGrid.K3+myGrid.order-1;
2892  q_data_size = f_alloc_count * q_stride * sizeof(float);
2893  ffz_size = (fsize + q_stride) * sizeof(int);
2894 
2895  // tack ffz onto end of q_data to allow merged transfer
2896  cudaMallocHost((void**) &q_data_host, q_data_size+ffz_size);
2897  ffz_host = (int*)(((char*)q_data_host) + q_data_size);
2898  cudaMalloc((void**) &q_data_dev, q_data_size+ffz_size);
2899  ffz_dev = (int*)(((char*)q_data_dev) + q_data_size);
2900  cudaMalloc((void**) &v_data_dev, q_data_size);
2901  cuda_errcheck("malloc grid data for pme");
2902  cudaMemset(q_data_dev, 0, q_data_size + ffz_size); // for first time
2903  cudaEventCreateWithFlags(&(nodePmeMgr->end_charge_memset),cudaEventDisableTiming);
2904  cudaEventRecord(nodePmeMgr->end_charge_memset, 0);
2905  cudaEventCreateWithFlags(&(nodePmeMgr->end_all_pme_kernels),cudaEventDisableTiming);
2906  cudaEventCreateWithFlags(&(nodePmeMgr->end_potential_memcpy),cudaEventDisableTiming);
2907 
2908  f_alloc_count = 0;
2909  for ( int n=fsize, i=0; i<n; ++i ) {
2910  if ( f_arr[i] == 0 ) {
2911  q_arr[i] = q_data_host + f_alloc_count * q_stride;
2912  q_arr_dev_host[i] = q_data_dev + f_alloc_count * q_stride;
2913  v_arr_dev_host[i] = v_data_dev + f_alloc_count * q_stride;
2914  ++f_alloc_count;
2915  } else {
2916  q_arr[i] = 0;
2917  q_arr_dev_host[i] = 0;
2918  v_arr_dev_host[i] = 0;
2919  }
2920  }
2921 
2922  cudaMemcpy(q_arr_dev, q_arr_dev_host, fsize * sizeof(float*), cudaMemcpyHostToDevice);
2923  cudaMemcpy(v_arr_dev, v_arr_dev_host, fsize * sizeof(float*), cudaMemcpyHostToDevice);
2924  delete [] q_arr_dev_host;
2925  delete [] v_arr_dev_host;
2926  delete [] f_arr;
2927  f_arr = new char[fsize + q_stride];
2928  fz_arr = f_arr + fsize;
2929  memset(f_arr, 0, fsize + q_stride);
2930  memset(ffz_host, 0, (fsize + q_stride)*sizeof(int));
2931 
2932  cuda_errcheck("initialize grid data for pme");
2933 
2934  cuda_init_bspline_coeffs(&bspline_coeffs_dev, &bspline_dcoeffs_dev, myGrid.order);
2935  cuda_errcheck("initialize bspline coefficients for pme");
2936 
2937 #define XCOPY(X) masterPmeMgr->X = X;
2938  XCOPY(bspline_coeffs_dev)
2939  XCOPY(bspline_dcoeffs_dev)
2940  XCOPY(q_arr)
2941  XCOPY(q_arr_dev)
2942  XCOPY(v_arr_dev)
2943  XCOPY(q_data_size)
2944  XCOPY(q_data_host)
2945  XCOPY(q_data_dev)
2946  XCOPY(v_data_dev)
2947  XCOPY(ffz_size)
2948  XCOPY(ffz_host)
2949  XCOPY(ffz_dev)
2950  XCOPY(f_arr)
2951  XCOPY(fz_arr)
2952 #undef XCOPY
2953  //CkPrintf("pe %d init first\n", CkMyPe());
2954  } else { // cudaFirst
2955  //CkPrintf("pe %d init later\n", CkMyPe());
2956 #define XCOPY(X) X = masterPmeMgr->X;
2957  XCOPY(bspline_coeffs_dev)
2958  XCOPY(bspline_dcoeffs_dev)
2959  XCOPY(q_arr)
2960  XCOPY(q_arr_dev)
2961  XCOPY(v_arr_dev)
2962  XCOPY(q_data_size)
2963  XCOPY(q_data_host)
2964  XCOPY(q_data_dev)
2965  XCOPY(v_data_dev)
2966  XCOPY(ffz_size)
2967  XCOPY(ffz_host)
2968  XCOPY(ffz_dev)
2969  XCOPY(f_arr)
2970  XCOPY(fz_arr)
2971 #undef XCOPY
2972  } // cudaFirst
2973  CmiUnlock(cuda_lock);
2974  } else // offload
2975 #endif // NAMD_CUDA
2976  {
2977  fz_arr = new char[myGrid.K3+myGrid.order-1];
2978  }
2979 
2980 #if 0 && USE_PERSISTENT
2981  recvGrid_handle = NULL;
2982 #endif
2983 }
2984 
2986 {
2987 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2988  if ( ! offload )
2989 #endif
2990  {
2991  for ( int g=0; g<numGridsMax; ++g ) delete myRealSpace[g];
2992  }
2993 }
2994 
2995 #if 0 && USE_PERSISTENT
2996 void ComputePmeMgr::setup_recvgrid_persistent()
2997 {
2998  int K1 = myGrid.K1;
2999  int K2 = myGrid.K2;
3000  int dim2 = myGrid.dim2;
3001  int dim3 = myGrid.dim3;
3002  int block1 = myGrid.block1;
3003  int block2 = myGrid.block2;
3004 
3005  CkArray *zPencil_local = zPencil.ckLocalBranch();
3006  recvGrid_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * numPencilsActive);
3007  for (int ap=0; ap<numPencilsActive; ++ap) {
3008  int ib = activePencils[ap].i;
3009  int jb = activePencils[ap].j;
3010  int ibegin = ib*block1;
3011  int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
3012  int jbegin = jb*block2;
3013  int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
3014  int flen = numGrids * (iend - ibegin) * (jend - jbegin);
3015  // f is changing
3016  int fcount = 0;
3017  for ( int g=0; g<numGrids; ++g ) {
3018  char *f = f_arr + g*fsize;
3019  for ( int i=ibegin; i<iend; ++i ) {
3020  for ( int j=jbegin; j<jend; ++j ) {
3021  fcount += f[i*dim2+j];
3022  }
3023  }
3024  }
3025  int zlistlen = 0;
3026  for ( int i=0; i<myGrid.K3; ++i ) {
3027  if ( fz_arr[i] ) ++zlistlen;
3028  }
3029  int hd = ( fcount? 1 : 0 ); // has data?
3030  int peer = zPencil_local->homePe(CkArrayIndex3D(ib, jb, 0));
3031  int compress_start = sizeof(PmeGridMsg ) + sizeof(envelope) + sizeof(int)*hd*zlistlen + sizeof(char)*hd*flen +sizeof(PmeReduction)*hd*numGrids ;
3032  int compress_size = sizeof(float)*hd*fcount*zlistlen;
3033  int size = compress_start + compress_size + PRIORITY_SIZE/8+6;
3034  recvGrid_handle[ap] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
3035  }
3036 }
3037 #endif
3038 
3040 
3041  if ( patch->flags.doFullElectrostatics ) {
3042  // In QM/MM simulations, atom charges form QM regions need special treatment.
3043  if ( qmForcesOn ) {
3044  return 1;
3045  }
3046  if ( ! myMgr->ungridForcesCount && ! myMgr->recipEvirCount ) return 0; // work to do, enqueue as usual
3047  myMgr->heldComputes.add(this);
3048  return 1; // don't enqueue yet
3049  }
3050 
3051  positionBox->skip();
3052  forceBox->skip();
3053 
3054  if ( ++(myMgr->noWorkCount) == myMgr->pmeComputes.size() ) {
3055  myMgr->noWorkCount = 0;
3056  myMgr->reduction->submit();
3057  }
3058 
3059  atomsChanged = 0;
3060 
3061  return 1; // no work for this step
3062 }
3063 
3065  ++recipEvirClients;
3066 }
3067 
3069  if ( ! pmeComputes.size() ) NAMD_bug("ComputePmeMgr::recvRecipEvir() called on pe without patches");
3070  for ( int g=0; g<numGrids; ++g ) {
3071  evir[g] += msg->evir[g];
3072  }
3073  delete msg;
3074  // CkPrintf("recvRecipEvir pe %d %d %d\n", CkMyPe(), ungridForcesCount, recipEvirCount);
3075  if ( ! --recipEvirCount && ! ungridForcesCount ) submitReductions();
3076 }
3077 
3079 
3080 // iout << CkMyPe() << ") ----> PME doQMWork.\n" << endi ;
3081 
3082 
3083  int numQMAtms = Node::Object()->molecule->get_numQMAtoms();
3084  const Real *qmAtmChrg = Node::Object()->molecule->get_qmAtmChrg() ;
3085  const int *qmAtmIndx = Node::Object()->molecule->get_qmAtmIndx() ;
3086  const Real *qmAtomGroup = Node::Object()->molecule->get_qmAtomGroup() ;
3087 
3088  const CompAtomExt *xExt = patch->getCompAtomExtInfo();
3089 
3090  // Determine number of qm atoms in this patch for the current step.
3091  numLocalQMAtoms = 0;
3092  for (int paIter=0; paIter<patch->getNumAtoms(); paIter++) {
3093  if ( qmAtomGroup[xExt[paIter].id] != 0 ) {
3094  numLocalQMAtoms++;
3095  }
3096  }
3097 
3098  // We prepare a charge vector with QM charges for use in the PME calculation.
3099 
3100  // Clears data from last step, if there is any.
3101  if (qmLoclIndx != 0)
3102  delete [] qmLoclIndx;
3103  if (qmLocalCharges != 0)
3104  delete [] qmLocalCharges;
3105 
3106  qmLoclIndx = new int[numLocalQMAtoms] ;
3107  qmLocalCharges = new Real[numLocalQMAtoms] ;
3108 
3109  // I am assuming there will be (in general) more QM atoms among all QM groups
3110  // than MM atoms in a patch.
3111  int procAtms = 0;
3112 
3113  for (int paIter=0; paIter<patch->getNumAtoms(); paIter++) {
3114 
3115  for (int i=0; i<numQMAtms; i++) {
3116 
3117  if (qmAtmIndx[i] == xExt[paIter].id) {
3118 
3119  qmLoclIndx[procAtms] = paIter ;
3120  qmLocalCharges[procAtms] = qmAtmChrg[i];
3121 
3122  procAtms++;
3123  break;
3124  }
3125 
3126  }
3127 
3128  if (procAtms == numLocalQMAtoms)
3129  break;
3130  }
3131 
3132  doWork();
3133  return ;
3134 }
3135 
3137 {
3138  DebugM(4,"Entering ComputePme::doWork().\n");
3139 
3141 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3143 #else
3145 #endif
3146  ungridForces();
3147  // CkPrintf("doWork 2 pe %d %d %d\n", CkMyPe(), myMgr->ungridForcesCount, myMgr->recipEvirCount);
3148  if ( ! --(myMgr->ungridForcesCount) && ! myMgr->recipEvirCount ) myMgr->submitReductions();
3149  return;
3150  }
3152  // CkPrintf("doWork 1 pe %d %d %d\n", CkMyPe(), myMgr->ungridForcesCount, myMgr->recipEvirCount);
3153 
3154 #ifdef TRACE_COMPUTE_OBJECTS
3155  double traceObjStartTime = CmiWallTimer();
3156 #endif
3157 
3158 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3159  if ( offload ) cudaSetDevice(deviceCUDA->getDeviceID());
3160 #endif
3161 
3162  // allocate storage
3163  numLocalAtoms = patch->getNumAtoms();
3164 
3165  Lattice &lattice = patch->flags.lattice;
3166 
3167  // For more than one grid, allocate base grid plus numGrids extra storage.
3168  // Each storage segment can hold the max atoms possible, numLocalAtoms.
3169  // Copy coordinate data from position box into base grid plus create
3170  // auxiliary array with partition number. For alchemy, copy coordinate data
3171  // from base grid into each extra buffer depending on partition values.
3172  // Storage is all PmeParticle: x,y,z,q double precision.
3173  int extraGrids = 0;
3174  if ( ! LJPMEOn && (numGrids > 1 || selfOn) ) {
3175  extraGrids = 1;
3176  }
3177 
3178  localData_alloc.resize(numLocalAtoms*(numGrids+extraGrids));
3179  localData = localData_alloc.begin();
3180  localPartition_alloc.resize(numLocalAtoms);
3181  localPartition = localPartition_alloc.begin();
3182 
3183  // We have local buffers: base, 0, 1, ..., numGrids-1 (for numGrids > 1).
3184  // localGridData points to the "0, 1, ..., numGrids-1" buffers.
3185  int g;
3186  for ( g=0; g<numGrids; ++g ) {
3187  localGridData[g] = localData + numLocalAtoms*(g+extraGrids);
3188  }
3189 
3190  // get positions and charges
3191  PmeParticle * data_ptr = localData;
3192  unsigned char * part_ptr = localPartition;
3193  const BigReal coulomb_sqrt = sqrt( COULOMB * ComputeNonbondedUtil::scaling
3195  {
3196  CompAtom *x = positionBox->open();
3197  // CompAtomExt *xExt = patch->getCompAtomExtInfo();
3198  if ( patch->flags.doMolly ) {
3199  positionBox->close(&x);
3200  x = avgPositionBox->open();
3201  }
3202  int numAtoms = patch->getNumAtoms();
3203 
3204  for(int i=0; i<numAtoms; ++i)
3205  {
3206  data_ptr->x = x[i].position.x;
3207  data_ptr->y = x[i].position.y;
3208  data_ptr->z = x[i].position.z;
3209  data_ptr->cg = coulomb_sqrt * x[i].charge;
3210  ++data_ptr;
3211  *part_ptr = x[i].partition;
3212  ++part_ptr;
3213  }
3214 
3215  // QM loop to overwrite charges of QM atoms.
3216  // They are zero for NAMD, but are updated in ComputeQM.
3217  if ( qmForcesOn ) {
3218 
3219  for(int i=0; i<numLocalQMAtoms; ++i)
3220  {
3221  localData[qmLoclIndx[i]].cg = coulomb_sqrt * qmLocalCharges[i];
3222  }
3223 
3224  }
3225 
3226  if ( patch->flags.doMolly ) { avgPositionBox->close(&x); }
3227  else { positionBox->close(&x); }
3228  }
3229 
3230  // copy to other grids if needed
3231  if ( (alchOn && (!alchDecouple)) || lesOn ) {
3232  for ( g=0; g<numGrids; ++g ) {
3233  PmeParticle *lgd = localGridData[g];
3234  if (g < 2) {
3235  int nga = 0;
3236  for(int i=0; i<numLocalAtoms; ++i) {
3237  if ( localPartition[i] == 0 || localPartition[i] == (g+1) || localPartition[i] == (g+3)) {
3238  // for FEP/TI: grid 0 gets non-alch + partition 1 + partition 3;
3239  // grid 1 gets non-alch + partition 2 + + partition 4;
3240  lgd[nga++] = localData[i];
3241  }
3242  }
3243  numGridAtoms[g] = nga;
3244  } else {
3245  int nga = 0;
3246  for(int i=0; i<numLocalAtoms; ++i) {
3247  if ( localPartition[i] == 0 ) {
3248  // grid 2 (only if called for with numGrids=3) gets only non-alch
3249  lgd[nga++] = localData[i];
3250  }
3251  }
3252  numGridAtoms[g] = nga;
3253  }
3254  }
3255  } else if ( alchOn && alchDecouple) {
3256  // alchemical decoupling: four grids
3257  // g=0: partition 0 and partition 1
3258  // g=1: partition 0 and partition 2
3259  // g=2: only partition 1 atoms
3260  // g=3: only partition 2 atoms
3261  // plus one grid g=4, only partition 0, if numGrids=5
3262  for ( g=0; g<2; ++g ) { // same as before for first 2
3263  PmeParticle *lgd = localGridData[g];
3264  int nga = 0;
3265  for(int i=0; i<numLocalAtoms; ++i) {
3266  if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
3267  lgd[nga++] = localData[i];
3268  }
3269  }
3270  numGridAtoms[g] = nga;
3271  }
3272  for (g=2 ; g<4 ; ++g ) { // only alchemical atoms for these 2
3273  PmeParticle *lgd = localGridData[g];
3274  int nga = 0;
3275  for(int i=0; i<numLocalAtoms; ++i) {
3276  if ( localPartition[i] == (g-1) ) {
3277  lgd[nga++] = localData[i];
3278  }
3279  }
3280  numGridAtoms[g] = nga;
3281  }
3282  for (g=4 ; g<numGrids ; ++g ) { // only non-alchemical atoms
3283  // numGrids=5 only if alchElecLambdaStart > 0
3284  PmeParticle *lgd = localGridData[g];
3285  int nga = 0;
3286  for(int i=0; i<numLocalAtoms; ++i) {
3287  if ( localPartition[i] == 0 ) {
3288  lgd[nga++] = localData[i];
3289  }
3290  }
3291  numGridAtoms[g] = nga;
3292  }
3293  } else if ( selfOn ) {
3294  if ( numGrids != 1 ) NAMD_bug("ComputePme::doWork assertion 1 failed");
3295  g = 0;
3296  PmeParticle *lgd = localGridData[g];
3297  int nga = 0;
3298  for(int i=0; i<numLocalAtoms; ++i) {
3299  if ( localPartition[i] == 1 ) {
3300  lgd[nga++] = localData[i];
3301  }
3302  }
3303  numGridAtoms[g] = nga;
3304  } else if ( pairOn ) {
3305  if ( numGrids != 3 ) NAMD_bug("ComputePme::doWork assertion 2 failed");
3306  g = 0;
3307  PmeParticle *lgd = localGridData[g];
3308  int nga = 0;
3309  for(int i=0; i<numLocalAtoms; ++i) {
3310  if ( localPartition[i] == 1 || localPartition[i] == 2 ) {
3311  lgd[nga++] = localData[i];
3312  }
3313  }
3314  numGridAtoms[g] = nga;
3315  for ( g=1; g<3; ++g ) {
3316  PmeParticle *lgd = localGridData[g];
3317  int nga = 0;
3318  for(int i=0; i<numLocalAtoms; ++i) {
3319  if ( localPartition[i] == g ) {
3320  lgd[nga++] = localData[i];
3321  }
3322  }
3323  numGridAtoms[g] = nga;
3324  }
3325  } else if ( LJPMEOn ) {
3326  const CompAtomExt *xExt = patch->getCompAtomExtInfo(); // for dispersion coef
3327  if ( numGrids != 2 ) NAMD_bug("ComputePme::doWork assertion for LJ-PME failed");
3328  // Reset localGridData pointers and set atom counts
3329  // localGridData[1] = localGridData[0];
3330  // localGridData[0] = localData;
3331  numGridAtoms[0] = numLocalAtoms;
3332  numGridAtoms[1] = numLocalAtoms;
3333  PmeParticle *lgd = localGridData[1];
3334  for (int i=0; i < numLocalAtoms; ++i) {
3335  lgd[i].x = localData[i].x;
3336  lgd[i].y = localData[i].y;
3337  lgd[i].z = localData[i].z;
3338  lgd[i].cg = xExt[i].dispcoef; // no scaling needed for dispersion
3339  }
3340  } else {
3341  // This else handles the numGrids==1 case.
3342  // In this case, localGridData[0] and numGridAtoms[0] aren't set to
3343  // usable values, so we reset them to point to the base buffer.
3344  // Expect the calculation to be done on localGridData[0..numGrids],
3345  // each buffer containing numGridAtoms[0..numGrids].
3346  if ( numGrids != 1 ) NAMD_bug("ComputePme::doWork assertion 3 failed");
3347  // localGridData[0] = localData;
3348  numGridAtoms[0] = numLocalAtoms;
3349  }
3350 
3351  if ( ! myMgr->doWorkCount ) {
3352  myMgr->doWorkCount = myMgr->pmeComputes.size();
3353 
3354 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3355  if ( ! offload )
3356 #endif // NAMD_CUDA
3357  {
3358  memset( (void*) myMgr->fz_arr, 0, (myGrid.K3+myGrid.order-1) * sizeof(char) );
3359 
3360  for (int i=0; i<myMgr->q_count; ++i) {
3361  memset( (void*) (myMgr->q_list[i]), 0, (myGrid.K3+myGrid.order-1) * sizeof(float) );
3362  }
3363  }
3364 
3365  for ( g=0; g<numGrids; ++g ) {
3366  myMgr->evir[g] = 0;
3367  }
3368 
3369  myMgr->strayChargeErrors = 0;
3370 
3371  myMgr->compute_sequence = sequence();
3372  }
3373 
3374  if ( sequence() != myMgr->compute_sequence ) NAMD_bug("ComputePme sequence mismatch in doWork()");
3375 
3376  int strayChargeErrors = 0;
3377 
3378  // XXX need self energy for LJ-PME
3379  // calculate self energy
3380  const BigReal ewaldcof = ComputeNonbondedUtil::ewaldcof;
3381  for ( g=0; g<numGrids; ++g ) {
3382  BigReal selfEnergy = 0;
3383  data_ptr = localGridData[g];
3384  for (int i=0; i<numGridAtoms[g]; ++i) {
3385  selfEnergy += data_ptr->cg * data_ptr->cg;
3386  ++data_ptr;
3387  }
3388  if ( LJPMEOn && 1==g ) {
3389  const BigReal LJewaldcof = ComputeNonbondedUtil::LJewaldcof;
3390  double alpha6 = LJewaldcof * LJewaldcof * LJewaldcof;
3391  alpha6 = alpha6 * alpha6;
3392  selfEnergy *= (1./12.) * alpha6;
3393  } else {
3394  selfEnergy *= -1. * ewaldcof / SQRT_PI;
3395  }
3396  myMgr->evir[g][0] += selfEnergy;
3397 
3398  float **q = myMgr->q_arr + g*myMgr->fsize;
3399  char *f = myMgr->f_arr + g*myMgr->fsize;
3400 
3401  scale_coordinates(localGridData[g], numGridAtoms[g], lattice, myGrid);
3402 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3403  if ( offload ) {
3404  if ( myMgr->cuda_atoms_alloc == 0 ) { // first call
3405  int na = myMgr->cuda_atoms_alloc = 1.2 * (myMgr->cuda_atoms_count + 1000);
3406  cuda_errcheck("before malloc atom data for pme");
3407  cudaMallocHost((void**) &(myMgr->a_data_host), 7*na*sizeof(float));
3408  cudaMalloc((void**) &(myMgr->a_data_dev), 7*na*sizeof(float));
3409  cuda_errcheck("malloc atom data for pme");
3410  myMgr->cuda_atoms_count = 0;
3411  }
3412  cuda_atoms_offset = myMgr->cuda_atoms_count;
3413  int n = numGridAtoms[g];
3414  myMgr->cuda_atoms_count += n;
3415  if ( myMgr->cuda_atoms_count > myMgr->cuda_atoms_alloc ) {
3416  CkPrintf("Pe %d expanding CUDA PME atoms allocation because %d > %d\n",
3417  CkMyPe(), myMgr->cuda_atoms_count, myMgr->cuda_atoms_alloc);
3418  cuda_errcheck("before malloc expanded atom data for pme");
3419  int na = myMgr->cuda_atoms_alloc = 1.2 * (myMgr->cuda_atoms_count + 1000);
3420  const float *a_data_host_old = myMgr->a_data_host;
3421  cudaMallocHost((void**) &(myMgr->a_data_host), 7*na*sizeof(float));
3422  cuda_errcheck("malloc expanded host atom data for pme");
3423  memcpy(myMgr->a_data_host, a_data_host_old, 7*cuda_atoms_offset*sizeof(float));
3424  cudaFreeHost((void*) a_data_host_old);
3425  cuda_errcheck("free expanded host atom data for pme");
3426  cudaFree(myMgr->a_data_dev);
3427  cuda_errcheck("free expanded dev atom data for pme");
3428  cudaMalloc((void**) &(myMgr->a_data_dev), 7*na*sizeof(float));
3429  cuda_errcheck("malloc expanded dev atom data for pme");
3430  }
3431  float *a_data_host = myMgr->a_data_host + 7 * cuda_atoms_offset;
3432  data_ptr = localGridData[g];
3433  double order_1 = myGrid.order - 1;
3434  double K1 = myGrid.K1;
3435  double K2 = myGrid.K2;
3436  double K3 = myGrid.K3;
3437  int found_negative = 0;
3438  for ( int i=0; i<n; ++i ) {
3439  if ( data_ptr[i].x < 0 || data_ptr[i].y < 0 || data_ptr[i].z < 0 ) {
3440  found_negative = 1;
3441  // CkPrintf("low coord: %f %f %f\n", data_ptr[i].x, data_ptr[i].y, data_ptr[i].z);
3442  }
3443  double x_int = (int) data_ptr[i].x;
3444  double y_int = (int) data_ptr[i].y;
3445  double z_int = (int) data_ptr[i].z;
3446  a_data_host[7*i ] = data_ptr[i].x - x_int; // subtract in double precision
3447  a_data_host[7*i+1] = data_ptr[i].y - y_int;
3448  a_data_host[7*i+2] = data_ptr[i].z - z_int;
3449  a_data_host[7*i+3] = data_ptr[i].cg;
3450  x_int -= order_1; if ( x_int < 0 ) x_int += K1;
3451  y_int -= order_1; if ( y_int < 0 ) y_int += K2;
3452  z_int -= order_1; if ( z_int < 0 ) z_int += K3;
3453  a_data_host[7*i+4] = x_int;
3454  a_data_host[7*i+5] = y_int;
3455  a_data_host[7*i+6] = z_int;
3456  }
3457  if ( found_negative ) NAMD_bug("found negative atom coordinate in ComputePme::doWork");
3458  } else
3459 #endif // NAMD_CUDA
3460  {
3461  myRealSpace[g]->set_num_atoms(numGridAtoms[g]);
3462  myRealSpace[g]->fill_charges(q, myMgr->q_list, myMgr->q_count, strayChargeErrors, f, myMgr->fz_arr, localGridData[g]);
3463  }
3464  }
3465  myMgr->strayChargeErrors += strayChargeErrors;
3466 
3467 #ifdef TRACE_COMPUTE_OBJECTS
3468  traceUserBracketEvent(TRACE_COMPOBJ_IDOFFSET+this->cid, traceObjStartTime, CmiWallTimer());
3469 #endif
3470 
3471  if ( --(myMgr->doWorkCount) == 0 ) {
3472 // cudaDeviceSynchronize(); // XXXX
3473 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3474  if ( offload ) {
3476  args.mgr = myMgr;
3477  args.lattice = &lattice;
3478  args.sequence = sequence();
3479  CmiLock(ComputePmeMgr::cuda_lock);
3480  if ( ComputePmeMgr::cuda_busy ) {
3482  } else if ( CkMyPe() == deviceCUDA->getMasterPe() ) {
3483  // avoid adding work to nonbonded data preparation pe
3484  args.mgr->cuda_submit_charges(*args.lattice, args.sequence);
3485  } else {
3486  ComputePmeMgr::cuda_busy = true;
3487  while ( 1 ) {
3488  CmiUnlock(ComputePmeMgr::cuda_lock);
3489  args.mgr->cuda_submit_charges(*args.lattice, args.sequence);
3490  CmiLock(ComputePmeMgr::cuda_lock);
3494  } else {
3495  ComputePmeMgr::cuda_busy = false;
3496  break;
3497  }
3498  }
3499  }
3500  CmiUnlock(ComputePmeMgr::cuda_lock);
3501  } else
3502 #endif // NAMD_CUDA
3503  {
3504  myMgr->chargeGridReady(lattice,sequence());
3505  }
3506  }
3507  atomsChanged = 0;
3508 }
3509 
3510 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3511 
3512 void ComputePmeMgr::cuda_submit_charges(Lattice &lattice, int sequence) {
3513 
3514  int n = cuda_atoms_count;
3515  //CkPrintf("pe %d cuda_atoms_count %d\n", CkMyPe(), cuda_atoms_count);
3516  cuda_atoms_count = 0;
3517 
3518  const double before = CmiWallTimer();
3519  cudaMemcpyAsync(a_data_dev, a_data_host, 7*n*sizeof(float),
3520  cudaMemcpyHostToDevice, streams[stream]);
3521  const double after = CmiWallTimer();
3522 
3523  cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_charge_memset, 0);
3524 
3525  cuda_pme_charges(
3526  bspline_coeffs_dev,
3527  q_arr_dev, ffz_dev, ffz_dev + fsize,
3528  a_data_dev, n,
3529  myGrid.K1, myGrid.K2, myGrid.K3, myGrid.order,
3530  streams[stream]);
3531  const double after2 = CmiWallTimer();
3532 
3533  chargeGridSubmitted(lattice,sequence); // must be inside lock
3534 
3535  masterPmeMgr->charges_time = before;
3536  traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,after);
3537  traceUserBracketEvent(CUDA_EVENT_ID_PME_KERNEL,after,after2);
3538 }
3539 
3540 void cuda_check_pme_charges(void *arg, double walltime) {
3541  ComputePmeMgr *argp = (ComputePmeMgr *) arg;
3542 
3543  cudaError_t err = cudaEventQuery(argp->end_charges);
3544  if ( err == cudaSuccess ) {
3545  traceUserBracketEvent(CUDA_EVENT_ID_PME_CHARGES,argp->charges_time,walltime);
3546  argp->charges_time = walltime - argp->charges_time;
3547  argp->sendChargeGridReady();
3548  argp->check_charges_count = 0;
3549  } else if ( err != cudaErrorNotReady ) {
3550  char errmsg[256];
3551  sprintf(errmsg,"in cuda_check_pme_charges after polling %d times over %f s on seq %d",
3552  argp->check_charges_count, walltime - argp->charges_time,
3553  argp->saved_sequence);
3554  cudaDie(errmsg,err);
3555  } else if ( ++(argp->check_charges_count) >= count_limit ) {
3556  char errmsg[256];
3557  sprintf(errmsg,"cuda_check_pme_charges polled %d times over %f s on seq %d",
3558  argp->check_charges_count, walltime - argp->charges_time,
3559  argp->saved_sequence);
3560  cudaDie(errmsg,err);
3561  } else {
3562  CcdCallBacksReset(0,walltime); // fix Charm++
3564  }
3565 }
3566 
3567 void ComputePmeMgr::chargeGridSubmitted(Lattice &lattice, int sequence) {
3568  saved_lattice = &lattice;
3569  saved_sequence = sequence;
3570 
3571  // cudaDeviceSynchronize(); // XXXX TESTING
3572  //int q_stride = myGrid.K3+myGrid.order-1;
3573  //for (int n=fsize+q_stride, j=0; j<n; ++j) {
3574  // if ( ffz_host[j] != 0 && ffz_host[j] != 1 ) {
3575  // CkPrintf("pre-memcpy flag %d/%d == %d on pe %d in ComputePmeMgr::chargeGridReady\n", j, n, ffz_host[j], CkMyPe());
3576  // }
3577  //}
3578  //CmiLock(cuda_lock);
3579 
3580  if ( --(masterPmeMgr->chargeGridSubmittedCount) == 0 ) {
3581  double before = CmiWallTimer();
3582  cudaEventRecord(nodePmeMgr->end_all_pme_kernels, 0); // when all streams complete
3583  cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_all_pme_kernels, 0);
3584  cudaMemcpyAsync(q_data_host, q_data_dev, q_data_size+ffz_size,
3585  cudaMemcpyDeviceToHost, streams[stream]);
3586  traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
3587  cudaEventRecord(masterPmeMgr->end_charges, streams[stream]);
3588  cudaMemsetAsync(q_data_dev, 0, q_data_size + ffz_size, streams[stream]); // for next time
3589  cudaEventRecord(nodePmeMgr->end_charge_memset, streams[stream]);
3590  //CmiUnlock(cuda_lock);
3591  // cudaDeviceSynchronize(); // XXXX TESTING
3592  // cuda_errcheck("after memcpy grid to host");
3593 
3595  pmeProxy[master_pe].pollChargeGridReady();
3596  }
3597 }
3598 
3600  for ( int i=0; i<CkMyNodeSize(); ++i ) {
3601  ComputePmeMgr *mgr = nodePmeMgr->mgrObjects[i];
3602  int cs = mgr->pmeComputes.size();
3603  if ( cs ) {
3604  mgr->ungridForcesCount = cs;
3605  mgr->recipEvirCount = mgr->recipEvirClients;
3606  masterPmeMgr->chargeGridSubmittedCount++;
3607  }
3608  }
3609  pmeProxy[master_pe].recvChargeGridReady();
3610 }
3611 #endif // NAMD_CUDA
3612 
3614 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3615  CcdCallBacksReset(0,CmiWallTimer()); // fix Charm++
3617 #else
3618  NAMD_bug("ComputePmeMgr::pollChargeGridReady() called in non-CUDA build.");
3619 #endif
3620 }
3621 
3624 }
3625 
3626 void ComputePmeMgr::chargeGridReady(Lattice &lattice, int sequence) {
3627 
3628 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3629  if ( offload ) {
3630  int errcount = 0;
3631  int q_stride = myGrid.K3+myGrid.order-1;
3632  for (int n=fsize+q_stride, j=fsize; j<n; ++j) {
3633  f_arr[j] = ffz_host[j];
3634  if ( ffz_host[j] & ~1 ) ++errcount;
3635  }
3636  if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::chargeGridReady");
3637  }
3638 #endif
3639  recipEvirCount = recipEvirClients;
3640  ungridForcesCount = pmeComputes.size();
3641 
3642  for (int j=0; j<myGrid.order-1; ++j) {
3643  fz_arr[j] |= fz_arr[myGrid.K3+j];
3644  }
3645 
3646  if ( usePencils ) {
3647  sendPencils(lattice,sequence);
3648  } else {
3649  sendData(lattice,sequence);
3650  }
3651 }
3652 
3653 
3654 void ComputePmeMgr::sendPencilsPart(int first, int last, Lattice &lattice, int sequence, int sourcepe) {
3655 
3656  // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
3657 
3658 #if 0 && USE_PERSISTENT
3659  if (recvGrid_handle== NULL) setup_recvgrid_persistent();
3660 #endif
3661  int K1 = myGrid.K1;
3662  int K2 = myGrid.K2;
3663  int dim2 = myGrid.dim2;
3664  int dim3 = myGrid.dim3;
3665  int block1 = myGrid.block1;
3666  int block2 = myGrid.block2;
3667 
3668  // int savedMessages = 0;
3669  NodePmeMgr *npMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
3670 
3671  for (int ap=first; ap<=last; ++ap) {
3672  int ib = activePencils[ap].i;
3673  int jb = activePencils[ap].j;
3674  int ibegin = ib*block1;
3675  int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
3676  int jbegin = jb*block2;
3677  int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
3678  int flen = numGrids * (iend - ibegin) * (jend - jbegin);
3679 
3680  int fcount = 0;
3681  for ( int g=0; g<numGrids; ++g ) {
3682  char *f = f_arr + g*fsize;
3683 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3684  if ( offload ) {
3685  int errcount = 0;
3686  for ( int i=ibegin; i<iend; ++i ) {
3687  for ( int j=jbegin; j<jend; ++j ) {
3688  int k = i*dim2+j;
3689  f[k] = ffz_host[k];
3690  fcount += f[k];
3691  if ( ffz_host[k] & ~1 ) ++errcount;
3692  }
3693  }
3694  if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::sendPencilsPart");
3695  } else
3696 #endif
3697  for ( int i=ibegin; i<iend; ++i ) {
3698  for ( int j=jbegin; j<jend; ++j ) {
3699  fcount += f[i*dim2+j];
3700  }
3701  }
3702  }
3703 
3704 #ifdef NETWORK_PROGRESS
3705  CmiNetworkProgress();
3706 #endif
3707 
3708  if ( ! pencilActive[ib*yBlocks+jb] )
3709  NAMD_bug("PME activePencils list inconsistent");
3710 
3711  int zlistlen = 0;
3712  for ( int i=0; i<myGrid.K3; ++i ) {
3713  if ( fz_arr[i] ) ++zlistlen;
3714  }
3715 
3716  int hd = ( fcount? 1 : 0 ); // has data?
3717  // if ( ! hd ) ++savedMessages;
3718 
3719 
3720  PmeGridMsg *msg = new ( hd*zlistlen, hd*flen,
3721  hd*fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
3722  msg->sourceNode = sourcepe;
3723  msg->hasData = hd;
3724  msg->lattice = lattice;
3725  if ( hd ) {
3726 #if 0
3727  msg->start = fstart;
3728  msg->len = flen;
3729 #else
3730  msg->start = -1; // obsolete?
3731  msg->len = -1; // obsolete?
3732 #endif
3733  msg->zlistlen = zlistlen;
3734  int *zlist = msg->zlist;
3735  zlistlen = 0;
3736  for ( int i=0; i<myGrid.K3; ++i ) {
3737  if ( fz_arr[i] ) zlist[zlistlen++] = i;
3738  }
3739  char *fmsg = msg->fgrid;
3740  float *qmsg = msg->qgrid;
3741  for ( int g=0; g<numGrids; ++g ) {
3742  char *f = f_arr + g*fsize;
3743  float **q = q_arr + g*fsize;
3744  for ( int i=ibegin; i<iend; ++i ) {
3745  for ( int j=jbegin; j<jend; ++j ) {
3746  *(fmsg++) = f[i*dim2+j];
3747  if( f[i*dim2+j] ) {
3748  for (int h=0; h<myGrid.order-1; ++h) {
3749  q[i*dim2+j][h] += q[i*dim2+j][myGrid.K3+h];
3750  }
3751  for ( int k=0; k<zlistlen; ++k ) {
3752  *(qmsg++) = q[i*dim2+j][zlist[k]];
3753  }
3754  }
3755  }
3756  }
3757  }
3758  }
3759 
3760  msg->sequence = compute_sequence;
3761  SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
3762  CmiEnableUrgentSend(1);
3763 #if USE_NODE_PAR_RECEIVE
3764  msg->destElem=CkArrayIndex3D(ib,jb,0);
3765  CProxy_PmePencilMap lzm = npMgr->zm;
3766  int destproc = lzm.ckLocalBranch()->procNum(0, msg->destElem);
3767  int destnode = CmiNodeOf(destproc);
3768 
3769 #if 0
3770  CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
3771 #endif
3772  pmeNodeProxy[destnode].recvZGrid(msg);
3773 #if 0
3774  CmiUsePersistentHandle(NULL, 0);
3775 #endif
3776 #else
3777 #if 0
3778  CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
3779 #endif
3780  zPencil(ib,jb,0).recvGrid(msg);
3781 #if 0
3782  CmiUsePersistentHandle(NULL, 0);
3783 #endif
3784 #endif
3785  CmiEnableUrgentSend(0);
3786  }
3787 
3788 
3789  // if ( savedMessages ) {
3790  // CkPrintf("Pe %d eliminated %d PME messages\n",CkMyPe(),savedMessages);
3791  // }
3792 
3793 }
3794 
3795 
3797  nodePmeMgr->sendPencilsHelper(iter);
3798 }
3799 
3801 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3802  ComputePmeMgr *obj = masterPmeMgr;
3804 #else
3805  NAMD_bug("NodePmeMgr::sendPencilsHelper called in non-CUDA build");
3806 #endif
3807 }
3808 
3809 void ComputePmeMgr::sendPencils(Lattice &lattice, int sequence) {
3810 
3811  sendDataHelper_lattice = &lattice;
3812  sendDataHelper_sequence = sequence;
3813  sendDataHelper_sourcepe = CkMyPe();
3814 
3815 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3816  if ( offload ) {
3817  for ( int ap=0; ap < numPencilsActive; ++ap ) {
3818 #if CMK_MULTICORE
3819  // nodegroup messages on multicore are delivered to sending pe, or pe 0 if expedited
3820  int ib = activePencils[ap].i;
3821  int jb = activePencils[ap].j;
3822  int destproc = nodePmeMgr->zm.ckLocalBranch()->procNum(0, CkArrayIndex3D(ib,jb,0));
3823  pmeProxy[destproc].sendPencilsHelper(ap);
3824 #else
3825  pmeNodeProxy[CkMyNode()].sendPencilsHelper(ap);
3826 #endif
3827  }
3828  } else
3829 #endif
3830  {
3831  sendPencilsPart(0,numPencilsActive-1,lattice,sequence,CkMyPe());
3832  }
3833 
3834  if ( strayChargeErrors ) {
3835  strayChargeErrors = 0;
3836  iout << iERROR << "Stray PME grid charges detected: "
3837  << CkMyPe() << " sending to (x,y)";
3838  int K1 = myGrid.K1;
3839  int K2 = myGrid.K2;
3840  int dim2 = myGrid.dim2;
3841  int block1 = myGrid.block1;
3842  int block2 = myGrid.block2;
3843  for (int ib=0; ib<xBlocks; ++ib) {
3844  for (int jb=0; jb<yBlocks; ++jb) {
3845  int ibegin = ib*block1;
3846  int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
3847  int jbegin = jb*block2;
3848  int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
3849  int flen = numGrids * (iend - ibegin) * (jend - jbegin);
3850 
3851  for ( int g=0; g<numGrids; ++g ) {
3852  char *f = f_arr + g*fsize;
3853  if ( ! pencilActive[ib*yBlocks+jb] ) {
3854  for ( int i=ibegin; i<iend; ++i ) {
3855  for ( int j=jbegin; j<jend; ++j ) {
3856  if ( f[i*dim2+j] == 3 ) {
3857  f[i*dim2+j] = 2;
3858  iout << " (" << i << "," << j << ")";
3859  }
3860  }
3861  }
3862  }
3863  }
3864  }
3865  }
3866  iout << "\n" << endi;
3867  }
3868 
3869 }
3870 
3871 
3873 
3874  int K1 = myGrid.K1;
3875  int K2 = myGrid.K2;
3876  int dim2 = myGrid.dim2;
3877  int dim3 = myGrid.dim3;
3878  int block1 = myGrid.block1;
3879  int block2 = myGrid.block2;
3880 
3881  // msg->sourceNode = thisIndex.x * initdata.yBlocks + thisIndex.y;
3882  int ib = msg->sourceNode / yBlocks;
3883  int jb = msg->sourceNode % yBlocks;
3884 
3885  int ibegin = ib*block1;
3886  int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
3887  int jbegin = jb*block2;
3888  int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
3889 
3890  int zlistlen = msg->zlistlen;
3891  int *zlist = msg->zlist;
3892  float *qmsg = msg->qgrid;
3893  int g;
3894  for ( g=0; g<numGrids; ++g ) {
3895  char *f = f_arr + g*fsize;
3896  float **q = q_arr + g*fsize;
3897  for ( int i=ibegin; i<iend; ++i ) {
3898  for ( int j=jbegin; j<jend; ++j ) {
3899  if( f[i*dim2+j] ) {
3900  f[i*dim2+j] = 0;
3901  for ( int k=0; k<zlistlen; ++k ) {
3902  q[i*dim2+j][zlist[k]] = *(qmsg++);
3903  }
3904  for (int h=0; h<myGrid.order-1; ++h) {
3905  q[i*dim2+j][myGrid.K3+h] = q[i*dim2+j][h];
3906  }
3907  }
3908  }
3909  }
3910  }
3911 }
3912 
3913 
3914 void ComputePmeMgr::sendDataPart(int first, int last, Lattice &lattice, int sequence, int sourcepe, int errors) {
3915 
3916  // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
3917 
3918  bsize = myGrid.block1 * myGrid.dim2 * myGrid.dim3;
3919 
3920  CProxy_ComputePmeMgr pmeProxy(CkpvAccess(BOCclass_group).computePmeMgr);
3921  for (int j=first; j<=last; j++) {
3922  int pe = gridPeOrder[j]; // different order
3923  if ( ! recipPeDest[pe] && ! errors ) continue;
3924  int start = pe * bsize;
3925  int len = bsize;
3926  if ( start >= qsize ) { start = 0; len = 0; }
3927  if ( start + len > qsize ) { len = qsize - start; }
3928  int zdim = myGrid.dim3;
3929  int fstart = start / zdim;
3930  int flen = len / zdim;
3931  int fcount = 0;
3932  int i;
3933 
3934  int g;
3935  for ( g=0; g<numGrids; ++g ) {
3936  char *f = f_arr + fstart + g*fsize;
3937 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3938  if ( offload ) {
3939  int errcount = 0;
3940  for ( i=0; i<flen; ++i ) {
3941  f[i] = ffz_host[fstart+i];
3942  fcount += f[i];
3943  if ( ffz_host[fstart+i] & ~1 ) ++errcount;
3944  }
3945  if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::sendDataPart");
3946  } else
3947 #endif
3948  for ( i=0; i<flen; ++i ) {
3949  fcount += f[i];
3950  }
3951  if ( ! recipPeDest[pe] ) {
3952  int errfound = 0;
3953  for ( i=0; i<flen; ++i ) {
3954  if ( f[i] == 3 ) {
3955  errfound = 1;
3956  break;
3957  }
3958  }
3959  if ( errfound ) {
3960  iout << iERROR << "Stray PME grid charges detected: "
3961  << sourcepe << " sending to " << gridPeMap[pe] << " for planes";
3962  int iz = -1;
3963  for ( i=0; i<flen; ++i ) {
3964  if ( f[i] == 3 ) {
3965  f[i] = 2;
3966  int jz = (i+fstart)/myGrid.K2;
3967  if ( iz != jz ) { iout << " " << jz; iz = jz; }
3968  }
3969  }
3970  iout << "\n" << endi;
3971  }
3972  }
3973  }
3974 
3975 #ifdef NETWORK_PROGRESS
3976  CmiNetworkProgress();
3977 #endif
3978 
3979  if ( ! recipPeDest[pe] ) continue;
3980 
3981  int zlistlen = 0;
3982  for ( i=0; i<myGrid.K3; ++i ) {
3983  if ( fz_arr[i] ) ++zlistlen;
3984  }
3985 
3986  PmeGridMsg *msg = new (zlistlen, flen*numGrids,
3987  fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
3988 
3989  msg->sourceNode = sourcepe;
3990  msg->lattice = lattice;
3991  msg->start = fstart;
3992  msg->len = flen;
3993  msg->zlistlen = zlistlen;
3994  int *zlist = msg->zlist;
3995  zlistlen = 0;
3996  for ( i=0; i<myGrid.K3; ++i ) {
3997  if ( fz_arr[i] ) zlist[zlistlen++] = i;
3998  }
3999  float *qmsg = msg->qgrid;
4000  for ( g=0; g<numGrids; ++g ) {
4001  char *f = f_arr + fstart + g*fsize;
4002  CmiMemcpy((void*)(msg->fgrid+g*flen),(void*)f,flen*sizeof(char));
4003  float **q = q_arr + fstart + g*fsize;
4004  for ( i=0; i<flen; ++i ) {
4005  if ( f[i] ) {
4006  for (int h=0; h<myGrid.order-1; ++h) {
4007  q[i][h] += q[i][myGrid.K3+h];
4008  }
4009  for ( int k=0; k<zlistlen; ++k ) {
4010  *(qmsg++) = q[i][zlist[k]];
4011  }
4012  }
4013  }
4014  }
4015 
4016  msg->sequence = compute_sequence;
4017  SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
4018  pmeProxy[gridPeMap[pe]].recvGrid(msg);
4019  }
4020 
4021 }
4022 
4024  nodePmeMgr->sendDataHelper(iter);
4025 }
4026 
4028 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
4029  ComputePmeMgr *obj = masterPmeMgr;
4031 #else
4032  NAMD_bug("NodePmeMgr::sendDataHelper called in non-CUDA build");
4033 #endif
4034 }
4035 
4036 void ComputePmeMgr::sendData(Lattice &lattice, int sequence) {
4037 
4038  sendDataHelper_lattice = &lattice;
4039  sendDataHelper_sequence = sequence;
4040  sendDataHelper_sourcepe = CkMyPe();
4041  sendDataHelper_errors = strayChargeErrors;
4042  strayChargeErrors = 0;
4043 
4044 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
4045  if ( offload ) {
4046  for ( int i=0; i < numGridPes; ++i ) {
4047  int pe = gridPeOrder[i]; // different order
4048  if ( ! recipPeDest[pe] && ! sendDataHelper_errors ) continue;
4049 #if CMK_MULTICORE
4050  // nodegroup messages on multicore are delivered to sending pe, or pe 0 if expedited
4051  pmeProxy[gridPeMap[pe]].sendDataHelper(i);
4052 #else
4053  pmeNodeProxy[CkMyNode()].sendDataHelper(i);
4054 #endif
4055  }
4056  } else
4057 #endif
4058  {
4059  sendDataPart(0,numGridPes-1,lattice,sequence,CkMyPe(),sendDataHelper_errors);
4060  }
4061 
4062 }
4063 
4065 
4066  int zdim = myGrid.dim3;
4067  int flen = msg->len;
4068  int fstart = msg->start;
4069  int zlistlen = msg->zlistlen;
4070  int *zlist = msg->zlist;
4071  float *qmsg = msg->qgrid;
4072  int g;
4073  for ( g=0; g<numGrids; ++g ) {
4074  char *f = msg->fgrid + g*flen;
4075  float **q = q_arr + fstart + g*fsize;
4076  for ( int i=0; i<flen; ++i ) {
4077  if ( f[i] ) {
4078  f[i] = 0;
4079  for ( int k=0; k<zlistlen; ++k ) {
4080  q[i][zlist[k]] = *(qmsg++);
4081  }
4082  for (int h=0; h<myGrid.order-1; ++h) {
4083  q[i][myGrid.K3+h] = q[i][h];
4084  }
4085  }
4086  }
4087  }
4088 }
4089 
4091 
4092  if ( sequence() != myMgr->compute_sequence ) NAMD_bug("ComputePme sequence mismatch in ungridForces()");
4093 
4095 
4096  localResults_alloc.resize(numLocalAtoms* ((numGrids>1 || selfOn)?2:1));
4097  Vector *localResults = localResults_alloc.begin();
4098  Vector *gridResults;
4099 
4100  if ( alchOn || lesOn || selfOn || pairOn ) {
4101  for(int i=0; i<numLocalAtoms; ++i) { localResults[i] = 0.; }
4102  gridResults = localResults + numLocalAtoms;
4103  } else {
4104  gridResults = localResults;
4105  }
4106 
4107  Vector pairForce = 0.;
4108  Lattice &lattice = patch->flags.lattice;
4109  int g = 0;
4110  if(!simParams->commOnly) {
4111  for ( g=0; g<numGrids; ++g ) {
4112 #ifdef NETWORK_PROGRESS
4113  CmiNetworkProgress();
4114 #endif
4115 
4116 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
4117  if ( offload ) {
4118  int errfound = 0;
4119  for ( int n=numGridAtoms[g], i=0; i<n; ++i ) {
4120  // Neither isnan() nor x != x worked when testing on Cray; this does.
4121  if ( ((int*)f_data_host)[3*i] == 0x7fffffff ) { errfound = 1; } // CUDA NaN
4122  gridResults[i].x = f_data_host[3*i];
4123  gridResults[i].y = f_data_host[3*i+1];
4124  gridResults[i].z = f_data_host[3*i+2];
4125  }
4126  if ( errfound ) {
4127  int errcount = 0;
4128  for ( int n=numGridAtoms[g], i=0; i<n; ++i ) {
4129  float f = f_data_host[3*i];
4130  if ( ((int*)f_data_host)[3*i] == 0x7fffffff ) { // CUDA NaN
4131  ++errcount;
4132  gridResults[i] = 0.;
4133  }
4134  }
4135  iout << iERROR << "Stray PME grid charges detected: "
4136  << errcount << " atoms on pe " << CkMyPe() << "\n" << endi;
4137  }
4138  } else
4139 #endif // NAMD_CUDA
4140  {
4141  myRealSpace[g]->compute_forces(myMgr->q_arr+g*myMgr->fsize, localGridData[g], gridResults);
4142  }
4143  scale_forces(gridResults, numGridAtoms[g], lattice);
4144 
4145  if (LJPMEOn) {
4146  if (0==g) {
4147  // finished loop g==0, next loop gathers
4148  // LJ-PME force contributions into upper buffer
4149  gridResults += numLocalAtoms;
4150  } else {
4151  // sum LJ-PME forces into electrostatic forces buffer
4152  for (int i=0; i < numLocalAtoms; i++) {
4153  localResults[i] += gridResults[i];
4154  }
4155  }
4156  } else if (alchOn) {
4157  float scale = 1.;
4158  BigReal elecLambdaUp, elecLambdaDown;
4159  BigReal alchLambda = simParams->getCurrentLambda(patch->flags.step);
4160  myMgr->alchLambda = alchLambda;
4161  BigReal alchLambda2 = simParams->getCurrentLambda2(patch->flags.step);
4162  myMgr->alchLambda2 = alchLambda2;
4163  elecLambdaUp = simParams->getElecLambda(alchLambda);
4164  elecLambdaDown = simParams->getElecLambda(1. - alchLambda);
4165 
4166  if ( g == 0 ) scale = elecLambdaUp;
4167  else if ( g == 1 ) scale = elecLambdaDown;
4168  else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
4169 
4170  if (alchDecouple) {
4171  if ( g == 2 ) scale = 1 - elecLambdaUp;
4172  else if ( g == 3 ) scale = 1 - elecLambdaDown;
4173  else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
4174  }
4175  int nga = 0;
4176  if (!alchDecouple) {
4177  if (g < 2 ) {
4178  for(int i=0; i<numLocalAtoms; ++i) {
4179  if ( localPartition[i] == 0 || localPartition[i] == (g+1) || localPartition[i] == (g+3) ) {
4180  // (g=0: only partition 0 and partiton 1 and partion 3)
4181  // (g=1: only partition 0 and partiton 2 and partion 4)
4182  localResults[i] += gridResults[nga++] * scale;
4183  }
4184  }
4185  } else {
4186  for(int i=0; i<numLocalAtoms; ++i) {
4187  if ( localPartition[i] == 0 ) {
4188  // (g=2: only partition 0)
4189  localResults[i] += gridResults[nga++] * scale;
4190  }
4191  }
4192  }
4193  } else { // alchDecouple
4194  if ( g < 2 ) {
4195  for(int i=0; i<numLocalAtoms; ++i) {
4196  if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
4197  // g = 0: partition 0 or partition 1
4198  // g = 1: partition 0 or partition 2
4199  localResults[i] += gridResults[nga++] * scale;
4200  }
4201  }
4202  }
4203  else {
4204  for(int i=0; i<numLocalAtoms; ++i) {
4205  if ( localPartition[i] == (g-1) || localPartition[i] == (g-4)) {
4206  // g = 2: partition 1 only
4207  // g = 3: partition 2 only
4208  // g = 4: partition 0 only
4209  localResults[i] += gridResults[nga++] * scale;
4210  }
4211  }
4212  }
4213  }
4214  } else if ( lesOn ) {
4215  float scale = 1.;
4216  if ( alchFepOn ) {
4217  BigReal alchLambda = simParams->getCurrentLambda(patch->flags.step);
4218  myMgr->alchLambda = alchLambda;
4219  BigReal alchLambda2 = simParams->getCurrentLambda2(patch->flags.step);
4220  myMgr->alchLambda2 = alchLambda2;
4221  if ( g == 0 ) scale = alchLambda;
4222  else if ( g == 1 ) scale = 1. - alchLambda;
4223  } else if ( lesOn ) {
4224  scale = 1.0 / (float)lesFactor;
4225  }
4226  int nga = 0;
4227  for(int i=0; i<numLocalAtoms; ++i) {
4228  if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
4229  localResults[i] += gridResults[nga++] * scale;
4230  }
4231  }
4232  } else if ( selfOn ) {
4233  PmeParticle *lgd = localGridData[g];
4234  int nga = 0;
4235  for(int i=0; i<numLocalAtoms; ++i) {
4236  if ( localPartition[i] == 1 ) {
4237  pairForce += gridResults[nga]; // should add up to almost zero
4238  localResults[i] += gridResults[nga++];
4239  }
4240  }
4241  } else if ( pairOn ) {
4242  if ( g == 0 ) {
4243  int nga = 0;
4244  for(int i=0; i<numLocalAtoms; ++i) {
4245  if ( localPartition[i] == 1 ) {
4246  pairForce += gridResults[nga];
4247  }
4248  if ( localPartition[i] == 1 || localPartition[i] == 2 ) {
4249  localResults[i] += gridResults[nga++];
4250  }
4251  }
4252  } else if ( g == 1 ) {
4253  int nga = 0;
4254  for(int i=0; i<numLocalAtoms; ++i) {
4255  if ( localPartition[i] == g ) {
4256  pairForce -= gridResults[nga]; // should add up to almost zero
4257  localResults[i] -= gridResults[nga++];
4258  }
4259  }
4260  } else {
4261  int nga = 0;
4262  for(int i=0; i<numLocalAtoms; ++i) {
4263  if ( localPartition[i] == g ) {
4264  localResults[i] -= gridResults[nga++];
4265  }
4266  }
4267  }
4268  }
4269  }
4270  }
4271 
4272  Vector *results_ptr = localResults;
4273 
4274  // add in forces
4275  {
4276  Results *r = forceBox->open();
4277  Force *f = r->f[Results::slow];
4278  int numAtoms = patch->getNumAtoms();
4279 
4280  if ( ! myMgr->strayChargeErrors && ! simParams->commOnly ) {
4281  for(int i=0; i<numAtoms; ++i) {
4282  f[i].x += results_ptr->x;
4283  f[i].y += results_ptr->y;
4284  f[i].z += results_ptr->z;
4285  ++results_ptr;
4286  }
4287  }
4288  forceBox->close(&r);
4289  }
4290 
4291  if ( pairOn || selfOn ) {
4292  ADD_VECTOR_OBJECT(myMgr->reduction,REDUCTION_PAIR_ELECT_FORCE,pairForce);
4293  }
4294 
4295 }
4296 
4298 
4300 
4301  for ( int g=0; g<numGrids; ++g ) {
4302  double scale = 1.;
4303  if (alchOn) {
4304  BigReal elecLambdaUp, elecLambdaDown;
4305  // alchLambda set on each step in ComputePme::ungridForces()
4306  if ( alchLambda < 0 || alchLambda > 1 ) {
4307  NAMD_bug("ComputePmeMgr::submitReductions alchLambda out of range");
4308  }
4309  elecLambdaUp = simParams->getElecLambda(alchLambda);
4310  elecLambdaDown = simParams->getElecLambda(1-alchLambda);
4311  if ( g == 0 ) scale = elecLambdaUp;
4312  else if ( g == 1 ) scale = elecLambdaDown;
4313  else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
4314  if (alchDecouple) {
4315  if ( g == 2 ) scale = 1-elecLambdaUp;
4316  else if ( g == 3 ) scale = 1-elecLambdaDown;
4317  else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
4318  }
4319  } else if ( lesOn ) {
4320  scale = 1.0 / lesFactor;
4321  } else if ( pairOn ) {
4322  scale = ( g == 0 ? 1. : -1. );
4323  }
4324  if ( LJPMEOn && 1==g ) {
4325  reduction->item(REDUCTION_LJ_ENERGY_SLOW) += evir[g][0] * scale;
4326  } else {
4327  reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += evir[g][0] * scale;
4328  }
4329  reduction->item(REDUCTION_VIRIAL_SLOW_XX) += evir[g][1] * scale;
4330  reduction->item(REDUCTION_VIRIAL_SLOW_XY) += evir[g][2] * scale;
4331  reduction->item(REDUCTION_VIRIAL_SLOW_XZ) += evir[g][3] * scale;
4332  reduction->item(REDUCTION_VIRIAL_SLOW_YX) += evir[g][2] * scale;
4333  reduction->item(REDUCTION_VIRIAL_SLOW_YY) += evir[g][4] * scale;
4334  reduction->item(REDUCTION_VIRIAL_SLOW_YZ) += evir[g][5] * scale;
4335  reduction->item(REDUCTION_VIRIAL_SLOW_ZX) += evir[g][3] * scale;
4336  reduction->item(REDUCTION_VIRIAL_SLOW_ZY) += evir[g][5] * scale;
4337  reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += evir[g][6] * scale;
4338 
4339  if (alchFepOn) {
4340  double scale2 = 0.;
4341  BigReal elecLambda2Up=0.0, elecLambda2Down=0.0;
4342  elecLambda2Up = simParams->getElecLambda(alchLambda2);
4343  elecLambda2Down = simParams->getElecLambda(1.-alchLambda2);
4344  if ( g == 0 ) scale2 = elecLambda2Up;
4345  else if ( g == 1 ) scale2 = elecLambda2Down;
4346  else if ( g == 2 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
4347  if (alchDecouple && g == 2 ) scale2 = 1 - elecLambda2Up;
4348  else if (alchDecouple && g == 3 ) scale2 = 1 - elecLambda2Down;
4349  else if (alchDecouple && g == 4 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
4350  reduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += evir[g][0] * scale2;
4351  }
4352 
4353  if (alchThermIntOn) {
4354 
4355  // no decoupling:
4356  // part. 1 <-> all of system except partition 2: g[0] - g[2]
4357  // (interactions between all atoms [partition 0 OR partition 1],
4358  // minus all [within partition 0])
4359  // U = elecLambdaUp * (U[0] - U[2])
4360  // dU/dl = U[0] - U[2];
4361 
4362  // part. 2 <-> all of system except partition 1: g[1] - g[2]
4363  // (interactions between all atoms [partition 0 OR partition 2],
4364  // minus all [within partition 0])
4365  // U = elecLambdaDown * (U[1] - U[2])
4366  // dU/dl = U[1] - U[2];
4367 
4368  // alchDecouple:
4369  // part. 1 <-> part. 0: g[0] - g[2] - g[4]
4370  // (interactions between all atoms [partition 0 OR partition 1]
4371  // minus all [within partition 1] minus all [within partition 0]
4372  // U = elecLambdaUp * (U[0] - U[4]) + (1-elecLambdaUp)* U[2]
4373  // dU/dl = U[0] - U[2] - U[4];
4374 
4375  // part. 2 <-> part. 0: g[1] - g[3] - g[4]
4376  // (interactions between all atoms [partition 0 OR partition 2]
4377  // minus all [within partition 2] minus all [within partition 0]
4378  // U = elecLambdaDown * (U[1] - U[4]) + (1-elecLambdaDown)* U[3]
4379  // dU/dl = U[1] - U[3] - U[4];
4380 
4381 
4382  if ( g == 0 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) += evir[g][0];
4383  if ( g == 1 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) += evir[g][0];
4384  if (!alchDecouple) {
4385  if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
4386  if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
4387  }
4388  else { // alchDecouple
4389  if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
4390  if ( g == 3 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
4391  if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
4392  if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
4393  }
4394  }
4395  }
4396 
4397  alchLambda = -1.; // illegal value to catch if not updated
4398 
4399  reduction->item(REDUCTION_STRAY_CHARGE_ERRORS) += strayChargeErrors;
4400  reduction->submit();
4401 
4402  for ( int i=0; i<heldComputes.size(); ++i ) {
4403  WorkDistrib::messageEnqueueWork(heldComputes[i]);
4404  }
4405  heldComputes.resize(0);
4406 }
4407 
4408 #if USE_TOPOMAP
4409 
4410 #define NPRIMES 8
4411 const static unsigned int NAMDPrimes[] = {
4412  3,
4413  5,
4414  7,
4415  11,
4416  13,
4417  17,
4418  19,
4419  23,
4420  29,
4421  31,
4422  37,
4423  59,
4424  73,
4425  93,
4426  113,
4427  157,
4428  307,
4429  617,
4430  1217 //This should b enough for 64K nodes of BGL.
4431 };
4432 
4433 #include "RecBisection.h"
4434 
4435 /***-----------------------------------------------------**********
4436  The Orthogonal Recursive Bisection strategy, which allocates PME
4437  objects close to the patches they communicate, and at the same
4438  time spreads them around the grid
4439 ****----------------------------------------------------------****/
4440 
4441 bool generateBGLORBPmePeList(int *pemap, int numPes,
4442  int *block_pes, int nbpes) {
4443 
4444  PatchMap *pmap = PatchMap::Object();
4445  int *pmemap = new int [CkNumPes()];
4446 
4447  if (pemap == NULL)
4448  return false;
4449 
4450  TopoManager tmgr;
4451 
4452  memset(pmemap, 0, sizeof(int) * CkNumPes());
4453 
4454  for(int count = 0; count < CkNumPes(); count++) {
4455  if(count < nbpes)
4456  pmemap[block_pes[count]] = 1;
4457 
4458  if(pmap->numPatchesOnNode(count)) {
4459  pmemap[count] = 1;
4460 
4461  //Assumes an XYZT mapping !!
4462  if(tmgr.hasMultipleProcsPerNode()) {
4463  pmemap[(count + CkNumPes()/2)% CkNumPes()] = 1;
4464  }
4465  }
4466  }
4467 
4468  if(numPes + nbpes + pmap->numNodesWithPatches() > CkNumPes())
4469  //NAMD_bug("PME ORB Allocator: Processors Unavailable\n");
4470  return false;
4471 
4472  CProxy_Node nd(CkpvAccess(BOCclass_group).node);
4473  Node *node = nd.ckLocalBranch();
4475 
4476  //first split PME processors into patch groups
4477 
4478  int xsize = 0, ysize = 0, zsize = 0;
4479 
4480  xsize = tmgr.getDimNX();
4481  ysize = tmgr.getDimNY();
4482  zsize = tmgr.getDimNZ();
4483 
4484  int nx = xsize, ny = ysize, nz = zsize;
4485  DimensionMap dm;
4486 
4487  dm.x = 0;
4488  dm.y = 1;
4489  dm.z = 2;
4490 
4491  findOptimalDimensions(xsize, ysize, zsize, nx, ny, nz, dm);
4492 
4493  //group size processors have to be allocated to each YZ plane
4494  int group_size = numPes/nx;
4495  if(numPes % nx)
4496  group_size ++;
4497 
4498  int my_prime = NAMDPrimes[0];
4499  int density = (ny * nz)/group_size + 1;
4500  int count = 0;
4501 
4502  //Choose a suitable prime Number
4503  for(count = 0; count < NPRIMES; count ++) {
4504  //Find a prime just greater than the density
4505  if(density < NAMDPrimes[count]) {
4506  my_prime = NAMDPrimes[count];
4507  break;
4508  }
4509  }
4510 
4511  if(count == NPRIMES)
4512  my_prime = NAMDPrimes[NPRIMES-1];
4513 
4514  //int gcount = numPes/2;
4515  int gcount = 0;
4516  int npme_pes = 0;
4517 
4518  int coord[3];
4519 
4520  for(int x = 0; x < nx; x++) {
4521  coord[0] = (x + nx/2)%nx;
4522 
4523  for(count=0; count < group_size && npme_pes < numPes; count++) {
4524  int dest = (count + 1) * my_prime;
4525  dest = dest % (ny * nz);
4526 
4527  coord[2] = dest / ny;
4528  coord[1] = dest - coord[2] * ny;
4529 
4530  //Locate where in the actual grid the processor is
4531  int destPe = coord[dm.x] + coord[dm.y] * xsize +
4532  coord[dm.z] * xsize* ysize;
4533 
4534  if(pmemap[destPe] == 0) {
4535  pemap[gcount++] = destPe;
4536  pmemap[destPe] = 1;
4537 
4538  if(tmgr.hasMultipleProcsPerNode())
4539  pmemap[(destPe + CkNumPes()/2) % CkNumPes()] = 1;
4540 
4541  npme_pes ++;
4542  }
4543  else {
4544  for(int pos = 1; pos < ny * nz; pos++) {
4545 
4546  coord[2] += pos / ny;
4547  coord[1] += pos % ny;
4548 
4549  coord[2] = coord[2] % nz;
4550  coord[1] = coord[1] % ny;
4551 
4552  int newdest = coord[dm.x] + coord[dm.y] * xsize +
4553  coord[dm.z] * xsize * ysize;
4554 
4555  if(pmemap[newdest] == 0) {
4556  pemap[gcount++] = newdest;
4557  pmemap[newdest] = 1;
4558 
4559  if(tmgr.hasMultipleProcsPerNode())
4560  pmemap[(newdest + CkNumPes()/2) % CkNumPes()] = 1;
4561 
4562  npme_pes ++;
4563  break;
4564  }
4565  }
4566  }
4567  }
4568 
4569  if(gcount == numPes)
4570  gcount = 0;
4571 
4572  if(npme_pes >= numPes)
4573  break;
4574  }
4575 
4576  delete [] pmemap;
4577 
4578  if(npme_pes != numPes)
4579  //NAMD_bug("ORB PME allocator failed\n");
4580  return false;
4581 
4582  return true;
4583 }
4584 
4585 #endif
4586 
4587 template <class T> class PmePencil : public T {
4588 public:
4590  data = 0;
4591  work = 0;
4592  send_order = 0;
4593  needs_reply = 0;
4594 #if USE_PERSISTENT
4595  trans_handle = untrans_handle = ungrid_handle = NULL;
4596 #endif
4597  }
4599 #ifdef NAMD_FFTW
4600  fftwf_free(data);
4601 #endif
4602  delete [] work;
4603  delete [] send_order;
4604  delete [] needs_reply;
4605  }
4607  imsg=0;
4608  imsgb=0;
4609  hasData=0;
4610  initdata = msg->data;
4611  }
4612  void order_init(int nBlocks) {
4613  send_order = new int[nBlocks];
4614  for ( int i=0; i<nBlocks; ++i ) send_order[i] = i;
4615  if ( Node::Object()->simParameters->PMESendOrder ) {
4616  std::sort(send_order,send_order+nBlocks,sortop_bit_reversed());
4617  } else {
4618  Random rand(CkMyPe());
4619  rand.reorder(send_order,nBlocks);
4620  }
4621  needs_reply = new int[nBlocks];
4623  }
4627  int sequence; // used for priorities
4628 #ifndef CmiMemoryAtomicType
4629  typedef int AtomicInt;
4630 #else
4631  typedef CmiMemoryAtomicInt AtomicInt;
4632 #endif
4633  AtomicInt imsg; // used in sdag code
4634  AtomicInt imsgb; // Node par uses distinct counter for back path
4635  int hasData; // used in message elimination
4636  int offload;
4637  float *data;
4638  float *work;
4641 #if USE_PERSISTENT
4642  PersistentHandle *trans_handle;
4643  PersistentHandle *untrans_handle;
4644  PersistentHandle *ungrid_handle;
4645 #endif
4646 };
4647 
4648 class PmeZPencil : public PmePencil<CBase_PmeZPencil> {
4649 public:
4650  PmeZPencil_SDAG_CODE
4651  PmeZPencil() { __sdag_init(); setMigratable(false); }
4652  PmeZPencil(CkMigrateMessage *) { __sdag_init(); setMigratable (false); imsg=imsgb=0;}
4654  #ifdef NAMD_FFTW
4655  #ifdef NAMD_FFTW_3
4656  delete [] forward_plans;
4657  delete [] backward_plans;
4658  #endif
4659  #endif
4660  }
4661  void fft_init();
4662  void recv_grid(const PmeGridMsg *);
4663  void forward_fft();
4664  void send_trans();
4665  void send_subset_trans(int fromIdx, int toIdx);
4666  void recv_untrans(const PmeUntransMsg *);
4667  void recvNodeAck(PmeAckMsg *);
4669  void node_process_grid(PmeGridMsg *);
4670  void backward_fft();
4671  void send_ungrid(PmeGridMsg *);
4672  void send_all_ungrid();
4673  void send_subset_ungrid(int fromIdx, int toIdx);
4674 private:
4675  ResizeArray<PmeGridMsg *> grid_msgs;
4676  ResizeArray<int> work_zlist;
4677 #ifdef NAMD_FFTW
4678 #ifdef NAMD_FFTW_3
4679  fftwf_plan forward_plan, backward_plan;
4680 
4681  //for ckloop usage
4682  int numPlans;
4683  fftwf_plan *forward_plans, *backward_plans;
4684 #else
4685  rfftwnd_plan forward_plan, backward_plan;
4686 #endif
4687 #endif
4688 
4689  int nx, ny;
4690 #if USE_PERSISTENT
4691  void setup_persistent() {
4692  int hd = 1;// ( hasData ? 1 : 0 );
4693  int zBlocks = initdata.zBlocks;
4694  int block3 = initdata.grid.block3;
4695  int dim3 = initdata.grid.dim3;
4696  CkArray *yPencil_local = initdata.yPencil.ckLocalBranch();
4697  CmiAssert(yPencil_local);
4698  trans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * zBlocks);
4699  for ( int isend=0; isend<zBlocks; ++isend ) {
4700  int kb = send_order[isend];
4701  int nz1 = block3;
4702  if ( (kb+1)*block3 > dim3/2 ) nz1 = dim3/2 - kb*block3;
4703  int peer = yPencil_local->homePe(CkArrayIndex3D(thisIndex.x, 0, kb));
4704  int size = sizeof(PmeTransMsg) + sizeof(float)*hd*nx*ny*nz1*2 +sizeof( envelope)+PRIORITY_SIZE/8+24;
4705  int compress_start = sizeof(PmeTransMsg)+sizeof(envelope);
4706  int compress_size = sizeof(float)*hd*nx*ny*nz1*2;
4707  trans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
4708  }
4709  }
4710 
4711  void setup_ungrid_persistent()
4712  {
4713  ungrid_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * grid_msgs.size());
4714  int limsg;
4715  for ( limsg=0; limsg < grid_msgs.size(); ++limsg ) {
4716  int peer = grid_msgs[limsg]->sourceNode;
4717  //ungrid_handle[limsg] = CmiCreatePersistent(peer, 0);
4718  }
4719  imsg = limsg;
4720  }
4721 #endif
4722 };
4723 
4724 class PmeYPencil : public PmePencil<CBase_PmeYPencil> {
4725 public:
4726  PmeYPencil_SDAG_CODE
4727  PmeYPencil() { __sdag_init(); setMigratable(false); imsg=imsgb=0;}
4728  PmeYPencil(CkMigrateMessage *) { __sdag_init(); }
4729  void fft_init();
4730  void recv_trans(const PmeTransMsg *);
4731  void forward_fft();
4732  void forward_subset_fft(int fromIdx, int toIdx);
4733  void send_trans();
4734  void send_subset_trans(int fromIdx, int toIdx);
4735  void recv_untrans(const PmeUntransMsg *);
4737  void recvNodeAck(PmeAckMsg *);
4739  void backward_fft();
4740  void backward_subset_fft(int fromIdx, int toIdx);
4741  void send_untrans();
4742  void send_subset_untrans(int fromIdx, int toIdx);
4743 private:
4744 #ifdef NAMD_FFTW
4745 #ifdef NAMD_FFTW_3
4746  fftwf_plan forward_plan, backward_plan;
4747 #else
4748  fftw_plan forward_plan, backward_plan;
4749 #endif
4750 #endif
4751 
4752  int nx, nz;
4753 #if USE_PERSISTENT
4754  void setup_persistent() {
4755  int yBlocks = initdata.yBlocks;
4756  int block2 = initdata.grid.block2;
4757  int K2 = initdata.grid.K2;
4758  int hd = 1;
4759  CkArray *xPencil_local = initdata.xPencil.ckLocalBranch();
4760  trans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * yBlocks);
4761  for ( int isend=0; isend<yBlocks; ++isend ) {
4762  int jb = send_order[isend];
4763  int ny1 = block2;
4764  if ( (jb+1)*block2 > K2 ) ny1 = K2 - jb*block2;
4765  int peer = xPencil_local->homePe(CkArrayIndex3D(0, jb, thisIndex.z));
4766  int size = sizeof(PmeTransMsg) + sizeof(float)*hd*nx*ny1*nz*2 +sizeof( envelope) + PRIORITY_SIZE/8+24;
4767  int compress_start = sizeof(PmeTransMsg)+sizeof( envelope);
4768  int compress_size = sizeof(float)*hd*nx*ny1*nz*2;
4769  trans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
4770  }
4771 
4772  CkArray *zPencil_local = initdata.zPencil.ckLocalBranch();
4773  untrans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * yBlocks);
4774  for ( int isend=0; isend<yBlocks; ++isend ) {
4775  int jb = send_order[isend];
4776  int ny1 = block2;
4777  if ( (jb+1)*block2 > K2 ) ny1 = K2 - jb*block2;
4778  int peer = zPencil_local->homePe(CkArrayIndex3D(thisIndex.x, jb, 0));
4779  int size= sizeof(PmeUntransMsg) + sizeof(float)*nx*ny1*nz*2 + sizeof( envelope) + PRIORITY_SIZE/8+24;
4780  int compress_start = sizeof(PmeUntransMsg) + sizeof( envelope);
4781  int compress_size = sizeof(float)*nx*ny1*nz*2;
4782  untrans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
4783  }
4784  }
4785 #endif
4786 };
4787 
4788 class PmeXPencil : public PmePencil<CBase_PmeXPencil> {
4789 public:
4790  PmeXPencil_SDAG_CODE
4791  PmeXPencil() { __sdag_init(); myKSpace = 0; setMigratable(false); imsg=imsgb=0; recipEvirPe = -999; }
4792  PmeXPencil(CkMigrateMessage *) { __sdag_init(); }
4794  #ifdef NAMD_FFTW
4795  #ifdef NAMD_FFTW_3
4796  delete [] forward_plans;
4797  delete [] backward_plans;
4798  #endif
4799  #endif
4800  }
4801  void fft_init();
4802  void recv_trans(const PmeTransMsg *);
4803  void forward_fft();
4804  void pme_kspace();
4805  void backward_fft();
4806  void send_untrans();
4807  void send_subset_untrans(int fromIdx, int toIdx);
4809 #ifdef NAMD_FFTW
4810 #ifdef NAMD_FFTW_3
4811  fftwf_plan forward_plan, backward_plan;
4812 
4813  int numPlans;
4814  fftwf_plan *forward_plans, *backward_plans;
4815 #else
4817 #endif
4818 #endif
4819  int ny, nz;
4821  void evir_init();
4823 #if USE_PERSISTENT
4824  void setup_persistent() {
4825  int xBlocks = initdata.xBlocks;
4826  int block1 = initdata.grid.block1;
4827  int K1 = initdata.grid.K1;
4828  CkArray *yPencil_local = initdata.yPencil.ckLocalBranch();
4829  untrans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * xBlocks);
4830  for ( int isend=0; isend<xBlocks; ++isend ) {
4831  int ib = send_order[isend];
4832  int nx1 = block1;
4833  if ( (ib+1)*block1 > K1 ) nx1 = K1 - ib*block1;
4834  int peer = yPencil_local->procNum(CkArrayIndex3D(ib, 0, thisIndex.z));
4835  int size = sizeof(PmeUntransMsg) +
4836  sizeof(float)*nx1*ny*nz*2 +sizeof( envelope) + PRIORITY_SIZE/8+24;
4837  int compress_start = sizeof(PmeUntransMsg) + sizeof( envelope);
4838  int compress_size = sizeof(float)*nx1*ny*nz*2;
4839  untrans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
4840  }
4841  }
4842 #endif
4843 
4844 };
4845 
4848  initdata.pmeProxy[recipEvirPe].addRecipEvirClient();
4849 }
4850 
4852  CProxy_Node nd(CkpvAccess(BOCclass_group).node);
4853  Node *node = nd.ckLocalBranch();
4855 
4856 #if USE_NODE_PAR_RECEIVE
4857  ((NodePmeMgr *)CkLocalNodeBranch(initdata.pmeNodeProxy))->registerZPencil(thisIndex,this);
4858 #endif
4859 
4860  int K1 = initdata.grid.K1;
4861  int K2 = initdata.grid.K2;
4862  int K3 = initdata.grid.K3;
4863  int dim3 = initdata.grid.dim3;
4864  int block1 = initdata.grid.block1;
4865  int block2 = initdata.grid.block2;
4866 
4867  nx = block1;
4868  if ( (thisIndex.x + 1) * block1 > K1 ) nx = K1 - thisIndex.x * block1;
4869  ny = block2;
4870  if ( (thisIndex.y + 1) * block2 > K2 ) ny = K2 - thisIndex.y * block2;
4871 
4872 #ifdef NAMD_FFTW
4874 
4875  data = (float *) fftwf_malloc( sizeof(float) *nx*ny*dim3);
4876  work = new float[dim3];
4877 
4879 
4880 #ifdef NAMD_FFTW_3
4881  /* need array of sizes for the how many */
4882 
4883  int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT : simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE ;
4884  int sizeLines=nx*ny;
4885  int planLineSizes[1];
4886  planLineSizes[0]=K3;
4887  int ndim=initdata.grid.dim3; // storage space is initdata.grid.dim3
4888  int ndimHalf=ndim/2;
4889  forward_plan = fftwf_plan_many_dft_r2c(1, planLineSizes, sizeLines,
4890  (float *) data, NULL, 1,
4891  ndim,
4892  (fftwf_complex *) data, NULL, 1,
4893  ndimHalf,
4894  fftwFlags);
4895 
4896  backward_plan = fftwf_plan_many_dft_c2r(1, planLineSizes, sizeLines,
4897  (fftwf_complex *) data, NULL, 1,
4898  ndimHalf,
4899  (float *) data, NULL, 1,
4900  ndim,
4901  fftwFlags);
4902 #if CMK_SMP && USE_CKLOOP
4903  if(simParams->useCkLoop) {
4904  //How many FFT plans to be created? The grain-size issue!!.
4905  //Currently, I am choosing the min(nx, ny) to be coarse-grain
4906  numPlans = (nx<=ny?nx:ny);
4907  if ( numPlans < CkMyNodeSize() ) numPlans = (nx>=ny?nx:ny);
4908  if ( numPlans < CkMyNodeSize() ) numPlans = sizeLines;
4909  int howmany = sizeLines/numPlans;
4910  forward_plans = new fftwf_plan[numPlans];
4911  backward_plans = new fftwf_plan[numPlans];
4912  for(int i=0; i<numPlans; i++) {
4913  int dimStride = i*ndim*howmany;
4914  int dimHalfStride = i*ndimHalf*howmany;
4915  forward_plans[i] = fftwf_plan_many_dft_r2c(1, planLineSizes, howmany,
4916  ((float *)data)+dimStride, NULL, 1,
4917  ndim,
4918  ((fftwf_complex *)data)+dimHalfStride, NULL, 1,
4919  ndimHalf,
4920  fftwFlags);
4921 
4922  backward_plans[i] = fftwf_plan_many_dft_c2r(1, planLineSizes, howmany,
4923  ((fftwf_complex *)data)+dimHalfStride, NULL, 1,
4924  ndimHalf,
4925  ((float *)data)+dimStride, NULL, 1,
4926  ndim,
4927  fftwFlags);
4928  }
4929  }else
4930 #endif
4931  {
4932  forward_plans = NULL;
4933  backward_plans = NULL;
4934  }
4935 #else
4936  forward_plan = rfftwnd_create_plan_specific(1, &K3, FFTW_REAL_TO_COMPLEX,
4937  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
4938  | FFTW_IN_PLACE | FFTW_USE_WISDOM, data, 1, work, 1);
4939  backward_plan = rfftwnd_create_plan_specific(1, &K3, FFTW_COMPLEX_TO_REAL,
4940  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
4941  | FFTW_IN_PLACE | FFTW_USE_WISDOM, data, 1, work, 1);
4942 #endif
4943  CmiUnlock(ComputePmeMgr::fftw_plan_lock);
4944 #else
4945  NAMD_die("Sorry, FFTW must be compiled in to use PME.");
4946 #endif
4947 
4948 #if USE_NODE_PAR_RECEIVE
4949  evir = 0.;
4950  memset(data, 0, sizeof(float) * nx*ny*dim3);
4951 #endif
4952 }
4953 
4955  CProxy_Node nd(CkpvAccess(BOCclass_group).node);
4956  Node *node = nd.ckLocalBranch();
4958 
4959 #if USE_NODE_PAR_RECEIVE
4960  ((NodePmeMgr *)CkLocalNodeBranch(initdata.pmeNodeProxy))->registerYPencil(thisIndex,this);
4961 #endif
4962 
4963  int K1 = initdata.grid.K1;
4964  int K2 = initdata.grid.K2;
4965  int dim2 = initdata.grid.dim2;
4966  int dim3 = initdata.grid.dim3;
4967  int block1 = initdata.grid.block1;
4968  int block3 = initdata.grid.block3;
4969 
4970  nx = block1;
4971  if ( (thisIndex.x + 1) * block1 > K1 ) nx = K1 - thisIndex.x * block1;
4972  nz = block3;
4973  if ( (thisIndex.z+1)*block3 > dim3/2 ) nz = dim3/2 - thisIndex.z*block3;
4974 
4975 #ifdef NAMD_FFTW
4977 
4978  data = (float *) fftwf_malloc( sizeof(float) * nx*dim2*nz*2);
4979  work = new float[2*K2];
4980 
4982 
4983 #ifdef NAMD_FFTW_3
4984  /* need array of sizes for the dimensions */
4985  /* ideally this should be implementable as a single multidimensional
4986  * plan, but that has proven tricky to implement, so we maintain the
4987  * loop of 1d plan executions. */
4988  int sizeLines=nz;
4989  int planLineSizes[1];
4990  planLineSizes[0]=K2;
4991  int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT : simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE ;
4992  forward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
4993  (fftwf_complex *) data, NULL, sizeLines, 1,
4994  (fftwf_complex *) data, NULL, sizeLines, 1,
4995  FFTW_FORWARD,
4996  fftwFlags);
4997  backward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
4998  (fftwf_complex *) data, NULL, sizeLines, 1,
4999  (fftwf_complex *) data, NULL, sizeLines, 1,
5000  FFTW_BACKWARD,
5001  fftwFlags);
5002  CkAssert(forward_plan != NULL);
5003  CkAssert(backward_plan != NULL);
5004 #else
5005  forward_plan = fftw_create_plan_specific(K2, FFTW_FORWARD,
5006  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
5007  | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
5008  nz, (fftw_complex *) work, 1);
5009  backward_plan = fftw_create_plan_specific(K2, FFTW_BACKWARD,
5010  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
5011  | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
5012  nz, (fftw_complex *) work, 1);
5013 #endif
5014  CmiUnlock(ComputePmeMgr::fftw_plan_lock);
5015 #else
5016  NAMD_die("Sorry, FFTW must be compiled in to use PME.");
5017 #endif
5018 
5019 #if USE_NODE_PAR_RECEIVE
5020  evir = 0;
5021  CmiMemoryWriteFence();
5022 #endif
5023 }
5024 
5026 {
5027  if ( msg->hasData ) hasData = 1;
5028  needs_reply[msg->sourceNode] = msg->hasData;
5029  recv_trans(msg);
5030  int limsg;
5031  CmiMemoryAtomicFetchAndInc(imsg,limsg);
5032  if(limsg+1 == initdata.yBlocks)
5033  {
5034  if ( hasData ) {
5035  forward_fft();
5036  }
5037  send_trans();
5038  imsg=0;
5039  CmiMemoryWriteFence();
5040  }
5041 }
5042 
5044  delete msg;
5046 }
5047 
5049 {
5050  if ( msg ) {
5051  if ( ! hasData ) NAMD_bug("PmeYPencil::node_process_untrans non-null msg but not hasData");
5052  recv_untrans(msg);
5053  } else if ( hasData ) NAMD_bug("PmeYPencil::node_process_untrans hasData but null msg");
5054  int limsg;
5055  CmiMemoryAtomicFetchAndInc(imsgb,limsg);
5056  if(limsg+1 == initdata.yBlocks)
5057  {
5058  if ( hasData ) {
5059  backward_fft();
5060  }
5061  hasData=0;
5062  imsgb=0;
5063  CmiMemoryWriteFence();
5064  send_untrans();
5065  }
5066 }
5067 
5068 #define DEBUG_NODE_PAR_RECV 0
5069 
5071  // CkPrintf("[%d] NodePmeMgr recvXTrans for %d %d %d\n",CkMyPe(),msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
5072  PmeXPencil *target=xPencilObj.get(msg->destElem);
5073 #if DEBUG_NODE_PAR_RECV
5074  if(target == NULL)
5075  CkAbort("xpencil in recvXTrans not found, debug registeration");
5076 #endif
5077  target->node_process_trans(msg);
5078  delete msg;
5079 }
5080 
5081 
5083  // CkPrintf("[%d] NodePmeMgr recvYTrans for %d %d %d\n",CkMyPe(),msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
5084  PmeYPencil *target=yPencilObj.get(msg->destElem);
5085 #if DEBUG_NODE_PAR_RECV
5086  if(target == NULL)
5087  CkAbort("ypencil in recvYTrans not found, debug registeration");
5088 #endif
5089  target->node_process_trans(msg);
5090  delete msg;
5091  }
5093  // CkPrintf("[%d] NodePmeMgr recvYUntrans for %d %d %d\n",CkMyPe(),msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
5094  PmeYPencil *target=yPencilObj.get(msg->destElem);
5095 #if DEBUG_NODE_PAR_RECV
5096  if(target == NULL)
5097  CkAbort("ypencil in recvYUntrans not found, debug registeration");
5098 #endif
5099  target->node_process_untrans(msg);
5100  delete msg;
5101  }
5103  //CkPrintf("[%d] NodePmeMgr recvZUntrans for %d %d %d\n",CkMyPe(),msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
5104  PmeZPencil *target=zPencilObj.get(msg->destElem);
5105 #if DEBUG_NODE_PAR_RECV
5106  if(target == NULL)
5107  CkAbort("zpencil in recvZUntrans not found, debug registeration");
5108 #endif
5109  target->node_process_untrans(msg);
5110  delete msg;
5111 }
5112 
5114  //CkPrintf("[%d] NodePmeMgr %p recvGrid for %d %d %d\n",CkMyPe(),this,msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
5115  PmeZPencil *target=zPencilObj.get(msg->destElem);
5116 #if DEBUG_NODE_PAR_RECV
5117  if(target == NULL){
5118  CkAbort("zpencil in recvZGrid not found, debug registeration");
5119  }
5120 #endif
5121  target->node_process_grid(msg); //msg is stored inside node_proces_grid
5122 }
5123 
5125  CProxy_Node nd(CkpvAccess(BOCclass_group).node);
5126  Node *node = nd.ckLocalBranch();
5128 #if USE_NODE_PAR_RECEIVE
5129  ((NodePmeMgr *)CkLocalNodeBranch(initdata.pmeNodeProxy))->registerXPencil(thisIndex,this);
5130 #endif
5131 
5132  int K1 = initdata.grid.K1;
5133  int K2 = initdata.grid.K2;
5134  int dim3 = initdata.grid.dim3;
5135  int block2 = initdata.grid.block2;
5136  int block3 = initdata.grid.block3;
5137 
5138  ny = block2;
5139  if ( (thisIndex.y + 1) * block2 > K2 ) ny = K2 - thisIndex.y * block2;
5140  nz = block3;
5141  if ( (thisIndex.z+1)*block3 > dim3/2 ) nz = dim3/2 - thisIndex.z*block3;
5142 
5143 #ifdef NAMD_FFTW
5145 
5146  data = (float *) fftwf_malloc( sizeof(float) * K1*ny*nz*2);
5147  work = new float[2*K1];
5148 
5150 
5151 #ifdef NAMD_FFTW_3
5152  /* need array of sizes for the how many */
5153  int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT : simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE ;
5154  int sizeLines=ny*nz;
5155  int planLineSizes[1];
5156  planLineSizes[0]=K1;
5157  forward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
5158  (fftwf_complex *) data, NULL, sizeLines, 1,
5159  (fftwf_complex *) data, NULL, sizeLines, 1,
5160  FFTW_FORWARD,
5161  fftwFlags);
5162  backward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
5163  (fftwf_complex *) data, NULL, sizeLines, 1,
5164  (fftwf_complex *) data, NULL, sizeLines, 1,
5165  FFTW_BACKWARD,
5166  fftwFlags);
5167 
5168 #if CMK_SMP && USE_CKLOOP
5169  if(simParams->useCkLoop) {
5170  //How many FFT plans to be created? The grain-size issue!!.
5171  //Currently, I am choosing the min(nx, ny) to be coarse-grain
5172  numPlans = (ny<=nz?ny:nz);
5173  // limit attempted parallelism due to false sharing
5174  //if ( numPlans < CkMyNodeSize() ) numPlans = (ny>=nz?ny:nz);
5175  //if ( numPlans < CkMyNodeSize() ) numPlans = sizeLines;
5176  if ( sizeLines/numPlans < 4 ) numPlans = 1;
5177  int howmany = sizeLines/numPlans;
5178  forward_plans = new fftwf_plan[numPlans];
5179  backward_plans = new fftwf_plan[numPlans];
5180  for(int i=0; i<numPlans; i++) {
5181  int curStride = i*howmany;
5182  forward_plans[i] = fftwf_plan_many_dft(1, planLineSizes, howmany,
5183  ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
5184  ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
5185  FFTW_FORWARD,
5186  fftwFlags);
5187 
5188  backward_plans[i] = fftwf_plan_many_dft(1, planLineSizes, howmany,
5189  ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
5190  ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
5191  FFTW_BACKWARD,
5192  fftwFlags);
5193  }
5194  }else
5195 #endif
5196  {
5197  forward_plans = NULL;
5198  backward_plans = NULL;
5199  }
5200 #else
5201  forward_plan = fftw_create_plan_specific(K1, FFTW_FORWARD,
5202  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
5203  | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
5204  ny*nz, (fftw_complex *) work, 1);
5205  backward_plan = fftw_create_plan_specific(K1, FFTW_BACKWARD,
5206  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
5207  | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
5208  ny*nz, (fftw_complex *) work, 1);
5209 #endif
5210  CmiUnlock(ComputePmeMgr::fftw_plan_lock);
5211 #else
5212  NAMD_die("Sorry, FFTW must be compiled in to use PME.");
5213 #endif
5214 
5216  thisIndex.y*block2, thisIndex.y*block2 + ny,
5217  thisIndex.z*block3, thisIndex.z*block3 + nz);
5218 
5219 }
5220 
5221 // #define FFTCHECK // run a grid of integers through the fft
5222 // #define ZEROCHECK // check for suspicious zeros in fft
5223 
5225 
5226  int dim3 = initdata.grid.dim3;
5227  if ( imsg == 0 ) {
5228  lattice = msg->lattice;
5229  sequence = msg->sequence;
5230 #if ! USE_NODE_PAR_RECEIVE
5231  memset(data, 0, sizeof(float)*nx*ny*dim3);
5232 #endif
5233  }
5234 
5235  if ( ! msg->hasData ) return;
5236 
5237  int zlistlen = msg->zlistlen;
5238 #ifdef NAMD_KNL
5239  int * __restrict msg_zlist = msg->zlist;
5240  int * __restrict zlist = (int*)__builtin_assume_aligned(work_zlist.begin(),
5241  64);
5242  for ( int k=0; k<zlistlen; ++k ) {
5243  zlist[k] = msg_zlist[k];
5244  }
5245 #else
5246  int * __restrict zlist = msg->zlist;
5247 #endif
5248  char * __restrict fmsg = msg->fgrid;
5249  float * __restrict qmsg = msg->qgrid;
5250  float * __restrict d = data;
5251  int numGrids = 1; // pencil FFT doesn't support multiple grids
5252  for ( int g=0; g<numGrids; ++g ) {
5253  for ( int i=0; i<nx; ++i ) {
5254  for ( int j=0; j<ny; ++j, d += dim3 ) {
5255  if( *(fmsg++) ) {
5256  #pragma ivdep
5257  for ( int k=0; k<zlistlen; ++k ) {
5258  d[zlist[k]] += *(qmsg++);
5259  }
5260  }
5261  }
5262  }
5263  }
5264 }
5265 
5266 static inline void PmeXZPencilFFT(int first, int last, void *result, int paraNum, void *param){
5267 #ifdef NAMD_FFTW
5268 #ifdef NAMD_FFTW_3
5269  fftwf_plan *plans = (fftwf_plan *)param;
5270  for(int i=first; i<=last; i++) fftwf_execute(plans[i]);
5271 #endif
5272 #endif
5273 }
5274 
5276  evir = 0.;
5277 #ifdef FFTCHECK
5278  int dim3 = initdata.grid.dim3;
5279  int K3 = initdata.grid.K3;
5280  float std_base = 100. * (thisIndex.x+1.) + 10. * (thisIndex.y+1.);
5281  float *d = data;
5282  for ( int i=0; i<nx; ++i ) {
5283  for ( int j=0; j<ny; ++j, d += dim3 ) {
5284  for ( int k=0; k<dim3; ++k ) {
5285  d[k] = 10. * (10. * (10. * std_base + i) + j) + k;
5286  }
5287  }
5288  }
5289 #endif
5290 #ifdef NAMD_FFTW
5291 #ifdef MANUAL_DEBUG_FFTW3
5292  dumpMatrixFloat3("fw_z_b", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
5293 #endif
5294 #ifdef NAMD_FFTW_3
5295 #if CMK_SMP && USE_CKLOOP
5296  int useCkLoop = Node::Object()->simParameters->useCkLoop;
5297  if(useCkLoop>=CKLOOP_CTRL_PME_FORWARDFFT
5298  && CkNumPes() >= 2 * initdata.xBlocks * initdata.yBlocks) {
5299  //for(int i=0; i<numPlans; i++) fftwf_execute(forward_plans[i]);
5300  //transform the above loop
5301  CkLoop_Parallelize(PmeXZPencilFFT, 1, (void *)forward_plans, CkMyNodeSize(), 0, numPlans-1); //sync
5302  return;
5303  }
5304 #endif
5305  fftwf_execute(forward_plan);
5306 #else
5307  rfftwnd_real_to_complex(forward_plan, nx*ny,
5308  data, 1, initdata.grid.dim3, (fftw_complex *) work, 1, 0);
5309 #endif
5310 #ifdef MANUAL_DEBUG_FFTW3
5311  dumpMatrixFloat3("fw_z_a", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
5312 #endif
5313 
5314 #endif
5315 #ifdef ZEROCHECK
5316  int dim3 = initdata.grid.dim3;
5317  int K3 = initdata.grid.K3;
5318  float *d = data;
5319  for ( int i=0; i<nx; ++i ) {
5320  for ( int j=0; j<ny; ++j, d += dim3 ) {
5321  for ( int k=0; k<dim3; ++k ) {
5322  if ( d[k] == 0. ) CkPrintf("0 in Z at %d %d %d %d %d %d %d %d %d\n",
5323  thisIndex.x, thisIndex.y, i, j, k, nx, ny, dim3);
5324  }
5325  }
5326  }
5327 #endif
5328 }
5329 
5330 /* A single task for partitioned PmeZPencil::send_trans work */
5331 static inline void PmeZPencilSendTrans(int first, int last, void *result, int paraNum, void *param){
5332  PmeZPencil *zpencil = (PmeZPencil *)param;
5333  zpencil->send_subset_trans(first, last);
5334 }
5335 
5336 void PmeZPencil::send_subset_trans(int fromIdx, int toIdx){
5337  int zBlocks = initdata.zBlocks;
5338  int block3 = initdata.grid.block3;
5339  int dim3 = initdata.grid.dim3;
5340  for ( int isend=fromIdx; isend<=toIdx; ++isend ) {
5341  int kb = send_order[isend];
5342  int nz = block3;
5343  if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
5344  int hd = ( hasData ? 1 : 0 );
5345  PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
5346  msg->lattice = lattice;
5347  msg->sourceNode = thisIndex.y;
5348  msg->hasData = hasData;
5349  msg->nx = ny;
5350  if ( hasData ) {
5351  float *md = msg->qgrid;
5352  const float *d = data;
5353  for ( int i=0; i<nx; ++i ) {
5354  for ( int j=0; j<ny; ++j, d += dim3 ) {
5355  for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
5356  *(md++) = d[2*k];
5357  *(md++) = d[2*k+1];
5358  }
5359  }
5360  }
5361  }
5362  msg->sequence = sequence;
5364 
5365  CmiEnableUrgentSend(1);
5366 #if USE_NODE_PAR_RECEIVE
5367  msg->destElem=CkArrayIndex3D(thisIndex.x,0,kb);
5368 #if Y_PERSIST
5369  CmiUsePersistentHandle(&trans_handle[isend], 1);
5370 #endif
5371  initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYTrans(msg);
5372 #if Y_PERSIST
5373  CmiUsePersistentHandle(NULL, 0);
5374 #endif
5375 #else
5376 #if Y_PERSIST
5377  CmiUsePersistentHandle(&trans_handle[isend], 1);
5378 #endif
5379  initdata.yPencil(thisIndex.x,0,kb).recvTrans(msg);
5380 #if Y_PERSIST
5381  CmiUsePersistentHandle(NULL, 0);
5382 #endif
5383 #endif
5384  CmiEnableUrgentSend(0);
5385  }
5386 }
5387 
5389 #if USE_PERSISTENT
5390  if (trans_handle == NULL) setup_persistent();
5391 #endif
5392 #if CMK_SMP && USE_CKLOOP
5393  int useCkLoop = Node::Object()->simParameters->useCkLoop;
5394  if(useCkLoop>=CKLOOP_CTRL_PME_SENDTRANS
5395  && CkNumPes() >= 2 * initdata.xBlocks * initdata.yBlocks) {
5402  //send_subset_trans(0, initdata.zBlocks-1);
5403  CkLoop_Parallelize(PmeZPencilSendTrans, 1, (void *)this, CkMyNodeSize(), 0, initdata.zBlocks-1, 1); //not sync
5404  return;
5405  }
5406 #endif
5407  int zBlocks = initdata.zBlocks;
5408  int block3 = initdata.grid.block3;
5409  int dim3 = initdata.grid.dim3;
5410  for ( int isend=0; isend<zBlocks; ++isend ) {
5411  int kb = send_order[isend];
5412  int nz = block3;
5413  if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
5414  int hd = ( hasData ? 1 : 0 );
5415  PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
5416  msg->lattice = lattice;
5417  msg->sourceNode = thisIndex.y;
5418  msg->hasData = hasData;
5419  msg->nx = ny;
5420  if ( hasData ) {
5421  float *md = msg->qgrid;
5422  const float *d = data;
5423  for ( int i=0; i<nx; ++i ) {
5424  for ( int j=0; j<ny; ++j, d += dim3 ) {
5425  for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
5426  *(md++) = d[2*k];
5427  *(md++) = d[2*k+1];
5428  }
5429  }
5430  }
5431  }
5432  msg->sequence = sequence;
5434 
5435  CmiEnableUrgentSend(1);
5436 #if USE_NODE_PAR_RECEIVE
5437  msg->destElem=CkArrayIndex3D(thisIndex.x,0,kb);
5438 #if Y_PERSIST
5439  CmiUsePersistentHandle(&trans_handle[isend], 1);
5440 #endif
5441  initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYTrans(msg);
5442 #if Y_PERSIST
5443  CmiUsePersistentHandle(NULL, 0);
5444 #endif
5445 #else
5446 #if Y_PERSIST
5447  CmiUsePersistentHandle(&trans_handle[isend], 1);
5448 #endif
5449  initdata.yPencil(thisIndex.x,0,kb).recvTrans(msg);
5450 #if Y_PERSIST
5451  CmiUsePersistentHandle(NULL, 0);
5452 #endif
5453 #endif
5454  CmiEnableUrgentSend(0);
5455  }
5456 }
5457 
5459  if ( imsg == 0 ) {
5460  lattice = msg->lattice;
5461  sequence = msg->sequence;
5462  }
5463  int block2 = initdata.grid.block2;
5464  int K2 = initdata.grid.K2;
5465  int jb = msg->sourceNode;
5466  int ny = msg->nx;
5467  if ( msg->hasData ) {
5468  const float *md = msg->qgrid;
5469  float *d = data;
5470  for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
5471  for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
5472  for ( int k=0; k<nz; ++k ) {
5473 #ifdef ZEROCHECK
5474  if ( (*md) == 0. ) CkPrintf("0 in ZY at %d %d %d %d %d %d %d %d %d\n",
5475  thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
5476 #endif
5477  d[2*(j*nz+k)] = *(md++);
5478  d[2*(j*nz+k)+1] = *(md++);
5479  }
5480  }
5481  }
5482  } else {
5483  float *d = data;
5484  for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
5485  for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
5486  for ( int k=0; k<nz; ++k ) {
5487  d[2*(j*nz+k)] = 0;
5488  d[2*(j*nz+k)+1] = 0;
5489  }
5490  }
5491  }
5492  }
5493 }
5494 
5495 static inline void PmeYPencilForwardFFT(int first, int last, void *result, int paraNum, void *param){
5496  PmeYPencil *ypencil = (PmeYPencil *)param;
5497  ypencil->forward_subset_fft(first, last);
5498 }
5499 void PmeYPencil::forward_subset_fft(int fromIdx, int toIdx) {
5500 #ifdef NAMD_FFTW
5501 #ifdef NAMD_FFTW_3
5502  for(int i=fromIdx; i<=toIdx; i++){
5503  fftwf_execute_dft(forward_plan, ((fftwf_complex *) data) + i
5504  * nz * initdata.grid.K2,
5505  ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
5506  }
5507 #endif
5508 #endif
5509 }
5510 
5512  evir = 0.;
5513 #ifdef NAMD_FFTW
5514 #ifdef MANUAL_DEBUG_FFTW3
5515  dumpMatrixFloat3("fw_y_b", data, nx, initdata.grid.K2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
5516 #endif
5517 
5518 #ifdef NAMD_FFTW_3
5519 #if CMK_SMP && USE_CKLOOP
5520  int useCkLoop = Node::Object()->simParameters->useCkLoop;
5521  if(useCkLoop>=CKLOOP_CTRL_PME_FORWARDFFT
5522  && CkNumPes() >= 2 * initdata.xBlocks * initdata.zBlocks) {
5523  CkLoop_Parallelize(PmeYPencilForwardFFT, 1, (void *)this, CkMyNodeSize(), 0, nx-1); //sync
5524  return;
5525  }
5526 #endif
5527  //the above is a transformation of the following loop using CkLoop
5528  for ( int i=0; i<nx; ++i ) {
5529  fftwf_execute_dft(forward_plan, ((fftwf_complex *) data) + i
5530  * nz * initdata.grid.K2,
5531  ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
5532  }
5533 #else
5534  for ( int i=0; i<nx; ++i ) {
5535  fftw(forward_plan, nz,
5536  ((fftw_complex *) data) + i * nz * initdata.grid.K2,
5537  nz, 1, (fftw_complex *) work, 1, 0);
5538  }
5539 #endif
5540 #ifdef MANUAL_DEBUG_FFTW3
5541  dumpMatrixFloat3("fw_y_a", data, nx, initdata.grid.dim2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
5542 #endif
5543 
5544 #endif
5545 }
5546 
5547 static inline void PmeYPencilSendTrans(int first, int last, void *result, int paraNum, void *param){
5548  PmeYPencil *ypencil = (PmeYPencil *)param;
5549  ypencil->send_subset_trans(first, last);
5550 }
5551 
5552 void PmeYPencil::send_subset_trans(int fromIdx, int toIdx){
5553  int yBlocks = initdata.yBlocks;
5554  int block2 = initdata.grid.block2;
5555  int K2 = initdata.grid.K2;
5556  for ( int isend=fromIdx; isend<=toIdx; ++isend ) {
5557  int jb = send_order[isend];
5558  int ny = block2;
5559  if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
5560  int hd = ( hasData ? 1 : 0 );
5561  PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
5562  msg->lattice = lattice;
5563  msg->sourceNode = thisIndex.x;
5564  msg->hasData = hasData;
5565  msg->nx = nx;
5566  if ( hasData ) {
5567  float *md = msg->qgrid;
5568  const float *d = data;
5569  for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
5570  for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
5571  for ( int k=0; k<nz; ++k ) {
5572  *(md++) = d[2*(j*nz+k)];
5573  *(md++) = d[2*(j*nz+k)+1];
5574  #ifdef ZEROCHECK
5575  if ( *(md-2) == 0. ) CkPrintf("send 0 in YX at %d %d %d %d %d %d %d %d %d\n",
5576  thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
5577  #endif
5578  }
5579  }
5580  }
5581  if ( md != msg->qgrid + nx*ny*nz*2 ) CkPrintf("error in YX at %d %d %d\n",
5582  thisIndex.x, jb, thisIndex.z);
5583  }
5584  msg->sequence = sequence;
5586  CmiEnableUrgentSend(1);
5587 #if USE_NODE_PAR_RECEIVE
5588  msg->destElem=CkArrayIndex3D(0,jb,thisIndex.z);
5589 #if X_PERSIST
5590  CmiUsePersistentHandle(&trans_handle[isend], 1);
5591 #endif
5592  initdata.pmeNodeProxy[CmiNodeOf(initdata.xm.ckLocalBranch()->procNum(0,msg->destElem))].recvXTrans(msg);
5593 #if X_PERSIST
5594  CmiUsePersistentHandle(NULL, 0);
5595 #endif
5596 #else
5597 #if X_PERSIST
5598  CmiUsePersistentHandle(&trans_handle[isend], 1);
5599 #endif
5600  initdata.xPencil(0,jb,thisIndex.z).recvTrans(msg);
5601 #if X_PERSIST
5602  CmiUsePersistentHandle(NULL, 0);
5603 #endif
5604 #endif
5605  CmiEnableUrgentSend(0);
5606  }
5607 }
5608 
5610 #if USE_PERSISTENT
5611  if (trans_handle == NULL) setup_persistent();
5612 #endif
5613 #if CMK_SMP && USE_CKLOOP
5614  int useCkLoop = Node::Object()->simParameters->useCkLoop;
5615  if(useCkLoop>=CKLOOP_CTRL_PME_SENDTRANS
5616  && CkNumPes() >= 2 * initdata.xBlocks * initdata.zBlocks) {
5623  //send_subset_trans(0, initdata.yBlocks-1);
5624  CkLoop_Parallelize(PmeYPencilSendTrans, 1, (void *)this, CkMyNodeSize(), 0, initdata.yBlocks-1, 1); //not sync
5625  return;
5626  }
5627 #endif
5628  int yBlocks = initdata.yBlocks;
5629  int block2 = initdata.grid.block2;
5630  int K2 = initdata.grid.K2;
5631  for ( int isend=0; isend<yBlocks; ++isend ) {
5632  int jb = send_order[isend];
5633  int ny = block2;
5634  if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
5635  int hd = ( hasData ? 1 : 0 );
5636  PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
5637  msg->lattice = lattice;
5638  msg->sourceNode = thisIndex.x;
5639  msg->hasData = hasData;
5640  msg->nx = nx;
5641  if ( hasData ) {
5642  float *md = msg->qgrid;
5643  const float *d = data;
5644  for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
5645  for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
5646  for ( int k=0; k<nz; ++k ) {
5647  *(md++) = d[2*(j*nz+k)];
5648  *(md++) = d[2*(j*nz+k)+1];
5649 #ifdef ZEROCHECK
5650  if ( *(md-2) == 0. ) CkPrintf("send 0 in YX at %d %d %d %d %d %d %d %d %d\n",
5651  thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
5652 #endif
5653  }
5654  }
5655  }
5656  if ( md != msg->qgrid + nx*ny*nz*2 ) CkPrintf("error in YX at %d %d %d\n",
5657  thisIndex.x, jb, thisIndex.z);
5658  }
5659  msg->sequence = sequence;
5661  CmiEnableUrgentSend(1);
5662 #if USE_NODE_PAR_RECEIVE
5663  msg->destElem=CkArrayIndex3D(0,jb,thisIndex.z);
5664 #if X_PERSIST
5665  CmiUsePersistentHandle(&trans_handle[isend], 1);
5666 #endif
5667  initdata.pmeNodeProxy[CmiNodeOf(initdata.xm.ckLocalBranch()->procNum(0,msg->destElem))].recvXTrans(msg);
5668 #if X_PERSIST
5669  CmiUsePersistentHandle(NULL, 0);
5670 #endif
5671 #else
5672 #if X_PERSIST
5673  CmiUsePersistentHandle(&trans_handle[isend], 1);
5674 #endif
5675  initdata.xPencil(0,jb,thisIndex.z).recvTrans(msg);
5676 #if X_PERSIST
5677  CmiUsePersistentHandle(NULL, 0);
5678 #endif
5679 
5680 #endif
5681  CmiEnableUrgentSend(0);
5682  }
5683 }
5684 
5686 {
5687  if(msg->hasData) hasData=1;
5688  needs_reply[msg->sourceNode] = msg->hasData;
5689  recv_trans(msg);
5690  int limsg;
5691  CmiMemoryAtomicFetchAndInc(imsg,limsg);
5692  if(limsg+1 == initdata.xBlocks)
5693  {
5694  if(hasData){
5695  forward_fft();
5696  pme_kspace();
5697  backward_fft();
5698  }
5699  send_untrans();
5700  imsg=0;
5701  CmiMemoryWriteFence();
5702  }
5703 }
5704 
5706  if ( imsg == 0 ) {
5707  lattice = msg->lattice;
5708  sequence = msg->sequence;
5709  }
5710  int block1 = initdata.grid.block1;
5711  int K1 = initdata.grid.K1;
5712  int ib = msg->sourceNode;
5713  int nx = msg->nx;
5714  if ( msg->hasData ) {
5715  const float *md = msg->qgrid;
5716  for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
5717  float *d = data + i*ny*nz*2;
5718  for ( int j=0; j<ny; ++j, d += nz*2 ) {
5719  for ( int k=0; k<nz; ++k ) {
5720 #ifdef ZEROCHECK
5721  if ( (*md) == 0. ) CkPrintf("0 in YX at %d %d %d %d %d %d %d %d %d\n",
5722  ib, thisIndex.y, thisIndex.z, i, j, k, nx, ny, nz);
5723 #endif
5724  d[2*k] = *(md++);
5725  d[2*k+1] = *(md++);
5726  }
5727  }
5728  }
5729  } else {
5730  for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
5731  float *d = data + i*ny*nz*2;
5732  for ( int j=0; j<ny; ++j, d += nz*2 ) {
5733  for ( int k=0; k<nz; ++k ) {
5734  d[2*k] = 0;
5735  d[2*k+1] = 0;
5736  }
5737  }
5738  }
5739  }
5740 }
5741 
5743 #ifdef NAMD_FFTW
5744 
5745 #ifdef MANUAL_DEBUG_FFTW3
5746  dumpMatrixFloat3("fw_x_b", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
5747 #endif
5748 
5749 #ifdef NAMD_FFTW_3
5750 #if CMK_SMP && USE_CKLOOP
5751  int useCkLoop = Node::Object()->simParameters->useCkLoop;
5752  if(useCkLoop>=CKLOOP_CTRL_PME_FORWARDFFT
5753  && CkNumPes() >= 2 * initdata.yBlocks * initdata.zBlocks) {
5754  //for(int i=0; i<numPlans; i++) fftwf_execute(forward_plans[i]);
5755  //transform the above loop
5756  CkLoop_Parallelize(PmeXZPencilFFT, 1, (void *)forward_plans, CkMyNodeSize(), 0, numPlans-1); //sync
5757  return;
5758  }
5759 #endif
5760  fftwf_execute(forward_plan);
5761 #else
5762  fftw(forward_plan, ny*nz,
5763  ((fftw_complex *) data), ny*nz, 1, (fftw_complex *) work, 1, 0);
5764 #endif
5765 #ifdef MANUAL_DEBUG_FFTW3
5766  dumpMatrixFloat3("fw_x_a", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
5767 #endif
5768 
5769 #endif
5770 }
5771 
5773 
5774  evir = 0.;
5775 
5776 #ifdef FFTCHECK
5777  return;
5778 #endif
5779 
5781 
5782  int useCkLoop = 0;
5783 #if CMK_SMP && USE_CKLOOP
5784  if ( Node::Object()->simParameters->useCkLoop >= CKLOOP_CTRL_PME_KSPACE
5785  && CkNumPes() >= 2 * initdata.yBlocks * initdata.zBlocks ) {
5786  useCkLoop = 1;
5787  }
5788 #endif
5789 
5790  // XXX will need to extend pencil decomposition to support LJ-PME
5791  int numGrids = 1;
5792  for ( int g=0; g<numGrids; ++g ) {
5793  evir[0] = myKSpace->compute_energy(data+0*g,
5794  lattice, ewaldcof, &(evir[1]), useCkLoop);
5795  }
5796 
5797 #if USE_NODE_PAR_RECEIVE
5798  CmiMemoryWriteFence();
5799 #endif
5800 }
5801 
5803 #ifdef NAMD_FFTW
5804 #ifdef MANUAL_DEBUG_FFTW3
5805  dumpMatrixFloat3("bw_x_b", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
5806 #endif
5807 
5808 #ifdef NAMD_FFTW_3
5809 #if CMK_SMP && USE_CKLOOP
5810  int useCkLoop = Node::Object()->simParameters->useCkLoop;
5811  if(useCkLoop>=CKLOOP_CTRL_PME_BACKWARDFFT
5812  && CkNumPes() >= 2 * initdata.yBlocks * initdata.zBlocks) {
5813  //for(int i=0; i<numPlans; i++) fftwf_execute(backward_plans[i]);
5814  //transform the above loop
5815  CkLoop_Parallelize(PmeXZPencilFFT, 1, (void *)backward_plans, CkMyNodeSize(), 0, numPlans-1); //sync
5816  return;
5817  }
5818 #endif
5819  fftwf_execute(backward_plan);
5820 #else
5821  fftw(backward_plan, ny*nz,
5822  ((fftw_complex *) data), ny*nz, 1, (fftw_complex *) work, 1, 0);
5823 #endif
5824 #ifdef MANUAL_DEBUG_FFTW3
5825  dumpMatrixFloat3("bw_x_a", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
5826 #endif
5827 #endif
5828 }
5829 
5830 static inline void PmeXPencilSendUntrans(int first, int last, void *result, int paraNum, void *param){
5831  PmeXPencil *xpencil = (PmeXPencil *)param;
5832  xpencil->send_subset_untrans(first, last);
5833 }
5834 
5835 void PmeXPencil::send_subset_untrans(int fromIdx, int toIdx){
5836  int xBlocks = initdata.xBlocks;
5837  int block1 = initdata.grid.block1;
5838  int K1 = initdata.grid.K1;
5839 
5840  for(int isend=fromIdx; isend<=toIdx; isend++) {
5841  int ib = send_order[isend];
5842  if ( ! needs_reply[ib] ) {
5843  PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
5844  CmiEnableUrgentSend(1);
5846 #if USE_NODE_PAR_RECEIVE
5847  initdata.yPencil(ib,0,thisIndex.z).recvNodeAck(msg);
5848 #else
5849  initdata.yPencil(ib,0,thisIndex.z).recvAck(msg);
5850 #endif
5851  CmiEnableUrgentSend(0);
5852  continue;
5853  }
5854  int nx = block1;
5855  if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
5856  PmeUntransMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;
5857  msg->sourceNode = thisIndex.y;
5858  msg->ny = ny;
5859  float *md = msg->qgrid;
5860  for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
5861  float *d = data + i*ny*nz*2;
5862  for ( int j=0; j<ny; ++j, d += nz*2 ) {
5863  for ( int k=0; k<nz; ++k ) {
5864  *(md++) = d[2*k];
5865  *(md++) = d[2*k+1];
5866  }
5867  }
5868  }
5870  CmiEnableUrgentSend(1);
5871 #if USE_NODE_PAR_RECEIVE
5872  msg->destElem=CkArrayIndex3D(ib,0, thisIndex.z);
5873 #if Y_PERSIST
5874  CmiUsePersistentHandle(&untrans_handle[isend], 1);
5875 #endif
5876  initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYUntrans(msg);
5877 #if Y_PERSIST
5878  CmiUsePersistentHandle(NULL, 0);
5879 #endif
5880 #else
5881 #if Y_PERSIST
5882  // CmiUsePersistentHandle(&untrans_handle[isend], 1);
5883 #endif
5884  initdata.yPencil(ib,0,thisIndex.z).recvUntrans(msg);
5885 #if Y_PERSIST
5886  // CmiUsePersistentHandle(NULL, 0);
5887 #endif
5888 #endif
5889  CmiEnableUrgentSend(0);
5890  }
5891 }
5892 
5894 
5895  { // send energy and virial
5896  int numGrids = 1;
5897  PmeEvirMsg *newmsg = new (numGrids, PRIORITY_SIZE) PmeEvirMsg;
5898  newmsg->evir[0] = evir;
5900  CmiEnableUrgentSend(1);
5901  initdata.pmeProxy[recipEvirPe].recvRecipEvir(newmsg);
5902  CmiEnableUrgentSend(0);
5903  }
5904 
5905 #if USE_PERSISTENT
5906  if (untrans_handle == NULL) setup_persistent();
5907 #endif
5908 #if CMK_SMP && USE_CKLOOP
5909  int useCkLoop = Node::Object()->simParameters->useCkLoop;
5910  if(useCkLoop>=CKLOOP_CTRL_PME_SENDUNTRANS
5911  && CkNumPes() >= 2 * initdata.yBlocks * initdata.zBlocks) {
5912  int xBlocks = initdata.xBlocks;
5913 
5914 #if USE_NODE_PAR_RECEIVE
5915  //CkLoop_Parallelize(PmeXPencilSendUntrans, 1, (void *)this, CkMyNodeSize(), 0, xBlocks-1, 1); //has to sync
5916  CkLoop_Parallelize(PmeXPencilSendUntrans, 1, (void *)this, xBlocks, 0, xBlocks-1, 1); //has to sync
5917 #else
5918  //CkLoop_Parallelize(PmeXPencilSendUntrans, 1, (void *)this, CkMyNodeSize(), 0, xBlocks-1, 0); //not sync
5919  CkLoop_Parallelize(PmeXPencilSendUntrans, 1, (void *)this, xBlocks, 0, xBlocks-1, 0); //not sync
5920 #endif
5921  return;
5922  }
5923 #endif
5924  int xBlocks = initdata.xBlocks;
5925  int block1 = initdata.grid.block1;
5926  int K1 = initdata.grid.K1;
5927  for ( int isend=0; isend<xBlocks; ++isend ) {
5928  int ib = send_order[isend];
5929  if ( ! needs_reply[ib] ) {
5930  PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
5931  CmiEnableUrgentSend(1);
5933 #if USE_NODE_PAR_RECEIVE
5934  initdata.yPencil(ib,0,thisIndex.z).recvNodeAck(msg);
5935 #else
5936  initdata.yPencil(ib,0,thisIndex.z).recvAck(msg);
5937 #endif
5938  CmiEnableUrgentSend(0);
5939  continue;
5940  }
5941  int nx = block1;
5942  if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
5943  PmeUntransMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;
5944  msg->sourceNode = thisIndex.y;
5945  msg->ny = ny;
5946  float *md = msg->qgrid;
5947  for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
5948  float *d = data + i*ny*nz*2;
5949  for ( int j=0; j<ny; ++j, d += nz*2 ) {
5950  for ( int k=0; k<nz; ++k ) {
5951  *(md++) = d[2*k];
5952  *(md++) = d[2*k+1];
5953  }
5954  }
5955  }
5957 
5958  CmiEnableUrgentSend(1);
5959 #if USE_NODE_PAR_RECEIVE
5960  msg->destElem=CkArrayIndex3D(ib,0, thisIndex.z);
5961 #if Y_PERSIST
5962  CmiUsePersistentHandle(&untrans_handle[isend], 1);
5963 #endif
5964  initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYUntrans(msg);
5965 #if Y_PERSIST
5966  CmiUsePersistentHandle(NULL, 0);
5967 #endif
5968 #else
5969 #if Y_PERSIST
5970  CmiUsePersistentHandle(&untrans_handle[isend], 1);
5971 #endif
5972  initdata.yPencil(ib,0,thisIndex.z).recvUntrans(msg);
5973 #if Y_PERSIST
5974  CmiUsePersistentHandle(NULL, 0);
5975 #endif
5976 #endif
5977  CmiEnableUrgentSend(0);
5978  }
5979 }
5980 
5982  int block2 = initdata.grid.block2;
5983  int K2 = initdata.grid.K2;
5984  int jb = msg->sourceNode;
5985  int ny = msg->ny;
5986  const float *md = msg->qgrid;
5987  float *d = data;
5988  for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
5989 #if CMK_BLUEGENEL
5990  CmiNetworkProgress();
5991 #endif
5992  for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
5993  for ( int k=0; k<nz; ++k ) {
5994 #ifdef ZEROCHECK
5995  if ( (*md) == 0. ) CkPrintf("0 in XY at %d %d %d %d %d %d %d %d %d\n",
5996  thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
5997 #endif
5998  d[2*(j*nz+k)] = *(md++);
5999  d[2*(j*nz+k)+1] = *(md++);
6000  }
6001  }
6002  }
6003 }
6004 
6005 static inline void PmeYPencilBackwardFFT(int first, int last, void *result, int paraNum, void *param){
6006  PmeYPencil *ypencil = (PmeYPencil *)param;
6007  ypencil->backward_subset_fft(first, last);
6008 }
6009 
6010 void PmeYPencil::backward_subset_fft(int fromIdx, int toIdx) {
6011 #ifdef NAMD_FFTW
6012 #ifdef NAMD_FFTW_3
6013  for(int i=fromIdx; i<=toIdx; i++){
6014  fftwf_execute_dft(backward_plan,
6015  ((fftwf_complex *) data) + i * nz * initdata.grid.K2,
6016  ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
6017  }
6018 #endif
6019 #endif
6020 }
6021 
6023 #ifdef NAMD_FFTW
6024 #ifdef MANUAL_DEBUG_FFTW3
6025  dumpMatrixFloat3("bw_y_b", data, nx, initdata.grid.K2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
6026 #endif
6027 
6028 #ifdef NAMD_FFTW_3
6029 #if CMK_SMP && USE_CKLOOP
6030  int useCkLoop = Node::Object()->simParameters->useCkLoop;
6031  if(useCkLoop>=CKLOOP_CTRL_PME_BACKWARDFFT
6032  && CkNumPes() >= 2 * initdata.xBlocks * initdata.zBlocks) {
6033  CkLoop_Parallelize(PmeYPencilBackwardFFT, 1, (void *)this, CkMyNodeSize(), 0, nx-1); //sync
6034  return;
6035  }
6036 #endif
6037  //the above is a transformation of the following loop using CkLoop
6038  for ( int i=0; i<nx; ++i ) {
6039 #if CMK_BLUEGENEL
6040  CmiNetworkProgress();
6041 #endif
6042  fftwf_execute_dft(backward_plan,
6043  ((fftwf_complex *) data) + i * nz * initdata.grid.K2,
6044  ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
6045  }
6046 #else
6047  for ( int i=0; i<nx; ++i ) {
6048 #if CMK_BLUEGENEL
6049  CmiNetworkProgress();
6050 #endif
6051  fftw(backward_plan, nz,
6052  ((fftw_complex *) data) + i * nz * initdata.grid.K2,
6053  nz, 1, (fftw_complex *) work, 1, 0);
6054  }
6055 #endif
6056 
6057 #ifdef MANUAL_DEBUG_FFTW3
6058  dumpMatrixFloat3("bw_y_a", data, nx, initdata.grid.K2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
6059 #endif
6060 
6061 #endif
6062 }
6063 
6064 static inline void PmeYPencilSendUntrans(int first, int last, void *result, int paraNum, void *param){
6065  PmeYPencil *ypencil = (PmeYPencil *)param;
6066  ypencil->send_subset_untrans(first, last);
6067 }
6068 
6069 void PmeYPencil::send_subset_untrans(int fromIdx, int toIdx){
6070  int yBlocks = initdata.yBlocks;
6071  int block2 = initdata.grid.block2;
6072  int K2 = initdata.grid.K2;
6073 
6074  for(int isend=fromIdx; isend<=toIdx; isend++) {
6075  int jb = send_order[isend];
6076  if ( ! needs_reply[jb] ) {
6077  PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
6078  CmiEnableUrgentSend(1);
6080 #if USE_NODE_PAR_RECEIVE
6081  initdata.zPencil(thisIndex.x,jb,0).recvNodeAck(msg);
6082 #else
6083  initdata.zPencil(thisIndex.x,jb,0).recvAck(msg);
6084 #endif
6085  CmiEnableUrgentSend(0);
6086  continue;
6087  }
6088  int ny = block2;
6089  if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
6090  PmeUntransMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;
6091  msg->sourceNode = thisIndex.z;
6092  msg->ny = nz;
6093  float *md = msg->qgrid;
6094  const float *d = data;
6095  for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
6096  for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
6097  for ( int k=0; k<nz; ++k ) {
6098  *(md++) = d[2*(j*nz+k)];
6099  *(md++) = d[2*(j*nz+k)+1];
6100  }
6101  }
6102  }
6104  CmiEnableUrgentSend(1);
6105 #if USE_NODE_PAR_RECEIVE
6106  msg->destElem=CkArrayIndex3D( thisIndex.x, jb, 0);
6107  // CkPrintf("[%d] sending to %d %d %d recvZUntrans on node %d\n", CkMyPe(), thisIndex.x, jb, 0, CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem)));
6108 #if Z_PERSIST
6109  CmiUsePersistentHandle(&untrans_handle[isend], 1);
6110 #endif
6111  initdata.pmeNodeProxy[CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem))].recvZUntrans(msg);
6112 #if Z_PERSIST
6113  CmiUsePersistentHandle(NULL, 0);
6114 #endif
6115 #else
6116 #if Z_PERSIST
6117  CmiUsePersistentHandle(&untrans_handle[isend], 1);
6118 #endif
6119  initdata.zPencil(thisIndex.x,jb,0).recvUntrans(msg);
6120 #if Z_PERSIST
6121  CmiUsePersistentHandle(NULL, 0);
6122 #endif
6123 #endif
6124  CmiEnableUrgentSend(0);
6125  }
6126 }
6127 
6129 #if USE_PERSISTENT
6130  if (untrans_handle == NULL) setup_persistent();
6131 #endif
6132 #if CMK_SMP && USE_CKLOOP
6133  int useCkLoop = Node::Object()->simParameters->useCkLoop;
6134  if(useCkLoop>=CKLOOP_CTRL_PME_SENDUNTRANS
6135  && CkNumPes() >= 2 * initdata.xBlocks * initdata.zBlocks) {
6136  int yBlocks = initdata.yBlocks;
6137 
6138 #if USE_NODE_PAR_RECEIVE
6139  //CkLoop_Parallelize(PmeYPencilSendUntrans, 1, (void *)this, CkMyNodeSize(), 0, yBlocks-1, 1); //sync
6140  CkLoop_Parallelize(PmeYPencilSendUntrans, 1, (void *)this, yBlocks, 0, yBlocks-1, 1);
6141 #else
6142  //CkLoop_Parallelize(PmeYPencilSendUntrans, 1, (void *)this, CkMyNodeSize(), 0, yBlocks-1, 0); //not sync
6143  CkLoop_Parallelize(PmeYPencilSendUntrans, 1, (void *)this, yBlocks, 0, yBlocks-1, 0); //not sync
6144 #endif
6145  return;
6146  }
6147 #endif
6148  int yBlocks = initdata.yBlocks;
6149  int block2 = initdata.grid.block2;
6150  int K2 = initdata.grid.K2;
6151  for ( int isend=0; isend<yBlocks; ++isend ) {
6152  int jb = send_order[isend];
6153  if ( ! needs_reply[jb] ) {
6154  PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
6155  CmiEnableUrgentSend(1);
6157 #if USE_NODE_PAR_RECEIVE
6158  initdata.zPencil(thisIndex.x,jb,0).recvNodeAck(msg);
6159 #else
6160  initdata.zPencil(thisIndex.x,jb,0).recvAck(msg);
6161 #endif
6162  CmiEnableUrgentSend(0);
6163  continue;
6164  }
6165  int ny = block2;
6166  if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
6167  PmeUntransMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;
6168  msg->sourceNode = thisIndex.z;
6169  msg->ny = nz;
6170  float *md = msg->qgrid;
6171  const float *d = data;
6172  for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
6173  for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
6174  for ( int k=0; k<nz; ++k ) {
6175  *(md++) = d[2*(j*nz+k)];
6176  *(md++) = d[2*(j*nz+k)+1];
6177  }
6178  }
6179  }
6181 
6182  CmiEnableUrgentSend(1);
6183 #if USE_NODE_PAR_RECEIVE
6184  msg->destElem=CkArrayIndex3D( thisIndex.x, jb, 0);
6185  // CkPrintf("[%d] sending to %d %d %d recvZUntrans on node %d\n", CkMyPe(), thisIndex.x, jb, 0, CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem)));
6186 #if Z_PERSIST
6187  CmiUsePersistentHandle(&untrans_handle[isend], 1);
6188 #endif
6189  initdata.pmeNodeProxy[CmiNodeOf(initdata.zm.ckLocalBranch()->procNum(0,msg->destElem))].recvZUntrans(msg);
6190 #if Z_PERSIST
6191  CmiUsePersistentHandle(NULL, 0);
6192 #endif
6193 #else
6194 #if Z_PERSIST
6195  CmiUsePersistentHandle(&untrans_handle[isend], 1);
6196 #endif
6197  initdata.zPencil(thisIndex.x,jb,0).recvUntrans(msg);
6198 #if Z_PERSIST
6199  CmiUsePersistentHandle(NULL, 0);
6200 #endif
6201 #endif
6202  CmiEnableUrgentSend(0);
6203  }
6204 
6205 #if USE_NODE_PAR_RECEIVE
6206  evir = 0.;
6207  CmiMemoryWriteFence();
6208 #endif
6209 }
6210 
6212 #if ! USE_NODE_PAR_RECEIVE
6213  if(imsg==0) evir=0.;
6214 #endif
6215 
6216  int block3 = initdata.grid.block3;
6217  int dim3 = initdata.grid.dim3;
6218  int kb = msg->sourceNode;
6219  int nz = msg->ny;
6220  const float *md = msg->qgrid;
6221  float *d = data;
6222  for ( int i=0; i<nx; ++i ) {
6223 #if CMK_BLUEGENEL
6224  CmiNetworkProgress();
6225 #endif
6226  for ( int j=0; j<ny; ++j, d += dim3 ) {
6227  for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
6228 #ifdef ZEROCHECK
6229  if ( (*md) == 0. ) CkPrintf("0 in YZ at %d %d %d %d %d %d %d %d %d\n",
6230  thisIndex.x, thisIndex.y, kb, i, j, k, nx, ny, nz);
6231 #endif
6232  d[2*k] = *(md++);
6233  d[2*k+1] = *(md++);
6234  }
6235  }
6236  }
6237 }
6238 
6240 #ifdef NAMD_FFTW
6241 #ifdef MANUAL_DEBUG_FFTW3
6242  dumpMatrixFloat3("bw_z_b", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
6243 #endif
6244 #ifdef NAMD_FFTW_3
6245 #if CMK_SMP && USE_CKLOOP
6246  int useCkLoop = Node::Object()->simParameters->useCkLoop;
6247  if(useCkLoop>=CKLOOP_CTRL_PME_BACKWARDFFT
6248  && CkNumPes() >= 2 * initdata.xBlocks * initdata.yBlocks) {
6249  //for(int i=0; i<numPlans; i++) fftwf_execute(backward_plans[i]);
6250  //transform the above loop
6251  CkLoop_Parallelize(PmeXZPencilFFT, 1, (void *)backward_plans, CkMyNodeSize(), 0, numPlans-1); //sync
6252  return;
6253  }
6254 #endif
6255  fftwf_execute(backward_plan);
6256 #else
6257  rfftwnd_complex_to_real(backward_plan, nx*ny,
6258  (fftw_complex *) data, 1, initdata.grid.dim3/2, work, 1, 0);
6259 #endif
6260 #ifdef MANUAL_DEBUG_FFTW3
6261  dumpMatrixFloat3("bw_z_a", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
6262 #endif
6263 
6264 #endif
6265 
6266 #if CMK_BLUEGENEL
6267  CmiNetworkProgress();
6268 #endif
6269 
6270 #ifdef FFTCHECK
6271  int dim3 = initdata.grid.dim3;
6272  int K1 = initdata.grid.K1;
6273  int K2 = initdata.grid.K2;
6274  int K3 = initdata.grid.K3;
6275  float scale = 1. / (1. * K1 * K2 * K3);
6276  float maxerr = 0.;
6277  float maxstd = 0.;
6278  int mi, mj, mk; mi = mj = mk = -1;
6279  float std_base = 100. * (thisIndex.x+1.) + 10. * (thisIndex.y+1.);
6280  const float *d = data;
6281  for ( int i=0; i<nx; ++i ) {
6282  for ( int j=0; j<ny; ++j, d += dim3 ) {
6283  for ( int k=0; k<K3; ++k ) {
6284  float std = 10. * (10. * (10. * std_base + i) + j) + k;
6285  float err = scale * d[k] - std;
6286  if ( fabsf(err) > fabsf(maxerr) ) {
6287  maxerr = err;
6288  maxstd = std;
6289  mi = i; mj = j; mk = k;
6290  }
6291  }
6292  }
6293  }
6294  CkPrintf("pencil %d %d max error %f at %d %d %d (should be %f)\n",
6295  thisIndex.x, thisIndex.y, maxerr, mi, mj, mk, maxstd);
6296 #endif
6297 
6298 }
6299 
6300 static inline void PmeZPencilSendUngrid(int first, int last, void *result, int paraNum, void *param){
6301  //to take advantage of the interface which allows 3 user params at most.
6302  //under such situtation, no new parameter list needs to be created!! -Chao Mei
6303  PmeZPencil *zpencil = (PmeZPencil *)param;
6304  zpencil->send_subset_ungrid(first, last);
6305 }
6306 
6308 
6309 #if CMK_SMP && USE_CKLOOP
6310  int useCkLoop = Node::Object()->simParameters->useCkLoop;
6311  if(useCkLoop>=CKLOOP_CTRL_PME_SENDUNTRANS
6312  && CkNumPes() >= 2 * initdata.xBlocks * initdata.yBlocks) {
6313  //????What's the best value for numChunks?????
6314  CkLoop_Parallelize(PmeZPencilSendUngrid, 1, (void *)this, grid_msgs.size(), 0, grid_msgs.size()-1, 1); //has to sync
6315  return;
6316  }
6317 #endif
6318  send_subset_ungrid(0, grid_msgs.size()-1);
6319 }
6320 
6321 void PmeZPencil::send_subset_ungrid(int fromIdx, int toIdx){
6322  for (int limsg=fromIdx; limsg <=toIdx; ++limsg ) {
6323  PmeGridMsg *msg = grid_msgs[limsg];
6324  send_ungrid(msg);
6325  }
6326 }
6327 
6329 
6330 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
6331  const int UNGRID_PRIORITY = ( offload ? PME_OFFLOAD_UNGRID_PRIORITY : PME_UNGRID_PRIORITY );
6332 #else
6333  const int UNGRID_PRIORITY = PME_UNGRID_PRIORITY ;
6334 #endif
6335 
6336  int pe = msg->sourceNode;
6337  if ( ! msg->hasData ) {
6338  delete msg;
6339  PmeAckMsg *ackmsg = new (PRIORITY_SIZE) PmeAckMsg;
6340  SET_PRIORITY(ackmsg,sequence,UNGRID_PRIORITY)
6341  CmiEnableUrgentSend(1);
6342  initdata.pmeProxy[pe].recvAck(ackmsg);
6343  CmiEnableUrgentSend(0);
6344  return;
6345  }
6346  if ( ! hasData ) NAMD_bug("PmeZPencil::send_ungrid msg->hasData but not pencil->hasData");
6347  msg->sourceNode = thisIndex.x * initdata.yBlocks + thisIndex.y;
6348  int dim3 = initdata.grid.dim3;
6349  int zlistlen = msg->zlistlen;
6350  int *zlist = msg->zlist;
6351  char *fmsg = msg->fgrid;
6352  float *qmsg = msg->qgrid;
6353  float *d = data;
6354  int numGrids = 1; // pencil FFT doesn't support multiple grids
6355  for ( int g=0; g<numGrids; ++g ) {
6356 #if CMK_BLUEGENEL
6357  CmiNetworkProgress();
6358 #endif
6359  for ( int i=0; i<nx; ++i ) {
6360  for ( int j=0; j<ny; ++j, d += dim3 ) {
6361  if( *(fmsg++) ) {
6362  for ( int k=0; k<zlistlen; ++k ) {
6363  *(qmsg++) = d[zlist[k]];
6364  }
6365  }
6366  }
6367  }
6368  }
6369  SET_PRIORITY(msg,sequence,UNGRID_PRIORITY)
6370  CmiEnableUrgentSend(1);
6371 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
6372  if ( offload ) {
6373  initdata.pmeNodeProxy[CkNodeOf(pe)].recvUngrid(msg);
6374  } else
6375 #endif
6376  initdata.pmeProxy[pe].recvUngrid(msg);
6377  CmiEnableUrgentSend(0);
6378 }
6379 
6381 {
6382 #if USE_NODE_PAR_RECEIVE
6384  CmiMemoryReadFence();
6385 #endif
6386  recv_grid(msg);
6387  if(msg->hasData) hasData=msg->hasData;
6388  int limsg;
6389  CmiMemoryAtomicFetchAndInc(imsg,limsg);
6390  grid_msgs[limsg] = msg;
6391  // CkPrintf("[%d] PmeZPencil node_process_grid for %d %d %d has %d of %d imsg %d\n",CkMyPe(),thisIndex.x,thisIndex.y,thisIndex.z, limsg, grid_msgs.size(), imsg);
6392  if(limsg+1 == grid_msgs.size())
6393  {
6394 
6395  if (hasData)
6396  {
6397  forward_fft();
6398  }
6399  send_trans();
6400  imsg=0;
6401  CmiMemoryWriteFence();
6402  // CkPrintf("[%d] PmeZPencil grid node_zero imsg for %d %d %d\n",CkMyPe(),thisIndex.x,thisIndex.y,thisIndex.z);
6403  }
6404 #if USE_NODE_PAR_RECEIVE
6405  CmiUnlock(ComputePmeMgr::fftw_plan_lock);
6406  CmiMemoryWriteFence();
6407 #endif
6408 }
6409 
6411  delete msg;
6413 }
6414 
6416 {
6417  if ( msg ) {
6418  if ( ! hasData ) NAMD_bug("PmeZPencil::node_process_untrans non-null msg but not hasData");
6419  recv_untrans(msg);
6420  } else if ( hasData ) NAMD_bug("PmeZPencil::node_process_untrans hasData but null msg");
6421 #if USE_NODE_PAR_RECEIVE
6422  CmiMemoryWriteFence();
6424 #endif
6425  int limsg;
6426  CmiMemoryAtomicFetchAndInc(imsgb,limsg);
6427  if(limsg+1 == initdata.zBlocks)
6428  {
6429 #if USE_NODE_PAR_RECEIVE
6430  CmiMemoryReadFence();
6431 #endif
6432  if(hasData) {
6433  backward_fft();
6434  }
6435  send_all_ungrid();
6436  hasData=0;
6437  imsgb=0;
6438  evir = 0;
6439  memset(data, 0, sizeof(float) * nx*ny* initdata.grid.dim3);
6440  CmiMemoryWriteFence();
6441  // CkPrintf("[%d] PmeZPencil untrans node_zero imsg for %d %d %d\n",CkMyPe(),thisIndex.x,thisIndex.y,thisIndex.z);
6442  }
6443 #if USE_NODE_PAR_RECEIVE
6444  CmiUnlock(ComputePmeMgr::fftw_plan_lock);
6445 #endif
6446 }
6447 
6449 {
6450  if ( CkMyRank() ) return;
6451 
6453 
6454  alchOn = simParams->alchOn;
6455  alchFepOn = simParams->alchFepOn;
6456  alchThermIntOn = simParams->alchThermIntOn;
6457  alchDecouple = alchOn && simParams->alchDecouple;
6458  alchElecLambdaStart = alchOn ? simParams->alchElecLambdaStart : 0;
6459  lesOn = simParams->lesOn;
6460  lesFactor = simParams->lesFactor;
6461  pairOn = simParams->pairInteractionOn;
6462  selfOn = simParams->pairInteractionSelf;
6463 
6464  LJPMEOn = (simParams->LJPMEOn && ! simParams->LJPMESerialOn);
6465 
6466  if ( alchOn ) {
6467  numGrids = 2;
6468  if (alchDecouple) numGrids += 2;
6470  } else if ( lesOn ) {
6471  numGrids = lesFactor;
6472  } else if ( pairOn ) {
6473  if ( selfOn ) pairOn = 0; // make pairOn and selfOn exclusive
6474  numGrids = (selfOn ? 1 : 3);
6475  } else {
6476  numGrids = 1;
6477  }
6478 
6479  if ( LJPMEOn ) {
6480  numGrids *= 2;
6481  }
6482 
6483 }
6484 
6485 #include "ComputePmeMgr.def.h"
6486 
static Node * Object()
Definition: Node.h:86
void node_process_grid(PmeGridMsg *)
Definition: ComputePme.C:6380
void setNumPatches(int n)
Definition: Compute.h:52
int dim2
Definition: PmeBase.h:22
void sendPencilsHelper(int)
Definition: ComputePme.C:3796
Lattice * saved_lattice
Definition: ComputePme.C:475
static void scale_coordinates(PmeParticle p[], int N, Lattice lattice, PmeGrid grid)
Definition: PmeBase.inl:17
void recvZGrid(PmeGridMsg *)
Definition: ComputePme.C:5113
#define CUDA_POLL(FN, ARG)
Definition: ComputePme.C:2505
void initialize(CkQdMsg *)
Definition: ComputePme.C:890
void compute_forces(const float *const *q_arr, const PmeParticle p[], Vector f[])
Definition: PmeRealSpace.C:141
static CmiNodeLock fftw_plan_lock
Definition: ComputePme.C:442
std::ostream & iINFO(std::ostream &s)
Definition: InfoStream.C:81
static void sortPmePes(int *pmepes, int xdim, int ydim)
Definition: WorkDistrib.C:307
Box< Patch, CompAtom > * registerAvgPositionPickup(Compute *cid)
Definition: Patch.C:133
void ungridForces()
Definition: ComputePme.C:4090
float * qgrid
Definition: ComputePme.C:182
int getNumAtoms() const
Definition: Patch.h:105
int numNodesWithPatches(void)
Definition: PatchMap.h:61
static void PmeYPencilSendTrans(int first, int last, void *result, int paraNum, void *param)
Definition: ComputePme.C:5547
PmePencilInitMsgData initdata
Definition: ComputePme.C:4624
double forces_time
Definition: ComputePme.C:459
int sequence(void)
Definition: Compute.h:64
int size(void) const
Definition: ResizeArray.h:131
float * a_data_dev
Definition: ComputePme.C:447
#define PME_UNGRID_PRIORITY
Definition: Priorities.h:74
void addRecipEvirClient(void)
Definition: ComputePme.C:3064
CProxy_PmePencilMap zm
Definition: ComputePme.C:662
#define EVENT_STRIDE
Definition: ComputePme.C:2506
void send_subset_untrans(int fromIdx, int toIdx)
Definition: ComputePme.C:6069
#define PME_TRANS2_PRIORITY
Definition: Priorities.h:32
void recv_trans(const PmeTransMsg *)
Definition: ComputePme.C:5458
void cuda_init_bspline_coeffs(float **c, float **dc, int order)
int dim3
Definition: PmeBase.h:22
cudaEvent_t end_charges
Definition: ComputePme.C:454
void gridCalc2(void)
Definition: ComputePme.C:2110
CProxy_ComputePmeMgr pmeProxy
Definition: ComputePme.C:245
void procUntrans(PmeUntransMsg *)
Definition: ComputePme.C:2337
Definition: Node.h:78
#define TRACE_COMPOBJ_IDOFFSET
Definition: Compute.h:77
int32 ComputeID
Definition: NamdTypes.h:288
BigReal max_a(int pid) const
Definition: PatchMap.h:92
void cuda_errcheck(const char *msg)
Definition: ComputePme.C:67
void doWork()
Definition: ComputePme.C:3136
const int * get_qmAtmIndx()
Definition: Molecule.h:863
int sequence
Definition: ComputePme.C:144
void send_trans()
Definition: ComputePme.C:5609
double x
Definition: PmeBase.h:37
static PatchMap * Object()
Definition: PatchMap.h:27
void registerXPencil(CkArrayIndex3D, PmeXPencil *)
Definition: ComputePme.C:719
void order_init(int nBlocks)
Definition: ComputePme.C:4612
friend class ComputePmeMgr
Definition: ComputePme.C:637
void doQMWork()
Definition: ComputePme.C:3078
void sendPencils(Lattice &, int sequence)
Definition: ComputePme.C:3809
Definition: Vector.h:72
int K2
Definition: PmeBase.h:21
CProxy_PmeZPencil zPencil
Definition: ComputePme.C:244
virtual void submit(void)=0
SimParameters * simParameters
Definition: Node.h:181
int K1
Definition: PmeBase.h:21
void recvXTrans(PmeTransMsg *)
Definition: ComputePme.C:5070
int sequence
Definition: ComputePme.C:4627
int get_numQMAtoms()
Definition: Molecule.h:865
void fft_init()
Definition: ComputePme.C:4954
void cudaDie(const char *msg, cudaError_t err)
Definition: CudaUtils.C:9
DispCoef dispcoef
Definition: NamdTypes.h:151
#define CUDA_STREAM_CREATE(X)
float Real
Definition: common.h:118
#define COULOMB
Definition: common.h:53
#define CUDA_EVENT_ID_PME_COPY
BigReal & item(int i)
Definition: ReductionMgr.h:336
#define DebugM(x, y)
Definition: Debug.h:75
static int numGrids
Definition: ComputePme.h:32
std::ostream & endi(std::ostream &s)
Definition: InfoStream.C:54
BigReal z
Definition: Vector.h:74
int block1
Definition: PmeBase.h:24
Position position
Definition: NamdTypes.h:78
void recvZUntrans(PmeUntransMsg *)
Definition: ComputePme.C:5102
CProxy_PmeYPencil yPencil
Definition: ComputePme.C:243
CProxy_PmePencilMap zm
Definition: ComputePme.C:249
void backward_subset_fft(int fromIdx, int toIdx)
Definition: ComputePme.C:6010
static Bool alchOn
Definition: ComputePme.h:33
static void messageEnqueueWork(Compute *)
Definition: WorkDistrib.C:2866
SubmitReduction * willSubmit(int setID, int size=-1)
Definition: ReductionMgr.C:368
CProxy_PmePencilMap xm
Definition: ComputePme.C:247
void sendChargeGridReady()
Definition: ComputePme.C:3599
#define PME_OFFLOAD_UNGRID_PRIORITY
Definition: Priorities.h:42
void recvYTrans(PmeTransMsg *)
Definition: ComputePme.C:5082
Lattice lattice
Definition: ComputePme.C:146
static ReductionMgr * Object(void)
Definition: ReductionMgr.h:290
CProxy_NodePmeMgr pmeNodeProxy
Definition: ComputePme.C:246
#define iout
Definition: InfoStream.h:51
void fft_init()
Definition: ComputePme.C:5124
static void PmeXZPencilFFT(int first, int last, void *result, int paraNum, void *param)
Definition: ComputePme.C:5266
float * qgrid
Definition: ComputePme.C:165
void fwdSharedTrans(PmeTransMsg *)
Definition: ComputePme.C:2042
double cg
Definition: PmeBase.h:38
Lattice lattice
Definition: ComputePme.C:4625
int block2
Definition: PmeBase.h:24
void recvUngrid(PmeGridMsg *)
Definition: ComputePme.C:2464
double z
Definition: PmeBase.h:37
double y
Definition: PmeBase.h:37
int add(const Elem &elem)
Definition: ResizeArray.h:101
void recvGrid(PmeGridMsg *)
Definition: ComputePme.C:1855
PmePencilInitMsg(PmePencilInitMsgData &d)
Definition: ComputePme.C:254
PmeZPencil(CkMigrateMessage *)
Definition: ComputePme.C:4652
void recvUngrid(PmeGridMsg *)
Definition: ComputePme.C:711
void Pme_init()
Definition: ComputePme.C:882
CmiNodeLock pmemgr_lock
Definition: ComputePme.C:443
int check_charges_count
Definition: ComputePme.C:460
static void PmeSlabSendTrans(int first, int last, void *result, int paraNum, void *param)
Definition: ComputePme.C:1962
float * f_data_host
Definition: ComputePme.C:448
int sourceNode
Definition: ComputePme.C:143
void send_subset_untrans(int fromIdx, int toIdx)
Definition: ComputePme.C:5835
void recv_trans(const PmeTransMsg *)
Definition: ComputePme.C:5705
MathArray< double, 7 > PmeReduction
Definition: PmeBase.h:42
static void PmeZPencilSendUngrid(int first, int last, void *result, int paraNum, void *param)
Definition: ComputePme.C:6300
Flags flags
Definition: Patch.h:128
#define SQRT_PI
Definition: ComputeExt.C:30
AtomicInt imsg
Definition: ComputePme.C:4633
void sendDataHelper(int)
Definition: ComputePme.C:4023
void resize(int i)
Definition: ResizeArray.h:84
CProxy_PmePencilMap xm
Definition: ComputePme.C:660
void pme_kspace()
Definition: ComputePme.C:5772
PmeYPencil(CkMigrateMessage *)
Definition: ComputePme.C:4728
PmeKSpace * myKSpace
Definition: ComputePme.C:4822
void forward_fft()
Definition: ComputePme.C:5742
static void PmeSlabSendUngrid(int first, int last, void *result, int paraNum, void *param)
Definition: ComputePme.C:2399
Charge charge
Definition: NamdTypes.h:79
void sendUntransSubset(int first, int last)
Definition: ComputePme.C:2245
void recvAck(PmeAckMsg *)
Definition: ComputePme.C:2479
void reorder(Elem *a, int n)
Definition: Random.h:234
PmeReduction * evir
Definition: ComputePme.C:195
virtual int registerArray(CkArrayIndexMax &, CkArrayID)
Definition: ComputePme.C:204
int isPmeProcessor(int p)
Definition: ComputePme.C:626
void node_process_trans(PmeTransMsg *)
Definition: ComputePme.C:5025
void send_untrans()
Definition: ComputePme.C:5893
void backward_fft()
Definition: ComputePme.C:6239
void chargeGridReady(Lattice &lattice, int sequence)
Definition: ComputePme.C:3626
#define CKLOOP_CTRL_PME_SENDUNTRANS
int cuda_atoms_alloc
Definition: ComputePme.C:451
#define PRIORITY_SIZE
Definition: Priorities.h:13
float * data
Definition: ComputePme.C:4637
int AtomicInt
Definition: ComputePme.C:4629
#define PME_PRIORITY
Definition: Priorities.h:29
#define CUDA_EVENT_ID_PME_FORCES
int sourceNode
Definition: ComputePme.C:159
#define CKLOOP_CTRL_PME_SENDTRANS
Definition: SimParameters.h:98
Definition: Random.h:37
void send_all_ungrid()
Definition: ComputePme.C:6307
int sendDataHelper_sequence
Definition: ComputePme.C:399
double compute_energy(float q_arr[], const Lattice &lattice, double ewald, double virial[], int useCkLoop)
Definition: PmeKSpace.C:321
int numPatches(void) const
Definition: PatchMap.h:59
void recvSharedUntrans(PmeSharedUntransMsg *)
Definition: ComputePme.C:2319
#define PME_TRANS_PRIORITY
Definition: Priorities.h:31
void recvNodeAck(PmeAckMsg *)
Definition: ComputePme.C:5043
void sendTrans(void)
Definition: ComputePme.C:1967
int order
Definition: PmeBase.h:23
void CcdCallBacksReset(void *ignored, double curWallTime)
#define COMPUTE_HOME_PRIORITY
Definition: Priorities.h:76
int compare_bit_reversed(int a, int b)
Definition: ComputePme.C:346
#define CUDA_EVENT_ID_PME_TICK
ResizeArray< ComputePme * > & getComputes(ComputePmeMgr *mgr)
Definition: ComputePme.C:615
void recvRecipEvir(PmeEvirMsg *)
Definition: ComputePme.C:3068
int getMasterPe()
Definition: DeviceCUDA.h:137
void NAMD_bug(const char *err_msg)
Definition: common.C:195
CkArrayIndex3D destElem
Definition: ComputePme.C:153
void sendPencilsHelper(int)
Definition: ComputePme.C:3800
double compute_energy_LJPME(float q_arr[], const Lattice &lattice, double LJewald, double virial[], int useCkLoop)
Definition: PmeKSpace.C:545
void pollForcesReady()
Definition: ComputePme.C:2701
void fill_charges(float **q_arr, float **q_arr_list, int &q_arr_count, int &stray_count, char *f_arr, char *fz_arr, PmeParticle p[])
Definition: PmeRealSpace.C:47
int block3
Definition: PmeBase.h:24
int doFullElectrostatics
Definition: PatchTypes.h:23
CProxy_PmePencilMap ym
Definition: ComputePme.C:661
void send_subset_ungrid(int fromIdx, int toIdx)
Definition: ComputePme.C:6321
static void PmeYPencilSendUntrans(int first, int last, void *result, int paraNum, void *param)
Definition: ComputePme.C:6064
int sendDataHelper_sourcepe
Definition: ComputePme.C:400
void recv_untrans(const PmeUntransMsg *)
Definition: ComputePme.C:5981
void sendData(Lattice &, int sequence)
Definition: ComputePme.C:4036
void generatePmePeList2(int *gridPeMap, int numGridPes, int *transPeMap, int numTransPes)
Definition: ComputePme.C:320
void registerYPencil(CkArrayIndex3D, PmeYPencil *)
Definition: ComputePme.C:725
float * f_data_dev
Definition: ComputePme.C:449
void forward_fft()
Definition: ComputePme.C:5511
void node_process_untrans(PmeUntransMsg *)
Definition: ComputePme.C:5048
int Bool
Definition: common.h:142
void backward_fft()
Definition: ComputePme.C:6022
static void PmeYPencilBackwardFFT(int first, int last, void *result, int paraNum, void *param)
Definition: ComputePme.C:6005
static BigReal alchElecLambdaStart
Definition: ComputePme.h:37
Force * f[maxNumForces]
Definition: PatchTypes.h:150
static std::deque< cuda_submit_charges_args > cuda_submit_charges_deque
Definition: ComputePme.C:469
ComputePme(ComputeID c, PatchID pid)
Definition: ComputePme.C:2712
static void PmeXPencilSendUntrans(int first, int last, void *result, int paraNum, void *param)
Definition: ComputePme.C:5830
Real * get_qmAtmChrg()
Definition: Molecule.h:862
int chargeGridSubmittedCount
Definition: ComputePme.C:472
void recv_untrans(const PmeUntransMsg *)
Definition: ComputePme.C:6211
int noWork()
Definition: ComputePme.C:3039
static void scale_forces(Vector f[], int N, Lattice &lattice)
Definition: PmeBase.inl:60
uint8 partition
Definition: NamdTypes.h:81
void fwdSharedUntrans(PmeUntransMsg *)
Definition: ComputePme.C:2305
BigReal x
Definition: Vector.h:74
void sendDataHelper(int)
Definition: ComputePme.C:4027
#define fftwf_free
Definition: ComputePme.C:14
void cuda_check_pme_charges(void *arg, double walltime)
Definition: ComputePme.C:3540
const Real * get_qmAtomGroup() const
Definition: Molecule.h:859
void ungridCalc(void)
Definition: ComputePme.C:2554
float * work
Definition: ComputePme.C:4638
NAMD_HOST_DEVICE Vector a_r() const
Definition: Lattice.h:284
void recvArrays(CProxy_PmeXPencil, CProxy_PmeYPencil, CProxy_PmeZPencil)
Definition: ComputePme.C:828
NAMD_HOST_DEVICE Vector b_r() const
Definition: Lattice.h:285
float * qgrid
Definition: ComputePme.C:152
ijpair(int I, int J)
Definition: ComputePme.C:373
void setMgr(ComputePmeMgr *mgr)
Definition: ComputePme.h:58
void recvSharedTrans(PmeSharedTransMsg *)
Definition: ComputePme.C:2058
#define PME_OFFLOAD_PRIORITY
Definition: Priorities.h:41
static Bool LJPMEOn
Definition: ComputePme.h:43
void NAMD_die(const char *err_msg)
Definition: common.C:147
static void PmeZPencilSendTrans(int first, int last, void *result, int paraNum, void *param)
Definition: ComputePme.C:5331
PmeTransMsg * msg
Definition: ComputePme.C:171
static Bool alchDecouple
Definition: ComputePme.h:36
BigReal min_a(int pid) const
Definition: PatchMap.h:91
void recvUntrans(PmeUntransMsg *)
Definition: ComputePme.C:2332
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:23
static int findRecipEvirPe()
Definition: ComputePme.C:269
int * zlist
Definition: ComputePme.C:150
#define CKLOOP_CTRL_PME_KSPACE
Definition: SimParameters.h:99
void initialize_pencils(CkQdMsg *)
Definition: ComputePme.C:1721
void sendUngrid(void)
Definition: ComputePme.C:2404
void submitReductions()
Definition: ComputePme.C:4297
static int lesFactor
Definition: ComputePme.h:39
CkArrayIndex3D destElem
Definition: ComputePme.C:166
NAMD_HOST_DEVICE Vector b() const
Definition: Lattice.h:269
void send_subset_trans(int fromIdx, int toIdx)
Definition: ComputePme.C:5552
static int * peDiffuseOrdering
Definition: WorkDistrib.h:116
void gridCalc1(void)
Definition: ComputePme.C:1934
int ny_after_transpose
Definition: ComputePme.C:261
void base_init(PmePencilInitMsg *msg)
Definition: ComputePme.C:4606
int * needs_reply
Definition: ComputePme.C:4640
int * send_order
Definition: ComputePme.C:4639
#define XCOPY(X)
void initialize_computes()
Definition: ComputePme.C:2765
int getDeviceID()
Definition: DeviceCUDA.h:144
PmePencilMap(int i_a, int i_b, int n_b, int n, int *d)
Definition: ComputePme.C:200
virtual int procNum(int, const CkArrayIndex &i)
Definition: ComputePme.C:208
void recvUntrans(PmeUntransMsg *)
Definition: ComputePme.C:707
AtomicInt imsgb
Definition: ComputePme.C:4634
void skip(void)
Definition: Box.h:63
PmeYPencil_SDAG_CODE PmeYPencil()
Definition: ComputePme.C:4727
void initialize()
Definition: ComputePme.C:694
int recipEvirPe
Definition: ComputePme.C:4820
void pollChargeGridReady()
Definition: ComputePme.C:3613
void cuda_check_pme_forces(void *arg, double walltime)
Definition: ComputePme.C:2512
void forward_fft()
Definition: ComputePme.C:5275
void recvTrans(PmeTransMsg *)
Definition: ComputePme.C:2071
void recvNodeAck(PmeAckMsg *)
Definition: ComputePme.C:6410
void initialize()
Definition: ComputePme.C:2751
#define simParams
Definition: Output.C:131
void fft_init()
Definition: ComputePme.C:4851
static Bool pairOn
Definition: ComputePme.h:40
#define PME_UNTRANS_PRIORITY
Definition: Priorities.h:33
int K3
Definition: PmeBase.h:21
int cuda_atoms_count
Definition: ComputePme.C:450
void node_process_untrans(PmeUntransMsg *)
Definition: ComputePme.C:6415
static Bool lesOn
Definition: ComputePme.h:38
iterator begin(void)
Definition: ResizeArray.h:36
BigReal max_b(int pid) const
Definition: PatchMap.h:94
Lattice * sendDataHelper_lattice
Definition: ComputePme.C:398
void evir_init()
Definition: ComputePme.C:4846
fftw_plan backward_plan
Definition: ComputePme.C:4816
int sendDataHelper_errors
Definition: ComputePme.C:401
void recvTrans(PmeTransMsg *)
Definition: ComputePme.C:703
#define count_limit
Definition: ComputePme.C:2504
#define PME_MAX_EVALS
Definition: PmeBase.h:41
void backward_fft()
Definition: ComputePme.C:5802
void send_untrans()
Definition: ComputePme.C:6128
PmeXPencil_SDAG_CODE PmeXPencil()
Definition: ComputePme.C:4791
iterator end(void)
Definition: ResizeArray.h:37
Lattice lattice
Definition: ComputePme.C:162
virtual void populateInitial(int, CkArrayIndexMax &, void *msg, CkArrMgr *mgr)
Definition: ComputePme.C:212
#define CKLOOP_CTRL_PME_FORWARDFFT
Definition: SimParameters.h:97
BigReal y
Definition: Vector.h:74
ResizeArray< ComputePme * > pmeComputes
Definition: ComputePme.C:482
PmeReduction evir
Definition: ComputePme.C:4626
void send_subset_trans(int fromIdx, int toIdx)
Definition: ComputePme.C:5336
static void select(void)
Definition: ComputePme.C:6448
#define ADD_VECTOR_OBJECT(R, RL, D)
Definition: ReductionMgr.h:28
void sendTransSubset(int first, int last)
Definition: ComputePme.C:1983
static Bool selfOn
Definition: ComputePme.h:41
Lattice lattice
Definition: PatchTypes.h:46
cudaEvent_t * end_forces
Definition: ComputePme.C:455
bool less_than_bit_reversed(int a, int b)
Definition: ComputePme.C:355
void sendUntrans(void)
Definition: ComputePme.C:2218
CmiNodeLock lock
Definition: ComputePme.C:190
#define PME_GRID_PRIORITY
Definition: Priorities.h:30
PmeUntransMsg * msg
Definition: ComputePme.C:188
virtual ~ComputePme()
Definition: ComputePme.C:2985
#define CUDA_EVENT_ID_PME_KERNEL
void set_num_atoms(int natoms)
Definition: PmeRealSpace.C:20
void registerZPencil(CkArrayIndex3D, PmeZPencil *)
Definition: ComputePme.C:731
void node_process_trans(PmeTransMsg *)
Definition: ComputePme.C:5685
void sendUngridSubset(int first, int last)
Definition: ComputePme.C:2420
int numPatchesOnNode(int node)
Definition: PatchMap.h:60
int i
Definition: ComputePme.C:371
int saved_sequence
Definition: ComputePme.C:476
void gridCalc2R(void)
Definition: ComputePme.C:2170
void copyPencils(PmeGridMsg *)
Definition: ComputePme.C:3872
void atomUpdate()
Definition: ComputePme.C:2710
void procTrans(PmeTransMsg *)
Definition: ComputePme.C:2076
CProxy_PmePencilMap ym
Definition: ComputePme.C:248
int basePriority
Definition: Compute.h:37
std::ostream & iERROR(std::ostream &s)
Definition: InfoStream.C:83
int zlistlen
Definition: ComputePme.C:149
#define SET_PRIORITY(MSG, SEQ, PRIO)
Definition: Priorities.h:18
static Bool alchFepOn
Definition: ComputePme.h:34
int node(int pid) const
Definition: PatchMap.h:114
CmiNodeLock lock
Definition: ComputePme.C:173
#define CKLOOP_CTRL_PME_BACKWARDFFT
void recvPencilMapProxies(CProxy_PmePencilMap _xm, CProxy_PmePencilMap _ym, CProxy_PmePencilMap _zm)
Definition: ComputePme.C:657
int forces_done_count
Definition: ComputePme.C:457
NAMD_HOST_DEVICE Vector a() const
Definition: Lattice.h:268
CkArrayIndex3D destElem
Definition: ComputePme.C:183
void copyResults(PmeGridMsg *)
Definition: ComputePme.C:4064
double charges_time
Definition: ComputePme.C:458
bool one_device_per_node()
Definition: DeviceCUDA.C:553
char * pencilPMEProcessors
Definition: ComputePme.C:135
void cuda_submit_charges(Lattice &lattice, int sequence)
Definition: ComputePme.C:3512
Data * open(void)
Definition: Box.h:39
#define CUDA_EVENT_ID_PME_CHARGES
int64_t int64
Definition: common.h:39
ijpair()
Definition: ComputePme.C:372
BigReal min_b(int pid) const
Definition: PatchMap.h:93
int32 PatchID
Definition: NamdTypes.h:287
NAMD_HOST_DEVICE Vector unit(void) const
Definition: Vector.h:215
Molecule * molecule
Definition: Node.h:179
void gridCalc3(void)
Definition: ComputePme.C:2379
PmeZPencil_SDAG_CODE PmeZPencil()
Definition: ComputePme.C:4651
static void PmeSlabSendUntrans(int first, int last, void *result, int paraNum, void *param)
Definition: ComputePme.C:2213
fftw_plan forward_plan
Definition: ComputePme.C:4816
PmePencilInitMsgData data
Definition: ComputePme.C:255
char * fgrid
Definition: ComputePme.C:151
const ComputeID cid
Definition: Compute.h:43
void chargeGridSubmitted(Lattice &lattice, int sequence)
Definition: ComputePme.C:3567
void send_ungrid(PmeGridMsg *)
Definition: ComputePme.C:6328
static void PmeYPencilForwardFFT(int first, int last, void *result, int paraNum, void *param)
Definition: ComputePme.C:5495
void send_trans()
Definition: ComputePme.C:5388
#define PME_UNTRANS2_PRIORITY
Definition: Priorities.h:34
void recvAck(DataMessage *dmsg)
Definition: DataExchanger.C:99
int j
Definition: ComputePme.C:371
void close(Data **const t)
Definition: Box.h:49
void forward_subset_fft(int fromIdx, int toIdx)
Definition: ComputePme.C:5499
int doMolly
Definition: PatchTypes.h:25
float * a_data_host
Definition: ComputePme.C:446
static bool cuda_busy
Definition: ComputePme.C:470
void recvYUntrans(PmeUntransMsg *)
Definition: ComputePme.C:5092
#define NUM_STREAMS
Definition: ComputePme.C:542
void sendPencilsPart(int first, int last, Lattice &, int sequence, int sourcepe)
Definition: ComputePme.C:3654
Box< Patch, CompAtom > * registerPositionPickup(Compute *cid)
Definition: Patch.C:106
void recvChargeGridReady()
Definition: ComputePme.C:3622
#define fftwf_malloc
Definition: ComputePme.C:13
void sendDataPart(int first, int last, Lattice &, int sequence, int sourcepe, int errors)
Definition: ComputePme.C:3914
double BigReal
Definition: common.h:123
CProxy_PmeXPencil xPencil
Definition: ComputePme.C:242
static Bool alchThermIntOn
Definition: ComputePme.h:35
int check_forces_count
Definition: ComputePme.C:461
int step
Definition: PatchTypes.h:16
#define PATCH_PRIORITY(PID)
Definition: Priorities.h:25
static CmiNodeLock cuda_lock
Definition: ComputePme.C:452
void activate_pencils(CkQdMsg *)
Definition: ComputePme.C:1816
PmeXPencil(CkMigrateMessage *)
Definition: ComputePme.C:4792
CompAtomExt * getCompAtomExtInfo()
Definition: Patch.h:117
int y_start_after_transpose
Definition: ComputePme.C:261
void recv_grid(const PmeGridMsg *)
Definition: ComputePme.C:5224
void sendTransBarrier(void)
Definition: ComputePme.C:1952
Box< Patch, Results > * registerForceDeposit(Compute *cid)
Definition: Patch.C:227