NAMD
ComputePme.C
Go to the documentation of this file.
1 
7 #ifdef NAMD_FFTW
8 //#define MANUAL_DEBUG_FFTW3 1
9 #ifdef NAMD_FFTW_3
10 #include <fftw3.h>
11 #else
12 // fftw2 doesn't have these defined
13 #define fftwf_malloc fftw_malloc
14 #define fftwf_free fftw_free
15 #ifdef NAMD_FFTW_NO_TYPE_PREFIX
16 #include <fftw.h>
17 #include <rfftw.h>
18 #else
19 #include <sfftw.h>
20 #include <srfftw.h>
21 #endif
22 #endif
23 #endif
24 
25 #include <vector>
26 #include <algorithm>
27 #include <deque>
28 using namespace std;
29 
30 #include "InfoStream.h"
31 #include "Node.h"
32 #include "PatchMap.h"
33 #include "PatchMap.inl"
34 #include "AtomMap.h"
35 #include "ComputePme.h"
36 #include "ComputePmeMgr.decl.h"
37 #include "PmeBase.inl"
38 #include "PmeRealSpace.h"
39 #include "PmeKSpace.h"
40 #include "ComputeNonbondedUtil.h"
41 #include "PatchMgr.h"
42 #include "Molecule.h"
43 #include "ReductionMgr.h"
44 #include "ComputeMgr.h"
45 #include "ComputeMgr.decl.h"
46 // #define DEBUGM
47 #define MIN_DEBUG_LEVEL 3
48 #include "Debug.h"
49 #include "SimParameters.h"
50 #include "WorkDistrib.h"
51 #include "varsizemsg.h"
52 #include "Random.h"
53 #include "ckhashtable.h"
54 #include "Priorities.h"
55 #include "CudaUtils.h"
56 #include "ComputeMoa.h"
57 #include "ComputeMoaMgr.decl.h"
58 
59 //#define USE_RANDOM_TOPO 1
60 
61 //#define USE_TOPO_SFC 1
62 //#define USE_CKLOOP 1
63 //#include "TopoManager.h"
64 
65 #include "DeviceCUDA.h"
66 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
67 void cuda_errcheck(const char *msg) {
68  cudaError_t err;
69  if ((err = cudaGetLastError()) != cudaSuccess) {
70  char host[128];
71  gethostname(host, 128); host[127] = 0;
72  char devstr[128] = "";
73  int devnum;
74  if ( cudaGetDevice(&devnum) == cudaSuccess ) {
75  sprintf(devstr, " device %d", devnum);
76  }
77  cudaDeviceProp deviceProp;
78  if ( cudaGetDeviceProperties(&deviceProp, devnum) == cudaSuccess ) {
79  sprintf(devstr, " device %d pci %x:%x:%x", devnum,
80  deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
81  }
82  char errmsg[1024];
83  sprintf(errmsg,"CUDA error %s on Pe %d (%s%s): %s", msg, CkMyPe(), host, devstr, cudaGetErrorString(err));
84  NAMD_die(errmsg);
85  }
86 }
87 #ifdef NAMD_CUDA
88 #include <cuda_runtime.h>
89 #include <cuda.h>
90 #endif
91 #ifdef NAMD_HIP
92 #include "HipDefines.h"
93 #include <hip/hip_runtime.h>
94 #endif
95 void cuda_errcheck(const char *msg);
96 #ifdef WIN32
97 #define __thread __declspec(thread)
98 #endif
99 extern __thread DeviceCUDA *deviceCUDA;
100 #endif
101 
102 #include "ComputePmeCUDAKernel.h"
103 
104 #ifndef SQRT_PI
105 #define SQRT_PI 1.7724538509055160273 /* mathematica 15 digits*/
106 #endif
107 
108 #if CMK_PERSISTENT_COMM
109 #define USE_PERSISTENT 1
110 #endif
111 
112 #if USE_PERSISTENT
113 #define Z_PERSIST 1
114 #define Y_PERSIST 1
115 #define X_PERSIST 1
116 #endif
117 
118 #if (defined(NAMD_HIP) || defined(NAMD_CUDA)) && defined(MEM_OPT_VERSION)
119 #define USE_NODE_PAR_RECEIVE 1
120 #endif
121 
132 
134 
135 class PmeAckMsg : public CMessage_PmeAckMsg {
136 };
137 
138 class PmeGridMsg : public CMessage_PmeGridMsg {
139 public:
140 
142  int sequence;
143  int hasData;
145  int start;
146  int len;
147  int zlistlen;
148  int *zlist;
149  char *fgrid;
150  float *qgrid;
151  CkArrayIndex3D destElem;
152 };
153 
154 class PmeTransMsg : public CMessage_PmeTransMsg {
155 public:
156 
158  int sequence;
159  int hasData;
161  int x_start;
162  int nx;
163  float *qgrid;
164  CkArrayIndex3D destElem;
165 };
166 
167 class PmeSharedTransMsg : public CMessage_PmeSharedTransMsg {
168 public:
170  int *count;
171  CmiNodeLock lock;
172 };
173 
174 class PmeUntransMsg : public CMessage_PmeUntransMsg {
175 public:
176 
178  int y_start;
179  int ny;
180  float *qgrid;
181  CkArrayIndex3D destElem;
182 };
183 
184 class PmeSharedUntransMsg : public CMessage_PmeSharedUntransMsg {
185 public:
187  int *count;
188  CmiNodeLock lock;
189 };
190 
191 class PmeEvirMsg : public CMessage_PmeEvirMsg {
192 public:
194 };
195 
196 class PmePencilMap : public CBase_PmePencilMap {
197 public:
198  PmePencilMap(int i_a, int i_b, int n_b, int n, int *d)
199  : ia(i_a), ib(i_b), nb(n_b),
200  size(n), data(newcopyint(n,d)) {
201  }
202  virtual int registerArray(CkArrayIndexMax&, CkArrayID) {
203  //Return an ``arrayHdl'', given some information about the array
204  return 0;
205  }
206  virtual int procNum(int, const CkArrayIndex &i) {
207  //Return the home processor number for this element of this array
208  return data[ i.data()[ia] * nb + i.data()[ib] ];
209  }
210  virtual void populateInitial(int, CkArrayIndexMax &, void *msg, CkArrMgr *mgr) {
211  int mype = CkMyPe();
212  for ( int i=0; i < size; ++i ) {
213  if ( data[i] == mype ) {
214  CkArrayIndex3D ai(0,0,0);
215  ai.data()[ia] = i / nb;
216  ai.data()[ib] = i % nb;
217  if ( procNum(0,ai) != mype ) NAMD_bug("PmePencilMap is inconsistent");
218  if ( ! msg ) NAMD_bug("PmePencilMap multiple pencils on a pe?");
219  mgr->insertInitial(ai,msg);
220  msg = 0;
221  }
222  }
223  mgr->doneInserting();
224  if ( msg ) CkFreeMsg(msg);
225  }
226 private:
227  const int ia, ib, nb, size;
228  const int* const data;
229  static int* newcopyint(int n, int *d) {
230  int *newd = new int[n];
231  memcpy(newd, d, n*sizeof(int));
232  return newd;
233  }
234 };
235 
236 // use this idiom since messages don't have copy constructors
239  int xBlocks, yBlocks, zBlocks;
240  CProxy_PmeXPencil xPencil;
241  CProxy_PmeYPencil yPencil;
242  CProxy_PmeZPencil zPencil;
243  CProxy_ComputePmeMgr pmeProxy;
244  CProxy_NodePmeMgr pmeNodeProxy;
245  CProxy_PmePencilMap xm;
246  CProxy_PmePencilMap ym;
247  CProxy_PmePencilMap zm;
248 };
249 
250 class PmePencilInitMsg : public CMessage_PmePencilInitMsg {
251 public:
254 };
255 
256 
257 struct LocalPmeInfo {
258  int nx, x_start;
259  int ny_after_transpose, y_start_after_transpose;
260 };
261 
262 struct NodePmeInfo {
263  int npe, pe_start, real_node;
264 };
265 
266 
267 static int findRecipEvirPe() {
268  PatchMap *patchMap = PatchMap::Object();
269  {
270  int mype = CkMyPe();
271  if ( patchMap->numPatchesOnNode(mype) ) {
272  return mype;
273  }
274  }
275  {
276  int node = CmiMyNode();
277  int firstpe = CmiNodeFirst(node);
278  int nodeSize = CmiNodeSize(node);
279  int myrank = CkMyRank();
280  for ( int i=0; i<nodeSize; ++i ) {
281  int pe = firstpe + (myrank+i)%nodeSize;
282  if ( patchMap->numPatchesOnNode(pe) ) {
283  return pe;
284  }
285  }
286  }
287  {
288  int *pelist;
289  int nodeSize;
290  CmiGetPesOnPhysicalNode(CmiPhysicalNodeID(CkMyPe()), &pelist, &nodeSize);
291  int myrank;
292  for ( int i=0; i<nodeSize; ++i ) {
293  if ( pelist[i] == CkMyPe() ) myrank = i;
294  }
295  for ( int i=0; i<nodeSize; ++i ) {
296  int pe = pelist[(myrank+i)%nodeSize];
297  if ( patchMap->numPatchesOnNode(pe) ) {
298  return pe;
299  }
300  }
301  }
302  {
303  int mype = CkMyPe();
304  int npes = CkNumPes();
305  for ( int i=0; i<npes; ++i ) {
306  int pe = (mype+i)%npes;
307  if ( patchMap->numPatchesOnNode(pe) ) {
308  return pe;
309  }
310  }
311  }
312  NAMD_bug("findRecipEvirPe() failed!");
313  return -999; // should never happen
314 }
315 
316 
317 //Assigns gridPeMap and transPeMap to different set of processors.
318 void generatePmePeList2(int *gridPeMap, int numGridPes, int *transPeMap, int numTransPes){
319  int ncpus = CkNumPes();
320 
321  for ( int i=0; i<numGridPes; ++i ) {
322  gridPeMap[i] = WorkDistrib::peDiffuseOrdering[ncpus - numGridPes + i];
323  }
324  std::sort(gridPeMap,gridPeMap+numGridPes);
325  int firstTransPe = ncpus - numGridPes - numTransPes;
326  if ( firstTransPe < 0 ) {
327  firstTransPe = 0;
328  // 0 should be first in list, skip if possible
329  if ( ncpus > numTransPes ) firstTransPe = 1;
330  }
331  for ( int i=0; i<numTransPes; ++i ) {
332  transPeMap[i] = WorkDistrib::peDiffuseOrdering[firstTransPe + i];
333  }
334  std::sort(transPeMap,transPeMap+numTransPes);
335 }
336 
337 #if USE_TOPOMAP
338 //Topology aware PME allocation
339 bool generateBGLORBPmePeList(int *pemap, int numPes, int *block_pes=0,
340  int nbpes=0);
341 #endif
342 
343 
344 int compare_bit_reversed(int a, int b) {
345  int d = a ^ b;
346  int c = 1;
347  if ( d ) while ( ! (d & c) ) {
348  c = c << 1;
349  }
350  return (a & c) - (b & c);
351 }
352 
353 inline bool less_than_bit_reversed(int a, int b) {
354  int d = a ^ b;
355  int c = 1;
356  if ( d ) while ( ! (d & c) ) {
357  c = c << 1;
358  }
359  return d && (b & c);
360 }
361 
363  inline bool operator() (int a, int b) const {
364  return less_than_bit_reversed(a,b);
365  }
366 };
367 
368 struct ijpair {
369  int i,j;
370  ijpair() {;}
371  ijpair(int I, int J) : i(I), j(J) {;}
372 };
373 
375  inline bool operator() (const ijpair &a, const ijpair &b) const {
376  return ( less_than_bit_reversed(a.i,b.i)
377  || ( (a.i == b.i) && less_than_bit_reversed(a.j,b.j) ) );
378  }
379 };
380 
381 class ComputePmeMgr : public CBase_ComputePmeMgr, public ComputePmeUtil {
382 public:
383  friend class ComputePme;
384  friend class NodePmeMgr;
385  ComputePmeMgr();
386  ~ComputePmeMgr();
387 
388  void initialize(CkQdMsg*);
389  void initialize_pencils(CkQdMsg*);
390  void activate_pencils(CkQdMsg*);
391  void recvArrays(CProxy_PmeXPencil, CProxy_PmeYPencil, CProxy_PmeZPencil);
392  void initialize_computes();
393 
394  void sendData(Lattice &, int sequence);
395  void sendDataPart(int first, int last, Lattice &, int sequence, int sourcepe, int errors);
400  void sendPencils(Lattice &, int sequence);
401  void sendPencilsPart(int first, int last, Lattice &, int sequence, int sourcepe);
402  void recvGrid(PmeGridMsg *);
403  void gridCalc1(void);
404  void sendTransBarrier(void);
405  void sendTransSubset(int first, int last);
406  void sendTrans(void);
407  void fwdSharedTrans(PmeTransMsg *);
408  void recvSharedTrans(PmeSharedTransMsg *);
409  void sendDataHelper(int);
410  void sendPencilsHelper(int);
411  void recvTrans(PmeTransMsg *);
412  void procTrans(PmeTransMsg *);
413  void gridCalc2(void);
414  #ifdef OPENATOM_VERSION
415  void gridCalc2Moa(void);
416  #endif // OPENATOM_VERSION
417  void gridCalc2R(void);
418  void fwdSharedUntrans(PmeUntransMsg *);
419  void recvSharedUntrans(PmeSharedUntransMsg *);
420  void sendUntrans(void);
421  void sendUntransSubset(int first, int last);
422  void recvUntrans(PmeUntransMsg *);
423  void procUntrans(PmeUntransMsg *);
424  void gridCalc3(void);
425  void sendUngrid(void);
426  void sendUngridSubset(int first, int last);
427  void recvUngrid(PmeGridMsg *);
428  void recvAck(PmeAckMsg *);
429  void copyResults(PmeGridMsg *);
430  void copyPencils(PmeGridMsg *);
431  void ungridCalc(void);
432  void recvRecipEvir(PmeEvirMsg *);
433  void addRecipEvirClient(void);
434  void submitReductions();
435 
436 #if 0 && USE_PERSISTENT
437  void setup_recvgrid_persistent();
438 #endif
439 
440  static CmiNodeLock fftw_plan_lock;
441  CmiNodeLock pmemgr_lock; // for accessing this object from other threads
442 
443 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
444  float *a_data_host;
445  float *a_data_dev;
446  float *f_data_host;
447  float *f_data_dev;
450  static CmiNodeLock cuda_lock;
451  void chargeGridSubmitted(Lattice &lattice, int sequence);
452  cudaEvent_t end_charges;
453  cudaEvent_t *end_forces;
456  double charges_time;
457  double forces_time;
461  int this_pe;
462 
463  void cuda_submit_charges(Lattice &lattice, int sequence);
465  ComputePmeMgr *mgr; Lattice *lattice; int sequence;
466  };
467  static std::deque<cuda_submit_charges_args> cuda_submit_charges_deque;
468  static bool cuda_busy;
469 
471  void sendChargeGridReady();
472 #endif
473  Lattice *saved_lattice; // saved by chargeGridSubmitted
474  int saved_sequence; // saved by chargeGridSubmitted
475  void pollChargeGridReady();
476  void pollForcesReady();
477  void recvChargeGridReady();
478  void chargeGridReady(Lattice &lattice, int sequence);
479 
481 
482 private:
483 
484 #if 0 && USE_PERSISTENT
485  PersistentHandle *recvGrid_handle;
486 #endif
487 
488  CProxy_ComputePmeMgr pmeProxy;
489  CProxy_ComputePmeMgr pmeProxyDir;
490  CProxy_NodePmeMgr pmeNodeProxy;
491  NodePmeMgr *nodePmeMgr;
492  ComputePmeMgr *masterPmeMgr;
493 
494  void addCompute(ComputePme *c) {
495  if ( ! pmeComputes.size() ) initialize_computes();
496  pmeComputes.add(c);
497  c->setMgr(this);
498  }
499 
500  ResizeArray<ComputePme*> heldComputes;
501  PmeGrid myGrid;
502  Lattice lattice;
503  PmeKSpace *myKSpace;
504  float *qgrid;
505  float *kgrid;
506 
507 #ifdef NAMD_FFTW
508 #ifdef NAMD_FFTW_3
509  fftwf_plan *forward_plan_x, *backward_plan_x;
510  fftwf_plan *forward_plan_yz, *backward_plan_yz;
511  fftwf_complex *work;
512 #else
513  fftw_plan forward_plan_x, backward_plan_x;
514  rfftwnd_plan forward_plan_yz, backward_plan_yz;
515  fftw_complex *work;
516 #endif
517 #else
518  float *work;
519 #endif
520 
521  int qsize, fsize, bsize;
522  int offload;
523  BigReal alchLambda; // set on each step in ComputePme::ungridForces()
524  BigReal alchLambda2; // set on each step in ComputePme::ungridForces()
525 
526  float **q_arr;
527  // q_list and q_count not used for offload
528  float **q_list;
529  int q_count;
530  char *f_arr;
531  char *fz_arr;
533  SubmitReduction *reduction;
534 
535  int noWorkCount;
536  int doWorkCount;
537  int ungridForcesCount;
538 
539 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
540 #define NUM_STREAMS 1
541  cudaStream_t streams[NUM_STREAMS];
542  int stream;
543 
544  float **q_arr_dev;
545  float **v_arr_dev;
546  float *q_data_host;
547  float *q_data_dev;
548  float *v_data_dev;
549  int *ffz_host;
550  int *ffz_dev;
551  int q_data_size;
552  int ffz_size;
553 
554  int f_data_mgr_alloc;
555  float *f_data_mgr_host;
556  float *f_data_mgr_dev;
557  float **afn_host;
558  float **afn_dev;
559 
560  float *bspline_coeffs_dev;
561  float *bspline_dcoeffs_dev;
562 #endif
563  int recipEvirCount; // used in compute only
564  int recipEvirClients; // used in compute only
565  int recipEvirPe; // used in trans only
566 
567  LocalPmeInfo *localInfo;
568  NodePmeInfo *gridNodeInfo;
569  NodePmeInfo *transNodeInfo;
570  int qgrid_size;
571  int qgrid_start;
572  int qgrid_len;
573  int fgrid_start;
574  int fgrid_len;
575 
576  int numSources;
577  int numGridPes;
578  int numTransPes;
579  int numGridNodes;
580  int numTransNodes;
581  int numDestRecipPes;
582  int myGridPe, myGridNode;
583  int myTransPe, myTransNode;
584  int *gridPeMap;
585  int *transPeMap;
586  int *recipPeDest;
587  int *gridPeOrder;
588  int *gridNodeOrder;
589  int *transNodeOrder;
590  int grid_count;
591  int trans_count;
592  int untrans_count;
593  int ungrid_count;
594  PmeGridMsg **gridmsg_reuse;
595  PmeReduction recip_evir2[PME_MAX_EVALS];
596 
597  int compute_sequence; // set from patch computes, used for priorities
598  int grid_sequence; // set from grid messages, used for priorities
599  int useBarrier;
600  int sendTransBarrier_received;
601 
602  int usePencils;
603  int xBlocks, yBlocks, zBlocks;
604  CProxy_PmeXPencil xPencil;
605  CProxy_PmeYPencil yPencil;
606  CProxy_PmeZPencil zPencil;
607  char *pencilActive;
608  ijpair *activePencils;
609  int numPencilsActive;
610  int strayChargeErrors;
611 };
612 
614  return mgr->pmeComputes ;
615 }
616 
617  CmiNodeLock ComputePmeMgr::fftw_plan_lock;
618 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
619  CmiNodeLock ComputePmeMgr::cuda_lock;
620  std::deque<ComputePmeMgr::cuda_submit_charges_args> ComputePmeMgr::cuda_submit_charges_deque;
622 #endif
623 
624 int isPmeProcessor(int p){
626  if (simParams->usePMECUDA) {
627  return 0;
628  } else {
629  return pencilPMEProcessors[p];
630  }
631 }
632 
633 class NodePmeMgr : public CBase_NodePmeMgr {
634 public:
635  friend class ComputePmeMgr;
636  friend class ComputePme;
637  NodePmeMgr();
638  ~NodePmeMgr();
639  void initialize();
640  void sendDataHelper(int);
641  void sendPencilsHelper(int);
642  void recvTrans(PmeTransMsg *);
643  void recvUntrans(PmeUntransMsg *);
644  void registerXPencil(CkArrayIndex3D, PmeXPencil *);
645  void registerYPencil(CkArrayIndex3D, PmeYPencil *);
646  void registerZPencil(CkArrayIndex3D, PmeZPencil *);
647  void recvXTrans(PmeTransMsg *);
648  void recvYTrans(PmeTransMsg *);
649  void recvYUntrans(PmeUntransMsg *);
650  void recvZGrid(PmeGridMsg *);
651  void recvZUntrans(PmeUntransMsg *);
652 
653  void recvUngrid(PmeGridMsg *);
654 
655  void recvPencilMapProxies(CProxy_PmePencilMap _xm, CProxy_PmePencilMap _ym, CProxy_PmePencilMap _zm){
656  xm=_xm; ym=_ym; zm=_zm;
657  }
658  CProxy_PmePencilMap xm;
659  CProxy_PmePencilMap ym;
660  CProxy_PmePencilMap zm;
661 
662 private:
663  CProxy_ComputePmeMgr mgrProxy;
664  ComputePmeMgr *mgrObject;
665  ComputePmeMgr **mgrObjects;
666 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
667  ComputePmeMgr *masterPmeMgr;
668  int master_pe;
669 #endif
670  CProxy_PmeXPencil xPencil;
671  CProxy_PmeYPencil yPencil;
672  CProxy_PmeZPencil zPencil;
673  CkHashtableT<CkArrayIndex3D,PmeXPencil*> xPencilObj;
674  CkHashtableT<CkArrayIndex3D,PmeYPencil*> yPencilObj;
675  CkHashtableT<CkArrayIndex3D,PmeZPencil*> zPencilObj;
676 
677 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
678  cudaEvent_t end_charge_memset;
679  cudaEvent_t end_all_pme_kernels;
680  cudaEvent_t end_potential_memcpy;
681 #endif
682 };
683 
685  mgrObjects = new ComputePmeMgr*[CkMyNodeSize()];
686 }
687 
689  delete [] mgrObjects;
690 }
691 
693  CProxy_ComputePmeMgr proxy = CkpvAccess(BOCclass_group).computePmeMgr;
694  mgrObjects[CkMyRank()] = proxy.ckLocalBranch();
695  if ( CkMyRank() == 0 ) {
696  mgrProxy = proxy;
697  mgrObject = proxy.ckLocalBranch();
698  }
699 }
700 
702  mgrObject->fwdSharedTrans(msg);
703 }
704 
706  mgrObject->fwdSharedUntrans(msg);
707 }
708 
710 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
711  masterPmeMgr->recvUngrid(msg);
712 #else
713  NAMD_bug("NodePmeMgr::recvUngrid called in non-CUDA build.");
714 #endif
715 }
716 
717 void NodePmeMgr::registerXPencil(CkArrayIndex3D idx, PmeXPencil *obj)
718 {
720  xPencilObj.put(idx)=obj;
722 }
723 void NodePmeMgr::registerYPencil(CkArrayIndex3D idx, PmeYPencil *obj)
724 {
726  yPencilObj.put(idx)=obj;
728 }
729 void NodePmeMgr::registerZPencil(CkArrayIndex3D idx, PmeZPencil *obj)
730 {
732  zPencilObj.put(idx)=obj;
734 }
735 
736 ComputePmeMgr::ComputePmeMgr() : pmeProxy(thisgroup),
737  pmeProxyDir(thisgroup) {
738 
739  CkpvAccess(BOCclass_group).computePmeMgr = thisgroup;
740  pmeNodeProxy = CkpvAccess(BOCclass_group).nodePmeMgr;
741  nodePmeMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
742 
743  pmeNodeProxy.ckLocalBranch()->initialize();
744 
745  if ( CmiMyRank() == 0 ) {
746  fftw_plan_lock = CmiCreateLock();
747  }
748  pmemgr_lock = CmiCreateLock();
749 
750  myKSpace = 0;
751  kgrid = 0;
752  work = 0;
753  grid_count = 0;
754  trans_count = 0;
755  untrans_count = 0;
756  ungrid_count = 0;
757  gridmsg_reuse= new PmeGridMsg*[CkNumPes()];
758  useBarrier = 0;
759  sendTransBarrier_received = 0;
760  usePencils = 0;
761 
762 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
763  // offload has not been set so this happens on every run
764  if ( CmiMyRank() == 0 ) {
765  cuda_lock = CmiCreateLock();
766  }
767 
768 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP)
769  int leastPriority, greatestPriority;
770  cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
771  cuda_errcheck("in cudaDeviceGetStreamPriorityRange");
772  //if ( CkMyNode() == 0 ) {
773  // CkPrintf("Pe %d PME CUDA stream priority range %d %d\n", CkMyPe(), leastPriority, greatestPriority);
774  //}
775 #define CUDA_STREAM_CREATE(X) cudaStreamCreateWithPriority(X,cudaStreamDefault,greatestPriority)
776 #else
777 #define CUDA_STREAM_CREATE(X) cudaStreamCreate(X)
778 #endif
779 
780  stream = 0;
781  for ( int i=0; i<NUM_STREAMS; ++i ) {
782 #if 1
783  CUDA_STREAM_CREATE(&streams[i]);
784  cuda_errcheck("cudaStreamCreate");
785 #else
786  streams[i] = 0; // XXXX Testing!!!
787 #endif
788  }
789 
790  this_pe = CkMyPe();
791 
792  cudaEventCreateWithFlags(&end_charges,cudaEventDisableTiming);
793  end_forces = 0;
795  check_forces_count = 0;
797 
798  cuda_atoms_count = 0;
799  cuda_atoms_alloc = 0;
800 
801  f_data_mgr_alloc = 0;
802  f_data_mgr_host = 0;
803  f_data_mgr_dev = 0;
804  afn_host = 0;
805  afn_dev = 0;
806 
807 #define CUDA_EVENT_ID_PME_CHARGES 80
808 #define CUDA_EVENT_ID_PME_FORCES 81
809 #define CUDA_EVENT_ID_PME_TICK 82
810 #define CUDA_EVENT_ID_PME_COPY 83
811 #define CUDA_EVENT_ID_PME_KERNEL 84
812  if ( 0 == CkMyPe() ) {
813  traceRegisterUserEvent("CUDA PME charges", CUDA_EVENT_ID_PME_CHARGES);
814  traceRegisterUserEvent("CUDA PME forces", CUDA_EVENT_ID_PME_FORCES);
815  traceRegisterUserEvent("CUDA PME tick", CUDA_EVENT_ID_PME_TICK);
816  traceRegisterUserEvent("CUDA PME memcpy", CUDA_EVENT_ID_PME_COPY);
817  traceRegisterUserEvent("CUDA PME kernel", CUDA_EVENT_ID_PME_KERNEL);
818  }
819 #endif
820  recipEvirCount = 0;
821  recipEvirClients = 0;
822  recipEvirPe = -999;
823 }
824 
825 
827  CProxy_PmeXPencil x, CProxy_PmeYPencil y, CProxy_PmeZPencil z) {
828  xPencil = x; yPencil = y; zPencil = z;
829 
830  if(CmiMyRank()==0)
831  {
832  pmeNodeProxy.ckLocalBranch()->xPencil=x;
833  pmeNodeProxy.ckLocalBranch()->yPencil=y;
834  pmeNodeProxy.ckLocalBranch()->zPencil=z;
835  }
836 }
837 
838 #if USE_TOPO_SFC
839  struct Coord
840  {
841  int x, y, z;
842  Coord(): x(0), y(0), z(0) {}
843  Coord(int a, int b, int c): x(a), y(b), z(c) {}
844  };
845  extern void SFC_grid(int xdim, int ydim, int zdim, int xdim1, int ydim1, int zdim1, vector<Coord> &result);
846 
847  void sort_sfc(SortableResizeArray<int> &procs, TopoManager &tmgr, vector<Coord> &result)
848  {
849  SortableResizeArray<int> newprocs(procs.size());
850  int num = 0;
851  for (int i=0; i<result.size(); i++) {
852  Coord &c = result[i];
853  for (int j=0; j<procs.size(); j++) {
854  int pe = procs[j];
855  int x,y,z,t;
856  tmgr.rankToCoordinates(pe, x, y, z, t);
857  if (x==c.x && y==c.y && z==c.z)
858  newprocs[num++] = pe;
859  }
860  }
861  CmiAssert(newprocs.size() == procs.size());
862  procs = newprocs;
863  }
864 
865  int find_level_grid(int x)
866  {
867  int a = sqrt(x);
868  int b;
869  for (; a>0; a--) {
870  if (x%a == 0) break;
871  }
872  if (a==1) a = x;
873  b = x/a;
874  //return a>b?a:b;
875  return b;
876  }
877  CmiNodeLock tmgr_lock;
878 #endif
879 
880 void Pme_init()
881 {
882 #if USE_TOPO_SFC
883  if (CkMyRank() == 0)
884  tmgr_lock = CmiCreateLock();
885 #endif
886 }
887 
888 void ComputePmeMgr::initialize(CkQdMsg *msg) {
889  delete msg;
890 
891  localInfo = new LocalPmeInfo[CkNumPes()];
892  gridNodeInfo = new NodePmeInfo[CkNumNodes()];
893  transNodeInfo = new NodePmeInfo[CkNumNodes()];
894  gridPeMap = new int[CkNumPes()];
895  transPeMap = new int[CkNumPes()];
896  recipPeDest = new int[CkNumPes()];
897  gridPeOrder = new int[CkNumPes()];
898  gridNodeOrder = new int[CkNumNodes()];
899  transNodeOrder = new int[CkNumNodes()];
900 
901  if (CkMyRank() == 0) {
902  pencilPMEProcessors = new char [CkNumPes()];
903  memset (pencilPMEProcessors, 0, sizeof(char) * CkNumPes());
904  }
905 
907  PatchMap *patchMap = PatchMap::Object();
908 
909  offload = simParams->PMEOffload;
910 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
911  if ( offload && ! deviceCUDA->one_device_per_node() ) {
912  NAMD_die("PME offload requires exactly one CUDA device per process. Use \"PMEOffload no\".");
913  }
914  if ( offload ) {
915  int dev;
916  cudaGetDevice(&dev);
917  cuda_errcheck("in cudaGetDevice");
918  if ( dev != deviceCUDA->getDeviceID() ) NAMD_bug("ComputePmeMgr::initialize dev != deviceCUDA->getDeviceID()");
919  cudaDeviceProp deviceProp;
920  cudaGetDeviceProperties(&deviceProp, dev);
921  cuda_errcheck("in cudaGetDeviceProperties");
922  if ( deviceProp.major < 2 )
923  NAMD_die("PME offload requires CUDA device of compute capability 2.0 or higher. Use \"PMEOffload no\".");
924  }
925 #endif
926 
927  alchLambda = -1.; // illegal value to catch if not updated
928  alchLambda2 = -1.;
929  useBarrier = simParams->PMEBarrier;
930 
931  if ( numGrids != 1 || simParams->PMEPencils == 0 ) usePencils = 0;
932  else if ( simParams->PMEPencils > 0 ) usePencils = 1;
933  else {
934  int nrps = simParams->PMEProcessors;
935  if ( nrps <= 0 ) nrps = CkNumPes();
936  if ( nrps > CkNumPes() ) nrps = CkNumPes();
937  int dimx = simParams->PMEGridSizeX;
938  int dimy = simParams->PMEGridSizeY;
939  int maxslabs = 1 + (dimx - 1) / simParams->PMEMinSlices;
940  if ( maxslabs > nrps ) maxslabs = nrps;
941  int maxpencils = ( simParams->PMEGridSizeX * (int64) simParams->PMEGridSizeY
942  * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
943  if ( maxpencils > nrps ) maxpencils = nrps;
944  if ( maxpencils > 3 * maxslabs ) usePencils = 1;
945  else usePencils = 0;
946  }
947 
948  if ( usePencils ) {
949  int nrps = simParams->PMEProcessors;
950  if ( nrps <= 0 ) nrps = CkNumPes();
951  if ( nrps > CkNumPes() ) nrps = CkNumPes();
952  if ( simParams->PMEPencils > 1 &&
953  simParams->PMEPencils * simParams->PMEPencils <= nrps ) {
954  xBlocks = yBlocks = zBlocks = simParams->PMEPencils;
955  } else {
956  int nb2 = ( simParams->PMEGridSizeX * (int64) simParams->PMEGridSizeY
957  * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
958  if ( nb2 > nrps ) nb2 = nrps;
959  if ( nb2 < 1 ) nb2 = 1;
960  int nb = (int) sqrt((float)nb2);
961  if ( nb < 1 ) nb = 1;
962  xBlocks = zBlocks = nb;
963  yBlocks = nb2 / nb;
964  }
965 
966  if ( simParams->PMEPencilsX > 0 ) xBlocks = simParams->PMEPencilsX;
967  if ( simParams->PMEPencilsY > 0 ) yBlocks = simParams->PMEPencilsY;
968  if ( simParams->PMEPencilsZ > 0 ) zBlocks = simParams->PMEPencilsZ;
969 
970  int dimx = simParams->PMEGridSizeX;
971  int bx = 1 + ( dimx - 1 ) / xBlocks;
972  xBlocks = 1 + ( dimx - 1 ) / bx;
973 
974  int dimy = simParams->PMEGridSizeY;
975  int by = 1 + ( dimy - 1 ) / yBlocks;
976  yBlocks = 1 + ( dimy - 1 ) / by;
977 
978  int dimz = simParams->PMEGridSizeZ / 2 + 1; // complex
979  int bz = 1 + ( dimz - 1 ) / zBlocks;
980  zBlocks = 1 + ( dimz - 1 ) / bz;
981 
982  if ( xBlocks * yBlocks > CkNumPes() ) {
983  NAMD_die("PME pencils xBlocks * yBlocks > numPes");
984  }
985  if ( xBlocks * zBlocks > CkNumPes() ) {
986  NAMD_die("PME pencils xBlocks * zBlocks > numPes");
987  }
988  if ( yBlocks * zBlocks > CkNumPes() ) {
989  NAMD_die("PME pencils yBlocks * zBlocks > numPes");
990  }
991 
992  if ( ! CkMyPe() ) {
993  iout << iINFO << "PME using " << xBlocks << " x " <<
994  yBlocks << " x " << zBlocks <<
995  " pencil grid for FFT and reciprocal sum.\n" << endi;
996  }
997  } else { // usePencils
998 
999  { // decide how many pes to use for reciprocal sum
1000 
1001  // rules based on work available
1002  int minslices = simParams->PMEMinSlices;
1003  int dimx = simParams->PMEGridSizeX;
1004  int nrpx = ( dimx + minslices - 1 ) / minslices;
1005  int dimy = simParams->PMEGridSizeY;
1006  int nrpy = ( dimy + minslices - 1 ) / minslices;
1007 
1008  // rules based on processors available
1009  int nrpp = CkNumPes();
1010  // if ( nrpp > 32 ) nrpp = 32; // cap to limit messages
1011  if ( nrpp < nrpx ) nrpx = nrpp;
1012  if ( nrpp < nrpy ) nrpy = nrpp;
1013 
1014  // user override
1015  int nrps = simParams->PMEProcessors;
1016  if ( nrps > CkNumPes() ) nrps = CkNumPes();
1017  if ( nrps > 0 ) nrpx = nrps;
1018  if ( nrps > 0 ) nrpy = nrps;
1019 
1020  // make sure there aren't any totally empty processors
1021  int bx = ( dimx + nrpx - 1 ) / nrpx;
1022  nrpx = ( dimx + bx - 1 ) / bx;
1023  int by = ( dimy + nrpy - 1 ) / nrpy;
1024  nrpy = ( dimy + by - 1 ) / by;
1025  if ( bx != ( dimx + nrpx - 1 ) / nrpx )
1026  NAMD_bug("Error in selecting number of PME processors.");
1027  if ( by != ( dimy + nrpy - 1 ) / nrpy )
1028  NAMD_bug("Error in selecting number of PME processors.");
1029 
1030  numGridPes = nrpx;
1031  numTransPes = nrpy;
1032  }
1033  if ( ! CkMyPe() ) {
1034  iout << iINFO << "PME using " << numGridPes << " and " << numTransPes <<
1035  " processors for FFT and reciprocal sum.\n" << endi;
1036  }
1037 
1038  int sum_npes = numTransPes + numGridPes;
1039  int max_npes = (numTransPes > numGridPes)?numTransPes:numGridPes;
1040 
1041 #if 0 // USE_TOPOMAP
1042  /* This code is being disabled permanently for slab PME on Blue Gene machines */
1043  PatchMap * pmap = PatchMap::Object();
1044 
1045  int patch_pes = pmap->numNodesWithPatches();
1046  TopoManager tmgr;
1047  if(tmgr.hasMultipleProcsPerNode())
1048  patch_pes *= 2;
1049 
1050  bool done = false;
1051  if(CkNumPes() > 2*sum_npes + patch_pes) {
1052  done = generateBGLORBPmePeList(transPeMap, numTransPes);
1053  done &= generateBGLORBPmePeList(gridPeMap, numGridPes, transPeMap, numTransPes);
1054  }
1055  else
1056  if(CkNumPes() > 2 *max_npes + patch_pes) {
1057  done = generateBGLORBPmePeList(transPeMap, max_npes);
1058  gridPeMap = transPeMap;
1059  }
1060 
1061  if (!done)
1062 #endif
1063  {
1064  //generatePmePeList(transPeMap, max_npes);
1065  //gridPeMap = transPeMap;
1066  generatePmePeList2(gridPeMap, numGridPes, transPeMap, numTransPes);
1067  }
1068 
1069  if ( ! CkMyPe() ) {
1070  iout << iINFO << "PME GRID LOCATIONS:";
1071  int i;
1072  for ( i=0; i<numGridPes && i<10; ++i ) {
1073  iout << " " << gridPeMap[i];
1074  }
1075  if ( i < numGridPes ) iout << " ...";
1076  iout << "\n" << endi;
1077  iout << iINFO << "PME TRANS LOCATIONS:";
1078  for ( i=0; i<numTransPes && i<10; ++i ) {
1079  iout << " " << transPeMap[i];
1080  }
1081  if ( i < numTransPes ) iout << " ...";
1082  iout << "\n" << endi;
1083  }
1084 
1085  // sort based on nodes and physical nodes
1086  std::sort(gridPeMap,gridPeMap+numGridPes,WorkDistrib::pe_sortop_compact());
1087 
1088  myGridPe = -1;
1089  myGridNode = -1;
1090  int i = 0;
1091  int node = -1;
1092  int real_node = -1;
1093  for ( i=0; i<numGridPes; ++i ) {
1094  if ( gridPeMap[i] == CkMyPe() ) myGridPe = i;
1095  if (CkMyRank() == 0) pencilPMEProcessors[gridPeMap[i]] |= 1;
1096  int real_node_i = CkNodeOf(gridPeMap[i]);
1097  if ( real_node_i == real_node ) {
1098  gridNodeInfo[node].npe += 1;
1099  } else {
1100  real_node = real_node_i;
1101  ++node;
1102  gridNodeInfo[node].real_node = real_node;
1103  gridNodeInfo[node].pe_start = i;
1104  gridNodeInfo[node].npe = 1;
1105  }
1106  if ( CkMyNode() == real_node_i ) myGridNode = node;
1107  }
1108  numGridNodes = node + 1;
1109  myTransPe = -1;
1110  myTransNode = -1;
1111  node = -1;
1112  real_node = -1;
1113  for ( i=0; i<numTransPes; ++i ) {
1114  if ( transPeMap[i] == CkMyPe() ) myTransPe = i;
1115  if (CkMyRank() == 0) pencilPMEProcessors[transPeMap[i]] |= 2;
1116  int real_node_i = CkNodeOf(transPeMap[i]);
1117  if ( real_node_i == real_node ) {
1118  transNodeInfo[node].npe += 1;
1119  } else {
1120  real_node = real_node_i;
1121  ++node;
1122  transNodeInfo[node].real_node = real_node;
1123  transNodeInfo[node].pe_start = i;
1124  transNodeInfo[node].npe = 1;
1125  }
1126  if ( CkMyNode() == real_node_i ) myTransNode = node;
1127  }
1128  numTransNodes = node + 1;
1129 
1130  if ( ! CkMyPe() ) {
1131  iout << iINFO << "PME USING " << numGridNodes << " GRID NODES AND "
1132  << numTransNodes << " TRANS NODES\n" << endi;
1133  }
1134 
1135  { // generate random orderings for grid and trans messages
1136  int i;
1137  for ( i = 0; i < numGridPes; ++i ) {
1138  gridPeOrder[i] = i;
1139  }
1140  Random rand(CkMyPe());
1141  if ( myGridPe < 0 ) {
1142  rand.reorder(gridPeOrder,numGridPes);
1143  } else { // self last
1144  gridPeOrder[myGridPe] = numGridPes-1;
1145  gridPeOrder[numGridPes-1] = myGridPe;
1146  rand.reorder(gridPeOrder,numGridPes-1);
1147  }
1148  for ( i = 0; i < numGridNodes; ++i ) {
1149  gridNodeOrder[i] = i;
1150  }
1151  if ( myGridNode < 0 ) {
1152  rand.reorder(gridNodeOrder,numGridNodes);
1153  } else { // self last
1154  gridNodeOrder[myGridNode] = numGridNodes-1;
1155  gridNodeOrder[numGridNodes-1] = myGridNode;
1156  rand.reorder(gridNodeOrder,numGridNodes-1);
1157  }
1158  for ( i = 0; i < numTransNodes; ++i ) {
1159  transNodeOrder[i] = i;
1160  }
1161  if ( myTransNode < 0 ) {
1162  rand.reorder(transNodeOrder,numTransNodes);
1163  } else { // self last
1164  transNodeOrder[myTransNode] = numTransNodes-1;
1165  transNodeOrder[numTransNodes-1] = myTransNode;
1166  rand.reorder(transNodeOrder,numTransNodes-1);
1167  }
1168  }
1169 
1170  } // ! usePencils
1171 
1172  myGrid.K1 = simParams->PMEGridSizeX;
1173  myGrid.K2 = simParams->PMEGridSizeY;
1174  myGrid.K3 = simParams->PMEGridSizeZ;
1175  myGrid.order = simParams->PMEInterpOrder;
1176  myGrid.dim2 = myGrid.K2;
1177  myGrid.dim3 = 2 * (myGrid.K3/2 + 1);
1178 
1179  if ( ! usePencils ) {
1180  myGrid.block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
1181  myGrid.block2 = ( myGrid.K2 + numTransPes - 1 ) / numTransPes;
1182  myGrid.block3 = myGrid.dim3 / 2; // complex
1183  }
1184 
1185  if ( usePencils ) {
1186  myGrid.block1 = ( myGrid.K1 + xBlocks - 1 ) / xBlocks;
1187  myGrid.block2 = ( myGrid.K2 + yBlocks - 1 ) / yBlocks;
1188  myGrid.block3 = ( myGrid.K3/2 + 1 + zBlocks - 1 ) / zBlocks; // complex
1189 
1190 
1191  int pe = 0;
1192  int x,y,z;
1193 
1194  SortableResizeArray<int> zprocs(xBlocks*yBlocks);
1195  SortableResizeArray<int> yprocs(xBlocks*zBlocks);
1196  SortableResizeArray<int> xprocs(yBlocks*zBlocks);
1197 
1198  // decide which pes to use by bit reversal and patch use
1199  int i;
1200  int ncpus = CkNumPes();
1201  SortableResizeArray<int> patches, nopatches, pmeprocs;
1202  PatchMap *pmap = PatchMap::Object();
1203  for ( int icpu=0; icpu<ncpus; ++icpu ) {
1204  int ri = WorkDistrib::peDiffuseOrdering[icpu];
1205  if ( ri ) { // keep 0 for special case
1206  // pretend pe 1 has patches to avoid placing extra PME load on node
1207  if ( ri == 1 || pmap->numPatchesOnNode(ri) ) patches.add(ri);
1208  else nopatches.add(ri);
1209  }
1210  }
1211 
1212 #if USE_RANDOM_TOPO
1213  Random rand(CkMyPe());
1214  int *tmp = new int[patches.size()];
1215  int nn = patches.size();
1216  for (i=0;i<nn;i++) tmp[i] = patches[i];
1217  rand.reorder(tmp, nn);
1218  patches.resize(0);
1219  for (i=0;i<nn;i++) patches.add(tmp[i]);
1220  delete [] tmp;
1221  tmp = new int[nopatches.size()];
1222  nn = nopatches.size();
1223  for (i=0;i<nn;i++) tmp[i] = nopatches[i];
1224  rand.reorder(tmp, nn);
1225  nopatches.resize(0);
1226  for (i=0;i<nn;i++) nopatches.add(tmp[i]);
1227  delete [] tmp;
1228 #endif
1229 
1230  // only use zero if it eliminates overloading or has patches
1231  int useZero = 0;
1232  int npens = xBlocks*yBlocks;
1233  if ( npens % ncpus == 0 ) useZero = 1;
1234  if ( npens == nopatches.size() + 1 ) useZero = 1;
1235  npens += xBlocks*zBlocks;
1236  if ( npens % ncpus == 0 ) useZero = 1;
1237  if ( npens == nopatches.size() + 1 ) useZero = 1;
1238  npens += yBlocks*zBlocks;
1239  if ( npens % ncpus == 0 ) useZero = 1;
1240  if ( npens == nopatches.size() + 1 ) useZero = 1;
1241 
1242  // add nopatches then patches in reversed order
1243  for ( i=nopatches.size()-1; i>=0; --i ) pmeprocs.add(nopatches[i]);
1244  if ( useZero && ! pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
1245  for ( i=patches.size()-1; i>=0; --i ) pmeprocs.add(patches[i]);
1246  if ( pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
1247 
1248  int npes = pmeprocs.size();
1249  for ( i=0; i<xBlocks*yBlocks; ++i, ++pe ) zprocs[i] = pmeprocs[pe%npes];
1250  if ( i>1 && zprocs[0] == zprocs[i-1] ) zprocs[0] = 0;
1251 #if !USE_RANDOM_TOPO
1252  zprocs.sort();
1253 #endif
1254  for ( i=0; i<xBlocks*zBlocks; ++i, ++pe ) yprocs[i] = pmeprocs[pe%npes];
1255  if ( i>1 && yprocs[0] == yprocs[i-1] ) yprocs[0] = 0;
1256 #if !USE_RANDOM_TOPO
1257  yprocs.sort();
1258 #endif
1259  for ( i=0; i<yBlocks*zBlocks; ++i, ++pe ) xprocs[i] = pmeprocs[pe%npes];
1260  if ( i>1 && xprocs[0] == xprocs[i-1] ) xprocs[0] = 0;
1261 #if !USE_RANDOM_TOPO
1262  xprocs.sort();
1263 #endif
1264 
1265 #if USE_TOPO_SFC
1266  CmiLock(tmgr_lock);
1267  //{
1268  TopoManager tmgr;
1269  int xdim = tmgr.getDimNX();
1270  int ydim = tmgr.getDimNY();
1271  int zdim = tmgr.getDimNZ();
1272  int xdim1 = find_level_grid(xdim);
1273  int ydim1 = find_level_grid(ydim);
1274  int zdim1 = find_level_grid(zdim);
1275  if(CkMyPe() == 0)
1276  printf("xdim: %d %d %d, %d %d %d\n", xdim, ydim, zdim, xdim1, ydim1, zdim1);
1277 
1278  vector<Coord> result;
1279  SFC_grid(xdim, ydim, zdim, xdim1, ydim1, zdim1, result);
1280  sort_sfc(xprocs, tmgr, result);
1281  sort_sfc(yprocs, tmgr, result);
1282  sort_sfc(zprocs, tmgr, result);
1283  //}
1284  CmiUnlock(tmgr_lock);
1285 #endif
1286 
1287 
1288  if(CkMyPe() == 0){
1289  iout << iINFO << "PME Z PENCIL LOCATIONS:";
1290  for ( i=0; i<zprocs.size() && i<10; ++i ) {
1291 #if USE_TOPO_SFC
1292  int x,y,z,t;
1293  tmgr.rankToCoordinates(zprocs[i], x,y, z, t);
1294  iout << " " << zprocs[i] << "(" << x << " " << y << " " << z << ")";
1295 #else
1296  iout << " " << zprocs[i];
1297 #endif
1298  }
1299  if ( i < zprocs.size() ) iout << " ...";
1300  iout << "\n" << endi;
1301  }
1302 
1303  if (CkMyRank() == 0) {
1304  for (pe=0, x = 0; x < xBlocks; ++x)
1305  for (y = 0; y < yBlocks; ++y, ++pe ) {
1306  pencilPMEProcessors[zprocs[pe]] = 1;
1307  }
1308  }
1309 
1310  if(CkMyPe() == 0){
1311  iout << iINFO << "PME Y PENCIL LOCATIONS:";
1312  for ( i=0; i<yprocs.size() && i<10; ++i ) {
1313 #if USE_TOPO_SFC
1314  int x,y,z,t;
1315  tmgr.rankToCoordinates(yprocs[i], x,y, z, t);
1316  iout << " " << yprocs[i] << "(" << x << " " << y << " " << z << ")";
1317 #else
1318  iout << " " << yprocs[i];
1319 #endif
1320  }
1321  if ( i < yprocs.size() ) iout << " ...";
1322  iout << "\n" << endi;
1323  }
1324 
1325  if (CkMyRank() == 0) {
1326  for (pe=0, z = 0; z < zBlocks; ++z )
1327  for (x = 0; x < xBlocks; ++x, ++pe ) {
1328  pencilPMEProcessors[yprocs[pe]] = 1;
1329  }
1330  }
1331 
1332  if(CkMyPe() == 0){
1333  iout << iINFO << "PME X PENCIL LOCATIONS:";
1334  for ( i=0; i<xprocs.size() && i<10; ++i ) {
1335 #if USE_TOPO_SFC
1336  int x,y,z,t;
1337  tmgr.rankToCoordinates(xprocs[i], x,y, z, t);
1338  iout << " " << xprocs[i] << "(" << x << " " << y << " " << z << ")";
1339 #else
1340  iout << " " << xprocs[i];
1341 #endif
1342  }
1343  if ( i < xprocs.size() ) iout << " ...";
1344  iout << "\n" << endi;
1345  }
1346 
1347  if (CkMyRank() == 0) {
1348  for (pe=0, y = 0; y < yBlocks; ++y )
1349  for (z = 0; z < zBlocks; ++z, ++pe ) {
1350  pencilPMEProcessors[xprocs[pe]] = 1;
1351  }
1352  }
1353 
1354 
1355  // creating the pencil arrays
1356  if ( CkMyPe() == 0 ){
1357 #if !USE_RANDOM_TOPO
1358  // std::sort(zprocs.begin(),zprocs.end(),WorkDistrib::pe_sortop_compact());
1359  WorkDistrib::sortPmePes(zprocs.begin(),xBlocks,yBlocks);
1360  std::sort(yprocs.begin(),yprocs.end(),WorkDistrib::pe_sortop_compact());
1361  std::sort(xprocs.begin(),xprocs.end(),WorkDistrib::pe_sortop_compact());
1362 #endif
1363 #if 1
1364  CProxy_PmePencilMap zm = CProxy_PmePencilMap::ckNew(0,1,yBlocks,xBlocks*yBlocks,zprocs.begin());
1365  CProxy_PmePencilMap ym;
1366  if ( simParams->PMEPencilsYLayout )
1367  ym = CProxy_PmePencilMap::ckNew(0,2,zBlocks,zBlocks*xBlocks,yprocs.begin()); // new
1368  else
1369  ym = CProxy_PmePencilMap::ckNew(2,0,xBlocks,zBlocks*xBlocks,yprocs.begin()); // old
1370  CProxy_PmePencilMap xm;
1371  if ( simParams->PMEPencilsXLayout )
1372  xm = CProxy_PmePencilMap::ckNew(2,1,yBlocks,yBlocks*zBlocks,xprocs.begin()); // new
1373  else
1374  xm = CProxy_PmePencilMap::ckNew(1,2,zBlocks,yBlocks*zBlocks,xprocs.begin()); // old
1375  pmeNodeProxy.recvPencilMapProxies(xm,ym,zm);
1376  CkArrayOptions zo(xBlocks,yBlocks,1); zo.setMap(zm);
1377  CkArrayOptions yo(xBlocks,1,zBlocks); yo.setMap(ym);
1378  CkArrayOptions xo(1,yBlocks,zBlocks); xo.setMap(xm);
1379  zo.setAnytimeMigration(false); zo.setStaticInsertion(true);
1380  yo.setAnytimeMigration(false); yo.setStaticInsertion(true);
1381  xo.setAnytimeMigration(false); xo.setStaticInsertion(true);
1382  zPencil = CProxy_PmeZPencil::ckNew(zo); // (xBlocks,yBlocks,1);
1383  yPencil = CProxy_PmeYPencil::ckNew(yo); // (xBlocks,1,zBlocks);
1384  xPencil = CProxy_PmeXPencil::ckNew(xo); // (1,yBlocks,zBlocks);
1385 #else
1386  zPencil = CProxy_PmeZPencil::ckNew(); // (xBlocks,yBlocks,1);
1387  yPencil = CProxy_PmeYPencil::ckNew(); // (xBlocks,1,zBlocks);
1388  xPencil = CProxy_PmeXPencil::ckNew(); // (1,yBlocks,zBlocks);
1389 
1390  for (pe=0, x = 0; x < xBlocks; ++x)
1391  for (y = 0; y < yBlocks; ++y, ++pe ) {
1392  zPencil(x,y,0).insert(zprocs[pe]);
1393  }
1394  zPencil.doneInserting();
1395 
1396  for (pe=0, x = 0; x < xBlocks; ++x)
1397  for (z = 0; z < zBlocks; ++z, ++pe ) {
1398  yPencil(x,0,z).insert(yprocs[pe]);
1399  }
1400  yPencil.doneInserting();
1401 
1402 
1403  for (pe=0, y = 0; y < yBlocks; ++y )
1404  for (z = 0; z < zBlocks; ++z, ++pe ) {
1405  xPencil(0,y,z).insert(xprocs[pe]);
1406  }
1407  xPencil.doneInserting();
1408 #endif
1409 
1410  pmeProxy.recvArrays(xPencil,yPencil,zPencil);
1411  PmePencilInitMsgData msgdata;
1412  msgdata.grid = myGrid;
1413  msgdata.xBlocks = xBlocks;
1414  msgdata.yBlocks = yBlocks;
1415  msgdata.zBlocks = zBlocks;
1416  msgdata.xPencil = xPencil;
1417  msgdata.yPencil = yPencil;
1418  msgdata.zPencil = zPencil;
1419  msgdata.pmeProxy = pmeProxyDir;
1420  msgdata.pmeNodeProxy = pmeNodeProxy;
1421  msgdata.xm = xm;
1422  msgdata.ym = ym;
1423  msgdata.zm = zm;
1424  xPencil.init(new PmePencilInitMsg(msgdata));
1425  yPencil.init(new PmePencilInitMsg(msgdata));
1426  zPencil.init(new PmePencilInitMsg(msgdata));
1427  }
1428 
1429  return; // continue in initialize_pencils() at next startup stage
1430  }
1431 
1432 
1433  int pe;
1434  int nx = 0;
1435  for ( pe = 0; pe < numGridPes; ++pe ) {
1436  localInfo[pe].x_start = nx;
1437  nx += myGrid.block1;
1438  if ( nx > myGrid.K1 ) nx = myGrid.K1;
1439  localInfo[pe].nx = nx - localInfo[pe].x_start;
1440  }
1441  int ny = 0;
1442  for ( pe = 0; pe < numTransPes; ++pe ) {
1443  localInfo[pe].y_start_after_transpose = ny;
1444  ny += myGrid.block2;
1445  if ( ny > myGrid.K2 ) ny = myGrid.K2;
1446  localInfo[pe].ny_after_transpose =
1447  ny - localInfo[pe].y_start_after_transpose;
1448  }
1449 
1450  { // decide how many pes this node exchanges charges with
1451 
1452  PatchMap *patchMap = PatchMap::Object();
1453  Lattice lattice = simParams->lattice;
1454  BigReal sysdima = lattice.a_r().unit() * lattice.a();
1455  BigReal cutoff = simParams->cutoff;
1456  BigReal patchdim = simParams->patchDimension;
1457  int numPatches = patchMap->numPatches();
1458  int numNodes = CkNumPes();
1459  int *source_flags = new int[numNodes];
1460  int node;
1461  for ( node=0; node<numNodes; ++node ) {
1462  source_flags[node] = 0;
1463  recipPeDest[node] = 0;
1464  }
1465 
1466  // // make sure that we don't get ahead of ourselves on this node
1467  // if ( CkMyPe() < numPatches && myRecipPe >= 0 ) {
1468  // source_flags[CkMyPe()] = 1;
1469  // recipPeDest[myRecipPe] = 1;
1470  // }
1471 
1472  for ( int pid=0; pid < numPatches; ++pid ) {
1473  int pnode = patchMap->node(pid);
1474 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
1475  if ( offload ) pnode = CkNodeFirst(CkNodeOf(pnode));
1476 #endif
1477  int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
1478  BigReal minx = patchMap->min_a(pid);
1479  BigReal maxx = patchMap->max_a(pid);
1480  BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
1481  // min1 (max1) is smallest (largest) grid line for this patch
1482  int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
1483  int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
1484  for ( int i=min1; i<=max1; ++i ) {
1485  int ix = i;
1486  while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
1487  while ( ix < 0 ) ix += myGrid.K1;
1488  // set source_flags[pnode] if this patch sends to our node
1489  if ( myGridPe >= 0 && ix >= localInfo[myGridPe].x_start &&
1490  ix < localInfo[myGridPe].x_start + localInfo[myGridPe].nx ) {
1491  source_flags[pnode] = 1;
1492  }
1493  // set dest_flags[] for node that our patch sends to
1494 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
1495  if ( offload ) {
1496  if ( pnode == CkNodeFirst(CkMyNode()) ) {
1497  recipPeDest[ix / myGrid.block1] = 1;
1498  }
1499  } else
1500 #endif
1501  if ( pnode == CkMyPe() ) {
1502  recipPeDest[ix / myGrid.block1] = 1;
1503  }
1504  }
1505  }
1506 
1507  int numSourcesSamePhysicalNode = 0;
1508  numSources = 0;
1509  numDestRecipPes = 0;
1510  for ( node=0; node<numNodes; ++node ) {
1511  if ( source_flags[node] ) ++numSources;
1512  if ( recipPeDest[node] ) ++numDestRecipPes;
1513  if ( source_flags[node] && CmiPeOnSamePhysicalNode(node,CkMyPe()) ) ++numSourcesSamePhysicalNode;
1514  }
1515 
1516 #if 0
1517  if ( numSources ) {
1518  CkPrintf("pe %5d pme %5d of %5d on same physical node\n",
1519  CkMyPe(), numSourcesSamePhysicalNode, numSources);
1520  iout << iINFO << "PME " << CkMyPe() << " sources:";
1521  for ( node=0; node<numNodes; ++node ) {
1522  if ( source_flags[node] ) iout << " " << node;
1523  }
1524  iout << "\n" << endi;
1525  }
1526 #endif
1527 
1528  delete [] source_flags;
1529 
1530  // CkPrintf("PME on node %d has %d sources and %d destinations\n",
1531  // CkMyPe(), numSources, numDestRecipPes);
1532 
1533  } // decide how many pes this node exchanges charges with (end)
1534 
1535  ungrid_count = numDestRecipPes;
1536 
1537  sendTransBarrier_received = 0;
1538 
1539  if ( myGridPe < 0 && myTransPe < 0 ) return;
1540  // the following only for nodes doing reciprocal sum
1541 
1542  if ( myTransPe >= 0 ) {
1543  recipEvirPe = findRecipEvirPe();
1544  pmeProxy[recipEvirPe].addRecipEvirClient();
1545  }
1546 
1547  if ( myTransPe >= 0 ) {
1548  int k2_start = localInfo[myTransPe].y_start_after_transpose;
1549  int k2_end = k2_start + localInfo[myTransPe].ny_after_transpose;
1550  #ifdef OPENATOM_VERSION
1551  if ( simParams->openatomOn ) {
1552  CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
1553  myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2, moaProxy);
1554  } else {
1555  myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
1556  }
1557  #else // OPENATOM_VERSION
1558  myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
1559  #endif // OPENATOM_VERSION
1560  }
1561 
1562  int local_size = myGrid.block1 * myGrid.K2 * myGrid.dim3;
1563  int local_size_2 = myGrid.block2 * myGrid.K1 * myGrid.dim3;
1564  if ( local_size < local_size_2 ) local_size = local_size_2;
1565  qgrid = new float[local_size*numGrids];
1566  if ( numGridPes > 1 || numTransPes > 1 ) {
1567  kgrid = new float[local_size*numGrids];
1568  } else {
1569  kgrid = qgrid;
1570  }
1571  qgrid_size = local_size;
1572 
1573  if ( myGridPe >= 0 ) {
1574  qgrid_start = localInfo[myGridPe].x_start * myGrid.K2 * myGrid.dim3;
1575  qgrid_len = localInfo[myGridPe].nx * myGrid.K2 * myGrid.dim3;
1576  fgrid_start = localInfo[myGridPe].x_start * myGrid.K2;
1577  fgrid_len = localInfo[myGridPe].nx * myGrid.K2;
1578  }
1579 
1580  int n[3]; n[0] = myGrid.K1; n[1] = myGrid.K2; n[2] = myGrid.K3;
1581 #ifdef NAMD_FFTW
1582  CmiLock(fftw_plan_lock);
1583 #ifdef NAMD_FFTW_3
1584  work = new fftwf_complex[n[0]];
1585  int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT : simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE ;
1586  if ( myGridPe >= 0 ) {
1587  forward_plan_yz=new fftwf_plan[numGrids];
1588  backward_plan_yz=new fftwf_plan[numGrids];
1589  }
1590  if ( myTransPe >= 0 ) {
1591  forward_plan_x=new fftwf_plan[numGrids];
1592  backward_plan_x=new fftwf_plan[numGrids];
1593  }
1594  /* need one plan per grid */
1595  if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps. 1..." << endi;
1596  if ( myGridPe >= 0 ) {
1597  for( int g=0; g<numGrids; g++)
1598  {
1599  forward_plan_yz[g] = fftwf_plan_many_dft_r2c(2, n+1,
1600  localInfo[myGridPe].nx,
1601  qgrid + qgrid_size * g,
1602  NULL,
1603  1,
1604  myGrid.dim2 * myGrid.dim3,
1605  (fftwf_complex *)
1606  (qgrid + qgrid_size * g),
1607  NULL,
1608  1,
1609  myGrid.dim2 * (myGrid.dim3/2),
1610  fftwFlags);
1611  }
1612  }
1613  int zdim = myGrid.dim3;
1614  int xStride=localInfo[myTransPe].ny_after_transpose *( myGrid.dim3 / 2);
1615  if ( ! CkMyPe() ) iout << " 2..." << endi;
1616  if ( myTransPe >= 0 ) {
1617  for( int g=0; g<numGrids; g++)
1618  {
1619 
1620  forward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
1621  (fftwf_complex *)
1622  (kgrid+qgrid_size*g),
1623  NULL,
1624  xStride,
1625  1,
1626  (fftwf_complex *)
1627  (kgrid+qgrid_size*g),
1628  NULL,
1629  xStride,
1630  1,
1631  FFTW_FORWARD,fftwFlags);
1632 
1633  }
1634  }
1635  if ( ! CkMyPe() ) iout << " 3..." << endi;
1636  if ( myTransPe >= 0 ) {
1637  for( int g=0; g<numGrids; g++)
1638  {
1639  backward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
1640  (fftwf_complex *)
1641  (kgrid+qgrid_size*g),
1642  NULL,
1643  xStride,
1644  1,
1645  (fftwf_complex *)
1646  (kgrid+qgrid_size*g),
1647  NULL,
1648  xStride,
1649  1,
1650  FFTW_BACKWARD, fftwFlags);
1651 
1652  }
1653  }
1654  if ( ! CkMyPe() ) iout << " 4..." << endi;
1655  if ( myGridPe >= 0 ) {
1656  for( int g=0; g<numGrids; g++)
1657  {
1658  backward_plan_yz[g] = fftwf_plan_many_dft_c2r(2, n+1,
1659  localInfo[myGridPe].nx,
1660  (fftwf_complex *)
1661  (qgrid + qgrid_size * g),
1662  NULL,
1663  1,
1664  myGrid.dim2*(myGrid.dim3/2),
1665  qgrid + qgrid_size * g,
1666  NULL,
1667  1,
1668  myGrid.dim2 * myGrid.dim3,
1669  fftwFlags);
1670  }
1671  }
1672  if ( ! CkMyPe() ) iout << " Done.\n" << endi;
1673 
1674 #else
1675  work = new fftw_complex[n[0]];
1676 
1677  if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps. 1..." << endi;
1678  if ( myGridPe >= 0 ) {
1679  forward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_REAL_TO_COMPLEX,
1680  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1681  | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
1682  }
1683  if ( ! CkMyPe() ) iout << " 2..." << endi;
1684  if ( myTransPe >= 0 ) {
1685  forward_plan_x = fftw_create_plan_specific(n[0], FFTW_FORWARD,
1686  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1687  | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
1688  localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
1689  }
1690  if ( ! CkMyPe() ) iout << " 3..." << endi;
1691  if ( myTransPe >= 0 ) {
1692  backward_plan_x = fftw_create_plan_specific(n[0], FFTW_BACKWARD,
1693  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1694  | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
1695  localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
1696  }
1697  if ( ! CkMyPe() ) iout << " 4..." << endi;
1698  if ( myGridPe >= 0 ) {
1699  backward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_COMPLEX_TO_REAL,
1700  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1701  | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
1702  }
1703  if ( ! CkMyPe() ) iout << " Done.\n" << endi;
1704 #endif
1705  CmiUnlock(fftw_plan_lock);
1706 #else
1707  NAMD_die("Sorry, FFTW must be compiled in to use PME.");
1708 #endif
1709 
1710  if ( myGridPe >= 0 && numSources == 0 )
1711  NAMD_bug("PME grid elements exist without sources.");
1712  grid_count = numSources;
1713  memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
1714  trans_count = numGridPes;
1715 }
1716 
1717 
1718 
1720  delete msg;
1721  if ( ! usePencils ) return;
1722 
1724 
1725  PatchMap *patchMap = PatchMap::Object();
1726  Lattice lattice = simParams->lattice;
1727  BigReal sysdima = lattice.a_r().unit() * lattice.a();
1728  BigReal sysdimb = lattice.b_r().unit() * lattice.b();
1729  BigReal cutoff = simParams->cutoff;
1730  BigReal patchdim = simParams->patchDimension;
1731  int numPatches = patchMap->numPatches();
1732 
1733  pencilActive = new char[xBlocks*yBlocks];
1734  for ( int i=0; i<xBlocks; ++i ) {
1735  for ( int j=0; j<yBlocks; ++j ) {
1736  pencilActive[i*yBlocks+j] = 0;
1737  }
1738  }
1739 
1740  for ( int pid=0; pid < numPatches; ++pid ) {
1741  int pnode = patchMap->node(pid);
1742 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
1743  if ( offload ) {
1744  if ( CkNodeOf(pnode) != CkMyNode() ) continue;
1745  } else
1746 #endif
1747  if ( pnode != CkMyPe() ) continue;
1748 
1749  int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
1750  int shift2 = (myGrid.K2 + myGrid.order - 1)/2;
1751 
1752  BigReal minx = patchMap->min_a(pid);
1753  BigReal maxx = patchMap->max_a(pid);
1754  BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
1755  // min1 (max1) is smallest (largest) grid line for this patch
1756  int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
1757  int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
1758 
1759  BigReal miny = patchMap->min_b(pid);
1760  BigReal maxy = patchMap->max_b(pid);
1761  BigReal marginb = 0.5 * ( patchdim - cutoff ) / sysdimb;
1762  // min2 (max2) is smallest (largest) grid line for this patch
1763  int min2 = ((int) floor(myGrid.K2 * (miny - marginb))) + shift2 - myGrid.order + 1;
1764  int max2 = ((int) floor(myGrid.K2 * (maxy + marginb))) + shift2;
1765 
1766  for ( int i=min1; i<=max1; ++i ) {
1767  int ix = i;
1768  while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
1769  while ( ix < 0 ) ix += myGrid.K1;
1770  for ( int j=min2; j<=max2; ++j ) {
1771  int jy = j;
1772  while ( jy >= myGrid.K2 ) jy -= myGrid.K2;
1773  while ( jy < 0 ) jy += myGrid.K2;
1774  pencilActive[(ix / myGrid.block1)*yBlocks + (jy / myGrid.block2)] = 1;
1775  }
1776  }
1777  }
1778 
1779  numPencilsActive = 0;
1780  for ( int i=0; i<xBlocks; ++i ) {
1781  for ( int j=0; j<yBlocks; ++j ) {
1782  if ( pencilActive[i*yBlocks+j] ) {
1783  ++numPencilsActive;
1784 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
1785  if ( CkMyPe() == deviceCUDA->getMasterPe() || ! offload )
1786 #endif
1787  zPencil(i,j,0).dummyRecvGrid(CkMyPe(),0);
1788  }
1789  }
1790  }
1791  activePencils = new ijpair[numPencilsActive];
1792  numPencilsActive = 0;
1793  for ( int i=0; i<xBlocks; ++i ) {
1794  for ( int j=0; j<yBlocks; ++j ) {
1795  if ( pencilActive[i*yBlocks+j] ) {
1796  activePencils[numPencilsActive++] = ijpair(i,j);
1797  }
1798  }
1799  }
1800  if ( simParams->PMESendOrder ) {
1801  std::sort(activePencils,activePencils+numPencilsActive,ijpair_sortop_bit_reversed());
1802  } else {
1803  Random rand(CkMyPe());
1804  rand.reorder(activePencils,numPencilsActive);
1805  }
1806  //if ( numPencilsActive ) {
1807  // CkPrintf("node %d sending to %d pencils\n", CkMyPe(), numPencilsActive);
1808  //}
1809 
1810  ungrid_count = numPencilsActive;
1811 }
1812 
1813 
1815  if ( ! usePencils ) return;
1816  if ( CkMyPe() == 0 ) zPencil.dummyRecvGrid(CkMyPe(),1);
1817 }
1818 
1819 
1821 
1822  if ( CmiMyRank() == 0 ) {
1823  CmiDestroyLock(fftw_plan_lock);
1824  }
1825  CmiDestroyLock(pmemgr_lock);
1826 
1827  delete myKSpace;
1828  delete [] localInfo;
1829  delete [] gridNodeInfo;
1830  delete [] transNodeInfo;
1831  delete [] gridPeMap;
1832  delete [] transPeMap;
1833  delete [] recipPeDest;
1834  delete [] gridPeOrder;
1835  delete [] gridNodeOrder;
1836  delete [] transNodeOrder;
1837  delete [] qgrid;
1838  if ( kgrid != qgrid ) delete [] kgrid;
1839  delete [] work;
1840  delete [] gridmsg_reuse;
1841 
1842  if ( ! offload ) {
1843  for (int i=0; i<q_count; ++i) {
1844  delete [] q_list[i];
1845  }
1846  delete [] q_list;
1847  delete [] fz_arr;
1848  }
1849  delete [] f_arr;
1850  delete [] q_arr;
1851 }
1852 
1854  // CkPrintf("recvGrid from %d on Pe(%d)\n",msg->sourceNode,CkMyPe());
1855  if ( grid_count == 0 ) {
1856  NAMD_bug("Message order failure in ComputePmeMgr::recvGrid\n");
1857  }
1858  if ( grid_count == numSources ) {
1859  lattice = msg->lattice;
1860  grid_sequence = msg->sequence;
1861  }
1862 
1863  int zdim = myGrid.dim3;
1864  int zlistlen = msg->zlistlen;
1865  int *zlist = msg->zlist;
1866  float *qmsg = msg->qgrid;
1867  for ( int g=0; g<numGrids; ++g ) {
1868  char *f = msg->fgrid + fgrid_len * g;
1869  float *q = qgrid + qgrid_size * g;
1870  for ( int i=0; i<fgrid_len; ++i ) {
1871  if ( f[i] ) {
1872  for ( int k=0; k<zlistlen; ++k ) {
1873  q[zlist[k]] += *(qmsg++);
1874  }
1875  }
1876  q += zdim;
1877  }
1878  }
1879 
1880  gridmsg_reuse[numSources-grid_count] = msg;
1881  --grid_count;
1882 
1883  if ( grid_count == 0 ) {
1884  pmeProxyDir[CkMyPe()].gridCalc1();
1885  if ( useBarrier ) pmeProxyDir[0].sendTransBarrier();
1886  }
1887 }
1888 #ifdef MANUAL_DEBUG_FFTW3
1889 
1890 /* utility functions for manual debugging */
1891 void dumpMatrixFloat(const char *infilename, float *matrix, int xdim, int ydim, int zdim,int pe)
1892 {
1893 
1894  char fmt[1000];
1895  char filename[1000];
1896  strncpy(fmt,infilename,999);
1897  strncat(fmt,"_%d.out",999);
1898  sprintf(filename,fmt, pe);
1899  FILE *loutfile = fopen(filename, "w");
1900 #ifdef PAIRCALC_TEST_DUMP
1901  fprintf(loutfile,"%d\n",ydim);
1902 #endif
1903  fprintf(loutfile,"%d %d %d\n",xdim,ydim, zdim);
1904  for(int i=0;i<xdim;i++)
1905  for(int j=0;j<ydim;j++)
1906  for(int k=0;k<zdim;k++)
1907  fprintf(loutfile,"%d %d %d %.8f\n",i,j,k,matrix[i*zdim*ydim+j*zdim +k]);
1908  fclose(loutfile);
1909 
1910 }
1911 
1912 void dumpMatrixFloat3(const char *infilename, float *matrix, int xdim, int ydim, int zdim,int x, int y, int z)
1913 {
1914  char fmt[1000];
1915  char filename[1000];
1916  strncpy(fmt,infilename,999);
1917  strncat(fmt,"_%d_%d_%d.out",999);
1918  sprintf(filename,fmt, x,y,z);
1919  FILE *loutfile = fopen(filename, "w");
1920  CkAssert(loutfile!=NULL);
1921  CkPrintf("opened %s for dump\n",filename);
1922  fprintf(loutfile,"%d %d %d\n",xdim,ydim, zdim);
1923  for(int i=0;i<xdim;i++)
1924  for(int j=0;j<ydim;j++)
1925  for(int k=0;k<zdim;k++)
1926  fprintf(loutfile,"%d %d %d %.8f\n",i,j,k,matrix[i*zdim*ydim+j*zdim +k]);
1927  fclose(loutfile);
1928 }
1929 
1930 #endif
1931 
1933  // CkPrintf("gridCalc1 on Pe(%d)\n",CkMyPe());
1934 
1935 #ifdef NAMD_FFTW
1936  for ( int g=0; g<numGrids; ++g ) {
1937 #ifdef NAMD_FFTW_3
1938  fftwf_execute(forward_plan_yz[g]);
1939 #else
1940  rfftwnd_real_to_complex(forward_plan_yz, localInfo[myGridPe].nx,
1941  qgrid + qgrid_size * g, 1, myGrid.dim2 * myGrid.dim3, 0, 0, 0);
1942 #endif
1943 
1944  }
1945 #endif
1946 
1947  if ( ! useBarrier ) pmeProxyDir[CkMyPe()].sendTrans();
1948 }
1949 
1951  sendTransBarrier_received += 1;
1952  // CkPrintf("sendTransBarrier on %d %d\n",myGridPe,numGridPes-sendTransBarrier_received);
1953  if ( sendTransBarrier_received < numGridPes ) return;
1954  sendTransBarrier_received = 0;
1955  for ( int i=0; i<numGridPes; ++i ) {
1956  pmeProxyDir[gridPeMap[i]].sendTrans();
1957  }
1958 }
1959 
1960 static inline void PmeSlabSendTrans(int first, int last, void *result, int paraNum, void *param) {
1961  ComputePmeMgr *mgr = (ComputePmeMgr *)param;
1962  mgr->sendTransSubset(first, last);
1963 }
1964 
1966 
1967  untrans_count = numTransPes;
1968 
1969 #if CMK_SMP && USE_CKLOOP
1970  int useCkLoop = Node::Object()->simParameters->useCkLoop;
1971  if ( useCkLoop >= CKLOOP_CTRL_PME_SENDTRANS && CkNumPes() >= 2 * numGridPes) {
1972  CkLoop_Parallelize(PmeSlabSendTrans, 1, (void *)this, CkMyNodeSize(), 0, numTransNodes-1, 0); // no sync
1973  } else
1974 #endif
1975  {
1976  sendTransSubset(0, numTransNodes-1);
1977  }
1978 
1979 }
1980 
1981 void ComputePmeMgr::sendTransSubset(int first, int last) {
1982  // CkPrintf("sendTrans on Pe(%d)\n",CkMyPe());
1983 
1984  // send data for transpose
1985  int zdim = myGrid.dim3;
1986  int nx = localInfo[myGridPe].nx;
1987  int x_start = localInfo[myGridPe].x_start;
1988  int slicelen = myGrid.K2 * zdim;
1989 
1990  ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
1991 
1992 #if CMK_BLUEGENEL
1993  CmiNetworkProgressAfter (0);
1994 #endif
1995 
1996  for (int j=first; j<=last; j++) {
1997  int node = transNodeOrder[j]; // different order on each node
1998  int pe = transNodeInfo[node].pe_start;
1999  int npe = transNodeInfo[node].npe;
2000  int totlen = 0;
2001  if ( node != myTransNode ) for (int i=0; i<npe; ++i, ++pe) {
2002  LocalPmeInfo &li = localInfo[pe];
2003  int cpylen = li.ny_after_transpose * zdim;
2004  totlen += cpylen;
2005  }
2006  PmeTransMsg *newmsg = new (nx * totlen * numGrids,
2008  newmsg->sourceNode = myGridPe;
2009  newmsg->lattice = lattice;
2010  newmsg->x_start = x_start;
2011  newmsg->nx = nx;
2012  for ( int g=0; g<numGrids; ++g ) {
2013  float *qmsg = newmsg->qgrid + nx * totlen * g;
2014  pe = transNodeInfo[node].pe_start;
2015  for (int i=0; i<npe; ++i, ++pe) {
2016  LocalPmeInfo &li = localInfo[pe];
2017  int cpylen = li.ny_after_transpose * zdim;
2018  if ( node == myTransNode ) {
2019  ComputePmeMgr *m = mgrObjects[CkRankOf(transPeMap[pe])];
2020  qmsg = m->kgrid + m->qgrid_size * g + x_start*cpylen;
2021  }
2022  float *q = qgrid + qgrid_size * g + li.y_start_after_transpose * zdim;
2023  for ( int x = 0; x < nx; ++x ) {
2024  CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
2025  q += slicelen;
2026  qmsg += cpylen;
2027  }
2028  }
2029  }
2030  newmsg->sequence = grid_sequence;
2031  SET_PRIORITY(newmsg,grid_sequence,PME_TRANS_PRIORITY)
2032  if ( node == myTransNode ) newmsg->nx = 0;
2033  if ( npe > 1 ) {
2034  if ( node == myTransNode ) fwdSharedTrans(newmsg);
2035  else pmeNodeProxy[transNodeInfo[node].real_node].recvTrans(newmsg);
2036  } else pmeProxy[transPeMap[transNodeInfo[node].pe_start]].recvTrans(newmsg);
2037  }
2038 }
2039 
2041  // CkPrintf("fwdSharedTrans on Pe(%d)\n",CkMyPe());
2042  int pe = transNodeInfo[myTransNode].pe_start;
2043  int npe = transNodeInfo[myTransNode].npe;
2044  CmiNodeLock lock = CmiCreateLock();
2045  int *count = new int; *count = npe;
2046  for (int i=0; i<npe; ++i, ++pe) {
2049  shmsg->msg = msg;
2050  shmsg->count = count;
2051  shmsg->lock = lock;
2052  pmeProxy[transPeMap[pe]].recvSharedTrans(shmsg);
2053  }
2054 }
2055 
2057  procTrans(msg->msg);
2058  CmiLock(msg->lock);
2059  int count = --(*msg->count);
2060  CmiUnlock(msg->lock);
2061  if ( count == 0 ) {
2062  CmiDestroyLock(msg->lock);
2063  delete msg->count;
2064  delete msg->msg;
2065  }
2066  delete msg;
2067 }
2068 
2070  procTrans(msg);
2071  delete msg;
2072 }
2073 
2075  // CkPrintf("procTrans on Pe(%d)\n",CkMyPe());
2076  if ( trans_count == numGridPes ) {
2077  lattice = msg->lattice;
2078  grid_sequence = msg->sequence;
2079  }
2080 
2081  if ( msg->nx ) {
2082  int zdim = myGrid.dim3;
2083  NodePmeInfo &nodeInfo(transNodeInfo[myTransNode]);
2084  int first_pe = nodeInfo.pe_start;
2085  int last_pe = first_pe+nodeInfo.npe-1;
2086  int y_skip = localInfo[myTransPe].y_start_after_transpose
2087  - localInfo[first_pe].y_start_after_transpose;
2088  int ny_msg = localInfo[last_pe].y_start_after_transpose
2089  + localInfo[last_pe].ny_after_transpose
2090  - localInfo[first_pe].y_start_after_transpose;
2091  int ny = localInfo[myTransPe].ny_after_transpose;
2092  int x_start = msg->x_start;
2093  int nx = msg->nx;
2094  for ( int g=0; g<numGrids; ++g ) {
2095  CmiMemcpy((void*)(kgrid + qgrid_size * g + x_start*ny*zdim),
2096  (void*)(msg->qgrid + nx*(ny_msg*g+y_skip)*zdim),
2097  nx*ny*zdim*sizeof(float));
2098  }
2099  }
2100 
2101  --trans_count;
2102 
2103  if ( trans_count == 0 ) {
2104  pmeProxyDir[CkMyPe()].gridCalc2();
2105  }
2106 }
2107 
2109  // CkPrintf("gridCalc2 on Pe(%d)\n",CkMyPe());
2110 
2111 #if CMK_BLUEGENEL
2112  CmiNetworkProgressAfter (0);
2113 #endif
2114 
2115  int zdim = myGrid.dim3;
2116  // int y_start = localInfo[myTransPe].y_start_after_transpose;
2117  int ny = localInfo[myTransPe].ny_after_transpose;
2118 
2119  for ( int g=0; g<numGrids; ++g ) {
2120  // finish forward FFT (x dimension)
2121 #ifdef NAMD_FFTW
2122 #ifdef NAMD_FFTW_3
2123  fftwf_execute(forward_plan_x[g]);
2124 #else
2125  fftw(forward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
2126  ny * zdim / 2, 1, work, 1, 0);
2127 #endif
2128 #endif
2129  }
2130 
2131 #ifdef OPENATOM_VERSION
2132  if ( ! simParams -> openatomOn ) {
2133 #endif // OPENATOM_VERSION
2134  gridCalc2R();
2135 #ifdef OPENATOM_VERSION
2136  } else {
2137  gridCalc2Moa();
2138  }
2139 #endif // OPENATOM_VERSION
2140 }
2141 
2142 #ifdef OPENATOM_VERSION
2143 void ComputePmeMgr::gridCalc2Moa(void) {
2144 
2145  int zdim = myGrid.dim3;
2146  // int y_start = localInfo[myTransPe].y_start_after_transpose;
2147  int ny = localInfo[myTransPe].ny_after_transpose;
2148 
2150 
2151  CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
2152 
2153  for ( int g=0; g<numGrids; ++g ) {
2154  #ifdef OPENATOM_VERSION_DEBUG
2155  CkPrintf("Sending recQ on processor %d \n", CkMyPe());
2156  for ( int i=0; i<=(ny * zdim / 2); ++i)
2157  {
2158  CkPrintf("PE, g,fftw_q,k*q*g, kgrid, qgrid_size value %d pre-send = %d, %d, %f %f, %d, \n", i, CkMyPe(), g, (kgrid+qgrid_size*g)[i], kgrid[i], qgrid_size);
2159  }
2160  #endif // OPENATOM_VERSION_DEBUG
2161 // mqcpProxy[CkMyPe()].recvQ((ny * zdim / 2),((fftw_complex *)(kgrid+qgrid_size*g)));
2162  CkCallback resumePme(CkIndex_ComputePmeMgr::gridCalc2R(), thishandle);
2163  moaProxy[CkMyPe()].recvQ(g,numGrids,(ny * zdim / 2),(kgrid+qgrid_size*g), resumePme);
2164  }
2165 }
2166 #endif // OPENATOM_VERSION
2167 
2169 
2170  int useCkLoop = 0;
2171 #if CMK_SMP && USE_CKLOOP
2172  if ( Node::Object()->simParameters->useCkLoop >= CKLOOP_CTRL_PME_KSPACE
2173  && CkNumPes() >= 2 * numTransPes ) {
2174  useCkLoop = 1;
2175  }
2176 #endif
2177 
2178  int zdim = myGrid.dim3;
2179  // int y_start = localInfo[myTransPe].y_start_after_transpose;
2180  int ny = localInfo[myTransPe].ny_after_transpose;
2181 
2182  for ( int g=0; g<numGrids; ++g ) {
2183  // reciprocal space portion of PME
2185  recip_evir2[g][0] = myKSpace->compute_energy(kgrid+qgrid_size*g,
2186  lattice, ewaldcof, &(recip_evir2[g][1]), useCkLoop);
2187  // CkPrintf("Ewald reciprocal energy = %f\n", recip_evir2[g][0]);
2188 
2189  // start backward FFT (x dimension)
2190 
2191 #ifdef NAMD_FFTW
2192 #ifdef NAMD_FFTW_3
2193  fftwf_execute(backward_plan_x[g]);
2194 #else
2195  fftw(backward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
2196  ny * zdim / 2, 1, work, 1, 0);
2197 #endif
2198 #endif
2199  }
2200 
2201  pmeProxyDir[CkMyPe()].sendUntrans();
2202 }
2203 
2204 static inline void PmeSlabSendUntrans(int first, int last, void *result, int paraNum, void *param) {
2205  ComputePmeMgr *mgr = (ComputePmeMgr *)param;
2206  mgr->sendUntransSubset(first, last);
2207 }
2208 
2210 
2211  trans_count = numGridPes;
2212 
2213  { // send energy and virial
2214  PmeEvirMsg *newmsg = new (numGrids, PRIORITY_SIZE) PmeEvirMsg;
2215  for ( int g=0; g<numGrids; ++g ) {
2216  newmsg->evir[g] = recip_evir2[g];
2217  }
2218  SET_PRIORITY(newmsg,grid_sequence,PME_UNGRID_PRIORITY)
2219  CmiEnableUrgentSend(1);
2220  pmeProxy[recipEvirPe].recvRecipEvir(newmsg);
2221  CmiEnableUrgentSend(0);
2222  }
2223 
2224 #if CMK_SMP && USE_CKLOOP
2225  int useCkLoop = Node::Object()->simParameters->useCkLoop;
2226  if ( useCkLoop >= CKLOOP_CTRL_PME_SENDUNTRANS && CkNumPes() >= 2 * numTransPes) {
2227  CkLoop_Parallelize(PmeSlabSendUntrans, 1, (void *)this, CkMyNodeSize(), 0, numGridNodes-1, 0); // no sync
2228  } else
2229 #endif
2230  {
2231  sendUntransSubset(0, numGridNodes-1);
2232  }
2233 
2234 }
2235 
2236 void ComputePmeMgr::sendUntransSubset(int first, int last) {
2237 
2238  int zdim = myGrid.dim3;
2239  int y_start = localInfo[myTransPe].y_start_after_transpose;
2240  int ny = localInfo[myTransPe].ny_after_transpose;
2241  int slicelen = myGrid.K2 * zdim;
2242 
2243  ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
2244 
2245 #if CMK_BLUEGENEL
2246  CmiNetworkProgressAfter (0);
2247 #endif
2248 
2249  // send data for reverse transpose
2250  for (int j=first; j<=last; j++) {
2251  int node = gridNodeOrder[j]; // different order on each node
2252  int pe = gridNodeInfo[node].pe_start;
2253  int npe = gridNodeInfo[node].npe;
2254  int totlen = 0;
2255  if ( node != myGridNode ) for (int i=0; i<npe; ++i, ++pe) {
2256  LocalPmeInfo &li = localInfo[pe];
2257  int cpylen = li.nx * zdim;
2258  totlen += cpylen;
2259  }
2260  PmeUntransMsg *newmsg = new (ny * totlen * numGrids, PRIORITY_SIZE) PmeUntransMsg;
2261  newmsg->sourceNode = myTransPe;
2262  newmsg->y_start = y_start;
2263  newmsg->ny = ny;
2264  for ( int g=0; g<numGrids; ++g ) {
2265  float *qmsg = newmsg->qgrid + ny * totlen * g;
2266  pe = gridNodeInfo[node].pe_start;
2267  for (int i=0; i<npe; ++i, ++pe) {
2268  LocalPmeInfo &li = localInfo[pe];
2269  if ( node == myGridNode ) {
2270  ComputePmeMgr *m = mgrObjects[CkRankOf(gridPeMap[pe])];
2271  qmsg = m->qgrid + m->qgrid_size * g + y_start * zdim;
2272  float *q = kgrid + qgrid_size*g + li.x_start*ny*zdim;
2273  int cpylen = ny * zdim;
2274  for ( int x = 0; x < li.nx; ++x ) {
2275  CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
2276  q += cpylen;
2277  qmsg += slicelen;
2278  }
2279  } else {
2280  CmiMemcpy((void*)qmsg,
2281  (void*)(kgrid + qgrid_size*g + li.x_start*ny*zdim),
2282  li.nx*ny*zdim*sizeof(float));
2283  qmsg += li.nx*ny*zdim;
2284  }
2285  }
2286  }
2287  SET_PRIORITY(newmsg,grid_sequence,PME_UNTRANS_PRIORITY)
2288  if ( node == myGridNode ) newmsg->ny = 0;
2289  if ( npe > 1 ) {
2290  if ( node == myGridNode ) fwdSharedUntrans(newmsg);
2291  else pmeNodeProxy[gridNodeInfo[node].real_node].recvUntrans(newmsg);
2292  } else pmeProxy[gridPeMap[gridNodeInfo[node].pe_start]].recvUntrans(newmsg);
2293  }
2294 }
2295 
2297  int pe = gridNodeInfo[myGridNode].pe_start;
2298  int npe = gridNodeInfo[myGridNode].npe;
2299  CmiNodeLock lock = CmiCreateLock();
2300  int *count = new int; *count = npe;
2301  for (int i=0; i<npe; ++i, ++pe) {
2303  shmsg->msg = msg;
2304  shmsg->count = count;
2305  shmsg->lock = lock;
2306  pmeProxy[gridPeMap[pe]].recvSharedUntrans(shmsg);
2307  }
2308 }
2309 
2311  procUntrans(msg->msg);
2312  CmiLock(msg->lock);
2313  int count = --(*msg->count);
2314  CmiUnlock(msg->lock);
2315  if ( count == 0 ) {
2316  CmiDestroyLock(msg->lock);
2317  delete msg->count;
2318  delete msg->msg;
2319  }
2320  delete msg;
2321 }
2322 
2324  procUntrans(msg);
2325  delete msg;
2326 }
2327 
2329  // CkPrintf("recvUntrans on Pe(%d)\n",CkMyPe());
2330 
2331 #if CMK_BLUEGENEL
2332  CmiNetworkProgressAfter (0);
2333 #endif
2334 
2335  NodePmeInfo &nodeInfo(gridNodeInfo[myGridNode]);
2336  int first_pe = nodeInfo.pe_start;
2337  int g;
2338 
2339  if ( msg->ny ) {
2340  int zdim = myGrid.dim3;
2341  int last_pe = first_pe+nodeInfo.npe-1;
2342  int x_skip = localInfo[myGridPe].x_start
2343  - localInfo[first_pe].x_start;
2344  int nx_msg = localInfo[last_pe].x_start
2345  + localInfo[last_pe].nx
2346  - localInfo[first_pe].x_start;
2347  int nx = localInfo[myGridPe].nx;
2348  int y_start = msg->y_start;
2349  int ny = msg->ny;
2350  int slicelen = myGrid.K2 * zdim;
2351  int cpylen = ny * zdim;
2352  for ( g=0; g<numGrids; ++g ) {
2353  float *q = qgrid + qgrid_size * g + y_start * zdim;
2354  float *qmsg = msg->qgrid + (nx_msg*g+x_skip) * cpylen;
2355  for ( int x = 0; x < nx; ++x ) {
2356  CmiMemcpy((void*)q, (void*)qmsg, cpylen*sizeof(float));
2357  q += slicelen;
2358  qmsg += cpylen;
2359  }
2360  }
2361  }
2362 
2363  --untrans_count;
2364 
2365  if ( untrans_count == 0 ) {
2366  pmeProxyDir[CkMyPe()].gridCalc3();
2367  }
2368 }
2369 
2371  // CkPrintf("gridCalc3 on Pe(%d)\n",CkMyPe());
2372 
2373  // finish backward FFT
2374 #ifdef NAMD_FFTW
2375  for ( int g=0; g<numGrids; ++g ) {
2376 #ifdef NAMD_FFTW_3
2377  fftwf_execute(backward_plan_yz[g]);
2378 #else
2379  rfftwnd_complex_to_real(backward_plan_yz, localInfo[myGridPe].nx,
2380  (fftw_complex *) (qgrid + qgrid_size * g),
2381  1, myGrid.dim2 * myGrid.dim3 / 2, 0, 0, 0);
2382 #endif
2383  }
2384 
2385 #endif
2386 
2387  pmeProxyDir[CkMyPe()].sendUngrid();
2388 }
2389 
2390 static inline void PmeSlabSendUngrid(int first, int last, void *result, int paraNum, void *param) {
2391  ComputePmeMgr *mgr = (ComputePmeMgr *)param;
2392  mgr->sendUngridSubset(first, last);
2393 }
2394 
2396 
2397 #if CMK_SMP && USE_CKLOOP
2398  int useCkLoop = Node::Object()->simParameters->useCkLoop;
2399  if ( useCkLoop >= CKLOOP_CTRL_PME_SENDUNTRANS && CkNumPes() >= 2 * numGridPes) {
2400  CkLoop_Parallelize(PmeSlabSendUngrid, 1, (void *)this, CkMyNodeSize(), 0, numSources-1, 1); // sync
2401  } else
2402 #endif
2403  {
2404  sendUngridSubset(0, numSources-1);
2405  }
2406 
2407  grid_count = numSources;
2408  memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
2409 }
2410 
2411 void ComputePmeMgr::sendUngridSubset(int first, int last) {
2412 
2413 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2414  const int UNGRID_PRIORITY = ( offload ? PME_OFFLOAD_UNGRID_PRIORITY : PME_UNGRID_PRIORITY );
2415 #else
2416  const int UNGRID_PRIORITY = PME_UNGRID_PRIORITY ;
2417 #endif
2418 
2419  for ( int j=first; j<=last; ++j ) {
2420  // int msglen = qgrid_len;
2421  PmeGridMsg *newmsg = gridmsg_reuse[j];
2422  int pe = newmsg->sourceNode;
2423  int zdim = myGrid.dim3;
2424  int flen = newmsg->len;
2425  int fstart = newmsg->start;
2426  int zlistlen = newmsg->zlistlen;
2427  int *zlist = newmsg->zlist;
2428  float *qmsg = newmsg->qgrid;
2429  for ( int g=0; g<numGrids; ++g ) {
2430  char *f = newmsg->fgrid + fgrid_len * g;
2431  float *q = qgrid + qgrid_size * g + (fstart-fgrid_start) * zdim;
2432  for ( int i=0; i<flen; ++i ) {
2433  if ( f[i] ) {
2434  for ( int k=0; k<zlistlen; ++k ) {
2435  *(qmsg++) = q[zlist[k]];
2436  }
2437  }
2438  q += zdim;
2439  }
2440  }
2441  newmsg->sourceNode = myGridPe;
2442 
2443  SET_PRIORITY(newmsg,grid_sequence,UNGRID_PRIORITY)
2444  CmiEnableUrgentSend(1);
2445 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2446  if ( offload ) {
2447  pmeNodeProxy[CkNodeOf(pe)].recvUngrid(newmsg);
2448  } else
2449 #endif
2450  pmeProxyDir[pe].recvUngrid(newmsg);
2451  CmiEnableUrgentSend(0);
2452  }
2453 }
2454 
2456  // CkPrintf("recvUngrid on Pe(%d)\n",CkMyPe());
2457 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2458  if ( ! offload ) // would need lock
2459 #endif
2460  if ( ungrid_count == 0 ) {
2461  NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
2462  }
2463 
2464  if ( usePencils ) copyPencils(msg);
2465  else copyResults(msg);
2466  delete msg;
2467  recvAck(0);
2468 }
2469 
2471  if ( msg ) delete msg;
2472 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2473  if ( offload ) {
2474  CmiLock(cuda_lock);
2475  if ( ungrid_count == 0 ) {
2476  NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
2477  }
2478  int uc = --ungrid_count;
2479  CmiUnlock(cuda_lock);
2480 
2481  if ( uc == 0 ) {
2482  pmeProxyDir[master_pe].ungridCalc();
2483  }
2484  return;
2485  }
2486 #endif
2487  --ungrid_count;
2488 
2489  if ( ungrid_count == 0 ) {
2490  pmeProxyDir[CkMyPe()].ungridCalc();
2491  }
2492 }
2493 
2494 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2495 #define count_limit 1000000
2496 #define CUDA_POLL(FN,ARG) CcdCallFnAfter(FN,ARG,0.1)
2497 #define EVENT_STRIDE 10
2498 
2499 extern "C" void CcdCallBacksReset(void *ignored,double curWallTime); // fix Charm++
2500 
2501 //void cudaDie(const char *msg, cudaError_t err=cudaSuccess);
2502 
2503 void cuda_check_pme_forces(void *arg, double walltime) {
2504  ComputePmeMgr *argp = (ComputePmeMgr *) arg;
2505 
2506  while ( 1 ) { // process multiple events per call
2507  cudaError_t err = cudaEventQuery(argp->end_forces[argp->forces_done_count/EVENT_STRIDE]);
2508  if ( err == cudaSuccess ) {
2509  argp->check_forces_count = 0;
2510  for ( int i=0; i<EVENT_STRIDE; ++i ) {
2512  if ( ++(argp->forces_done_count) == argp->forces_count ) break;
2513  }
2514  if ( argp->forces_done_count == argp->forces_count ) { // last event
2515  traceUserBracketEvent(CUDA_EVENT_ID_PME_FORCES,argp->forces_time,walltime);
2516  argp->forces_time = walltime - argp->forces_time;
2517  //CkPrintf("cuda_check_pme_forces forces_time == %f\n", argp->forces_time);
2518  return;
2519  } else { // more events
2520  continue; // check next event
2521  }
2522  } else if ( err != cudaErrorNotReady ) {
2523  char errmsg[256];
2524  sprintf(errmsg,"in cuda_check_pme_forces for event %d after polling %d times over %f s on seq %d",
2526  argp->check_forces_count, walltime - argp->forces_time,
2527  argp->saved_sequence);
2528  cudaDie(errmsg,err);
2529  } else if ( ++(argp->check_forces_count) >= count_limit ) {
2530  char errmsg[256];
2531  sprintf(errmsg,"cuda_check_pme_forces for event %d polled %d times over %f s on seq %d",
2533  argp->check_forces_count, walltime - argp->forces_time,
2534  argp->saved_sequence);
2535  cudaDie(errmsg,err);
2536  } else {
2537  break; // call again
2538  }
2539  } // while ( 1 )
2540  CcdCallBacksReset(0,walltime); // fix Charm++
2542 }
2543 #endif // NAMD_CUDA
2544 
2546  // CkPrintf("ungridCalc on Pe(%d)\n",CkMyPe());
2547 
2548  ungridForcesCount = pmeComputes.size();
2549 
2550 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2551  if ( offload ) {
2552  //CmiLock(cuda_lock);
2553  cudaSetDevice(deviceCUDA->getDeviceID());
2554 
2555  if ( this == masterPmeMgr ) {
2556  double before = CmiWallTimer();
2557  // XXX prevents something from breaking???
2558  cudaMemcpyAsync(v_data_dev, q_data_host, q_data_size, cudaMemcpyHostToDevice, 0 /*streams[stream]*/);
2559  cudaEventRecord(nodePmeMgr->end_potential_memcpy, 0 /*streams[stream]*/);
2560  // try to make the unspecified launch failures go away
2561  cudaEventSynchronize(nodePmeMgr->end_potential_memcpy);
2562  cuda_errcheck("in ComputePmeMgr::ungridCalc after potential memcpy");
2563  traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
2564 
2565  const int myrank = CkMyRank();
2566  for ( int i=0; i<CkMyNodeSize(); ++i ) {
2567  if ( myrank != i && nodePmeMgr->mgrObjects[i]->pmeComputes.size() ) {
2568  nodePmeMgr->mgrObjects[i]->ungridCalc();
2569  }
2570  }
2571  if ( ! pmeComputes.size() ) return;
2572  }
2573 
2574  if ( ! end_forces ) {
2575  int n=(pmeComputes.size()-1)/EVENT_STRIDE+1;
2576  end_forces = new cudaEvent_t[n];
2577  for ( int i=0; i<n; ++i ) {
2578  cudaEventCreateWithFlags(&end_forces[i],cudaEventDisableTiming);
2579  }
2580  }
2581 
2582  const int pcsz = pmeComputes.size();
2583  if ( ! afn_host ) {
2584  cudaMallocHost((void**) &afn_host, 3*pcsz*sizeof(float*));
2585  cudaMalloc((void**) &afn_dev, 3*pcsz*sizeof(float*));
2586  cuda_errcheck("malloc params for pme");
2587  }
2588  int totn = 0;
2589  for ( int i=0; i<pcsz; ++i ) {
2590  int n = pmeComputes[i]->numGridAtoms[0];
2591  totn += n;
2592  }
2593  if ( totn > f_data_mgr_alloc ) {
2594  if ( f_data_mgr_alloc ) {
2595  CkPrintf("Expanding CUDA forces allocation because %d > %d\n", totn, f_data_mgr_alloc);
2596  cudaFree(f_data_mgr_dev);
2597  cudaFreeHost(f_data_mgr_host);
2598  }
2599  f_data_mgr_alloc = 1.2 * (totn + 100);
2600  cudaMalloc((void**) &f_data_mgr_dev, 3*f_data_mgr_alloc*sizeof(float));
2601  cudaMallocHost((void**) &f_data_mgr_host, 3*f_data_mgr_alloc*sizeof(float));
2602  cuda_errcheck("malloc forces for pme");
2603  }
2604  // CkPrintf("pe %d pcsz %d totn %d alloc %d\n", CkMyPe(), pcsz, totn, f_data_mgr_alloc);
2605  float *f_dev = f_data_mgr_dev;
2606  float *f_host = f_data_mgr_host;
2607  for ( int i=0; i<pcsz; ++i ) {
2608  int n = pmeComputes[i]->numGridAtoms[0];
2609  pmeComputes[i]->f_data_dev = f_dev;
2610  pmeComputes[i]->f_data_host = f_host;
2611  afn_host[3*i ] = a_data_dev + 7 * pmeComputes[i]->cuda_atoms_offset;
2612  afn_host[3*i+1] = f_dev;
2613  afn_host[3*i+2] = f_dev + n; // avoid type conversion issues
2614  f_dev += 3*n;
2615  f_host += 3*n;
2616  }
2617  //CmiLock(cuda_lock);
2618  double before = CmiWallTimer();
2619  cudaMemcpyAsync(afn_dev, afn_host, 3*pcsz*sizeof(float*), cudaMemcpyHostToDevice, streams[stream]);
2620  cuda_errcheck("in ComputePmeMgr::ungridCalc after force pointer memcpy");
2621  traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
2622  cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_potential_memcpy, 0);
2623  cuda_errcheck("in ComputePmeMgr::ungridCalc after wait for potential memcpy");
2624  traceUserEvent(CUDA_EVENT_ID_PME_TICK);
2625 
2626  for ( int i=0; i<pcsz; ++i ) {
2627  // cudaMemsetAsync(pmeComputes[i]->f_data_dev, 0, 3*n*sizeof(float), streams[stream]);
2628  if ( i%EVENT_STRIDE == 0 ) {
2629  int dimy = pcsz - i;
2630  if ( dimy > EVENT_STRIDE ) dimy = EVENT_STRIDE;
2631  int maxn = 0;
2632  int subtotn = 0;
2633  for ( int j=0; j<dimy; ++j ) {
2634  int n = pmeComputes[i+j]->numGridAtoms[0];
2635  subtotn += n;
2636  if ( n > maxn ) maxn = n;
2637  }
2638  // CkPrintf("pe %d dimy %d maxn %d subtotn %d\n", CkMyPe(), dimy, maxn, subtotn);
2639  before = CmiWallTimer();
2640  cuda_pme_forces(
2641  bspline_coeffs_dev,
2642  v_arr_dev, afn_dev+3*i, dimy, maxn, /*
2643  pmeComputes[i]->a_data_dev,
2644  pmeComputes[i]->f_data_dev,
2645  n, */ myGrid.K1, myGrid.K2, myGrid.K3, myGrid.order,
2646  streams[stream]);
2647  cuda_errcheck("in ComputePmeMgr::ungridCalc after force kernel submit");
2648  traceUserBracketEvent(CUDA_EVENT_ID_PME_KERNEL,before,CmiWallTimer());
2649  before = CmiWallTimer();
2650  cudaMemcpyAsync(pmeComputes[i]->f_data_host, pmeComputes[i]->f_data_dev, 3*subtotn*sizeof(float),
2651  cudaMemcpyDeviceToHost, streams[stream]);
2652 #if 0
2653  cudaDeviceSynchronize();
2654  fprintf(stderr, "i = %d\n", i);
2655  for(int k=0; k < subtotn*3; k++)
2656  {
2657  fprintf(stderr, "f_data_host[%d][%d] = %f\n", i, k,
2658  pmeComputes[i]->f_data_host[k]);
2659  }
2660 #endif
2661  cuda_errcheck("in ComputePmeMgr::ungridCalc after force memcpy submit");
2662  traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
2663  cudaEventRecord(end_forces[i/EVENT_STRIDE], streams[stream]);
2664  cuda_errcheck("in ComputePmeMgr::ungridCalc after end_forces event");
2665  traceUserEvent(CUDA_EVENT_ID_PME_TICK);
2666  }
2667  // CkPrintf("pe %d c %d natoms %d fdev %lld fhost %lld\n", CkMyPe(), i, (int64)afn_host[3*i+2], pmeComputes[i]->f_data_dev, pmeComputes[i]->f_data_host);
2668  }
2669  //CmiUnlock(cuda_lock);
2670  } else
2671 #endif // NAMD_CUDA
2672  {
2673  for ( int i=0; i<pmeComputes.size(); ++i ) {
2675  // pmeComputes[i]->ungridForces();
2676  }
2677  }
2678  // submitReductions(); // must follow all ungridForces()
2679 
2680 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2681  if ( offload ) {
2682  forces_time = CmiWallTimer();
2683  forces_count = ungridForcesCount;
2684  forces_done_count = 0;
2685  pmeProxy[this_pe].pollForcesReady();
2686  }
2687 #endif
2688 
2689  ungrid_count = (usePencils ? numPencilsActive : numDestRecipPes );
2690 }
2691 
2693 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2694  CcdCallBacksReset(0,CmiWallTimer()); // fix Charm++
2696 #else
2697  NAMD_bug("ComputePmeMgr::pollForcesReady() called in non-CUDA build.");
2698 #endif
2699 }
2700 
2701 void ComputePme::atomUpdate() { atomsChanged = 1; }
2702 
2704 {
2705  DebugM(4,"ComputePme created.\n");
2707  setNumPatches(1);
2708 
2709  CProxy_ComputePmeMgr::ckLocalBranch(
2710  CkpvAccess(BOCclass_group).computePmeMgr)->addCompute(this);
2711 
2713 
2714  qmForcesOn = simParams->qmForcesOn;
2715  offload = simParams->PMEOffload;
2716 
2717  numGridsMax = numGrids;
2718 
2719  myGrid.K1 = simParams->PMEGridSizeX;
2720  myGrid.K2 = simParams->PMEGridSizeY;
2721  myGrid.K3 = simParams->PMEGridSizeZ;
2722  myGrid.order = simParams->PMEInterpOrder;
2723  myGrid.dim2 = myGrid.K2;
2724  myGrid.dim3 = 2 * (myGrid.K3/2 + 1);
2725 
2726 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2727  cuda_atoms_offset = 0;
2728  f_data_host = 0;
2729  f_data_dev = 0;
2730  if ( ! offload )
2731 #endif
2732  {
2733  for ( int g=0; g<numGrids; ++g ) myRealSpace[g] = new PmeRealSpace(myGrid);
2734  }
2735 
2736  atomsChanged = 0;
2737 
2738  qmLoclIndx = 0;
2739  qmLocalCharges = 0;
2740 }
2741 
2743  if (!(patch = PatchMap::Object()->patch(patchID))) {
2744  NAMD_bug("ComputePme used with unknown patch.");
2745  }
2746  positionBox = patch->registerPositionPickup(this);
2747  avgPositionBox = patch->registerAvgPositionPickup(this);
2748  forceBox = patch->registerForceDeposit(this);
2749 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2750  if ( offload ) {
2751  myMgr->cuda_atoms_count += patch->getNumAtoms();
2752  }
2753 #endif
2754 }
2755 
2757 
2758  noWorkCount = 0;
2759  doWorkCount = 0;
2760  ungridForcesCount = 0;
2761 
2763 
2765 
2766  strayChargeErrors = 0;
2767 
2768 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2769  PatchMap *patchMap = PatchMap::Object();
2770  int pe = master_pe = CkNodeFirst(CkMyNode());
2771  for ( int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
2772  if ( ! patchMap->numPatchesOnNode(master_pe) ) master_pe = pe;
2773  if ( ! patchMap->numPatchesOnNode(pe) ) continue;
2774  if ( master_pe < 1 && pe != deviceCUDA->getMasterPe() ) master_pe = pe;
2775  if ( master_pe == deviceCUDA->getMasterPe() ) master_pe = pe;
2777  && pe != deviceCUDA->getMasterPe() ) {
2778  master_pe = pe;
2779  }
2780  }
2781  if ( ! patchMap->numPatchesOnNode(master_pe) ) {
2782  NAMD_bug("ComputePmeMgr::initialize_computes() master_pe has no patches.");
2783  }
2784 
2785  masterPmeMgr = nodePmeMgr->mgrObjects[master_pe - CkNodeFirst(CkMyNode())];
2786  bool cudaFirst = 1;
2787  if ( offload ) {
2788  CmiLock(cuda_lock);
2789  cudaFirst = ! masterPmeMgr->chargeGridSubmittedCount++;
2790  }
2791 
2792  if ( cudaFirst ) {
2793  nodePmeMgr->master_pe = master_pe;
2794  nodePmeMgr->masterPmeMgr = masterPmeMgr;
2795  }
2796 #endif
2797 
2798  qsize = myGrid.K1 * myGrid.dim2 * myGrid.dim3;
2799  fsize = myGrid.K1 * myGrid.dim2;
2800  if ( myGrid.K2 != myGrid.dim2 ) NAMD_bug("PME myGrid.K2 != myGrid.dim2");
2801 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2802  if ( ! offload )
2803 #endif
2804  {
2805  q_arr = new float*[fsize*numGrids];
2806  memset( (void*) q_arr, 0, fsize*numGrids * sizeof(float*) );
2807  q_list = new float*[fsize*numGrids];
2808  memset( (void*) q_list, 0, fsize*numGrids * sizeof(float*) );
2809  q_count = 0;
2810  }
2811 
2812 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2813  if ( cudaFirst || ! offload ) {
2814 #endif
2815  f_arr = new char[fsize*numGrids];
2816  // memset to non-zero value has race condition on BlueGene/Q
2817  // memset( (void*) f_arr, 2, fsize*numGrids * sizeof(char) );
2818  for ( int n=fsize*numGrids, i=0; i<n; ++i ) f_arr[i] = 2;
2819 
2820  for ( int g=0; g<numGrids; ++g ) {
2821  char *f = f_arr + g*fsize;
2822  if ( usePencils ) {
2823  int K1 = myGrid.K1;
2824  int K2 = myGrid.K2;
2825  int block1 = ( K1 + xBlocks - 1 ) / xBlocks;
2826  int block2 = ( K2 + yBlocks - 1 ) / yBlocks;
2827  int dim2 = myGrid.dim2;
2828  for (int ap=0; ap<numPencilsActive; ++ap) {
2829  int ib = activePencils[ap].i;
2830  int jb = activePencils[ap].j;
2831  int ibegin = ib*block1;
2832  int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
2833  int jbegin = jb*block2;
2834  int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
2835  int flen = numGrids * (iend - ibegin) * (jend - jbegin);
2836  for ( int i=ibegin; i<iend; ++i ) {
2837  for ( int j=jbegin; j<jend; ++j ) {
2838  f[i*dim2+j] = 0;
2839  }
2840  }
2841  }
2842  } else {
2843  int block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
2844  bsize = block1 * myGrid.dim2 * myGrid.dim3;
2845  for (int pe=0; pe<numGridPes; pe++) {
2846  if ( ! recipPeDest[pe] ) continue;
2847  int start = pe * bsize;
2848  int len = bsize;
2849  if ( start >= qsize ) { start = 0; len = 0; }
2850  if ( start + len > qsize ) { len = qsize - start; }
2851  int zdim = myGrid.dim3;
2852  int fstart = start / zdim;
2853  int flen = len / zdim;
2854  memset(f + fstart, 0, flen*sizeof(char));
2855  // CkPrintf("pe %d enabled slabs %d to %d\n", CkMyPe(), fstart/myGrid.dim2, (fstart+flen)/myGrid.dim2-1);
2856  }
2857  }
2858  }
2859 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2860  }
2861  if ( offload ) {
2862  cudaSetDevice(deviceCUDA->getDeviceID());
2863  if ( cudaFirst ) {
2864 
2865  int f_alloc_count = 0;
2866  for ( int n=fsize, i=0; i<n; ++i ) {
2867  if ( f_arr[i] == 0 ) {
2868  ++f_alloc_count;
2869  }
2870  }
2871  // CkPrintf("pe %d f_alloc_count == %d (%d slabs)\n", CkMyPe(), f_alloc_count, f_alloc_count/myGrid.dim2);
2872 
2873  q_arr = new float*[fsize*numGrids];
2874  memset( (void*) q_arr, 0, fsize*numGrids * sizeof(float*) );
2875 
2876  float **q_arr_dev_host = new float*[fsize];
2877  cudaMalloc((void**) &q_arr_dev, fsize * sizeof(float*));
2878 
2879  float **v_arr_dev_host = new float*[fsize];
2880  cudaMalloc((void**) &v_arr_dev, fsize * sizeof(float*));
2881 
2882  int q_stride = myGrid.K3+myGrid.order-1;
2883  q_data_size = f_alloc_count * q_stride * sizeof(float);
2884  ffz_size = (fsize + q_stride) * sizeof(int);
2885 
2886  // tack ffz onto end of q_data to allow merged transfer
2887  cudaMallocHost((void**) &q_data_host, q_data_size+ffz_size);
2888  ffz_host = (int*)(((char*)q_data_host) + q_data_size);
2889  cudaMalloc((void**) &q_data_dev, q_data_size+ffz_size);
2890  ffz_dev = (int*)(((char*)q_data_dev) + q_data_size);
2891  cudaMalloc((void**) &v_data_dev, q_data_size);
2892  cuda_errcheck("malloc grid data for pme");
2893  cudaMemset(q_data_dev, 0, q_data_size + ffz_size); // for first time
2894  cudaEventCreateWithFlags(&(nodePmeMgr->end_charge_memset),cudaEventDisableTiming);
2895  cudaEventRecord(nodePmeMgr->end_charge_memset, 0);
2896  cudaEventCreateWithFlags(&(nodePmeMgr->end_all_pme_kernels),cudaEventDisableTiming);
2897  cudaEventCreateWithFlags(&(nodePmeMgr->end_potential_memcpy),cudaEventDisableTiming);
2898 
2899  f_alloc_count = 0;
2900  for ( int n=fsize, i=0; i<n; ++i ) {
2901  if ( f_arr[i] == 0 ) {
2902  q_arr[i] = q_data_host + f_alloc_count * q_stride;
2903  q_arr_dev_host[i] = q_data_dev + f_alloc_count * q_stride;
2904  v_arr_dev_host[i] = v_data_dev + f_alloc_count * q_stride;
2905  ++f_alloc_count;
2906  } else {
2907  q_arr[i] = 0;
2908  q_arr_dev_host[i] = 0;
2909  v_arr_dev_host[i] = 0;
2910  }
2911  }
2912 
2913  cudaMemcpy(q_arr_dev, q_arr_dev_host, fsize * sizeof(float*), cudaMemcpyHostToDevice);
2914  cudaMemcpy(v_arr_dev, v_arr_dev_host, fsize * sizeof(float*), cudaMemcpyHostToDevice);
2915  delete [] q_arr_dev_host;
2916  delete [] v_arr_dev_host;
2917  delete [] f_arr;
2918  f_arr = new char[fsize + q_stride];
2919  fz_arr = f_arr + fsize;
2920  memset(f_arr, 0, fsize + q_stride);
2921  memset(ffz_host, 0, (fsize + q_stride)*sizeof(int));
2922 
2923  cuda_errcheck("initialize grid data for pme");
2924 
2925  cuda_init_bspline_coeffs(&bspline_coeffs_dev, &bspline_dcoeffs_dev, myGrid.order);
2926  cuda_errcheck("initialize bspline coefficients for pme");
2927 
2928 #define XCOPY(X) masterPmeMgr->X = X;
2929  XCOPY(bspline_coeffs_dev)
2930  XCOPY(bspline_dcoeffs_dev)
2931  XCOPY(q_arr)
2932  XCOPY(q_arr_dev)
2933  XCOPY(v_arr_dev)
2934  XCOPY(q_data_size)
2935  XCOPY(q_data_host)
2936  XCOPY(q_data_dev)
2937  XCOPY(v_data_dev)
2938  XCOPY(ffz_size)
2939  XCOPY(ffz_host)
2940  XCOPY(ffz_dev)
2941  XCOPY(f_arr)
2942  XCOPY(fz_arr)
2943 #undef XCOPY
2944  //CkPrintf("pe %d init first\n", CkMyPe());
2945  } else { // cudaFirst
2946  //CkPrintf("pe %d init later\n", CkMyPe());
2947 #define XCOPY(X) X = masterPmeMgr->X;
2948  XCOPY(bspline_coeffs_dev)
2949  XCOPY(bspline_dcoeffs_dev)
2950  XCOPY(q_arr)
2951  XCOPY(q_arr_dev)
2952  XCOPY(v_arr_dev)
2953  XCOPY(q_data_size)
2954  XCOPY(q_data_host)
2955  XCOPY(q_data_dev)
2956  XCOPY(v_data_dev)
2957  XCOPY(ffz_size)
2958  XCOPY(ffz_host)
2959  XCOPY(ffz_dev)
2960  XCOPY(f_arr)
2961  XCOPY(fz_arr)
2962 #undef XCOPY
2963  } // cudaFirst
2964  CmiUnlock(cuda_lock);
2965  } else // offload
2966 #endif // NAMD_CUDA
2967  {
2968  fz_arr = new char[myGrid.K3+myGrid.order-1];
2969  }
2970 
2971 #if 0 && USE_PERSISTENT
2972  recvGrid_handle = NULL;
2973 #endif
2974 }
2975 
2977 {
2978 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2979  if ( ! offload )
2980 #endif
2981  {
2982  for ( int g=0; g<numGridsMax; ++g ) delete myRealSpace[g];
2983  }
2984 }
2985 
2986 #if 0 && USE_PERSISTENT
2987 void ComputePmeMgr::setup_recvgrid_persistent()
2988 {
2989  int K1 = myGrid.K1;
2990  int K2 = myGrid.K2;
2991  int dim2 = myGrid.dim2;
2992  int dim3 = myGrid.dim3;
2993  int block1 = myGrid.block1;
2994  int block2 = myGrid.block2;
2995 
2996  CkArray *zPencil_local = zPencil.ckLocalBranch();
2997  recvGrid_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * numPencilsActive);
2998  for (int ap=0; ap<numPencilsActive; ++ap) {
2999  int ib = activePencils[ap].i;
3000  int jb = activePencils[ap].j;
3001  int ibegin = ib*block1;
3002  int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
3003  int jbegin = jb*block2;
3004  int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
3005  int flen = numGrids * (iend - ibegin) * (jend - jbegin);
3006  // f is changing
3007  int fcount = 0;
3008  for ( int g=0; g<numGrids; ++g ) {
3009  char *f = f_arr + g*fsize;
3010  for ( int i=ibegin; i<iend; ++i ) {
3011  for ( int j=jbegin; j<jend; ++j ) {
3012  fcount += f[i*dim2+j];
3013  }
3014  }
3015  }
3016  int zlistlen = 0;
3017  for ( int i=0; i<myGrid.K3; ++i ) {
3018  if ( fz_arr[i] ) ++zlistlen;
3019  }
3020  int hd = ( fcount? 1 : 0 ); // has data?
3021  int peer = zPencil_local->homePe(CkArrayIndex3D(ib, jb, 0));
3022  int compress_start = sizeof(PmeGridMsg ) + sizeof(envelope) + sizeof(int)*hd*zlistlen + sizeof(char)*hd*flen +sizeof(PmeReduction)*hd*numGrids ;
3023  int compress_size = sizeof(float)*hd*fcount*zlistlen;
3024  int size = compress_start + compress_size + PRIORITY_SIZE/8+6;
3025  recvGrid_handle[ap] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
3026  }
3027 }
3028 #endif
3029 
3031 
3032  if ( patch->flags.doFullElectrostatics ) {
3033  // In QM/MM simulations, atom charges form QM regions need special treatment.
3034  if ( qmForcesOn ) {
3035  return 1;
3036  }
3037  if ( ! myMgr->ungridForcesCount && ! myMgr->recipEvirCount ) return 0; // work to do, enqueue as usual
3038  myMgr->heldComputes.add(this);
3039  return 1; // don't enqueue yet
3040  }
3041 
3042  positionBox->skip();
3043  forceBox->skip();
3044 
3045  if ( ++(myMgr->noWorkCount) == myMgr->pmeComputes.size() ) {
3046  myMgr->noWorkCount = 0;
3047  myMgr->reduction->submit();
3048  }
3049 
3050  atomsChanged = 0;
3051 
3052  return 1; // no work for this step
3053 }
3054 
3056  ++recipEvirClients;
3057 }
3058 
3060  if ( ! pmeComputes.size() ) NAMD_bug("ComputePmeMgr::recvRecipEvir() called on pe without patches");
3061  for ( int g=0; g<numGrids; ++g ) {
3062  evir[g] += msg->evir[g];
3063  }
3064  delete msg;
3065  // CkPrintf("recvRecipEvir pe %d %d %d\n", CkMyPe(), ungridForcesCount, recipEvirCount);
3066  if ( ! --recipEvirCount && ! ungridForcesCount ) submitReductions();
3067 }
3068 
3070 
3071 // iout << CkMyPe() << ") ----> PME doQMWork.\n" << endi ;
3072 
3073 
3074  int numQMAtms = Node::Object()->molecule->get_numQMAtoms();
3075  const Real *qmAtmChrg = Node::Object()->molecule->get_qmAtmChrg() ;
3076  const int *qmAtmIndx = Node::Object()->molecule->get_qmAtmIndx() ;
3077  const Real *qmAtomGroup = Node::Object()->molecule->get_qmAtomGroup() ;
3078 
3079  const CompAtomExt *xExt = patch->getCompAtomExtInfo();
3080 
3081  // Determine number of qm atoms in this patch for the current step.
3082  numLocalQMAtoms = 0;
3083  for (int paIter=0; paIter<patch->getNumAtoms(); paIter++) {
3084  if ( qmAtomGroup[xExt[paIter].id] != 0 ) {
3085  numLocalQMAtoms++;
3086  }
3087  }
3088 
3089  // We prepare a charge vector with QM charges for use in the PME calculation.
3090 
3091  // Clears data from last step, if there is any.
3092  if (qmLoclIndx != 0)
3093  delete [] qmLoclIndx;
3094  if (qmLocalCharges != 0)
3095  delete [] qmLocalCharges;
3096 
3097  qmLoclIndx = new int[numLocalQMAtoms] ;
3098  qmLocalCharges = new Real[numLocalQMAtoms] ;
3099 
3100  // I am assuming there will be (in general) more QM atoms among all QM groups
3101  // than MM atoms in a patch.
3102  int procAtms = 0;
3103 
3104  for (int paIter=0; paIter<patch->getNumAtoms(); paIter++) {
3105 
3106  for (int i=0; i<numQMAtms; i++) {
3107 
3108  if (qmAtmIndx[i] == xExt[paIter].id) {
3109 
3110  qmLoclIndx[procAtms] = paIter ;
3111  qmLocalCharges[procAtms] = qmAtmChrg[i];
3112 
3113  procAtms++;
3114  break;
3115  }
3116 
3117  }
3118 
3119  if (procAtms == numLocalQMAtoms)
3120  break;
3121  }
3122 
3123  doWork();
3124  return ;
3125 }
3126 
3128 {
3129  DebugM(4,"Entering ComputePme::doWork().\n");
3130 
3132 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3134 #else
3136 #endif
3137  ungridForces();
3138  // CkPrintf("doWork 2 pe %d %d %d\n", CkMyPe(), myMgr->ungridForcesCount, myMgr->recipEvirCount);
3139  if ( ! --(myMgr->ungridForcesCount) && ! myMgr->recipEvirCount ) myMgr->submitReductions();
3140  return;
3141  }
3143  // CkPrintf("doWork 1 pe %d %d %d\n", CkMyPe(), myMgr->ungridForcesCount, myMgr->recipEvirCount);
3144 
3145 #ifdef TRACE_COMPUTE_OBJECTS
3146  double traceObjStartTime = CmiWallTimer();
3147 #endif
3148 
3149 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3150  if ( offload ) cudaSetDevice(deviceCUDA->getDeviceID());
3151 #endif
3152 
3153  // allocate storage
3154  numLocalAtoms = patch->getNumAtoms();
3155 
3156  Lattice &lattice = patch->flags.lattice;
3157 
3158  localData_alloc.resize(numLocalAtoms*(numGrids+ ((numGrids>1 || selfOn)?1:0)));
3159  localData = localData_alloc.begin();
3160  localPartition_alloc.resize(numLocalAtoms);
3161  localPartition = localPartition_alloc.begin();
3162 
3163  int g;
3164  for ( g=0; g<numGrids; ++g ) {
3165  localGridData[g] = localData + numLocalAtoms*(g+1);
3166  }
3167 
3168  // get positions and charges
3169  PmeParticle * data_ptr = localData;
3170  unsigned char * part_ptr = localPartition;
3171  const BigReal coulomb_sqrt = sqrt( COULOMB * ComputeNonbondedUtil::scaling
3173 
3174  {
3175  CompAtom *x = positionBox->open();
3176  // CompAtomExt *xExt = patch->getCompAtomExtInfo();
3177  if ( patch->flags.doMolly ) {
3178  positionBox->close(&x);
3179  x = avgPositionBox->open();
3180  }
3181  int numAtoms = patch->getNumAtoms();
3182 
3183  for(int i=0; i<numAtoms; ++i)
3184  {
3185  data_ptr->x = x[i].position.x;
3186  data_ptr->y = x[i].position.y;
3187  data_ptr->z = x[i].position.z;
3188  data_ptr->cg = coulomb_sqrt * x[i].charge;
3189  ++data_ptr;
3190  *part_ptr = x[i].partition;
3191  ++part_ptr;
3192  }
3193 
3194  // QM loop to overwrite charges of QM atoms.
3195  // They are zero for NAMD, but are updated in ComputeQM.
3196  if ( qmForcesOn ) {
3197 
3198  for(int i=0; i<numLocalQMAtoms; ++i)
3199  {
3200  localData[qmLoclIndx[i]].cg = coulomb_sqrt * qmLocalCharges[i];
3201  }
3202 
3203  }
3204 
3205  if ( patch->flags.doMolly ) { avgPositionBox->close(&x); }
3206  else { positionBox->close(&x); }
3207  }
3208 
3209  // copy to other grids if needed
3210  if ( (alchOn && (!alchDecouple)) || lesOn ) {
3211  for ( g=0; g<numGrids; ++g ) {
3212  PmeParticle *lgd = localGridData[g];
3213  if (g < 2) {
3214  int nga = 0;
3215  for(int i=0; i<numLocalAtoms; ++i) {
3216  if ( localPartition[i] == 0 || localPartition[i] == (g+1) || localPartition[i] == (g+3)) {
3217  // for FEP/TI: grid 0 gets non-alch + partition 1 + partition 3;
3218  // grid 1 gets non-alch + partition 2 + + partition 4;
3219  lgd[nga++] = localData[i];
3220  }
3221  }
3222  numGridAtoms[g] = nga;
3223  } else {
3224  int nga = 0;
3225  for(int i=0; i<numLocalAtoms; ++i) {
3226  if ( localPartition[i] == 0 ) {
3227  // grid 2 (only if called for with numGrids=3) gets only non-alch
3228  lgd[nga++] = localData[i];
3229  }
3230  }
3231  numGridAtoms[g] = nga;
3232  }
3233  }
3234  } else if ( alchOn && alchDecouple) {
3235  // alchemical decoupling: four grids
3236  // g=0: partition 0 and partition 1
3237  // g=1: partition 0 and partition 2
3238  // g=2: only partition 1 atoms
3239  // g=3: only partition 2 atoms
3240  // plus one grid g=4, only partition 0, if numGrids=5
3241  for ( g=0; g<2; ++g ) { // same as before for first 2
3242  PmeParticle *lgd = localGridData[g];
3243  int nga = 0;
3244  for(int i=0; i<numLocalAtoms; ++i) {
3245  if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
3246  lgd[nga++] = localData[i];
3247  }
3248  }
3249  numGridAtoms[g] = nga;
3250  }
3251  for (g=2 ; g<4 ; ++g ) { // only alchemical atoms for these 2
3252  PmeParticle *lgd = localGridData[g];
3253  int nga = 0;
3254  for(int i=0; i<numLocalAtoms; ++i) {
3255  if ( localPartition[i] == (g-1) ) {
3256  lgd[nga++] = localData[i];
3257  }
3258  }
3259  numGridAtoms[g] = nga;
3260  }
3261  for (g=4 ; g<numGrids ; ++g ) { // only non-alchemical atoms
3262  // numGrids=5 only if alchElecLambdaStart > 0
3263  PmeParticle *lgd = localGridData[g];
3264  int nga = 0;
3265  for(int i=0; i<numLocalAtoms; ++i) {
3266  if ( localPartition[i] == 0 ) {
3267  lgd[nga++] = localData[i];
3268  }
3269  }
3270  numGridAtoms[g] = nga;
3271  }
3272  } else if ( selfOn ) {
3273  if ( numGrids != 1 ) NAMD_bug("ComputePme::doWork assertion 1 failed");
3274  g = 0;
3275  PmeParticle *lgd = localGridData[g];
3276  int nga = 0;
3277  for(int i=0; i<numLocalAtoms; ++i) {
3278  if ( localPartition[i] == 1 ) {
3279  lgd[nga++] = localData[i];
3280  }
3281  }
3282  numGridAtoms[g] = nga;
3283  } else if ( pairOn ) {
3284  if ( numGrids != 3 ) NAMD_bug("ComputePme::doWork assertion 2 failed");
3285  g = 0;
3286  PmeParticle *lgd = localGridData[g];
3287  int nga = 0;
3288  for(int i=0; i<numLocalAtoms; ++i) {
3289  if ( localPartition[i] == 1 || localPartition[i] == 2 ) {
3290  lgd[nga++] = localData[i];
3291  }
3292  }
3293  numGridAtoms[g] = nga;
3294  for ( g=1; g<3; ++g ) {
3295  PmeParticle *lgd = localGridData[g];
3296  int nga = 0;
3297  for(int i=0; i<numLocalAtoms; ++i) {
3298  if ( localPartition[i] == g ) {
3299  lgd[nga++] = localData[i];
3300  }
3301  }
3302  numGridAtoms[g] = nga;
3303  }
3304  } else {
3305  if ( numGrids != 1 ) NAMD_bug("ComputePme::doWork assertion 3 failed");
3306  localGridData[0] = localData;
3307  numGridAtoms[0] = numLocalAtoms;
3308  }
3309 
3310  if ( ! myMgr->doWorkCount ) {
3311  myMgr->doWorkCount = myMgr->pmeComputes.size();
3312 
3313 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3314  if ( ! offload )
3315 #endif // NAMD_CUDA
3316  {
3317  memset( (void*) myMgr->fz_arr, 0, (myGrid.K3+myGrid.order-1) * sizeof(char) );
3318 
3319  for (int i=0; i<myMgr->q_count; ++i) {
3320  memset( (void*) (myMgr->q_list[i]), 0, (myGrid.K3+myGrid.order-1) * sizeof(float) );
3321  }
3322  }
3323 
3324  for ( g=0; g<numGrids; ++g ) {
3325  myMgr->evir[g] = 0;
3326  }
3327 
3328  myMgr->strayChargeErrors = 0;
3329 
3330  myMgr->compute_sequence = sequence();
3331  }
3332 
3333  if ( sequence() != myMgr->compute_sequence ) NAMD_bug("ComputePme sequence mismatch in doWork()");
3334 
3335  int strayChargeErrors = 0;
3336 
3337  // calculate self energy
3339  for ( g=0; g<numGrids; ++g ) {
3340  BigReal selfEnergy = 0;
3341  data_ptr = localGridData[g];
3342  int i;
3343  for(i=0; i<numGridAtoms[g]; ++i)
3344  {
3345  selfEnergy += data_ptr->cg * data_ptr->cg;
3346  ++data_ptr;
3347  }
3348  selfEnergy *= -1. * ewaldcof / SQRT_PI;
3349  myMgr->evir[g][0] += selfEnergy;
3350 
3351  float **q = myMgr->q_arr + g*myMgr->fsize;
3352  char *f = myMgr->f_arr + g*myMgr->fsize;
3353 
3354  scale_coordinates(localGridData[g], numGridAtoms[g], lattice, myGrid);
3355 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3356  if ( offload ) {
3357  if ( myMgr->cuda_atoms_alloc == 0 ) { // first call
3358  int na = myMgr->cuda_atoms_alloc = 1.2 * (myMgr->cuda_atoms_count + 1000);
3359  cuda_errcheck("before malloc atom data for pme");
3360  cudaMallocHost((void**) &(myMgr->a_data_host), 7*na*sizeof(float));
3361  cudaMalloc((void**) &(myMgr->a_data_dev), 7*na*sizeof(float));
3362  cuda_errcheck("malloc atom data for pme");
3363  myMgr->cuda_atoms_count = 0;
3364  }
3365  cuda_atoms_offset = myMgr->cuda_atoms_count;
3366  int n = numGridAtoms[g];
3367  myMgr->cuda_atoms_count += n;
3368  if ( myMgr->cuda_atoms_count > myMgr->cuda_atoms_alloc ) {
3369  CkPrintf("Pe %d expanding CUDA PME atoms allocation because %d > %d\n",
3370  CkMyPe(), myMgr->cuda_atoms_count, myMgr->cuda_atoms_alloc);
3371  cuda_errcheck("before malloc expanded atom data for pme");
3372  int na = myMgr->cuda_atoms_alloc = 1.2 * (myMgr->cuda_atoms_count + 1000);
3373  const float *a_data_host_old = myMgr->a_data_host;
3374  cudaMallocHost((void**) &(myMgr->a_data_host), 7*na*sizeof(float));
3375  cuda_errcheck("malloc expanded host atom data for pme");
3376  memcpy(myMgr->a_data_host, a_data_host_old, 7*cuda_atoms_offset*sizeof(float));
3377  cudaFreeHost((void*) a_data_host_old);
3378  cuda_errcheck("free expanded host atom data for pme");
3379  cudaFree(myMgr->a_data_dev);
3380  cuda_errcheck("free expanded dev atom data for pme");
3381  cudaMalloc((void**) &(myMgr->a_data_dev), 7*na*sizeof(float));
3382  cuda_errcheck("malloc expanded dev atom data for pme");
3383  }
3384  float *a_data_host = myMgr->a_data_host + 7 * cuda_atoms_offset;
3385  data_ptr = localGridData[g];
3386  double order_1 = myGrid.order - 1;
3387  double K1 = myGrid.K1;
3388  double K2 = myGrid.K2;
3389  double K3 = myGrid.K3;
3390  int found_negative = 0;
3391  for ( int i=0; i<n; ++i ) {
3392  if ( data_ptr[i].x < 0 || data_ptr[i].y < 0 || data_ptr[i].z < 0 ) {
3393  found_negative = 1;
3394  // CkPrintf("low coord: %f %f %f\n", data_ptr[i].x, data_ptr[i].y, data_ptr[i].z);
3395  }
3396  double x_int = (int) data_ptr[i].x;
3397  double y_int = (int) data_ptr[i].y;
3398  double z_int = (int) data_ptr[i].z;
3399  a_data_host[7*i ] = data_ptr[i].x - x_int; // subtract in double precision
3400  a_data_host[7*i+1] = data_ptr[i].y - y_int;
3401  a_data_host[7*i+2] = data_ptr[i].z - z_int;
3402  a_data_host[7*i+3] = data_ptr[i].cg;
3403  x_int -= order_1; if ( x_int < 0 ) x_int += K1;
3404  y_int -= order_1; if ( y_int < 0 ) y_int += K2;
3405  z_int -= order_1; if ( z_int < 0 ) z_int += K3;
3406  a_data_host[7*i+4] = x_int;
3407  a_data_host[7*i+5] = y_int;
3408  a_data_host[7*i+6] = z_int;
3409  }
3410  if ( found_negative ) NAMD_bug("found negative atom coordinate in ComputePme::doWork");
3411  } else
3412 #endif // NAMD_CUDA
3413  {
3414  myRealSpace[g]->set_num_atoms(numGridAtoms[g]);
3415  myRealSpace[g]->fill_charges(q, myMgr->q_list, myMgr->q_count, strayChargeErrors, f, myMgr->fz_arr, localGridData[g]);
3416  }
3417  }
3418  myMgr->strayChargeErrors += strayChargeErrors;
3419 
3420 #ifdef TRACE_COMPUTE_OBJECTS
3421  traceUserBracketEvent(TRACE_COMPOBJ_IDOFFSET+this->cid, traceObjStartTime, CmiWallTimer());
3422 #endif
3423 
3424  if ( --(myMgr->doWorkCount) == 0 ) {
3425 // cudaDeviceSynchronize(); // XXXX
3426 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3427  if ( offload ) {
3429  args.mgr = myMgr;
3430  args.lattice = &lattice;
3431  args.sequence = sequence();
3432  CmiLock(ComputePmeMgr::cuda_lock);
3433  if ( ComputePmeMgr::cuda_busy ) {
3435  } else if ( CkMyPe() == deviceCUDA->getMasterPe() ) {
3436  // avoid adding work to nonbonded data preparation pe
3437  args.mgr->cuda_submit_charges(*args.lattice, args.sequence);
3438  } else {
3439  ComputePmeMgr::cuda_busy = true;
3440  while ( 1 ) {
3441  CmiUnlock(ComputePmeMgr::cuda_lock);
3442  args.mgr->cuda_submit_charges(*args.lattice, args.sequence);
3443  CmiLock(ComputePmeMgr::cuda_lock);
3447  } else {
3448  ComputePmeMgr::cuda_busy = false;
3449  break;
3450  }
3451  }
3452  }
3453  CmiUnlock(ComputePmeMgr::cuda_lock);
3454  } else
3455 #endif // NAMD_CUDA
3456  {
3457  myMgr->chargeGridReady(lattice,sequence());
3458  }
3459  }
3460  atomsChanged = 0;
3461 }
3462 
3463 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3464 
3465 void ComputePmeMgr::cuda_submit_charges(Lattice &lattice, int sequence) {
3466 
3467  int n = cuda_atoms_count;
3468  //CkPrintf("pe %d cuda_atoms_count %d\n", CkMyPe(), cuda_atoms_count);
3469  cuda_atoms_count = 0;
3470 
3471  const double before = CmiWallTimer();
3472  cudaMemcpyAsync(a_data_dev, a_data_host, 7*n*sizeof(float),
3473  cudaMemcpyHostToDevice, streams[stream]);
3474  const double after = CmiWallTimer();
3475 
3476  cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_charge_memset, 0);
3477 
3478  cuda_pme_charges(
3479  bspline_coeffs_dev,
3480  q_arr_dev, ffz_dev, ffz_dev + fsize,
3481  a_data_dev, n,
3482  myGrid.K1, myGrid.K2, myGrid.K3, myGrid.order,
3483  streams[stream]);
3484  const double after2 = CmiWallTimer();
3485 
3486  chargeGridSubmitted(lattice,sequence); // must be inside lock
3487 
3488  masterPmeMgr->charges_time = before;
3489  traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,after);
3490  traceUserBracketEvent(CUDA_EVENT_ID_PME_KERNEL,after,after2);
3491 }
3492 
3493 void cuda_check_pme_charges(void *arg, double walltime) {
3494  ComputePmeMgr *argp = (ComputePmeMgr *) arg;
3495 
3496  cudaError_t err = cudaEventQuery(argp->end_charges);
3497  if ( err == cudaSuccess ) {
3498  traceUserBracketEvent(CUDA_EVENT_ID_PME_CHARGES,argp->charges_time,walltime);
3499  argp->charges_time = walltime - argp->charges_time;
3500  argp->sendChargeGridReady();
3501  argp->check_charges_count = 0;
3502  } else if ( err != cudaErrorNotReady ) {
3503  char errmsg[256];
3504  sprintf(errmsg,"in cuda_check_pme_charges after polling %d times over %f s on seq %d",
3505  argp->check_charges_count, walltime - argp->charges_time,
3506  argp->saved_sequence);
3507  cudaDie(errmsg,err);
3508  } else if ( ++(argp->check_charges_count) >= count_limit ) {
3509  char errmsg[256];
3510  sprintf(errmsg,"cuda_check_pme_charges polled %d times over %f s on seq %d",
3511  argp->check_charges_count, walltime - argp->charges_time,
3512  argp->saved_sequence);
3513  cudaDie(errmsg,err);
3514  } else {
3515  CcdCallBacksReset(0,walltime); // fix Charm++
3517  }
3518 }
3519 
3520 void ComputePmeMgr::chargeGridSubmitted(Lattice &lattice, int sequence) {
3521  saved_lattice = &lattice;
3522  saved_sequence = sequence;
3523 
3524  // cudaDeviceSynchronize(); // XXXX TESTING
3525  //int q_stride = myGrid.K3+myGrid.order-1;
3526  //for (int n=fsize+q_stride, j=0; j<n; ++j) {
3527  // if ( ffz_host[j] != 0 && ffz_host[j] != 1 ) {
3528  // CkPrintf("pre-memcpy flag %d/%d == %d on pe %d in ComputePmeMgr::chargeGridReady\n", j, n, ffz_host[j], CkMyPe());
3529  // }
3530  //}
3531  //CmiLock(cuda_lock);
3532 
3533  if ( --(masterPmeMgr->chargeGridSubmittedCount) == 0 ) {
3534  double before = CmiWallTimer();
3535  cudaEventRecord(nodePmeMgr->end_all_pme_kernels, 0); // when all streams complete
3536  cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_all_pme_kernels, 0);
3537  cudaMemcpyAsync(q_data_host, q_data_dev, q_data_size+ffz_size,
3538  cudaMemcpyDeviceToHost, streams[stream]);
3539  traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
3540  cudaEventRecord(masterPmeMgr->end_charges, streams[stream]);
3541  cudaMemsetAsync(q_data_dev, 0, q_data_size + ffz_size, streams[stream]); // for next time
3542  cudaEventRecord(nodePmeMgr->end_charge_memset, streams[stream]);
3543  //CmiUnlock(cuda_lock);
3544  // cudaDeviceSynchronize(); // XXXX TESTING
3545  // cuda_errcheck("after memcpy grid to host");
3546 
3548  pmeProxy[master_pe].pollChargeGridReady();
3549  }
3550 }
3551 
3553  for ( int i=0; i<CkMyNodeSize(); ++i ) {
3554  ComputePmeMgr *mgr = nodePmeMgr->mgrObjects[i];
3555  int cs = mgr->pmeComputes.size();
3556  if ( cs ) {
3557  mgr->ungridForcesCount = cs;
3558  mgr->recipEvirCount = mgr->recipEvirClients;
3559  masterPmeMgr->chargeGridSubmittedCount++;
3560  }
3561  }
3562  pmeProxy[master_pe].recvChargeGridReady();
3563 }
3564 #endif // NAMD_CUDA
3565 
3567 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3568  CcdCallBacksReset(0,CmiWallTimer()); // fix Charm++
3570 #else
3571  NAMD_bug("ComputePmeMgr::pollChargeGridReady() called in non-CUDA build.");
3572 #endif
3573 }
3574 
3577 }
3578 
3579 void ComputePmeMgr::chargeGridReady(Lattice &lattice, int sequence) {
3580 
3581 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3582  if ( offload ) {
3583  int errcount = 0;
3584  int q_stride = myGrid.K3+myGrid.order-1;
3585  for (int n=fsize+q_stride, j=fsize; j<n; ++j) {
3586  f_arr[j] = ffz_host[j];
3587  if ( ffz_host[j] & ~1 ) ++errcount;
3588  }
3589  if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::chargeGridReady");
3590  }
3591 #endif
3592  recipEvirCount = recipEvirClients;
3593  ungridForcesCount = pmeComputes.size();
3594 
3595  for (int j=0; j<myGrid.order-1; ++j) {
3596  fz_arr[j] |= fz_arr[myGrid.K3+j];
3597  }
3598 
3599  if ( usePencils ) {
3600  sendPencils(lattice,sequence);
3601  } else {
3602  sendData(lattice,sequence);
3603  }
3604 }
3605 
3606 
3607 void ComputePmeMgr::sendPencilsPart(int first, int last, Lattice &lattice, int sequence, int sourcepe) {
3608 
3609  // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
3610 
3611 #if 0 && USE_PERSISTENT
3612  if (recvGrid_handle== NULL) setup_recvgrid_persistent();
3613 #endif
3614  int K1 = myGrid.K1;
3615  int K2 = myGrid.K2;
3616  int dim2 = myGrid.dim2;
3617  int dim3 = myGrid.dim3;
3618  int block1 = myGrid.block1;
3619  int block2 = myGrid.block2;
3620 
3621  // int savedMessages = 0;
3622  NodePmeMgr *npMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
3623 
3624  for (int ap=first; ap<=last; ++ap) {
3625  int ib = activePencils[ap].i;
3626  int jb = activePencils[ap].j;
3627  int ibegin = ib*block1;
3628  int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
3629  int jbegin = jb*block2;
3630  int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
3631  int flen = numGrids * (iend - ibegin) * (jend - jbegin);
3632 
3633  int fcount = 0;
3634  for ( int g=0; g<numGrids; ++g ) {
3635  char *f = f_arr + g*fsize;
3636 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3637  if ( offload ) {
3638  int errcount = 0;
3639  for ( int i=ibegin; i<iend; ++i ) {
3640  for ( int j=jbegin; j<jend; ++j ) {
3641  int k = i*dim2+j;
3642  f[k] = ffz_host[k];
3643  fcount += f[k];
3644  if ( ffz_host[k] & ~1 ) ++errcount;
3645  }
3646  }
3647  if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::sendPencilsPart");
3648  } else
3649 #endif
3650  for ( int i=ibegin; i<iend; ++i ) {
3651  for ( int j=jbegin; j<jend; ++j ) {
3652  fcount += f[i*dim2+j];
3653  }
3654  }
3655  }
3656 
3657 #ifdef NETWORK_PROGRESS
3658  CmiNetworkProgress();
3659 #endif
3660 
3661  if ( ! pencilActive[ib*yBlocks+jb] )
3662  NAMD_bug("PME activePencils list inconsistent");
3663 
3664  int zlistlen = 0;
3665  for ( int i=0; i<myGrid.K3; ++i ) {
3666  if ( fz_arr[i] ) ++zlistlen;
3667  }
3668 
3669  int hd = ( fcount? 1 : 0 ); // has data?
3670  // if ( ! hd ) ++savedMessages;
3671 
3672 
3673  PmeGridMsg *msg = new ( hd*zlistlen, hd*flen,
3674  hd*fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
3675  msg->sourceNode = sourcepe;
3676  msg->hasData = hd;
3677  msg->lattice = lattice;
3678  if ( hd ) {
3679 #if 0
3680  msg->start = fstart;
3681  msg->len = flen;
3682 #else
3683  msg->start = -1; // obsolete?
3684  msg->len = -1; // obsolete?
3685 #endif
3686  msg->zlistlen = zlistlen;
3687  int *zlist = msg->zlist;
3688  zlistlen = 0;
3689  for ( int i=0; i<myGrid.K3; ++i ) {
3690  if ( fz_arr[i] ) zlist[zlistlen++] = i;
3691  }
3692  char *fmsg = msg->fgrid;
3693  float *qmsg = msg->qgrid;
3694  for ( int g=0; g<numGrids; ++g ) {
3695  char *f = f_arr + g*fsize;
3696  float **q = q_arr + g*fsize;
3697  for ( int i=ibegin; i<iend; ++i ) {
3698  for ( int j=jbegin; j<jend; ++j ) {
3699  *(fmsg++) = f[i*dim2+j];
3700  if( f[i*dim2+j] ) {
3701  for (int h=0; h<myGrid.order-1; ++h) {
3702  q[i*dim2+j][h] += q[i*dim2+j][myGrid.K3+h];
3703  }
3704  for ( int k=0; k<zlistlen; ++k ) {
3705  *(qmsg++) = q[i*dim2+j][zlist[k]];
3706  }
3707  }
3708  }
3709  }
3710  }
3711  }
3712 
3713  msg->sequence = compute_sequence;
3714  SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
3715  CmiEnableUrgentSend(1);
3716 #if USE_NODE_PAR_RECEIVE
3717  msg->destElem=CkArrayIndex3D(ib,jb,0);
3718  CProxy_PmePencilMap lzm = npMgr->zm;
3719  int destproc = lzm.ckLocalBranch()->procNum(0, msg->destElem);
3720  int destnode = CmiNodeOf(destproc);
3721 
3722 #if 0
3723  CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
3724 #endif
3725  pmeNodeProxy[destnode].recvZGrid(msg);
3726 #if 0
3727  CmiUsePersistentHandle(NULL, 0);
3728 #endif
3729 #else
3730 #if 0
3731  CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
3732 #endif
3733  zPencil(ib,jb,0).recvGrid(msg);
3734 #if 0
3735  CmiUsePersistentHandle(NULL, 0);
3736 #endif
3737 #endif
3738  CmiEnableUrgentSend(0);
3739  }
3740 
3741 
3742  // if ( savedMessages ) {
3743  // CkPrintf("Pe %d eliminated %d PME messages\n",CkMyPe(),savedMessages);
3744  // }
3745 
3746 }
3747 
3748 
3750  nodePmeMgr->sendPencilsHelper(iter);
3751 }
3752 
3754 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3755  ComputePmeMgr *obj = masterPmeMgr;
3757 #else
3758  NAMD_bug("NodePmeMgr::sendPencilsHelper called in non-CUDA build");
3759 #endif
3760 }
3761 
3762 void ComputePmeMgr::sendPencils(Lattice &lattice, int sequence) {
3763 
3764  sendDataHelper_lattice = &lattice;
3765  sendDataHelper_sequence = sequence;
3766  sendDataHelper_sourcepe = CkMyPe();
3767 
3768 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3769  if ( offload ) {
3770  for ( int ap=0; ap < numPencilsActive; ++ap ) {
3771 #if CMK_MULTICORE
3772  // nodegroup messages on multicore are delivered to sending pe, or pe 0 if expedited
3773  int ib = activePencils[ap].i;
3774  int jb = activePencils[ap].j;
3775  int destproc = nodePmeMgr->zm.ckLocalBranch()->procNum(0, CkArrayIndex3D(ib,jb,0));
3776  pmeProxy[destproc].sendPencilsHelper(ap);
3777 #else
3778  pmeNodeProxy[CkMyNode()].sendPencilsHelper(ap);
3779 #endif
3780  }
3781  } else
3782 #endif
3783  {
3784  sendPencilsPart(0,numPencilsActive-1,lattice,sequence,CkMyPe());
3785  }
3786 
3787  if ( strayChargeErrors ) {
3788  strayChargeErrors = 0;
3789  iout << iERROR << "Stray PME grid charges detected: "
3790  << CkMyPe() << " sending to (x,y)";
3791  int K1 = myGrid.K1;
3792  int K2 = myGrid.K2;
3793  int dim2 = myGrid.dim2;
3794  int block1 = myGrid.block1;
3795  int block2 = myGrid.block2;
3796  for (int ib=0; ib<xBlocks; ++ib) {
3797  for (int jb=0; jb<yBlocks; ++jb) {
3798  int ibegin = ib*block1;
3799  int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
3800  int jbegin = jb*block2;
3801  int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
3802  int flen = numGrids * (iend - ibegin) * (jend - jbegin);
3803 
3804  for ( int g=0; g<numGrids; ++g ) {
3805  char *f = f_arr + g*fsize;
3806  if ( ! pencilActive[ib*yBlocks+jb] ) {
3807  for ( int i=ibegin; i<iend; ++i ) {
3808  for ( int j=jbegin; j<jend; ++j ) {
3809  if ( f[i*dim2+j] == 3 ) {
3810  f[i*dim2+j] = 2;
3811  iout << " (" << i << "," << j << ")";
3812  }
3813  }
3814  }
3815  }
3816  }
3817  }
3818  }
3819  iout << "\n" << endi;
3820  }
3821 
3822 }
3823 
3824 
3826 
3827  int K1 = myGrid.K1;
3828  int K2 = myGrid.K2;
3829  int dim2 = myGrid.dim2;
3830  int dim3 = myGrid.dim3;
3831  int block1 = myGrid.block1;
3832  int block2 = myGrid.block2;
3833 
3834  // msg->sourceNode = thisIndex.x * initdata.yBlocks + thisIndex.y;
3835  int ib = msg->sourceNode / yBlocks;
3836  int jb = msg->sourceNode % yBlocks;
3837 
3838  int ibegin = ib*block1;
3839  int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
3840  int jbegin = jb*block2;
3841  int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
3842 
3843  int zlistlen = msg->zlistlen;
3844  int *zlist = msg->zlist;
3845  float *qmsg = msg->qgrid;
3846  int g;
3847  for ( g=0; g<numGrids; ++g ) {
3848  char *f = f_arr + g*fsize;
3849  float **q = q_arr + g*fsize;
3850  for ( int i=ibegin; i<iend; ++i ) {
3851  for ( int j=jbegin; j<jend; ++j ) {
3852  if( f[i*dim2+j] ) {
3853  f[i*dim2+j] = 0;
3854  for ( int k=0; k<zlistlen; ++k ) {
3855  q[i*dim2+j][zlist[k]] = *(qmsg++);
3856  }
3857  for (int h=0; h<myGrid.order-1; ++h) {
3858  q[i*dim2+j][myGrid.K3+h] = q[i*dim2+j][h];
3859  }
3860  }
3861  }
3862  }
3863  }
3864 }
3865 
3866 
3867 void ComputePmeMgr::sendDataPart(int first, int last, Lattice &lattice, int sequence, int sourcepe, int errors) {
3868 
3869  // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
3870 
3871  bsize = myGrid.block1 * myGrid.dim2 * myGrid.dim3;
3872 
3873  CProxy_ComputePmeMgr pmeProxy(CkpvAccess(BOCclass_group).computePmeMgr);
3874  for (int j=first; j<=last; j++) {
3875  int pe = gridPeOrder[j]; // different order
3876  if ( ! recipPeDest[pe] && ! errors ) continue;
3877  int start = pe * bsize;
3878  int len = bsize;
3879  if ( start >= qsize ) { start = 0; len = 0; }
3880  if ( start + len > qsize ) { len = qsize - start; }
3881  int zdim = myGrid.dim3;
3882  int fstart = start / zdim;
3883  int flen = len / zdim;
3884  int fcount = 0;
3885  int i;
3886 
3887  int g;
3888  for ( g=0; g<numGrids; ++g ) {
3889  char *f = f_arr + fstart + g*fsize;
3890 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3891  if ( offload ) {
3892  int errcount = 0;
3893  for ( i=0; i<flen; ++i ) {
3894  f[i] = ffz_host[fstart+i];
3895  fcount += f[i];
3896  if ( ffz_host[fstart+i] & ~1 ) ++errcount;
3897  }
3898  if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::sendDataPart");
3899  } else
3900 #endif
3901  for ( i=0; i<flen; ++i ) {
3902  fcount += f[i];
3903  }
3904  if ( ! recipPeDest[pe] ) {
3905  int errfound = 0;
3906  for ( i=0; i<flen; ++i ) {
3907  if ( f[i] == 3 ) {
3908  errfound = 1;
3909  break;
3910  }
3911  }
3912  if ( errfound ) {
3913  iout << iERROR << "Stray PME grid charges detected: "
3914  << sourcepe << " sending to " << gridPeMap[pe] << " for planes";
3915  int iz = -1;
3916  for ( i=0; i<flen; ++i ) {
3917  if ( f[i] == 3 ) {
3918  f[i] = 2;
3919  int jz = (i+fstart)/myGrid.K2;
3920  if ( iz != jz ) { iout << " " << jz; iz = jz; }
3921  }
3922  }
3923  iout << "\n" << endi;
3924  }
3925  }
3926  }
3927 
3928 #ifdef NETWORK_PROGRESS
3929  CmiNetworkProgress();
3930 #endif
3931 
3932  if ( ! recipPeDest[pe] ) continue;
3933 
3934  int zlistlen = 0;
3935  for ( i=0; i<myGrid.K3; ++i ) {
3936  if ( fz_arr[i] ) ++zlistlen;
3937  }
3938 
3939  PmeGridMsg *msg = new (zlistlen, flen*numGrids,
3940  fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
3941 
3942  msg->sourceNode = sourcepe;
3943  msg->lattice = lattice;
3944  msg->start = fstart;
3945  msg->len = flen;
3946  msg->zlistlen = zlistlen;
3947  int *zlist = msg->zlist;
3948  zlistlen = 0;
3949  for ( i=0; i<myGrid.K3; ++i ) {
3950  if ( fz_arr[i] ) zlist[zlistlen++] = i;
3951  }
3952  float *qmsg = msg->qgrid;
3953  for ( g=0; g<numGrids; ++g ) {
3954  char *f = f_arr + fstart + g*fsize;
3955  CmiMemcpy((void*)(msg->fgrid+g*flen),(void*)f,flen*sizeof(char));
3956  float **q = q_arr + fstart + g*fsize;
3957  for ( i=0; i<flen; ++i ) {
3958  if ( f[i] ) {
3959  for (int h=0; h<myGrid.order-1; ++h) {
3960  q[i][h] += q[i][myGrid.K3+h];
3961  }
3962  for ( int k=0; k<zlistlen; ++k ) {
3963  *(qmsg++) = q[i][zlist[k]];
3964  }
3965  }
3966  }
3967  }
3968 
3969  msg->sequence = compute_sequence;
3970  SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
3971  pmeProxy[gridPeMap[pe]].recvGrid(msg);
3972  }
3973 
3974 }
3975 
3977  nodePmeMgr->sendDataHelper(iter);
3978 }
3979 
3981 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3982  ComputePmeMgr *obj = masterPmeMgr;
3984 #else
3985  NAMD_bug("NodePmeMgr::sendDataHelper called in non-CUDA build");
3986 #endif
3987 }
3988 
3989 void ComputePmeMgr::sendData(Lattice &lattice, int sequence) {
3990 
3991  sendDataHelper_lattice = &lattice;
3992  sendDataHelper_sequence = sequence;
3993  sendDataHelper_sourcepe = CkMyPe();
3994  sendDataHelper_errors = strayChargeErrors;
3995  strayChargeErrors = 0;
3996 
3997 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3998  if ( offload ) {
3999  for ( int i=0; i < numGridPes; ++i ) {
4000  int pe = gridPeOrder[i]; // different order
4001  if ( ! recipPeDest[pe] && ! sendDataHelper_errors ) continue;
4002 #if CMK_MULTICORE
4003  // nodegroup messages on multicore are delivered to sending pe, or pe 0 if expedited
4004  pmeProxy[gridPeMap[pe]].sendDataHelper(i);
4005 #else
4006  pmeNodeProxy[CkMyNode()].sendDataHelper(i);
4007 #endif
4008  }
4009  } else
4010 #endif
4011  {
4012  sendDataPart(0,numGridPes-1,lattice,sequence,CkMyPe(),sendDataHelper_errors);
4013  }
4014 
4015 }
4016 
4018 
4019  int zdim = myGrid.dim3;
4020  int flen = msg->len;
4021  int fstart = msg->start;
4022  int zlistlen = msg->zlistlen;
4023  int *zlist = msg->zlist;
4024  float *qmsg = msg->qgrid;
4025  int g;
4026  for ( g=0; g<numGrids; ++g ) {
4027  char *f = msg->fgrid + g*flen;
4028  float **q = q_arr + fstart + g*fsize;
4029  for ( int i=0; i<flen; ++i ) {
4030  if ( f[i] ) {
4031  f[i] = 0;
4032  for ( int k=0; k<zlistlen; ++k ) {
4033  q[i][zlist[k]] = *(qmsg++);
4034  }
4035  for (int h=0; h<myGrid.order-1; ++h) {
4036  q[i][myGrid.K3+h] = q[i][h];
4037  }
4038  }
4039  }
4040  }
4041 }
4042 
4044 
4045  if ( sequence() != myMgr->compute_sequence ) NAMD_bug("ComputePme sequence mismatch in ungridForces()");
4046 
4048 
4049  localResults_alloc.resize(numLocalAtoms* ((numGrids>1 || selfOn)?2:1));
4050  Vector *localResults = localResults_alloc.begin();
4051  Vector *gridResults;
4052 
4053  if ( alchOn || lesOn || selfOn || pairOn ) {
4054  for(int i=0; i<numLocalAtoms; ++i) { localResults[i] = 0.; }
4055  gridResults = localResults + numLocalAtoms;
4056  } else {
4057  gridResults = localResults;
4058  }
4059 
4060  Vector pairForce = 0.;
4061  Lattice &lattice = patch->flags.lattice;
4062  int g = 0;
4063  if(!simParams->commOnly) {
4064  for ( g=0; g<numGrids; ++g ) {
4065 #ifdef NETWORK_PROGRESS
4066  CmiNetworkProgress();
4067 #endif
4068 
4069 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
4070  if ( offload ) {
4071  int errfound = 0;
4072  for ( int n=numGridAtoms[g], i=0; i<n; ++i ) {
4073  // Neither isnan() nor x != x worked when testing on Cray; this does.
4074  if ( ((int*)f_data_host)[3*i] == 0x7fffffff ) { errfound = 1; } // CUDA NaN
4075  gridResults[i].x = f_data_host[3*i];
4076  gridResults[i].y = f_data_host[3*i+1];
4077  gridResults[i].z = f_data_host[3*i+2];
4078  }
4079  if ( errfound ) {
4080  int errcount = 0;
4081  for ( int n=numGridAtoms[g], i=0; i<n; ++i ) {
4082  float f = f_data_host[3*i];
4083  if ( ((int*)f_data_host)[3*i] == 0x7fffffff ) { // CUDA NaN
4084  ++errcount;
4085  gridResults[i] = 0.;
4086  }
4087  }
4088  iout << iERROR << "Stray PME grid charges detected: "
4089  << errcount << " atoms on pe " << CkMyPe() << "\n" << endi;
4090  }
4091  } else
4092 #endif // NAMD_CUDA
4093  {
4094  myRealSpace[g]->compute_forces(myMgr->q_arr+g*myMgr->fsize, localGridData[g], gridResults);
4095  }
4096  scale_forces(gridResults, numGridAtoms[g], lattice);
4097 
4098  if (alchOn) {
4099  float scale = 1.;
4100  BigReal elecLambdaUp, elecLambdaDown;
4101  BigReal alchLambda = simParams->getCurrentLambda(patch->flags.step);
4102  myMgr->alchLambda = alchLambda;
4103  BigReal alchLambda2 = simParams->getCurrentLambda2(patch->flags.step);
4104  myMgr->alchLambda2 = alchLambda2;
4105  elecLambdaUp = simParams->getElecLambda(alchLambda);
4106  elecLambdaDown = simParams->getElecLambda(1. - alchLambda);
4107 
4108  if ( g == 0 ) scale = elecLambdaUp;
4109  else if ( g == 1 ) scale = elecLambdaDown;
4110  else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
4111 
4112  if (alchDecouple) {
4113  if ( g == 2 ) scale = 1 - elecLambdaUp;
4114  else if ( g == 3 ) scale = 1 - elecLambdaDown;
4115  else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
4116  }
4117  int nga = 0;
4118  if (!alchDecouple) {
4119  if (g < 2 ) {
4120  for(int i=0; i<numLocalAtoms; ++i) {
4121  if ( localPartition[i] == 0 || localPartition[i] == (g+1) || localPartition[i] == (g+3) ) {
4122  // (g=0: only partition 0 and partiton 1 and partion 3)
4123  // (g=1: only partition 0 and partiton 2 and partion 4)
4124  localResults[i] += gridResults[nga++] * scale;
4125  }
4126  }
4127  } else {
4128  for(int i=0; i<numLocalAtoms; ++i) {
4129  if ( localPartition[i] == 0 ) {
4130  // (g=2: only partition 0)
4131  localResults[i] += gridResults[nga++] * scale;
4132  }
4133  }
4134  }
4135  } else { // alchDecouple
4136  if ( g < 2 ) {
4137  for(int i=0; i<numLocalAtoms; ++i) {
4138  if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
4139  // g = 0: partition 0 or partition 1
4140  // g = 1: partition 0 or partition 2
4141  localResults[i] += gridResults[nga++] * scale;
4142  }
4143  }
4144  }
4145  else {
4146  for(int i=0; i<numLocalAtoms; ++i) {
4147  if ( localPartition[i] == (g-1) || localPartition[i] == (g-4)) {
4148  // g = 2: partition 1 only
4149  // g = 3: partition 2 only
4150  // g = 4: partition 0 only
4151  localResults[i] += gridResults[nga++] * scale;
4152  }
4153  }
4154  }
4155  }
4156  } else if ( lesOn ) {
4157  float scale = 1.;
4158  if ( alchFepOn ) {
4159  BigReal alchLambda = simParams->getCurrentLambda(patch->flags.step);
4160  myMgr->alchLambda = alchLambda;
4161  BigReal alchLambda2 = simParams->getCurrentLambda2(patch->flags.step);
4162  myMgr->alchLambda2 = alchLambda2;
4163  if ( g == 0 ) scale = alchLambda;
4164  else if ( g == 1 ) scale = 1. - alchLambda;
4165  } else if ( lesOn ) {
4166  scale = 1.0 / (float)lesFactor;
4167  }
4168  int nga = 0;
4169  for(int i=0; i<numLocalAtoms; ++i) {
4170  if ( localPartition[i] == 0 || localPartition[i] == (g+1) ) {
4171  localResults[i] += gridResults[nga++] * scale;
4172  }
4173  }
4174  } else if ( selfOn ) {
4175  PmeParticle *lgd = localGridData[g];
4176  int nga = 0;
4177  for(int i=0; i<numLocalAtoms; ++i) {
4178  if ( localPartition[i] == 1 ) {
4179  pairForce += gridResults[nga]; // should add up to almost zero
4180  localResults[i] += gridResults[nga++];
4181  }
4182  }
4183  } else if ( pairOn ) {
4184  if ( g == 0 ) {
4185  int nga = 0;
4186  for(int i=0; i<numLocalAtoms; ++i) {
4187  if ( localPartition[i] == 1 ) {
4188  pairForce += gridResults[nga];
4189  }
4190  if ( localPartition[i] == 1 || localPartition[i] == 2 ) {
4191  localResults[i] += gridResults[nga++];
4192  }
4193  }
4194  } else if ( g == 1 ) {
4195  int nga = 0;
4196  for(int i=0; i<numLocalAtoms; ++i) {
4197  if ( localPartition[i] == g ) {
4198  pairForce -= gridResults[nga]; // should add up to almost zero
4199  localResults[i] -= gridResults[nga++];
4200  }
4201  }
4202  } else {
4203  int nga = 0;
4204  for(int i=0; i<numLocalAtoms; ++i) {
4205  if ( localPartition[i] == g ) {
4206  localResults[i] -= gridResults[nga++];
4207  }
4208  }
4209  }
4210  }
4211  }
4212  }
4213 
4214  Vector *results_ptr = localResults;
4215 
4216  // add in forces
4217  {
4218  Results *r = forceBox->open();
4219  Force *f = r->f[Results::slow];
4220  int numAtoms = patch->getNumAtoms();
4221 
4222  if ( ! myMgr->strayChargeErrors && ! simParams->commOnly ) {
4223  for(int i=0; i<numAtoms; ++i) {
4224  f[i].x += results_ptr->x;
4225  f[i].y += results_ptr->y;
4226  f[i].z += results_ptr->z;
4227  ++results_ptr;
4228  }
4229  }
4230  forceBox->close(&r);
4231  }
4232 
4233  if ( pairOn || selfOn ) {
4234  ADD_VECTOR_OBJECT(myMgr->reduction,REDUCTION_PAIR_ELECT_FORCE,pairForce);
4235  }
4236 
4237 }
4238 
4240 
4242 
4243  for ( int g=0; g<numGrids; ++g ) {
4244  float scale = 1.;
4245  if (alchOn) {
4246  BigReal elecLambdaUp, elecLambdaDown;
4247  // alchLambda set on each step in ComputePme::ungridForces()
4248  if ( alchLambda < 0 || alchLambda > 1 ) {
4249  NAMD_bug("ComputePmeMgr::submitReductions alchLambda out of range");
4250  }
4251  elecLambdaUp = simParams->getElecLambda(alchLambda);
4252  elecLambdaDown = simParams->getElecLambda(1-alchLambda);
4253  if ( g == 0 ) scale = elecLambdaUp;
4254  else if ( g == 1 ) scale = elecLambdaDown;
4255  else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
4256  if (alchDecouple) {
4257  if ( g == 2 ) scale = 1-elecLambdaUp;
4258  else if ( g == 3 ) scale = 1-elecLambdaDown;
4259  else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
4260  }
4261  } else if ( lesOn ) {
4262  scale = 1.0 / lesFactor;
4263  } else if ( pairOn ) {
4264  scale = ( g == 0 ? 1. : -1. );
4265  }
4266  reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += evir[g][0] * scale;
4267  reduction->item(REDUCTION_VIRIAL_SLOW_XX) += evir[g][1] * scale;
4268  reduction->item(REDUCTION_VIRIAL_SLOW_XY) += evir[g][2] * scale;
4269  reduction->item(REDUCTION_VIRIAL_SLOW_XZ) += evir[g][3] * scale;
4270  reduction->item(REDUCTION_VIRIAL_SLOW_YX) += evir[g][2] * scale;
4271  reduction->item(REDUCTION_VIRIAL_SLOW_YY) += evir[g][4] * scale;
4272  reduction->item(REDUCTION_VIRIAL_SLOW_YZ) += evir[g][5] * scale;
4273  reduction->item(REDUCTION_VIRIAL_SLOW_ZX) += evir[g][3] * scale;
4274  reduction->item(REDUCTION_VIRIAL_SLOW_ZY) += evir[g][5] * scale;
4275  reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += evir[g][6] * scale;
4276 
4277  float scale2 = 0.;
4278 
4279  // why is this declared/defined again here?
4281 
4282  if (alchFepOn) {
4283  BigReal elecLambda2Up=0.0, elecLambda2Down=0.0;
4284  elecLambda2Up = simParams->getElecLambda(alchLambda2);
4285  elecLambda2Down = simParams->getElecLambda(1.-alchLambda2);
4286  if ( g == 0 ) scale2 = elecLambda2Up;
4287  else if ( g == 1 ) scale2 = elecLambda2Down;
4288  else if ( g == 2 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
4289  if (alchDecouple && g == 2 ) scale2 = 1 - elecLambda2Up;
4290  else if (alchDecouple && g == 3 ) scale2 = 1 - elecLambda2Down;
4291  else if (alchDecouple && g == 4 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
4292  }
4293  reduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += evir[g][0] * scale2;
4294 
4295  if (alchThermIntOn) {
4296 
4297  // no decoupling:
4298  // part. 1 <-> all of system except partition 2: g[0] - g[2]
4299  // (interactions between all atoms [partition 0 OR partition 1],
4300  // minus all [within partition 0])
4301  // U = elecLambdaUp * (U[0] - U[2])
4302  // dU/dl = U[0] - U[2];
4303 
4304  // part. 2 <-> all of system except partition 1: g[1] - g[2]
4305  // (interactions between all atoms [partition 0 OR partition 2],
4306  // minus all [within partition 0])
4307  // U = elecLambdaDown * (U[1] - U[2])
4308  // dU/dl = U[1] - U[2];
4309 
4310  // alchDecouple:
4311  // part. 1 <-> part. 0: g[0] - g[2] - g[4]
4312  // (interactions between all atoms [partition 0 OR partition 1]
4313  // minus all [within partition 1] minus all [within partition 0]
4314  // U = elecLambdaUp * (U[0] - U[4]) + (1-elecLambdaUp)* U[2]
4315  // dU/dl = U[0] - U[2] - U[4];
4316 
4317  // part. 2 <-> part. 0: g[1] - g[3] - g[4]
4318  // (interactions between all atoms [partition 0 OR partition 2]
4319  // minus all [within partition 2] minus all [within partition 0]
4320  // U = elecLambdaDown * (U[1] - U[4]) + (1-elecLambdaDown)* U[3]
4321  // dU/dl = U[1] - U[3] - U[4];
4322 
4323 
4324  if ( g == 0 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) += evir[g][0];
4325  if ( g == 1 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) += evir[g][0];
4326  if (!alchDecouple) {
4327  if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
4328  if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
4329  }
4330  else { // alchDecouple
4331  if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
4332  if ( g == 3 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
4333  if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
4334  if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
4335  }
4336  }
4337  }
4338 
4339  alchLambda = -1.; // illegal value to catch if not updated
4340 
4341  reduction->item(REDUCTION_STRAY_CHARGE_ERRORS) += strayChargeErrors;
4342  reduction->submit();
4343 
4344  for ( int i=0; i<heldComputes.size(); ++i ) {
4345  WorkDistrib::messageEnqueueWork(heldComputes[i]);
4346  }
4347  heldComputes.resize(0);
4348 }
4349 
4350 #if USE_TOPOMAP
4351 
4352 #define NPRIMES 8
4353 const static unsigned int NAMDPrimes[] = {
4354  3,
4355  5,
4356  7,
4357  11,
4358  13,
4359  17,
4360  19,
4361  23,
4362  29,
4363  31,
4364  37,
4365  59,
4366  73,
4367  93,
4368  113,
4369  157,
4370  307,
4371  617,
4372  1217 //This should b enough for 64K nodes of BGL.
4373 };
4374 
4375 #include "RecBisection.h"
4376 
4377 /***-----------------------------------------------------**********
4378  The Orthogonal Recursive Bisection strategy, which allocates PME
4379  objects close to the patches they communicate, and at the same
4380  time spreads them around the grid
4381 ****----------------------------------------------------------****/
4382 
4383 bool generateBGLORBPmePeList(int *pemap, int numPes,
4384  int *block_pes, int nbpes) {
4385 
4386  PatchMap *pmap = PatchMap::Object();
4387  int *pmemap = new int [CkNumPes()];
4388 
4389  if (pemap == NULL)
4390  return false;
4391 
4392  TopoManager tmgr;
4393 
4394  memset(pmemap, 0, sizeof(int) * CkNumPes());
4395 
4396  for(int count = 0; count < CkNumPes(); count++) {
4397  if(count < nbpes)
4398  pmemap[block_pes[count]] = 1;
4399 
4400  if(pmap->numPatchesOnNode(count)) {
4401  pmemap[count] = 1;
4402 
4403  //Assumes an XYZT mapping !!
4404  if(tmgr.hasMultipleProcsPerNode()) {
4405  pmemap[(count + CkNumPes()/2)% CkNumPes()] = 1;
4406  }
4407  }
4408  }
4409 
4410  if(numPes + nbpes + pmap->numNodesWithPatches() > CkNumPes())
4411  //NAMD_bug("PME ORB Allocator: Processors Unavailable\n");
4412  return false;
4413 
4414  CProxy_Node nd(CkpvAccess(BOCclass_group).node);
4415  Node *node = nd.ckLocalBranch();
4417 
4418  //first split PME processors into patch groups
4419 
4420  int xsize = 0, ysize = 0, zsize = 0;
4421 
4422  xsize = tmgr.getDimNX();
4423  ysize = tmgr.getDimNY();
4424  zsize = tmgr.getDimNZ();
4425 
4426  int nx = xsize, ny = ysize, nz = zsize;
4427  DimensionMap dm;
4428 
4429  dm.x = 0;
4430  dm.y = 1;
4431  dm.z = 2;
4432 
4433  findOptimalDimensions(xsize, ysize, zsize, nx, ny, nz, dm);
4434 
4435  //group size processors have to be allocated to each YZ plane
4436  int group_size = numPes/nx;
4437  if(numPes % nx)
4438  group_size ++;
4439 
4440  int my_prime = NAMDPrimes[0];
4441  int density = (ny * nz)/group_size + 1;
4442  int count = 0;
4443 
4444  //Choose a suitable prime Number
4445  for(count = 0; count < NPRIMES; count ++) {
4446  //Find a prime just greater than the density
4447  if(density < NAMDPrimes[count]) {
4448  my_prime = NAMDPrimes[count];
4449  break;
4450  }
4451  }
4452 
4453  if(count == NPRIMES)
4454  my_prime = NAMDPrimes[NPRIMES-1];
4455 
4456  //int gcount = numPes/2;
4457  int gcount = 0;
4458  int npme_pes = 0;
4459 
4460  int coord[3];
4461 
4462  for(int x = 0; x < nx; x++) {
4463  coord[0] = (x + nx/2)%nx;
4464 
4465  for(count=0; count < group_size && npme_pes < numPes; count++) {
4466  int dest = (count + 1) * my_prime;
4467  dest = dest % (ny * nz);
4468 
4469  coord[2] = dest / ny;
4470  coord[1] = dest - coord[2] * ny;
4471 
4472  //Locate where in the actual grid the processor is
4473  int destPe = coord[dm.x] + coord[dm.y] * xsize +
4474  coord[dm.z] * xsize* ysize;
4475 
4476  if(pmemap[destPe] == 0) {
4477  pemap[gcount++] = destPe;
4478  pmemap[destPe] = 1;
4479 
4480  if(tmgr.hasMultipleProcsPerNode())
4481  pmemap[(destPe + CkNumPes()/2) % CkNumPes()] = 1;
4482 
4483  npme_pes ++;
4484  }
4485  else {
4486  for(int pos = 1; pos < ny * nz; pos++) {
4487 
4488  coord[2] += pos / ny;
4489  coord[1] += pos % ny;
4490 
4491  coord[2] = coord[2] % nz;
4492  coord[1] = coord[1] % ny;
4493 
4494  int newdest = coord[dm.x] + coord[dm.y] * xsize +
4495  coord[dm.z] * xsize * ysize;
4496 
4497  if(pmemap[newdest] == 0) {
4498  pemap[gcount++] = newdest;
4499  pmemap[newdest] = 1;
4500 
4501  if(tmgr.hasMultipleProcsPerNode())
4502  pmemap[(newdest + CkNumPes()/2) % CkNumPes()] = 1;
4503 
4504  npme_pes ++;
4505  break;
4506  }
4507  }
4508  }
4509  }
4510 
4511  if(gcount == numPes)
4512  gcount = 0;
4513 
4514  if(npme_pes >= numPes)
4515  break;
4516  }
4517 
4518  delete [] pmemap;
4519 
4520  if(npme_pes != numPes)
4521  //NAMD_bug("ORB PME allocator failed\n");
4522  return false;
4523 
4524  return true;
4525 }
4526 
4527 #endif
4528 
4529 template <class T> class PmePencil : public T {
4530 public:
4532  data = 0;
4533  work = 0;
4534  send_order = 0;
4535  needs_reply = 0;
4536 #if USE_PERSISTENT
4537  trans_handle = untrans_handle = ungrid_handle = NULL;
4538 #endif
4539  }
4541 #ifdef NAMD_FFTW
4542  fftwf_free(data);
4543 #endif
4544  delete [] work;
4545  delete [] send_order;
4546  delete [] needs_reply;
4547  }
4549  imsg=0;
4550  imsgb=0;
4551  hasData=0;
4552  initdata = msg->data;
4553  }
4554  void order_init(int nBlocks) {
4555  send_order = new int[nBlocks];
4556  for ( int i=0; i<nBlocks; ++i ) send_order[i] = i;
4557  if ( Node::Object()->simParameters->PMESendOrder ) {
4558  std::sort(send_order,send_order+nBlocks,sortop_bit_reversed());
4559  } else {
4560  Random rand(CkMyPe());
4561  rand.reorder(send_order,nBlocks);
4562  }
4563  needs_reply = new int[nBlocks];
4565  }
4569  int sequence; // used for priorities
4570 #ifndef CmiMemoryAtomicType
4571  typedef int AtomicInt;
4572 #else
4573  typedef CmiMemoryAtomicInt AtomicInt;
4574 #endif
4575  AtomicInt imsg; // used in sdag code
4576  AtomicInt imsgb; // Node par uses distinct counter for back path
4577  int hasData; // used in message elimination
4578  int offload;
4579  float *data;
4580  float *work;
4583 #if USE_PERSISTENT
4584  PersistentHandle *trans_handle;
4585  PersistentHandle *untrans_handle;
4586  PersistentHandle *ungrid_handle;
4587 #endif
4588 };
4589 
4590 class PmeZPencil : public PmePencil<CBase_PmeZPencil> {
4591 public:
4592  PmeZPencil_SDAG_CODE
4593  PmeZPencil() { __sdag_init(); setMigratable(false); }
4594  PmeZPencil(CkMigrateMessage *) { __sdag_init(); setMigratable (false); imsg=imsgb=0;}
4596  #ifdef NAMD_FFTW
4597  #ifdef NAMD_FFTW_3
4598  delete [] forward_plans;
4599  delete [] backward_plans;
4600  #endif
4601  #endif
4602  }
4603  void fft_init();
4604  void recv_grid(const PmeGridMsg *);
4605  void forward_fft();
4606  void send_trans();
4607  void send_subset_trans(int fromIdx, int toIdx);
4608  void recv_untrans(const PmeUntransMsg *);
4609  void recvNodeAck(PmeAckMsg *);
4611  void node_process_grid(PmeGridMsg *);
4612  void backward_fft();
4613  void send_ungrid(PmeGridMsg *);
4614  void send_all_ungrid();
4615  void send_subset_ungrid(int fromIdx, int toIdx);
4616 private:
4617  ResizeArray<PmeGridMsg *> grid_msgs;
4618  ResizeArray<int> work_zlist;
4619 #ifdef NAMD_FFTW
4620 #ifdef NAMD_FFTW_3
4621  fftwf_plan forward_plan, backward_plan;
4622 
4623  //for ckloop usage
4624  int numPlans;
4625  fftwf_plan *forward_plans, *backward_plans;
4626 #else
4627  rfftwnd_plan forward_plan, backward_plan;
4628 #endif
4629 #endif
4630 
4631  int nx, ny;
4632 #if USE_PERSISTENT
4633  void setup_persistent() {
4634  int hd = 1;// ( hasData ? 1 : 0 );
4635  int zBlocks = initdata.zBlocks;
4636  int block3 = initdata.grid.block3;
4637  int dim3 = initdata.grid.dim3;
4638  CkArray *yPencil_local = initdata.yPencil.ckLocalBranch();
4639  CmiAssert(yPencil_local);
4640  trans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * zBlocks);
4641  for ( int isend=0; isend<zBlocks; ++isend ) {
4642  int kb = send_order[isend];
4643  int nz1 = block3;
4644  if ( (kb+1)*block3 > dim3/2 ) nz1 = dim3/2 - kb*block3;
4645  int peer = yPencil_local->homePe(CkArrayIndex3D(thisIndex.x, 0, kb));
4646  int size = sizeof(PmeTransMsg) + sizeof(float)*hd*nx*ny*nz1*2 +sizeof( envelope)+PRIORITY_SIZE/8+24;
4647  int compress_start = sizeof(PmeTransMsg)+sizeof(envelope);
4648  int compress_size = sizeof(float)*hd*nx*ny*nz1*2;
4649  trans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
4650  }
4651  }
4652 
4653  void setup_ungrid_persistent()
4654  {
4655  ungrid_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * grid_msgs.size());
4656  int limsg;
4657  for ( limsg=0; limsg < grid_msgs.size(); ++limsg ) {
4658  int peer = grid_msgs[limsg]->sourceNode;
4659  //ungrid_handle[limsg] = CmiCreatePersistent(peer, 0);
4660  }
4661  imsg = limsg;
4662  }
4663 #endif
4664 };
4665 
4666 class PmeYPencil : public PmePencil<CBase_PmeYPencil> {
4667 public:
4668  PmeYPencil_SDAG_CODE
4669  PmeYPencil() { __sdag_init(); setMigratable(false); imsg=imsgb=0;}
4670  PmeYPencil(CkMigrateMessage *) { __sdag_init(); }
4671  void fft_init();
4672  void recv_trans(const PmeTransMsg *);
4673  void forward_fft();
4674  void forward_subset_fft(int fromIdx, int toIdx);
4675  void send_trans();
4676  void send_subset_trans(int fromIdx, int toIdx);
4677  void recv_untrans(const PmeUntransMsg *);
4679  void recvNodeAck(PmeAckMsg *);
4681  void backward_fft();
4682  void backward_subset_fft(int fromIdx, int toIdx);
4683  void send_untrans();
4684  void send_subset_untrans(int fromIdx, int toIdx);
4685 private:
4686 #ifdef NAMD_FFTW
4687 #ifdef NAMD_FFTW_3
4688  fftwf_plan forward_plan, backward_plan;
4689 #else
4690  fftw_plan forward_plan, backward_plan;
4691 #endif
4692 #endif
4693 
4694  int nx, nz;
4695 #if USE_PERSISTENT
4696  void setup_persistent() {
4697  int yBlocks = initdata.yBlocks;
4698  int block2 = initdata.grid.block2;
4699  int K2 = initdata.grid.K2;
4700  int hd = 1;
4701  CkArray *xPencil_local = initdata.xPencil.ckLocalBranch();
4702  trans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * yBlocks);
4703  for ( int isend=0; isend<yBlocks; ++isend ) {
4704  int jb = send_order[isend];
4705  int ny1 = block2;
4706  if ( (jb+1)*block2 > K2 ) ny1 = K2 - jb*block2;
4707  int peer = xPencil_local->homePe(CkArrayIndex3D(0, jb, thisIndex.z));
4708  int size = sizeof(PmeTransMsg) + sizeof(float)*hd*nx*ny1*nz*2 +sizeof( envelope) + PRIORITY_SIZE/8+24;
4709  int compress_start = sizeof(PmeTransMsg)+sizeof( envelope);
4710  int compress_size = sizeof(float)*hd*nx*ny1*nz*2;
4711  trans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
4712  }
4713 
4714  CkArray *zPencil_local = initdata.zPencil.ckLocalBranch();
4715  untrans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * yBlocks);
4716  for ( int isend=0; isend<yBlocks; ++isend ) {
4717  int jb = send_order[isend];
4718  int ny1 = block2;
4719  if ( (jb+1)*block2 > K2 ) ny1 = K2 - jb*block2;
4720  int peer = zPencil_local->homePe(CkArrayIndex3D(thisIndex.x, jb, 0));
4721  int size= sizeof(PmeUntransMsg) + sizeof(float)*nx*ny1*nz*2 + sizeof( envelope) + PRIORITY_SIZE/8+24;
4722  int compress_start = sizeof(PmeUntransMsg) + sizeof( envelope);
4723  int compress_size = sizeof(float)*nx*ny1*nz*2;
4724  untrans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
4725  }
4726  }
4727 #endif
4728 };
4729 
4730 class PmeXPencil : public PmePencil<CBase_PmeXPencil> {
4731 public:
4732  PmeXPencil_SDAG_CODE
4733  PmeXPencil() { __sdag_init(); myKSpace = 0; setMigratable(false); imsg=imsgb=0; recipEvirPe = -999; }
4734  PmeXPencil(CkMigrateMessage *) { __sdag_init(); }
4736  #ifdef NAMD_FFTW
4737  #ifdef NAMD_FFTW_3
4738  delete [] forward_plans;
4739  delete [] backward_plans;
4740  #endif
4741  #endif
4742  }
4743  void fft_init();
4744  void recv_trans(const PmeTransMsg *);
4745  void forward_fft();
4746  void pme_kspace();
4747  void backward_fft();
4748  void send_untrans();
4749  void send_subset_untrans(int fromIdx, int toIdx);
4751 #ifdef NAMD_FFTW
4752 #ifdef NAMD_FFTW_3
4753  fftwf_plan forward_plan, backward_plan;
4754 
4755  int numPlans;
4756  fftwf_plan *forward_plans, *backward_plans;
4757 #else
4759 #endif
4760 #endif
4761  int ny, nz;
4763  void evir_init();
4765 #if USE_PERSISTENT
4766  void setup_persistent() {
4767  int xBlocks = initdata.xBlocks;
4768  int block1 = initdata.grid.block1;
4769  int K1 = initdata.grid.K1;
4770  CkArray *yPencil_local = initdata.yPencil.ckLocalBranch();
4771  untrans_handle = (PersistentHandle*) malloc( sizeof(PersistentHandle) * xBlocks);
4772  for ( int isend=0; isend<xBlocks; ++isend ) {
4773  int ib = send_order[isend];
4774  int nx1 = block1;
4775  if ( (ib+1)*block1 > K1 ) nx1 = K1 - ib*block1;
4776  int peer = yPencil_local->procNum(CkArrayIndex3D(ib, 0, thisIndex.z));
4777  int size = sizeof(PmeUntransMsg) +
4778  sizeof(float)*nx1*ny*nz*2 +sizeof( envelope) + PRIORITY_SIZE/8+24;
4779  int compress_start = sizeof(PmeUntransMsg) + sizeof( envelope);
4780  int compress_size = sizeof(float)*nx1*ny*nz*2;
4781  untrans_handle[isend] = CmiCreateCompressPersistentSize(peer, size, compress_start, compress_size, CMI_FLOATING);
4782  }
4783  }
4784 #endif
4785 
4786 };
4787 
4790  initdata.pmeProxy[recipEvirPe].addRecipEvirClient();
4791 }
4792 
4794  CProxy_Node nd(CkpvAccess(BOCclass_group).node);
4795  Node *node = nd.ckLocalBranch();
4797 
4798 #if USE_NODE_PAR_RECEIVE
4799  ((NodePmeMgr *)CkLocalNodeBranch(initdata.pmeNodeProxy))->registerZPencil(thisIndex,this);
4800 #endif
4801 
4802  int K1 = initdata.grid.K1;
4803  int K2 = initdata.grid.K2;
4804  int K3 = initdata.grid.K3;
4805  int dim3 = initdata.grid.dim3;
4806  int block1 = initdata.grid.block1;
4807  int block2 = initdata.grid.block2;
4808 
4809  nx = block1;
4810  if ( (thisIndex.x + 1) * block1 > K1 ) nx = K1 - thisIndex.x * block1;
4811  ny = block2;
4812  if ( (thisIndex.y + 1) * block2 > K2 ) ny = K2 - thisIndex.y * block2;
4813 
4814 #ifdef NAMD_FFTW
4816 
4817  data = (float *) fftwf_malloc( sizeof(float) *nx*ny*dim3);
4818  work = new float[dim3];
4819 
4821 
4822 #ifdef NAMD_FFTW_3
4823  /* need array of sizes for the how many */
4824 
4825  int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT : simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE ;
4826  int sizeLines=nx*ny;
4827  int planLineSizes[1];
4828  planLineSizes[0]=K3;
4829  int ndim=initdata.grid.dim3; // storage space is initdata.grid.dim3
4830  int ndimHalf=ndim/2;
4831  forward_plan = fftwf_plan_many_dft_r2c(1, planLineSizes, sizeLines,
4832  (float *) data, NULL, 1,
4833  ndim,
4834  (fftwf_complex *) data, NULL, 1,
4835  ndimHalf,
4836  fftwFlags);
4837 
4838  backward_plan = fftwf_plan_many_dft_c2r(1, planLineSizes, sizeLines,
4839  (fftwf_complex *) data, NULL, 1,
4840  ndimHalf,
4841  (float *) data, NULL, 1,
4842  ndim,
4843  fftwFlags);
4844 #if CMK_SMP && USE_CKLOOP
4845  if(simParams->useCkLoop) {
4846  //How many FFT plans to be created? The grain-size issue!!.
4847  //Currently, I am choosing the min(nx, ny) to be coarse-grain
4848  numPlans = (nx<=ny?nx:ny);
4849  if ( numPlans < CkMyNodeSize() ) numPlans = (nx>=ny?nx:ny);
4850  if ( numPlans < CkMyNodeSize() ) numPlans = sizeLines;
4851  int howmany = sizeLines/numPlans;
4852  forward_plans = new fftwf_plan[numPlans];
4853  backward_plans = new fftwf_plan[numPlans];
4854  for(int i=0; i<numPlans; i++) {
4855  int dimStride = i*ndim*howmany;
4856  int dimHalfStride = i*ndimHalf*howmany;
4857  forward_plans[i] = fftwf_plan_many_dft_r2c(1, planLineSizes, howmany,
4858  ((float *)data)+dimStride, NULL, 1,
4859  ndim,
4860  ((fftwf_complex *)data)+dimHalfStride, NULL, 1,
4861  ndimHalf,
4862  fftwFlags);
4863 
4864  backward_plans[i] = fftwf_plan_many_dft_c2r(1, planLineSizes, howmany,
4865  ((fftwf_complex *)data)+dimHalfStride, NULL, 1,
4866  ndimHalf,
4867  ((float *)data)+dimStride, NULL, 1,
4868  ndim,
4869  fftwFlags);
4870  }
4871  }else
4872 #endif
4873  {
4874  forward_plans = NULL;
4875  backward_plans = NULL;
4876  }
4877 #else
4878  forward_plan = rfftwnd_create_plan_specific(1, &K3, FFTW_REAL_TO_COMPLEX,
4879  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
4880  | FFTW_IN_PLACE | FFTW_USE_WISDOM, data, 1, work, 1);
4881  backward_plan = rfftwnd_create_plan_specific(1, &K3, FFTW_COMPLEX_TO_REAL,
4882  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
4883  | FFTW_IN_PLACE | FFTW_USE_WISDOM, data, 1, work, 1);
4884 #endif
4885  CmiUnlock(ComputePmeMgr::fftw_plan_lock);
4886 #else
4887  NAMD_die("Sorry, FFTW must be compiled in to use PME.");
4888 #endif
4889 
4890 #if USE_NODE_PAR_RECEIVE
4891  evir = 0.;
4892  memset(data, 0, sizeof(float) * nx*ny*dim3);
4893 #endif
4894 }
4895 
4897  CProxy_Node nd(CkpvAccess(BOCclass_group).node);
4898  Node *node = nd.ckLocalBranch();
4900 
4901 #if USE_NODE_PAR_RECEIVE
4902  ((NodePmeMgr *)CkLocalNodeBranch(initdata.pmeNodeProxy))->registerYPencil(thisIndex,this);
4903 #endif
4904 
4905  int K1 = initdata.grid.K1;
4906  int K2 = initdata.grid.K2;
4907  int dim2 = initdata.grid.dim2;
4908  int dim3 = initdata.grid.dim3;
4909  int block1 = initdata.grid.block1;
4910  int block3 = initdata.grid.block3;
4911 
4912  nx = block1;
4913  if ( (thisIndex.x + 1) * block1 > K1 ) nx = K1 - thisIndex.x * block1;
4914  nz = block3;
4915  if ( (thisIndex.z+1)*block3 > dim3/2 ) nz = dim3/2 - thisIndex.z*block3;
4916 
4917 #ifdef NAMD_FFTW
4919 
4920  data = (float *) fftwf_malloc( sizeof(float) * nx*dim2*nz*2);
4921  work = new float[2*K2];
4922 
4924 
4925 #ifdef NAMD_FFTW_3
4926  /* need array of sizes for the dimensions */
4927  /* ideally this should be implementable as a single multidimensional
4928  * plan, but that has proven tricky to implement, so we maintain the
4929  * loop of 1d plan executions. */
4930  int sizeLines=nz;
4931  int planLineSizes[1];
4932  planLineSizes[0]=K2;
4933  int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT : simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE ;
4934  forward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
4935  (fftwf_complex *) data, NULL, sizeLines, 1,
4936  (fftwf_complex *) data, NULL, sizeLines, 1,
4937  FFTW_FORWARD,
4938  fftwFlags);
4939  backward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
4940  (fftwf_complex *) data, NULL, sizeLines, 1,
4941  (fftwf_complex *) data, NULL, sizeLines, 1,
4942  FFTW_BACKWARD,
4943  fftwFlags);
4944  CkAssert(forward_plan != NULL);
4945  CkAssert(backward_plan != NULL);
4946 #else
4947  forward_plan = fftw_create_plan_specific(K2, FFTW_FORWARD,
4948  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
4949  | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
4950  nz, (fftw_complex *) work, 1);
4951  backward_plan = fftw_create_plan_specific(K2, FFTW_BACKWARD,
4952  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
4953  | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
4954  nz, (fftw_complex *) work, 1);
4955 #endif
4956  CmiUnlock(ComputePmeMgr::fftw_plan_lock);
4957 #else
4958  NAMD_die("Sorry, FFTW must be compiled in to use PME.");
4959 #endif
4960 
4961 #if USE_NODE_PAR_RECEIVE
4962  evir = 0;
4963  CmiMemoryWriteFence();
4964 #endif
4965 }
4966 
4968 {
4969  if ( msg->hasData ) hasData = 1;
4970  needs_reply[msg->sourceNode] = msg->hasData;
4971  recv_trans(msg);
4972  int limsg;
4973  CmiMemoryAtomicFetchAndInc(imsg,limsg);
4974  if(limsg+1 == initdata.yBlocks)
4975  {
4976  if ( hasData ) {
4977  forward_fft();
4978  }
4979  send_trans();
4980  imsg=0;
4981  CmiMemoryWriteFence();
4982  }
4983 }
4984 
4986  delete msg;
4988 }
4989 
4991 {
4992  if ( msg ) {
4993  if ( ! hasData ) NAMD_bug("PmeYPencil::node_process_untrans non-null msg but not hasData");
4994  recv_untrans(msg);
4995  } else if ( hasData ) NAMD_bug("PmeYPencil::node_process_untrans hasData but null msg");
4996  int limsg;
4997  CmiMemoryAtomicFetchAndInc(imsgb,limsg);
4998  if(limsg+1 == initdata.yBlocks)
4999  {
5000  if ( hasData ) {
5001  backward_fft();
5002  }
5003  hasData=0;
5004  imsgb=0;
5005  CmiMemoryWriteFence();
5006  send_untrans();
5007  }
5008 }
5009 
5010 #define DEBUG_NODE_PAR_RECV 0
5011 
5013  // CkPrintf("[%d] NodePmeMgr recvXTrans for %d %d %d\n",CkMyPe(),msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
5014  PmeXPencil *target=xPencilObj.get(msg->destElem);
5015 #if DEBUG_NODE_PAR_RECV
5016  if(target == NULL)
5017  CkAbort("xpencil in recvXTrans not found, debug registeration");
5018 #endif
5019  target->node_process_trans(msg);
5020  delete msg;
5021 }
5022 
5023 
5025  // CkPrintf("[%d] NodePmeMgr recvYTrans for %d %d %d\n",CkMyPe(),msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
5026  PmeYPencil *target=yPencilObj.get(msg->destElem);
5027 #if DEBUG_NODE_PAR_RECV
5028  if(target == NULL)
5029  CkAbort("ypencil in recvYTrans not found, debug registeration");
5030 #endif
5031  target->node_process_trans(msg);
5032  delete msg;
5033  }
5035  // CkPrintf("[%d] NodePmeMgr recvYUntrans for %d %d %d\n",CkMyPe(),msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
5036  PmeYPencil *target=yPencilObj.get(msg->destElem);
5037 #if DEBUG_NODE_PAR_RECV
5038  if(target == NULL)
5039  CkAbort("ypencil in recvYUntrans not found, debug registeration");
5040 #endif
5041  target->node_process_untrans(msg);
5042  delete msg;
5043  }
5045  //CkPrintf("[%d] NodePmeMgr recvZUntrans for %d %d %d\n",CkMyPe(),msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
5046  PmeZPencil *target=zPencilObj.get(msg->destElem);
5047 #if DEBUG_NODE_PAR_RECV
5048  if(target == NULL)
5049  CkAbort("zpencil in recvZUntrans not found, debug registeration");
5050 #endif
5051  target->node_process_untrans(msg);
5052  delete msg;
5053 }
5054 
5056  //CkPrintf("[%d] NodePmeMgr %p recvGrid for %d %d %d\n",CkMyPe(),this,msg->destElem.index[0],msg->destElem.index[1],msg->destElem.index[2]);
5057  PmeZPencil *target=zPencilObj.get(msg->destElem);
5058 #if DEBUG_NODE_PAR_RECV
5059  if(target == NULL){
5060  CkAbort("zpencil in recvZGrid not found, debug registeration");
5061  }
5062 #endif
5063  target->node_process_grid(msg); //msg is stored inside node_proces_grid
5064 }
5065 
5067  CProxy_Node nd(CkpvAccess(BOCclass_group).node);
5068  Node *node = nd.ckLocalBranch();
5070 #if USE_NODE_PAR_RECEIVE
5071  ((NodePmeMgr *)CkLocalNodeBranch(initdata.pmeNodeProxy))->registerXPencil(thisIndex,this);
5072 #endif
5073 
5074  int K1 = initdata.grid.K1;
5075  int K2 = initdata.grid.K2;
5076  int dim3 = initdata.grid.dim3;
5077  int block2 = initdata.grid.block2;
5078  int block3 = initdata.grid.block3;
5079 
5080  ny = block2;
5081  if ( (thisIndex.y + 1) * block2 > K2 ) ny = K2 - thisIndex.y * block2;
5082  nz = block3;
5083  if ( (thisIndex.z+1)*block3 > dim3/2 ) nz = dim3/2 - thisIndex.z*block3;
5084 
5085 #ifdef NAMD_FFTW
5087 
5088  data = (float *) fftwf_malloc( sizeof(float) * K1*ny*nz*2);
5089  work = new float[2*K1];
5090 
5092 
5093 #ifdef NAMD_FFTW_3
5094  /* need array of sizes for the how many */
5095  int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT : simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE ;
5096  int sizeLines=ny*nz;
5097  int planLineSizes[1];
5098  planLineSizes[0]=K1;
5099  forward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
5100  (fftwf_complex *) data, NULL, sizeLines, 1,
5101  (fftwf_complex *) data, NULL, sizeLines, 1,
5102  FFTW_FORWARD,
5103  fftwFlags);
5104  backward_plan = fftwf_plan_many_dft(1, planLineSizes, sizeLines,
5105  (fftwf_complex *) data, NULL, sizeLines, 1,
5106  (fftwf_complex *) data, NULL, sizeLines, 1,
5107  FFTW_BACKWARD,
5108  fftwFlags);
5109 
5110 #if CMK_SMP && USE_CKLOOP
5111  if(simParams->useCkLoop) {
5112  //How many FFT plans to be created? The grain-size issue!!.
5113  //Currently, I am choosing the min(nx, ny) to be coarse-grain
5114  numPlans = (ny<=nz?ny:nz);
5115  // limit attempted parallelism due to false sharing
5116  //if ( numPlans < CkMyNodeSize() ) numPlans = (ny>=nz?ny:nz);
5117  //if ( numPlans < CkMyNodeSize() ) numPlans = sizeLines;
5118  if ( sizeLines/numPlans < 4 ) numPlans = 1;
5119  int howmany = sizeLines/numPlans;
5120  forward_plans = new fftwf_plan[numPlans];
5121  backward_plans = new fftwf_plan[numPlans];
5122  for(int i=0; i<numPlans; i++) {
5123  int curStride = i*howmany;
5124  forward_plans[i] = fftwf_plan_many_dft(1, planLineSizes, howmany,
5125  ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
5126  ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
5127  FFTW_FORWARD,
5128  fftwFlags);
5129 
5130  backward_plans[i] = fftwf_plan_many_dft(1, planLineSizes, howmany,
5131  ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
5132  ((fftwf_complex *)data)+curStride, NULL, sizeLines, 1,
5133  FFTW_BACKWARD,
5134  fftwFlags);
5135  }
5136  }else
5137 #endif
5138  {
5139  forward_plans = NULL;
5140  backward_plans = NULL;
5141  }
5142 #else
5143  forward_plan = fftw_create_plan_specific(K1, FFTW_FORWARD,
5144  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
5145  | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
5146  ny*nz, (fftw_complex *) work, 1);
5147  backward_plan = fftw_create_plan_specific(K1, FFTW_BACKWARD,
5148  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
5149  | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
5150  ny*nz, (fftw_complex *) work, 1);
5151 #endif
5152  CmiUnlock(ComputePmeMgr::fftw_plan_lock);
5153 #else
5154  NAMD_die("Sorry, FFTW must be compiled in to use PME.");
5155 #endif
5156 
5158  thisIndex.y*block2, thisIndex.y*block2 + ny,
5159  thisIndex.z*block3, thisIndex.z*block3 + nz);
5160 
5161 }
5162 
5163 // #define FFTCHECK // run a grid of integers through the fft
5164 // #define ZEROCHECK // check for suspicious zeros in fft
5165 
5167 
5168  int dim3 = initdata.grid.dim3;
5169  if ( imsg == 0 ) {
5170  lattice = msg->lattice;
5171  sequence = msg->sequence;
5172 #if ! USE_NODE_PAR_RECEIVE
5173  memset(data, 0, sizeof(float)*nx*ny*dim3);
5174 #endif
5175  }
5176 
5177  if ( ! msg->hasData ) return;
5178 
5179  int zlistlen = msg->zlistlen;
5180 #ifdef NAMD_KNL
5181  int * __restrict msg_zlist = msg->zlist;
5182  int * __restrict zlist = (int*)__builtin_assume_aligned(work_zlist.begin(),
5183  64);
5184  for ( int k=0; k<zlistlen; ++k ) {
5185  zlist[k] = msg_zlist[k];
5186  }
5187 #else
5188  int * __restrict zlist = msg->zlist;
5189 #endif
5190  char * __restrict fmsg = msg->fgrid;
5191  float * __restrict qmsg = msg->qgrid;
5192  float * __restrict d = data;
5193  int numGrids = 1; // pencil FFT doesn't support multiple grids
5194  for ( int g=0; g<numGrids; ++g ) {
5195  for ( int i=0; i<nx; ++i ) {
5196  for ( int j=0; j<ny; ++j, d += dim3 ) {
5197  if( *(fmsg++) ) {
5198  #pragma ivdep
5199  for ( int k=0; k<zlistlen; ++k ) {
5200  d[zlist[k]] += *(qmsg++);
5201  }
5202  }
5203  }
5204  }
5205  }
5206 }
5207 
5208 static inline void PmeXZPencilFFT(int first, int last, void *result, int paraNum, void *param){
5209 #ifdef NAMD_FFTW
5210 #ifdef NAMD_FFTW_3
5211  fftwf_plan *plans = (fftwf_plan *)param;
5212  for(int i=first; i<=last; i++) fftwf_execute(plans[i]);
5213 #endif
5214 #endif
5215 }
5216 
5218  evir = 0.;
5219 #ifdef FFTCHECK
5220  int dim3 = initdata.grid.dim3;
5221  int K3 = initdata.grid.K3;
5222  float std_base = 100. * (thisIndex.x+1.) + 10. * (thisIndex.y+1.);
5223  float *d = data;
5224  for ( int i=0; i<nx; ++i ) {
5225  for ( int j=0; j<ny; ++j, d += dim3 ) {
5226  for ( int k=0; k<dim3; ++k ) {
5227  d[k] = 10. * (10. * (10. * std_base + i) + j) + k;
5228  }
5229  }
5230  }
5231 #endif
5232 #ifdef NAMD_FFTW
5233 #ifdef MANUAL_DEBUG_FFTW3
5234  dumpMatrixFloat3("fw_z_b", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
5235 #endif
5236 #ifdef NAMD_FFTW_3
5237 #if CMK_SMP && USE_CKLOOP
5238  int useCkLoop = Node::Object()->simParameters->useCkLoop;
5239  if(useCkLoop>=CKLOOP_CTRL_PME_FORWARDFFT
5240  && CkNumPes() >= 2 * initdata.xBlocks * initdata.yBlocks) {
5241  //for(int i=0; i<numPlans; i++) fftwf_execute(forward_plans[i]);
5242  //transform the above loop
5243  CkLoop_Parallelize(PmeXZPencilFFT, 1, (void *)forward_plans, CkMyNodeSize(), 0, numPlans-1); //sync
5244  return;
5245  }
5246 #endif
5247  fftwf_execute(forward_plan);
5248 #else
5249  rfftwnd_real_to_complex(forward_plan, nx*ny,
5250  data, 1, initdata.grid.dim3, (fftw_complex *) work, 1, 0);
5251 #endif
5252 #ifdef MANUAL_DEBUG_FFTW3
5253  dumpMatrixFloat3("fw_z_a", data, nx, ny, initdata.grid.dim3, thisIndex.x, thisIndex.y, thisIndex.z);
5254 #endif
5255 
5256 #endif
5257 #ifdef ZEROCHECK
5258  int dim3 = initdata.grid.dim3;
5259  int K3 = initdata.grid.K3;
5260  float *d = data;
5261  for ( int i=0; i<nx; ++i ) {
5262  for ( int j=0; j<ny; ++j, d += dim3 ) {
5263  for ( int k=0; k<dim3; ++k ) {
5264  if ( d[k] == 0. ) CkPrintf("0 in Z at %d %d %d %d %d %d %d %d %d\n",
5265  thisIndex.x, thisIndex.y, i, j, k, nx, ny, dim3);
5266  }
5267  }
5268  }
5269 #endif
5270 }
5271 
5272 /* A single task for partitioned PmeZPencil::send_trans work */
5273 static inline void PmeZPencilSendTrans(int first, int last, void *result, int paraNum, void *param){
5274  PmeZPencil *zpencil = (PmeZPencil *)param;
5275  zpencil->send_subset_trans(first, last);
5276 }
5277 
5278 void PmeZPencil::send_subset_trans(int fromIdx, int toIdx){
5279  int zBlocks = initdata.zBlocks;
5280  int block3 = initdata.grid.block3;
5281  int dim3 = initdata.grid.dim3;
5282  for ( int isend=fromIdx; isend<=toIdx; ++isend ) {
5283  int kb = send_order[isend];
5284  int nz = block3;
5285  if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
5286  int hd = ( hasData ? 1 : 0 );
5287  PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
5288  msg->lattice = lattice;
5289  msg->sourceNode = thisIndex.y;
5290  msg->hasData = hasData;
5291  msg->nx = ny;
5292  if ( hasData ) {
5293  float *md = msg->qgrid;
5294  const float *d = data;
5295  for ( int i=0; i<nx; ++i ) {
5296  for ( int j=0; j<ny; ++j, d += dim3 ) {
5297  for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
5298  *(md++) = d[2*k];
5299  *(md++) = d[2*k+1];
5300  }
5301  }
5302  }
5303  }
5304  msg->sequence = sequence;
5306 
5307  CmiEnableUrgentSend(1);
5308 #if USE_NODE_PAR_RECEIVE
5309  msg->destElem=CkArrayIndex3D(thisIndex.x,0,kb);
5310 #if Y_PERSIST
5311  CmiUsePersistentHandle(&trans_handle[isend], 1);
5312 #endif
5313  initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYTrans(msg);
5314 #if Y_PERSIST
5315  CmiUsePersistentHandle(NULL, 0);
5316 #endif
5317 #else
5318 #if Y_PERSIST
5319  CmiUsePersistentHandle(&trans_handle[isend], 1);
5320 #endif
5321  initdata.yPencil(thisIndex.x,0,kb).recvTrans(msg);
5322 #if Y_PERSIST
5323  CmiUsePersistentHandle(NULL, 0);
5324 #endif
5325 #endif
5326  CmiEnableUrgentSend(0);
5327  }
5328 }
5329 
5331 #if USE_PERSISTENT
5332  if (trans_handle == NULL) setup_persistent();
5333 #endif
5334 #if CMK_SMP && USE_CKLOOP
5335  int useCkLoop = Node::Object()->simParameters->useCkLoop;
5336  if(useCkLoop>=CKLOOP_CTRL_PME_SENDTRANS
5337  && CkNumPes() >= 2 * initdata.xBlocks * initdata.yBlocks) {
5344  //send_subset_trans(0, initdata.zBlocks-1);
5345  CkLoop_Parallelize(PmeZPencilSendTrans, 1, (void *)this, CkMyNodeSize(), 0, initdata.zBlocks-1, 1); //not sync
5346  return;
5347  }
5348 #endif
5349  int zBlocks = initdata.zBlocks;
5350  int block3 = initdata.grid.block3;
5351  int dim3 = initdata.grid.dim3;
5352  for ( int isend=0; isend<zBlocks; ++isend ) {
5353  int kb = send_order[isend];
5354  int nz = block3;
5355  if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
5356  int hd = ( hasData ? 1 : 0 );
5357  PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
5358  msg->lattice = lattice;
5359  msg->sourceNode = thisIndex.y;
5360  msg->hasData = hasData;
5361  msg->nx = ny;
5362  if ( hasData ) {
5363  float *md = msg->qgrid;
5364  const float *d = data;
5365  for ( int i=0; i<nx; ++i ) {
5366  for ( int j=0; j<ny; ++j, d += dim3 ) {
5367  for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
5368  *(md++) = d[2*k];
5369  *(md++) = d[2*k+1];
5370  }
5371  }
5372  }
5373  }
5374  msg->sequence = sequence;
5376 
5377  CmiEnableUrgentSend(1);
5378 #if USE_NODE_PAR_RECEIVE
5379  msg->destElem=CkArrayIndex3D(thisIndex.x,0,kb);
5380 #if Y_PERSIST
5381  CmiUsePersistentHandle(&trans_handle[isend], 1);
5382 #endif
5383  initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYTrans(msg);
5384 #if Y_PERSIST
5385  CmiUsePersistentHandle(NULL, 0);
5386 #endif
5387 #else
5388 #if Y_PERSIST
5389  CmiUsePersistentHandle(&trans_handle[isend], 1);
5390 #endif
5391  initdata.yPencil(thisIndex.x,0,kb).recvTrans(msg);
5392 #if Y_PERSIST
5393  CmiUsePersistentHandle(NULL, 0);
5394 #endif
5395 #endif
5396  CmiEnableUrgentSend(0);
5397  }
5398 }
5399 
5401  if ( imsg == 0 ) {
5402  lattice = msg->lattice;
5403  sequence = msg->sequence;
5404  }
5405  int block2 = initdata.grid.block2;
5406  int K2 = initdata.grid.K2;
5407  int jb = msg->sourceNode;
5408  int ny = msg->nx;
5409  if ( msg->hasData ) {
5410  const float *md = msg->qgrid;
5411  float *d = data;
5412  for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
5413  for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
5414  for ( int k=0; k<nz; ++k ) {
5415 #ifdef ZEROCHECK
5416  if ( (*md) == 0. ) CkPrintf("0 in ZY at %d %d %d %d %d %d %d %d %d\n",
5417  thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
5418 #endif
5419  d[2*(j*nz+k)] = *(md++);
5420  d[2*(j*nz+k)+1] = *(md++);
5421  }
5422  }
5423  }
5424  } else {
5425  float *d = data;
5426  for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
5427  for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
5428  for ( int k=0; k<nz; ++k ) {
5429  d[2*(j*nz+k)] = 0;
5430  d[2*(j*nz+k)+1] = 0;
5431  }
5432  }
5433  }
5434  }
5435 }
5436 
5437 static inline void PmeYPencilForwardFFT(int first, int last, void *result, int paraNum, void *param){
5438  PmeYPencil *ypencil = (PmeYPencil *)param;
5439  ypencil->forward_subset_fft(first, last);
5440 }
5441 void PmeYPencil::forward_subset_fft(int fromIdx, int toIdx) {
5442 #ifdef NAMD_FFTW
5443 #ifdef NAMD_FFTW_3
5444  for(int i=fromIdx; i<=toIdx; i++){
5445  fftwf_execute_dft(forward_plan, ((fftwf_complex *) data) + i
5446  * nz * initdata.grid.K2,
5447  ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
5448  }
5449 #endif
5450 #endif
5451 }
5452 
5454  evir = 0.;
5455 #ifdef NAMD_FFTW
5456 #ifdef MANUAL_DEBUG_FFTW3
5457  dumpMatrixFloat3("fw_y_b", data, nx, initdata.grid.K2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
5458 #endif
5459 
5460 #ifdef NAMD_FFTW_3
5461 #if CMK_SMP && USE_CKLOOP
5462  int useCkLoop = Node::Object()->simParameters->useCkLoop;
5463  if(useCkLoop>=CKLOOP_CTRL_PME_FORWARDFFT
5464  && CkNumPes() >= 2 * initdata.xBlocks * initdata.zBlocks) {
5465  CkLoop_Parallelize(PmeYPencilForwardFFT, 1, (void *)this, CkMyNodeSize(), 0, nx-1); //sync
5466  return;
5467  }
5468 #endif
5469  //the above is a transformation of the following loop using CkLoop
5470  for ( int i=0; i<nx; ++i ) {
5471  fftwf_execute_dft(forward_plan, ((fftwf_complex *) data) + i
5472  * nz * initdata.grid.K2,
5473  ((fftwf_complex *) data) + i * nz * initdata.grid.K2);
5474  }
5475 #else
5476  for ( int i=0; i<nx; ++i ) {
5477  fftw(forward_plan, nz,
5478  ((fftw_complex *) data) + i * nz * initdata.grid.K2,
5479  nz, 1, (fftw_complex *) work, 1, 0);
5480  }
5481 #endif
5482 #ifdef MANUAL_DEBUG_FFTW3
5483  dumpMatrixFloat3("fw_y_a", data, nx, initdata.grid.dim2, nz, thisIndex.x, thisIndex.y, thisIndex.z);
5484 #endif
5485 
5486 #endif
5487 }
5488 
5489 static inline void PmeYPencilSendTrans(int first, int last, void *result, int paraNum, void *param){
5490  PmeYPencil *ypencil = (PmeYPencil *)param;
5491  ypencil->send_subset_trans(first, last);
5492 }
5493 
5494 void PmeYPencil::send_subset_trans(int fromIdx, int toIdx){
5495  int yBlocks = initdata.yBlocks;
5496  int block2 = initdata.grid.block2;
5497  int K2 = initdata.grid.K2;
5498  for ( int isend=fromIdx; isend<=toIdx; ++isend ) {
5499  int jb = send_order[isend];
5500  int ny = block2;
5501  if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
5502  int hd = ( hasData ? 1 : 0 );
5503  PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
5504  msg->lattice = lattice;
5505  msg->sourceNode = thisIndex.x;
5506  msg->hasData = hasData;
5507  msg->nx = nx;
5508  if ( hasData ) {
5509  float *md = msg->qgrid;
5510  const float *d = data;
5511  for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
5512  for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
5513  for ( int k=0; k<nz; ++k ) {
5514  *(md++) = d[2*(j*nz+k)];
5515  *(md++) = d[2*(j*nz+k)+1];
5516  #ifdef ZEROCHECK
5517  if ( *(md-2) == 0. ) CkPrintf("send 0 in YX at %d %d %d %d %d %d %d %d %d\n",
5518  thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
5519  #endif
5520  }
5521  }
5522  }
5523  if ( md != msg->qgrid + nx*ny*nz*2 ) CkPrintf("error in YX at %d %d %d\n",
5524  thisIndex.x, jb, thisIndex.z);
5525  }
5526  msg->sequence = sequence;
5528  CmiEnableUrgentSend(1);
5529 #if USE_NODE_PAR_RECEIVE
5530  msg->destElem=CkArrayIndex3D(0,jb,thisIndex.z);
5531 #if X_PERSIST
5532  CmiUsePersistentHandle(&trans_handle[isend], 1);
5533 #endif
5534  initdata.pmeNodeProxy[CmiNodeOf(initdata.xm.ckLocalBranch()->procNum(0,msg->destElem))].recvXTrans(msg);
5535 #if X_PERSIST
5536  CmiUsePersistentHandle(NULL, 0);
5537 #endif
5538 #else
5539 #if X_PERSIST
5540  CmiUsePersistentHandle(&trans_handle[isend], 1);
5541 #endif
5542  initdata.xPencil(0,jb,thisIndex.z).recvTrans(msg);
5543 #if X_PERSIST
5544  CmiUsePersistentHandle(NULL, 0);
5545 #endif
5546 #endif
5547  CmiEnableUrgentSend(0);
5548  }
5549 }
5550 
5552 #if USE_PERSISTENT
5553  if (trans_handle == NULL) setup_persistent();
5554 #endif
5555 #if CMK_SMP && USE_CKLOOP
5556  int useCkLoop = Node::Object()->simParameters->useCkLoop;
5557  if(useCkLoop>=CKLOOP_CTRL_PME_SENDTRANS
5558  && CkNumPes() >= 2 * initdata.xBlocks * initdata.zBlocks) {
5565  //send_subset_trans(0, initdata.yBlocks-1);
5566  CkLoop_Parallelize(PmeYPencilSendTrans, 1, (void *)this, CkMyNodeSize(), 0, initdata.yBlocks-1, 1); //not sync
5567  return;
5568  }
5569 #endif
5570  int yBlocks = initdata.yBlocks;
5571  int block2 = initdata.grid.block2;
5572  int K2 = initdata.grid.K2;
5573  for ( int isend=0; isend<yBlocks; ++isend ) {
5574  int jb = send_order[isend];
5575  int ny = block2;
5576  if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
5577  int hd = ( hasData ? 1 : 0 );
5578  PmeTransMsg *msg = new (hd*nx*ny*nz*2,PRIORITY_SIZE) PmeTransMsg;
5579  msg->lattice = lattice;
5580  msg->sourceNode = thisIndex.x;
5581  msg->hasData = hasData;
5582  msg->nx = nx;
5583  if ( hasData ) {
5584  float *md = msg->qgrid;
5585  const float *d = data;
5586  for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
5587  for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
5588  for ( int k=0; k<nz; ++k ) {
5589  *(md++) = d[2*(j*nz+k)];
5590  *(md++) = d[2*(j*nz+k)+1];
5591 #ifdef ZEROCHECK
5592  if ( *(md-2) == 0. ) CkPrintf("send 0 in YX at %d %d %d %d %d %d %d %d %d\n",
5593  thisIndex.x, jb, thisIndex.z, i, j, k, nx, ny, nz);
5594 #endif
5595  }
5596  }
5597  }
5598  if ( md != msg->qgrid + nx*ny*nz*2 ) CkPrintf("error in YX at %d %d %d\n",
5599  thisIndex.x, jb, thisIndex.z);
5600  }
5601  msg->sequence = sequence;
5603  CmiEnableUrgentSend(1);
5604 #if USE_NODE_PAR_RECEIVE
5605  msg->destElem=CkArrayIndex3D(0,jb,thisIndex.z);
5606 #if X_PERSIST
5607  CmiUsePersistentHandle(&trans_handle[isend], 1);
5608 #endif
5609  initdata.pmeNodeProxy[CmiNodeOf(initdata.xm.ckLocalBranch()->procNum(0,msg->destElem))].recvXTrans(msg);
5610 #if X_PERSIST
5611  CmiUsePersistentHandle(NULL, 0);
5612 #endif
5613 #else
5614 #if X_PERSIST
5615  CmiUsePersistentHandle(&trans_handle[isend], 1);
5616 #endif
5617  initdata.xPencil(0,jb,thisIndex.z).recvTrans(msg);
5618 #if X_PERSIST
5619  CmiUsePersistentHandle(NULL, 0);
5620 #endif
5621 
5622 #endif
5623  CmiEnableUrgentSend(0);
5624  }
5625 }
5626 
5628 {
5629  if(msg->hasData) hasData=1;
5630  needs_reply[msg->sourceNode] = msg->hasData;
5631  recv_trans(msg);
5632  int limsg;
5633  CmiMemoryAtomicFetchAndInc(imsg,limsg);
5634  if(limsg+1 == initdata.xBlocks)
5635  {
5636  if(hasData){
5637  forward_fft();
5638  pme_kspace();
5639  backward_fft();
5640  }
5641  send_untrans();
5642  imsg=0;
5643  CmiMemoryWriteFence();
5644  }
5645 }
5646 
5648  if ( imsg == 0 ) {
5649  lattice = msg->lattice;
5650  sequence = msg->sequence;
5651  }
5652  int block1 = initdata.grid.block1;
5653  int K1 = initdata.grid.K1;
5654  int ib = msg->sourceNode;
5655  int nx = msg->nx;
5656  if ( msg->hasData ) {
5657  const float *md = msg->qgrid;
5658  for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
5659  float *d = data + i*ny*nz*2;
5660  for ( int j=0; j<ny; ++j, d += nz*2 ) {
5661  for ( int k=0; k<nz; ++k ) {
5662 #ifdef ZEROCHECK
5663  if ( (*md) == 0. ) CkPrintf("0 in YX at %d %d %d %d %d %d %d %d %d\n",
5664  ib, thisIndex.y, thisIndex.z, i, j, k, nx, ny, nz);
5665 #endif
5666  d[2*k] = *(md++);
5667  d[2*k+1] = *(md++);
5668  }
5669  }
5670  }
5671  } else {
5672  for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
5673  float *d = data + i*ny*nz*2;
5674  for ( int j=0; j<ny; ++j, d += nz*2 ) {
5675  for ( int k=0; k<nz; ++k ) {
5676  d[2*k] = 0;
5677  d[2*k+1] = 0;
5678  }
5679  }
5680  }
5681  }
5682 }
5683 
5685 #ifdef NAMD_FFTW
5686 
5687 #ifdef MANUAL_DEBUG_FFTW3
5688  dumpMatrixFloat3("fw_x_b", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
5689 #endif
5690 
5691 #ifdef NAMD_FFTW_3
5692 #if CMK_SMP && USE_CKLOOP
5693  int useCkLoop = Node::Object()->simParameters->useCkLoop;
5694  if(useCkLoop>=CKLOOP_CTRL_PME_FORWARDFFT
5695  && CkNumPes() >= 2 * initdata.yBlocks * initdata.zBlocks) {
5696  //for(int i=0; i<numPlans; i++) fftwf_execute(forward_plans[i]);
5697  //transform the above loop
5698  CkLoop_Parallelize(PmeXZPencilFFT, 1, (void *)forward_plans, CkMyNodeSize(), 0, numPlans-1); //sync
5699  return;
5700  }
5701 #endif
5702  fftwf_execute(forward_plan);
5703 #else
5704  fftw(forward_plan, ny*nz,
5705  ((fftw_complex *) data), ny*nz, 1, (fftw_complex *) work, 1, 0);
5706 #endif
5707 #ifdef MANUAL_DEBUG_FFTW3
5708  dumpMatrixFloat3("fw_x_a", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
5709 #endif
5710 
5711 #endif
5712 }
5713 
5715 
5716  evir = 0.;
5717 
5718 #ifdef FFTCHECK
5719  return;
5720 #endif
5721 
5723 
5724  int useCkLoop = 0;
5725 #if CMK_SMP && USE_CKLOOP
5726  if ( Node::Object()->simParameters->useCkLoop >= CKLOOP_CTRL_PME_KSPACE
5727  && CkNumPes() >= 2 * initdata.yBlocks * initdata.zBlocks ) {
5728  useCkLoop = 1;
5729  }
5730 #endif
5731 
5732  int numGrids = 1;
5733  for ( int g=0; g<numGrids; ++g ) {
5734  evir[0] = myKSpace->compute_energy(data+0*g,
5735  lattice, ewaldcof, &(evir[1]), useCkLoop);
5736  }
5737 
5738 #if USE_NODE_PAR_RECEIVE
5739  CmiMemoryWriteFence();
5740 #endif
5741 }
5742 
5744 #ifdef NAMD_FFTW
5745 #ifdef MANUAL_DEBUG_FFTW3
5746  dumpMatrixFloat3("bw_x_b", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
5747 #endif
5748 
5749 #ifdef NAMD_FFTW_3
5750 #if CMK_SMP && USE_CKLOOP
5751  int useCkLoop = Node::Object()->simParameters->useCkLoop;
5752  if(useCkLoop>=CKLOOP_CTRL_PME_BACKWARDFFT
5753  && CkNumPes() >= 2 * initdata.yBlocks * initdata.zBlocks) {
5754  //for(int i=0; i<numPlans; i++) fftwf_execute(backward_plans[i]);
5755  //transform the above loop
5756  CkLoop_Parallelize(PmeXZPencilFFT, 1, (void *)backward_plans, CkMyNodeSize(), 0, numPlans-1); //sync
5757  return;
5758  }
5759 #endif
5760  fftwf_execute(backward_plan);
5761 #else
5762  fftw(backward_plan, ny*nz,
5763  ((fftw_complex *) data), ny*nz, 1, (fftw_complex *) work, 1, 0);
5764 #endif
5765 #ifdef MANUAL_DEBUG_FFTW3
5766  dumpMatrixFloat3("bw_x_a", data, initdata.grid.K1, ny, nz, thisIndex.x, thisIndex.y, thisIndex.z);
5767 #endif
5768 #endif
5769 }
5770 
5771 static inline void PmeXPencilSendUntrans(int first, int last, void *result, int paraNum, void *param){
5772  PmeXPencil *xpencil = (PmeXPencil *)param;
5773  xpencil->send_subset_untrans(first, last);
5774 }
5775 
5776 void PmeXPencil::send_subset_untrans(int fromIdx, int toIdx){
5777  int xBlocks = initdata.xBlocks;
5778  int block1 = initdata.grid.block1;
5779  int K1 = initdata.grid.K1;
5780 
5781  for(int isend=fromIdx; isend<=toIdx; isend++) {
5782  int ib = send_order[isend];
5783  if ( ! needs_reply[ib] ) {
5784  PmeAckMsg *msg = new (PRIORITY_SIZE) PmeAckMsg;
5785  CmiEnableUrgentSend(1);
5787 #if USE_NODE_PAR_RECEIVE
5788  initdata.yPencil(ib,0,thisIndex.z).recvNodeAck(msg);
5789 #else
5790  initdata.yPencil(ib,0,thisIndex.z).recvAck(msg);
5791 #endif
5792  CmiEnableUrgentSend(0);
5793  continue;
5794  }
5795  int nx = block1;
5796  if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
5797  PmeUntransMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) PmeUntransMsg;
5798  msg->sourceNode = thisIndex.y;
5799  msg->ny = ny;
5800  float *md = msg->qgrid;
5801  for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
5802  float *d = data + i*ny*nz*2;
5803  for ( int j=0; j<ny; ++j, d += nz*2 ) {
5804  for ( int k=0; k<nz; ++k ) {
5805  *(md++) = d[2*k];
5806  *(md++) = d[2*k+1];
5807  }
5808  }
5809  }
5811  CmiEnableUrgentSend(1);
5812 #if USE_NODE_PAR_RECEIVE
5813  msg->destElem=CkArrayIndex3D(ib,0, thisIndex.z);
5814 #if Y_PERSIST
5815  CmiUsePersistentHandle(&untrans_handle[isend], 1);
5816 #endif
5817  initdata.pmeNodeProxy[CmiNodeOf(initdata.ym.ckLocalBranch()->procNum(0,msg->destElem))].recvYUntrans(msg);
5818 #if Y_PERSIST
5819  CmiUsePersistentHandle(NULL, 0);
5820 #endif
5821 #else
5822 #if Y_PERSIST
5823  // CmiUsePersistentHandle(&untrans_handle[isend], 1);
5824 #endif
5825  initdata.yPencil(ib,0,thisIndex.z).recvUntrans(msg);
5826 #if Y_PERSIST
5827  // CmiUsePersistentHandle(NULL, 0);
5828 #endif
5829 #endif
5830  CmiEnableUrgentSend(0);
5831  }
5832 }
5833 
5835 
5836  { // send energy and virial
5837  int numGrids = 1;
5838  PmeEvirMsg *newmsg = new (numGrids, PRIORITY_SIZE) PmeEvirMsg;
5839  newmsg->evir[0] = evir;
5841  CmiEnableUrgentSend(1);
5842  initdata.pmeProxy[recipEvirPe].recvRecipEvir(newmsg);
5843  CmiEnableUrgentSend(0);
5844  }
5845 
5846 #if USE_PERSISTENT
5847  if (untrans_handle == NULL) setup_persistent();
5848 #endif
5849 #if CMK_SMP && USE_CKLOOP
5850  int useCkLoop = Node::Object()->simParameters->useCkLoop;
5851  if(useCkLoop>=CKLOOP_CTRL_PME_SENDUNTRANS
5852  && CkNumPes() >= 2 * initdata.yBlocks * initdata.zBlocks) {
5853  int xBlocks = initdata.xBlocks;
5854 
5855 #if USE_NODE_PAR_RECEIVE
5856  //CkLoop_Parallelize(PmeXPencilSendUntrans, 1, (void *)this, CkMyNodeSize(), 0, xBlocks-1, 1); //has to sync
5857  CkLoop_Parallelize(PmeXPencilSendUntrans, 1, (void *)this, xBlocks, 0, xBlocks-1, 1); //has to sync
5858 #else
5859  //CkLoop_Parallelize(PmeXPencilSendUntrans, 1, (void *)this, CkMyNodeSize(), 0, xBlocks-1, 0); //not sync
5860  CkLoop_Parallelize(