NAMD
Classes | Public Member Functions | Public Attributes | Static Public Attributes | Friends | List of all members
ComputePmeMgr Class Reference
Inheritance diagram for ComputePmeMgr:
ComputePmeUtil

Classes

struct  cuda_submit_charges_args
 

Public Member Functions

 ComputePmeMgr ()
 
 ~ComputePmeMgr ()
 
void initialize (CkQdMsg *)
 
void initialize_pencils (CkQdMsg *)
 
void activate_pencils (CkQdMsg *)
 
void recvArrays (CProxy_PmeXPencil, CProxy_PmeYPencil, CProxy_PmeZPencil)
 
void initialize_computes ()
 
void sendData (Lattice &, int sequence)
 
void sendDataPart (int first, int last, Lattice &, int sequence, int sourcepe, int errors)
 
void sendPencils (Lattice &, int sequence)
 
void sendPencilsPart (int first, int last, Lattice &, int sequence, int sourcepe)
 
void recvGrid (PmeGridMsg *)
 
void gridCalc1 (void)
 
void sendTransBarrier (void)
 
void sendTransSubset (int first, int last)
 
void sendTrans (void)
 
void fwdSharedTrans (PmeTransMsg *)
 
void recvSharedTrans (PmeSharedTransMsg *)
 
void sendDataHelper (int)
 
void sendPencilsHelper (int)
 
void recvTrans (PmeTransMsg *)
 
void procTrans (PmeTransMsg *)
 
void gridCalc2 (void)
 
void gridCalc2R (void)
 
void fwdSharedUntrans (PmeUntransMsg *)
 
void recvSharedUntrans (PmeSharedUntransMsg *)
 
void sendUntrans (void)
 
void sendUntransSubset (int first, int last)
 
void recvUntrans (PmeUntransMsg *)
 
void procUntrans (PmeUntransMsg *)
 
void gridCalc3 (void)
 
void sendUngrid (void)
 
void sendUngridSubset (int first, int last)
 
void recvUngrid (PmeGridMsg *)
 
void recvAck (PmeAckMsg *)
 
void copyResults (PmeGridMsg *)
 
void copyPencils (PmeGridMsg *)
 
void ungridCalc (void)
 
void recvRecipEvir (PmeEvirMsg *)
 
void addRecipEvirClient (void)
 
void submitReductions ()
 
void chargeGridSubmitted (Lattice &lattice, int sequence)
 
void cuda_submit_charges (Lattice &lattice, int sequence)
 
void sendChargeGridReady ()
 
void pollChargeGridReady ()
 
void pollForcesReady ()
 
void recvChargeGridReady ()
 
void chargeGridReady (Lattice &lattice, int sequence)
 
- Public Member Functions inherited from ComputePmeUtil
 ComputePmeUtil ()
 
 ~ComputePmeUtil ()
 

Public Attributes

LatticesendDataHelper_lattice
 
int sendDataHelper_sequence
 
int sendDataHelper_sourcepe
 
int sendDataHelper_errors
 
CmiNodeLock pmemgr_lock
 
float * a_data_host
 
float * a_data_dev
 
float * f_data_host
 
float * f_data_dev
 
int cuda_atoms_count
 
int cuda_atoms_alloc
 
cudaEvent_t end_charges
 
cudaEvent_t * end_forces
 
int forces_count
 
int forces_done_count
 
double charges_time
 
double forces_time
 
int check_charges_count
 
int check_forces_count
 
int master_pe
 
int this_pe
 
int chargeGridSubmittedCount
 
Latticesaved_lattice
 
int saved_sequence
 
ResizeArray< ComputePme * > pmeComputes
 

Static Public Attributes

static CmiNodeLock fftw_plan_lock
 
static CmiNodeLock cuda_lock
 
static std::deque< cuda_submit_charges_argscuda_submit_charges_deque
 
static bool cuda_busy
 
- Static Public Attributes inherited from ComputePmeUtil
static int numGrids
 
static Bool alchOn
 
static Bool alchFepOn
 
static Bool alchThermIntOn
 
static Bool alchDecouple
 
static BigReal alchElecLambdaStart
 
static Bool lesOn
 
static int lesFactor
 
static Bool pairOn
 
static Bool selfOn
 

Friends

class ComputePme
 
class NodePmeMgr
 

Additional Inherited Members

- Static Public Member Functions inherited from ComputePmeUtil
static void select (void)
 

Detailed Description

Definition at line 381 of file ComputePme.C.

Constructor & Destructor Documentation

◆ ComputePmeMgr()

ComputePmeMgr::ComputePmeMgr ( )

Definition at line 736 of file ComputePme.C.

References chargeGridSubmittedCount, check_charges_count, check_forces_count, cuda_atoms_alloc, cuda_atoms_count, cuda_errcheck(), CUDA_EVENT_ID_PME_CHARGES, CUDA_EVENT_ID_PME_COPY, CUDA_EVENT_ID_PME_FORCES, CUDA_EVENT_ID_PME_KERNEL, CUDA_EVENT_ID_PME_TICK, cuda_lock, CUDA_STREAM_CREATE, end_charges, end_forces, fftw_plan_lock, NUM_STREAMS, pmemgr_lock, and this_pe.

736  : pmeProxy(thisgroup),
737  pmeProxyDir(thisgroup) {
738 
739  CkpvAccess(BOCclass_group).computePmeMgr = thisgroup;
740  pmeNodeProxy = CkpvAccess(BOCclass_group).nodePmeMgr;
741  nodePmeMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
742 
743  pmeNodeProxy.ckLocalBranch()->initialize();
744 
745  if ( CmiMyRank() == 0 ) {
746  fftw_plan_lock = CmiCreateLock();
747  }
748  pmemgr_lock = CmiCreateLock();
749 
750  myKSpace = 0;
751  kgrid = 0;
752  work = 0;
753  grid_count = 0;
754  trans_count = 0;
755  untrans_count = 0;
756  ungrid_count = 0;
757  gridmsg_reuse= new PmeGridMsg*[CkNumPes()];
758  useBarrier = 0;
759  sendTransBarrier_received = 0;
760  usePencils = 0;
761 
762 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
763  // offload has not been set so this happens on every run
764  if ( CmiMyRank() == 0 ) {
765  cuda_lock = CmiCreateLock();
766  }
767 
768 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP)
769  int leastPriority, greatestPriority;
770  cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
771  cuda_errcheck("in cudaDeviceGetStreamPriorityRange");
772  //if ( CkMyNode() == 0 ) {
773  // CkPrintf("Pe %d PME CUDA stream priority range %d %d\n", CkMyPe(), leastPriority, greatestPriority);
774  //}
775 #define CUDA_STREAM_CREATE(X) cudaStreamCreateWithPriority(X,cudaStreamDefault,greatestPriority)
776 #else
777 #define CUDA_STREAM_CREATE(X) cudaStreamCreate(X)
778 #endif
779 
780  stream = 0;
781  for ( int i=0; i<NUM_STREAMS; ++i ) {
782 #if 1
783  CUDA_STREAM_CREATE(&streams[i]);
784  cuda_errcheck("cudaStreamCreate");
785 #else
786  streams[i] = 0; // XXXX Testing!!!
787 #endif
788  }
789 
790  this_pe = CkMyPe();
791 
792  cudaEventCreateWithFlags(&end_charges,cudaEventDisableTiming);
793  end_forces = 0;
795  check_forces_count = 0;
797 
798  cuda_atoms_count = 0;
799  cuda_atoms_alloc = 0;
800 
801  f_data_mgr_alloc = 0;
802  f_data_mgr_host = 0;
803  f_data_mgr_dev = 0;
804  afn_host = 0;
805  afn_dev = 0;
806 
807 #define CUDA_EVENT_ID_PME_CHARGES 80
808 #define CUDA_EVENT_ID_PME_FORCES 81
809 #define CUDA_EVENT_ID_PME_TICK 82
810 #define CUDA_EVENT_ID_PME_COPY 83
811 #define CUDA_EVENT_ID_PME_KERNEL 84
812  if ( 0 == CkMyPe() ) {
813  traceRegisterUserEvent("CUDA PME charges", CUDA_EVENT_ID_PME_CHARGES);
814  traceRegisterUserEvent("CUDA PME forces", CUDA_EVENT_ID_PME_FORCES);
815  traceRegisterUserEvent("CUDA PME tick", CUDA_EVENT_ID_PME_TICK);
816  traceRegisterUserEvent("CUDA PME memcpy", CUDA_EVENT_ID_PME_COPY);
817  traceRegisterUserEvent("CUDA PME kernel", CUDA_EVENT_ID_PME_KERNEL);
818  }
819 #endif
820  recipEvirCount = 0;
821  recipEvirClients = 0;
822  recipEvirPe = -999;
823 }
static CmiNodeLock fftw_plan_lock
Definition: ComputePme.C:440
cudaEvent_t end_charges
Definition: ComputePme.C:452
void cuda_errcheck(const char *msg)
Definition: ComputePme.C:67
#define CUDA_STREAM_CREATE(X)
#define CUDA_EVENT_ID_PME_COPY
CmiNodeLock pmemgr_lock
Definition: ComputePme.C:441
int check_charges_count
Definition: ComputePme.C:458
int cuda_atoms_alloc
Definition: ComputePme.C:449
#define CUDA_EVENT_ID_PME_FORCES
#define CUDA_EVENT_ID_PME_TICK
int chargeGridSubmittedCount
Definition: ComputePme.C:470
int cuda_atoms_count
Definition: ComputePme.C:448
cudaEvent_t * end_forces
Definition: ComputePme.C:453
#define CUDA_EVENT_ID_PME_KERNEL
#define CUDA_EVENT_ID_PME_CHARGES
#define NUM_STREAMS
Definition: ComputePme.C:540
int check_forces_count
Definition: ComputePme.C:459
static CmiNodeLock cuda_lock
Definition: ComputePme.C:450

◆ ~ComputePmeMgr()

ComputePmeMgr::~ComputePmeMgr ( )

Definition at line 1820 of file ComputePme.C.

References fftw_plan_lock, and pmemgr_lock.

1820  {
1821 
1822  if ( CmiMyRank() == 0 ) {
1823  CmiDestroyLock(fftw_plan_lock);
1824  }
1825  CmiDestroyLock(pmemgr_lock);
1826 
1827  delete myKSpace;
1828  delete [] localInfo;
1829  delete [] gridNodeInfo;
1830  delete [] transNodeInfo;
1831  delete [] gridPeMap;
1832  delete [] transPeMap;
1833  delete [] recipPeDest;
1834  delete [] gridPeOrder;
1835  delete [] gridNodeOrder;
1836  delete [] transNodeOrder;
1837  delete [] qgrid;
1838  if ( kgrid != qgrid ) delete [] kgrid;
1839  delete [] work;
1840  delete [] gridmsg_reuse;
1841 
1842  if ( ! offload ) {
1843  for (int i=0; i<q_count; ++i) {
1844  delete [] q_list[i];
1845  }
1846  delete [] q_list;
1847  delete [] fz_arr;
1848  }
1849  delete [] f_arr;
1850  delete [] q_arr;
1851 }
static CmiNodeLock fftw_plan_lock
Definition: ComputePme.C:440
CmiNodeLock pmemgr_lock
Definition: ComputePme.C:441

Member Function Documentation

◆ activate_pencils()

void ComputePmeMgr::activate_pencils ( CkQdMsg *  msg)

Definition at line 1814 of file ComputePme.C.

1814  {
1815  if ( ! usePencils ) return;
1816  if ( CkMyPe() == 0 ) zPencil.dummyRecvGrid(CkMyPe(),1);
1817 }

◆ addRecipEvirClient()

void ComputePmeMgr::addRecipEvirClient ( void  )

Definition at line 3055 of file ComputePme.C.

3055  {
3056  ++recipEvirClients;
3057 }

◆ chargeGridReady()

void ComputePmeMgr::chargeGridReady ( Lattice lattice,
int  sequence 
)

Definition at line 3579 of file ComputePme.C.

References PmeGrid::K3, NAMD_bug(), PmeGrid::order, pmeComputes, sendData(), sendPencils(), and ResizeArray< Elem >::size().

Referenced by ComputePme::doWork(), and recvChargeGridReady().

3579  {
3580 
3581 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3582  if ( offload ) {
3583  int errcount = 0;
3584  int q_stride = myGrid.K3+myGrid.order-1;
3585  for (int n=fsize+q_stride, j=fsize; j<n; ++j) {
3586  f_arr[j] = ffz_host[j];
3587  if ( ffz_host[j] & ~1 ) ++errcount;
3588  }
3589  if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::chargeGridReady");
3590  }
3591 #endif
3592  recipEvirCount = recipEvirClients;
3593  ungridForcesCount = pmeComputes.size();
3594 
3595  for (int j=0; j<myGrid.order-1; ++j) {
3596  fz_arr[j] |= fz_arr[myGrid.K3+j];
3597  }
3598 
3599  if ( usePencils ) {
3600  sendPencils(lattice,sequence);
3601  } else {
3602  sendData(lattice,sequence);
3603  }
3604 }
int size(void) const
Definition: ResizeArray.h:131
void sendPencils(Lattice &, int sequence)
Definition: ComputePme.C:3762
int order
Definition: PmeBase.h:23
void NAMD_bug(const char *err_msg)
Definition: common.C:195
void sendData(Lattice &, int sequence)
Definition: ComputePme.C:3989
int K3
Definition: PmeBase.h:21
ResizeArray< ComputePme * > pmeComputes
Definition: ComputePme.C:480

◆ chargeGridSubmitted()

void ComputePmeMgr::chargeGridSubmitted ( Lattice lattice,
int  sequence 
)

Definition at line 3520 of file ComputePme.C.

References chargeGridSubmittedCount, CUDA_EVENT_ID_PME_COPY, end_charges, master_pe, Node::Object(), saved_lattice, saved_sequence, Node::simParameters, and simParams.

Referenced by cuda_submit_charges().

3520  {
3521  saved_lattice = &lattice;
3522  saved_sequence = sequence;
3523 
3524  // cudaDeviceSynchronize(); // XXXX TESTING
3525  //int q_stride = myGrid.K3+myGrid.order-1;
3526  //for (int n=fsize+q_stride, j=0; j<n; ++j) {
3527  // if ( ffz_host[j] != 0 && ffz_host[j] != 1 ) {
3528  // CkPrintf("pre-memcpy flag %d/%d == %d on pe %d in ComputePmeMgr::chargeGridReady\n", j, n, ffz_host[j], CkMyPe());
3529  // }
3530  //}
3531  //CmiLock(cuda_lock);
3532 
3533  if ( --(masterPmeMgr->chargeGridSubmittedCount) == 0 ) {
3534  double before = CmiWallTimer();
3535  cudaEventRecord(nodePmeMgr->end_all_pme_kernels, 0); // when all streams complete
3536  cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_all_pme_kernels, 0);
3537  cudaMemcpyAsync(q_data_host, q_data_dev, q_data_size+ffz_size,
3538  cudaMemcpyDeviceToHost, streams[stream]);
3539  traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
3540  cudaEventRecord(masterPmeMgr->end_charges, streams[stream]);
3541  cudaMemsetAsync(q_data_dev, 0, q_data_size + ffz_size, streams[stream]); // for next time
3542  cudaEventRecord(nodePmeMgr->end_charge_memset, streams[stream]);
3543  //CmiUnlock(cuda_lock);
3544  // cudaDeviceSynchronize(); // XXXX TESTING
3545  // cuda_errcheck("after memcpy grid to host");
3546 
3548  pmeProxy[master_pe].pollChargeGridReady();
3549  }
3550 }
static Node * Object()
Definition: Node.h:86
Lattice * saved_lattice
Definition: ComputePme.C:473
cudaEvent_t end_charges
Definition: ComputePme.C:452
SimParameters * simParameters
Definition: Node.h:181
#define CUDA_EVENT_ID_PME_COPY
int chargeGridSubmittedCount
Definition: ComputePme.C:470
#define simParams
Definition: Output.C:129
int saved_sequence
Definition: ComputePme.C:474

◆ copyPencils()

void ComputePmeMgr::copyPencils ( PmeGridMsg msg)

Definition at line 3825 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::block2, PmeGrid::dim2, PmeGrid::dim3, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, ComputePmeUtil::numGrids, PmeGrid::order, PmeGridMsg::qgrid, PmeGridMsg::sourceNode, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

Referenced by recvUngrid().

3825  {
3826 
3827  int K1 = myGrid.K1;
3828  int K2 = myGrid.K2;
3829  int dim2 = myGrid.dim2;
3830  int dim3 = myGrid.dim3;
3831  int block1 = myGrid.block1;
3832  int block2 = myGrid.block2;
3833 
3834  // msg->sourceNode = thisIndex.x * initdata.yBlocks + thisIndex.y;
3835  int ib = msg->sourceNode / yBlocks;
3836  int jb = msg->sourceNode % yBlocks;
3837 
3838  int ibegin = ib*block1;
3839  int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
3840  int jbegin = jb*block2;
3841  int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
3842 
3843  int zlistlen = msg->zlistlen;
3844  int *zlist = msg->zlist;
3845  float *qmsg = msg->qgrid;
3846  int g;
3847  for ( g=0; g<numGrids; ++g ) {
3848  char *f = f_arr + g*fsize;
3849  float **q = q_arr + g*fsize;
3850  for ( int i=ibegin; i<iend; ++i ) {
3851  for ( int j=jbegin; j<jend; ++j ) {
3852  if( f[i*dim2+j] ) {
3853  f[i*dim2+j] = 0;
3854  for ( int k=0; k<zlistlen; ++k ) {
3855  q[i*dim2+j][zlist[k]] = *(qmsg++);
3856  }
3857  for (int h=0; h<myGrid.order-1; ++h) {
3858  q[i*dim2+j][myGrid.K3+h] = q[i*dim2+j][h];
3859  }
3860  }
3861  }
3862  }
3863  }
3864 }
int dim2
Definition: PmeBase.h:22
int dim3
Definition: PmeBase.h:22
int K2
Definition: PmeBase.h:21
int K1
Definition: PmeBase.h:21
static int numGrids
Definition: ComputePme.h:32
int block1
Definition: PmeBase.h:24
int block2
Definition: PmeBase.h:24
int sourceNode
Definition: ComputePme.C:141
int order
Definition: PmeBase.h:23
float * qgrid
Definition: ComputePme.C:150
int * zlist
Definition: ComputePme.C:148
int K3
Definition: PmeBase.h:21
int zlistlen
Definition: ComputePme.C:147

◆ copyResults()

void ComputePmeMgr::copyResults ( PmeGridMsg msg)

Definition at line 4017 of file ComputePme.C.

References PmeGrid::dim3, PmeGridMsg::fgrid, PmeGrid::K3, PmeGridMsg::len, ComputePmeUtil::numGrids, PmeGrid::order, PmeGridMsg::qgrid, PmeGridMsg::start, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

Referenced by recvUngrid().

4017  {
4018 
4019  int zdim = myGrid.dim3;
4020  int flen = msg->len;
4021  int fstart = msg->start;
4022  int zlistlen = msg->zlistlen;
4023  int *zlist = msg->zlist;
4024  float *qmsg = msg->qgrid;
4025  int g;
4026  for ( g=0; g<numGrids; ++g ) {
4027  char *f = msg->fgrid + g*flen;
4028  float **q = q_arr + fstart + g*fsize;
4029  for ( int i=0; i<flen; ++i ) {
4030  if ( f[i] ) {
4031  f[i] = 0;
4032  for ( int k=0; k<zlistlen; ++k ) {
4033  q[i][zlist[k]] = *(qmsg++);
4034  }
4035  for (int h=0; h<myGrid.order-1; ++h) {
4036  q[i][myGrid.K3+h] = q[i][h];
4037  }
4038  }
4039  }
4040  }
4041 }
int dim3
Definition: PmeBase.h:22
static int numGrids
Definition: ComputePme.h:32
int order
Definition: PmeBase.h:23
float * qgrid
Definition: ComputePme.C:150
int * zlist
Definition: ComputePme.C:148
int K3
Definition: PmeBase.h:21
int zlistlen
Definition: ComputePme.C:147
char * fgrid
Definition: ComputePme.C:149

◆ cuda_submit_charges()

void ComputePmeMgr::cuda_submit_charges ( Lattice lattice,
int  sequence 
)

Definition at line 3465 of file ComputePme.C.

References a_data_dev, a_data_host, chargeGridSubmitted(), charges_time, cuda_atoms_count, CUDA_EVENT_ID_PME_COPY, CUDA_EVENT_ID_PME_KERNEL, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, and PmeGrid::order.

Referenced by ComputePme::doWork().

3465  {
3466 
3467  int n = cuda_atoms_count;
3468  //CkPrintf("pe %d cuda_atoms_count %d\n", CkMyPe(), cuda_atoms_count);
3469  cuda_atoms_count = 0;
3470 
3471  const double before = CmiWallTimer();
3472  cudaMemcpyAsync(a_data_dev, a_data_host, 7*n*sizeof(float),
3473  cudaMemcpyHostToDevice, streams[stream]);
3474  const double after = CmiWallTimer();
3475 
3476  cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_charge_memset, 0);
3477 
3478  cuda_pme_charges(
3479  bspline_coeffs_dev,
3480  q_arr_dev, ffz_dev, ffz_dev + fsize,
3481  a_data_dev, n,
3482  myGrid.K1, myGrid.K2, myGrid.K3, myGrid.order,
3483  streams[stream]);
3484  const double after2 = CmiWallTimer();
3485 
3486  chargeGridSubmitted(lattice,sequence); // must be inside lock
3487 
3488  masterPmeMgr->charges_time = before;
3489  traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,after);
3490  traceUserBracketEvent(CUDA_EVENT_ID_PME_KERNEL,after,after2);
3491 }
float * a_data_dev
Definition: ComputePme.C:445
int K2
Definition: PmeBase.h:21
int K1
Definition: PmeBase.h:21
#define CUDA_EVENT_ID_PME_COPY
int order
Definition: PmeBase.h:23
int K3
Definition: PmeBase.h:21
int cuda_atoms_count
Definition: ComputePme.C:448
#define CUDA_EVENT_ID_PME_KERNEL
double charges_time
Definition: ComputePme.C:456
void chargeGridSubmitted(Lattice &lattice, int sequence)
Definition: ComputePme.C:3520
float * a_data_host
Definition: ComputePme.C:444

◆ fwdSharedTrans()

void ComputePmeMgr::fwdSharedTrans ( PmeTransMsg msg)

Definition at line 2040 of file ComputePme.C.

References PmeSharedTransMsg::count, PmeSharedTransMsg::lock, PmeSharedTransMsg::msg, NodePmeInfo::npe, NodePmeInfo::pe_start, PME_TRANS_PRIORITY, PRIORITY_SIZE, PmeTransMsg::sequence, and SET_PRIORITY.

Referenced by sendTransSubset().

2040  {
2041  // CkPrintf("fwdSharedTrans on Pe(%d)\n",CkMyPe());
2042  int pe = transNodeInfo[myTransNode].pe_start;
2043  int npe = transNodeInfo[myTransNode].npe;
2044  CmiNodeLock lock = CmiCreateLock();
2045  int *count = new int; *count = npe;
2046  for (int i=0; i<npe; ++i, ++pe) {
2049  shmsg->msg = msg;
2050  shmsg->count = count;
2051  shmsg->lock = lock;
2052  pmeProxy[transPeMap[pe]].recvSharedTrans(shmsg);
2053  }
2054 }
#define PRIORITY_SIZE
Definition: Priorities.h:13
#define PME_TRANS_PRIORITY
Definition: Priorities.h:31
PmeTransMsg * msg
Definition: ComputePme.C:169
#define SET_PRIORITY(MSG, SEQ, PRIO)
Definition: Priorities.h:18
CmiNodeLock lock
Definition: ComputePme.C:171

◆ fwdSharedUntrans()

void ComputePmeMgr::fwdSharedUntrans ( PmeUntransMsg msg)

Definition at line 2296 of file ComputePme.C.

References PmeSharedUntransMsg::count, PmeSharedUntransMsg::lock, PmeSharedUntransMsg::msg, NodePmeInfo::npe, and NodePmeInfo::pe_start.

Referenced by sendUntransSubset().

2296  {
2297  int pe = gridNodeInfo[myGridNode].pe_start;
2298  int npe = gridNodeInfo[myGridNode].npe;
2299  CmiNodeLock lock = CmiCreateLock();
2300  int *count = new int; *count = npe;
2301  for (int i=0; i<npe; ++i, ++pe) {
2303  shmsg->msg = msg;
2304  shmsg->count = count;
2305  shmsg->lock = lock;
2306  pmeProxy[gridPeMap[pe]].recvSharedUntrans(shmsg);
2307  }
2308 }
CmiNodeLock lock
Definition: ComputePme.C:188
PmeUntransMsg * msg
Definition: ComputePme.C:186

◆ gridCalc1()

void ComputePmeMgr::gridCalc1 ( void  )

Definition at line 1932 of file ComputePme.C.

References PmeGrid::dim2, PmeGrid::dim3, and ComputePmeUtil::numGrids.

1932  {
1933  // CkPrintf("gridCalc1 on Pe(%d)\n",CkMyPe());
1934 
1935 #ifdef NAMD_FFTW
1936  for ( int g=0; g<numGrids; ++g ) {
1937 #ifdef NAMD_FFTW_3
1938  fftwf_execute(forward_plan_yz[g]);
1939 #else
1940  rfftwnd_real_to_complex(forward_plan_yz, localInfo[myGridPe].nx,
1941  qgrid + qgrid_size * g, 1, myGrid.dim2 * myGrid.dim3, 0, 0, 0);
1942 #endif
1943 
1944  }
1945 #endif
1946 
1947  if ( ! useBarrier ) pmeProxyDir[CkMyPe()].sendTrans();
1948 }
int dim2
Definition: PmeBase.h:22
int dim3
Definition: PmeBase.h:22
static int numGrids
Definition: ComputePme.h:32

◆ gridCalc2()

void ComputePmeMgr::gridCalc2 ( void  )

Definition at line 2108 of file ComputePme.C.

References PmeGrid::dim3, gridCalc2R(), ComputePmeUtil::numGrids, LocalPmeInfo::ny_after_transpose, and simParams.

2108  {
2109  // CkPrintf("gridCalc2 on Pe(%d)\n",CkMyPe());
2110 
2111 #if CMK_BLUEGENEL
2112  CmiNetworkProgressAfter (0);
2113 #endif
2114 
2115  int zdim = myGrid.dim3;
2116  // int y_start = localInfo[myTransPe].y_start_after_transpose;
2117  int ny = localInfo[myTransPe].ny_after_transpose;
2118 
2119  for ( int g=0; g<numGrids; ++g ) {
2120  // finish forward FFT (x dimension)
2121 #ifdef NAMD_FFTW
2122 #ifdef NAMD_FFTW_3
2123  fftwf_execute(forward_plan_x[g]);
2124 #else
2125  fftw(forward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
2126  ny * zdim / 2, 1, work, 1, 0);
2127 #endif
2128 #endif
2129  }
2130 
2131 #ifdef OPENATOM_VERSION
2132  if ( ! simParams -> openatomOn ) {
2133 #endif // OPENATOM_VERSION
2134  gridCalc2R();
2135 #ifdef OPENATOM_VERSION
2136  } else {
2137  gridCalc2Moa();
2138  }
2139 #endif // OPENATOM_VERSION
2140 }
int dim3
Definition: PmeBase.h:22
static int numGrids
Definition: ComputePme.h:32
int ny_after_transpose
Definition: ComputePme.C:259
#define simParams
Definition: Output.C:129
void gridCalc2R(void)
Definition: ComputePme.C:2168

◆ gridCalc2R()

void ComputePmeMgr::gridCalc2R ( void  )

Definition at line 2168 of file ComputePme.C.

References CKLOOP_CTRL_PME_KSPACE, PmeKSpace::compute_energy(), PmeGrid::dim3, ComputeNonbondedUtil::ewaldcof, ComputePmeUtil::numGrids, LocalPmeInfo::ny_after_transpose, and Node::Object().

Referenced by gridCalc2().

2168  {
2169 
2170  int useCkLoop = 0;
2171 #if CMK_SMP && USE_CKLOOP
2172  if ( Node::Object()->simParameters->useCkLoop >= CKLOOP_CTRL_PME_KSPACE
2173  && CkNumPes() >= 2 * numTransPes ) {
2174  useCkLoop = 1;
2175  }
2176 #endif
2177 
2178  int zdim = myGrid.dim3;
2179  // int y_start = localInfo[myTransPe].y_start_after_transpose;
2180  int ny = localInfo[myTransPe].ny_after_transpose;
2181 
2182  for ( int g=0; g<numGrids; ++g ) {
2183  // reciprocal space portion of PME
2185  recip_evir2[g][0] = myKSpace->compute_energy(kgrid+qgrid_size*g,
2186  lattice, ewaldcof, &(recip_evir2[g][1]), useCkLoop);
2187  // CkPrintf("Ewald reciprocal energy = %f\n", recip_evir2[g][0]);
2188 
2189  // start backward FFT (x dimension)
2190 
2191 #ifdef NAMD_FFTW
2192 #ifdef NAMD_FFTW_3
2193  fftwf_execute(backward_plan_x[g]);
2194 #else
2195  fftw(backward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
2196  ny * zdim / 2, 1, work, 1, 0);
2197 #endif
2198 #endif
2199  }
2200 
2201  pmeProxyDir[CkMyPe()].sendUntrans();
2202 }
static Node * Object()
Definition: Node.h:86
int dim3
Definition: PmeBase.h:22
static int numGrids
Definition: ComputePme.h:32
double compute_energy(float q_arr[], const Lattice &lattice, double ewald, double virial[], int useCkLoop)
Definition: PmeKSpace.C:321
#define CKLOOP_CTRL_PME_KSPACE
Definition: SimParameters.h:98
int ny_after_transpose
Definition: ComputePme.C:259
double BigReal
Definition: common.h:123

◆ gridCalc3()

void ComputePmeMgr::gridCalc3 ( void  )

Definition at line 2370 of file ComputePme.C.

References PmeGrid::dim2, PmeGrid::dim3, and ComputePmeUtil::numGrids.

2370  {
2371  // CkPrintf("gridCalc3 on Pe(%d)\n",CkMyPe());
2372 
2373  // finish backward FFT
2374 #ifdef NAMD_FFTW
2375  for ( int g=0; g<numGrids; ++g ) {
2376 #ifdef NAMD_FFTW_3
2377  fftwf_execute(backward_plan_yz[g]);
2378 #else
2379  rfftwnd_complex_to_real(backward_plan_yz, localInfo[myGridPe].nx,
2380  (fftw_complex *) (qgrid + qgrid_size * g),
2381  1, myGrid.dim2 * myGrid.dim3 / 2, 0, 0, 0);
2382 #endif
2383  }
2384 
2385 #endif
2386 
2387  pmeProxyDir[CkMyPe()].sendUngrid();
2388 }
int dim2
Definition: PmeBase.h:22
int dim3
Definition: PmeBase.h:22
static int numGrids
Definition: ComputePme.h:32

◆ initialize()

void ComputePmeMgr::initialize ( CkQdMsg *  msg)

Definition at line 888 of file ComputePme.C.

References Lattice::a(), Lattice::a_r(), ResizeArray< Elem >::add(), ResizeArray< Elem >::begin(), PmeGrid::block1, PmeGrid::block2, PmeGrid::block3, cuda_errcheck(), deviceCUDA, PmeGrid::dim2, PmeGrid::dim3, ResizeArray< Elem >::end(), endi(), fftw_plan_lock, findRecipEvirPe(), generatePmePeList2(), DeviceCUDA::getDeviceID(), PmePencilInitMsgData::grid, iINFO(), iout, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PatchMap::max_a(), PatchMap::min_a(), NAMD_bug(), NAMD_die(), PatchMap::node(), NodePmeInfo::npe, ComputePmeUtil::numGrids, PatchMap::numNodesWithPatches(), PatchMap::numPatches(), PatchMap::numPatchesOnNode(), LocalPmeInfo::nx, LocalPmeInfo::ny_after_transpose, PatchMap::Object(), Node::Object(), DeviceCUDA::one_device_per_node(), PmeGrid::order, NodePmeInfo::pe_start, WorkDistrib::peDiffuseOrdering, pencilPMEProcessors, PmePencilInitMsgData::pmeNodeProxy, PmePencilInitMsgData::pmeProxy, NodePmeInfo::real_node, Random::reorder(), ResizeArray< Elem >::resize(), Node::simParameters, simParams, ResizeArray< Elem >::size(), SortableResizeArray< Elem >::sort(), WorkDistrib::sortPmePes(), Vector::unit(), LocalPmeInfo::x_start, PmePencilInitMsgData::xBlocks, PmePencilInitMsgData::xm, PmePencilInitMsgData::xPencil, LocalPmeInfo::y_start_after_transpose, PmePencilInitMsgData::yBlocks, PmePencilInitMsgData::ym, PmePencilInitMsgData::yPencil, PmePencilInitMsgData::zBlocks, PmePencilInitMsgData::zm, and PmePencilInitMsgData::zPencil.

888  {
889  delete msg;
890 
891  localInfo = new LocalPmeInfo[CkNumPes()];
892  gridNodeInfo = new NodePmeInfo[CkNumNodes()];
893  transNodeInfo = new NodePmeInfo[CkNumNodes()];
894  gridPeMap = new int[CkNumPes()];
895  transPeMap = new int[CkNumPes()];
896  recipPeDest = new int[CkNumPes()];
897  gridPeOrder = new int[CkNumPes()];
898  gridNodeOrder = new int[CkNumNodes()];
899  transNodeOrder = new int[CkNumNodes()];
900 
901  if (CkMyRank() == 0) {
902  pencilPMEProcessors = new char [CkNumPes()];
903  memset (pencilPMEProcessors, 0, sizeof(char) * CkNumPes());
904  }
905 
907  PatchMap *patchMap = PatchMap::Object();
908 
909  offload = simParams->PMEOffload;
910 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
911  if ( offload && ! deviceCUDA->one_device_per_node() ) {
912  NAMD_die("PME offload requires exactly one CUDA device per process. Use \"PMEOffload no\".");
913  }
914  if ( offload ) {
915  int dev;
916  cudaGetDevice(&dev);
917  cuda_errcheck("in cudaGetDevice");
918  if ( dev != deviceCUDA->getDeviceID() ) NAMD_bug("ComputePmeMgr::initialize dev != deviceCUDA->getDeviceID()");
919  cudaDeviceProp deviceProp;
920  cudaGetDeviceProperties(&deviceProp, dev);
921  cuda_errcheck("in cudaGetDeviceProperties");
922  if ( deviceProp.major < 2 )
923  NAMD_die("PME offload requires CUDA device of compute capability 2.0 or higher. Use \"PMEOffload no\".");
924  }
925 #endif
926 
927  alchLambda = -1.; // illegal value to catch if not updated
928  alchLambda2 = -1.;
929  useBarrier = simParams->PMEBarrier;
930 
931  if ( numGrids != 1 || simParams->PMEPencils == 0 ) usePencils = 0;
932  else if ( simParams->PMEPencils > 0 ) usePencils = 1;
933  else {
934  int nrps = simParams->PMEProcessors;
935  if ( nrps <= 0 ) nrps = CkNumPes();
936  if ( nrps > CkNumPes() ) nrps = CkNumPes();
937  int dimx = simParams->PMEGridSizeX;
938  int dimy = simParams->PMEGridSizeY;
939  int maxslabs = 1 + (dimx - 1) / simParams->PMEMinSlices;
940  if ( maxslabs > nrps ) maxslabs = nrps;
941  int maxpencils = ( simParams->PMEGridSizeX * (int64) simParams->PMEGridSizeY
942  * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
943  if ( maxpencils > nrps ) maxpencils = nrps;
944  if ( maxpencils > 3 * maxslabs ) usePencils = 1;
945  else usePencils = 0;
946  }
947 
948  if ( usePencils ) {
949  int nrps = simParams->PMEProcessors;
950  if ( nrps <= 0 ) nrps = CkNumPes();
951  if ( nrps > CkNumPes() ) nrps = CkNumPes();
952  if ( simParams->PMEPencils > 1 &&
953  simParams->PMEPencils * simParams->PMEPencils <= nrps ) {
954  xBlocks = yBlocks = zBlocks = simParams->PMEPencils;
955  } else {
956  int nb2 = ( simParams->PMEGridSizeX * (int64) simParams->PMEGridSizeY
957  * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
958  if ( nb2 > nrps ) nb2 = nrps;
959  if ( nb2 < 1 ) nb2 = 1;
960  int nb = (int) sqrt((float)nb2);
961  if ( nb < 1 ) nb = 1;
962  xBlocks = zBlocks = nb;
963  yBlocks = nb2 / nb;
964  }
965 
966  if ( simParams->PMEPencilsX > 0 ) xBlocks = simParams->PMEPencilsX;
967  if ( simParams->PMEPencilsY > 0 ) yBlocks = simParams->PMEPencilsY;
968  if ( simParams->PMEPencilsZ > 0 ) zBlocks = simParams->PMEPencilsZ;
969 
970  int dimx = simParams->PMEGridSizeX;
971  int bx = 1 + ( dimx - 1 ) / xBlocks;
972  xBlocks = 1 + ( dimx - 1 ) / bx;
973 
974  int dimy = simParams->PMEGridSizeY;
975  int by = 1 + ( dimy - 1 ) / yBlocks;
976  yBlocks = 1 + ( dimy - 1 ) / by;
977 
978  int dimz = simParams->PMEGridSizeZ / 2 + 1; // complex
979  int bz = 1 + ( dimz - 1 ) / zBlocks;
980  zBlocks = 1 + ( dimz - 1 ) / bz;
981 
982  if ( xBlocks * yBlocks > CkNumPes() ) {
983  NAMD_die("PME pencils xBlocks * yBlocks > numPes");
984  }
985  if ( xBlocks * zBlocks > CkNumPes() ) {
986  NAMD_die("PME pencils xBlocks * zBlocks > numPes");
987  }
988  if ( yBlocks * zBlocks > CkNumPes() ) {
989  NAMD_die("PME pencils yBlocks * zBlocks > numPes");
990  }
991 
992  if ( ! CkMyPe() ) {
993  iout << iINFO << "PME using " << xBlocks << " x " <<
994  yBlocks << " x " << zBlocks <<
995  " pencil grid for FFT and reciprocal sum.\n" << endi;
996  }
997  } else { // usePencils
998 
999  { // decide how many pes to use for reciprocal sum
1000 
1001  // rules based on work available
1002  int minslices = simParams->PMEMinSlices;
1003  int dimx = simParams->PMEGridSizeX;
1004  int nrpx = ( dimx + minslices - 1 ) / minslices;
1005  int dimy = simParams->PMEGridSizeY;
1006  int nrpy = ( dimy + minslices - 1 ) / minslices;
1007 
1008  // rules based on processors available
1009  int nrpp = CkNumPes();
1010  // if ( nrpp > 32 ) nrpp = 32; // cap to limit messages
1011  if ( nrpp < nrpx ) nrpx = nrpp;
1012  if ( nrpp < nrpy ) nrpy = nrpp;
1013 
1014  // user override
1015  int nrps = simParams->PMEProcessors;
1016  if ( nrps > CkNumPes() ) nrps = CkNumPes();
1017  if ( nrps > 0 ) nrpx = nrps;
1018  if ( nrps > 0 ) nrpy = nrps;
1019 
1020  // make sure there aren't any totally empty processors
1021  int bx = ( dimx + nrpx - 1 ) / nrpx;
1022  nrpx = ( dimx + bx - 1 ) / bx;
1023  int by = ( dimy + nrpy - 1 ) / nrpy;
1024  nrpy = ( dimy + by - 1 ) / by;
1025  if ( bx != ( dimx + nrpx - 1 ) / nrpx )
1026  NAMD_bug("Error in selecting number of PME processors.");
1027  if ( by != ( dimy + nrpy - 1 ) / nrpy )
1028  NAMD_bug("Error in selecting number of PME processors.");
1029 
1030  numGridPes = nrpx;
1031  numTransPes = nrpy;
1032  }
1033  if ( ! CkMyPe() ) {
1034  iout << iINFO << "PME using " << numGridPes << " and " << numTransPes <<
1035  " processors for FFT and reciprocal sum.\n" << endi;
1036  }
1037 
1038  int sum_npes = numTransPes + numGridPes;
1039  int max_npes = (numTransPes > numGridPes)?numTransPes:numGridPes;
1040 
1041 #if 0 // USE_TOPOMAP
1042  /* This code is being disabled permanently for slab PME on Blue Gene machines */
1043  PatchMap * pmap = PatchMap::Object();
1044 
1045  int patch_pes = pmap->numNodesWithPatches();
1046  TopoManager tmgr;
1047  if(tmgr.hasMultipleProcsPerNode())
1048  patch_pes *= 2;
1049 
1050  bool done = false;
1051  if(CkNumPes() > 2*sum_npes + patch_pes) {
1052  done = generateBGLORBPmePeList(transPeMap, numTransPes);
1053  done &= generateBGLORBPmePeList(gridPeMap, numGridPes, transPeMap, numTransPes);
1054  }
1055  else
1056  if(CkNumPes() > 2 *max_npes + patch_pes) {
1057  done = generateBGLORBPmePeList(transPeMap, max_npes);
1058  gridPeMap = transPeMap;
1059  }
1060 
1061  if (!done)
1062 #endif
1063  {
1064  //generatePmePeList(transPeMap, max_npes);
1065  //gridPeMap = transPeMap;
1066  generatePmePeList2(gridPeMap, numGridPes, transPeMap, numTransPes);
1067  }
1068 
1069  if ( ! CkMyPe() ) {
1070  iout << iINFO << "PME GRID LOCATIONS:";
1071  int i;
1072  for ( i=0; i<numGridPes && i<10; ++i ) {
1073  iout << " " << gridPeMap[i];
1074  }
1075  if ( i < numGridPes ) iout << " ...";
1076  iout << "\n" << endi;
1077  iout << iINFO << "PME TRANS LOCATIONS:";
1078  for ( i=0; i<numTransPes && i<10; ++i ) {
1079  iout << " " << transPeMap[i];
1080  }
1081  if ( i < numTransPes ) iout << " ...";
1082  iout << "\n" << endi;
1083  }
1084 
1085  // sort based on nodes and physical nodes
1086  std::sort(gridPeMap,gridPeMap+numGridPes,WorkDistrib::pe_sortop_compact());
1087 
1088  myGridPe = -1;
1089  myGridNode = -1;
1090  int i = 0;
1091  int node = -1;
1092  int real_node = -1;
1093  for ( i=0; i<numGridPes; ++i ) {
1094  if ( gridPeMap[i] == CkMyPe() ) myGridPe = i;
1095  if (CkMyRank() == 0) pencilPMEProcessors[gridPeMap[i]] |= 1;
1096  int real_node_i = CkNodeOf(gridPeMap[i]);
1097  if ( real_node_i == real_node ) {
1098  gridNodeInfo[node].npe += 1;
1099  } else {
1100  real_node = real_node_i;
1101  ++node;
1102  gridNodeInfo[node].real_node = real_node;
1103  gridNodeInfo[node].pe_start = i;
1104  gridNodeInfo[node].npe = 1;
1105  }
1106  if ( CkMyNode() == real_node_i ) myGridNode = node;
1107  }
1108  numGridNodes = node + 1;
1109  myTransPe = -1;
1110  myTransNode = -1;
1111  node = -1;
1112  real_node = -1;
1113  for ( i=0; i<numTransPes; ++i ) {
1114  if ( transPeMap[i] == CkMyPe() ) myTransPe = i;
1115  if (CkMyRank() == 0) pencilPMEProcessors[transPeMap[i]] |= 2;
1116  int real_node_i = CkNodeOf(transPeMap[i]);
1117  if ( real_node_i == real_node ) {
1118  transNodeInfo[node].npe += 1;
1119  } else {
1120  real_node = real_node_i;
1121  ++node;
1122  transNodeInfo[node].real_node = real_node;
1123  transNodeInfo[node].pe_start = i;
1124  transNodeInfo[node].npe = 1;
1125  }
1126  if ( CkMyNode() == real_node_i ) myTransNode = node;
1127  }
1128  numTransNodes = node + 1;
1129 
1130  if ( ! CkMyPe() ) {
1131  iout << iINFO << "PME USING " << numGridNodes << " GRID NODES AND "
1132  << numTransNodes << " TRANS NODES\n" << endi;
1133  }
1134 
1135  { // generate random orderings for grid and trans messages
1136  int i;
1137  for ( i = 0; i < numGridPes; ++i ) {
1138  gridPeOrder[i] = i;
1139  }
1140  Random rand(CkMyPe());
1141  if ( myGridPe < 0 ) {
1142  rand.reorder(gridPeOrder,numGridPes);
1143  } else { // self last
1144  gridPeOrder[myGridPe] = numGridPes-1;
1145  gridPeOrder[numGridPes-1] = myGridPe;
1146  rand.reorder(gridPeOrder,numGridPes-1);
1147  }
1148  for ( i = 0; i < numGridNodes; ++i ) {
1149  gridNodeOrder[i] = i;
1150  }
1151  if ( myGridNode < 0 ) {
1152  rand.reorder(gridNodeOrder,numGridNodes);
1153  } else { // self last
1154  gridNodeOrder[myGridNode] = numGridNodes-1;
1155  gridNodeOrder[numGridNodes-1] = myGridNode;
1156  rand.reorder(gridNodeOrder,numGridNodes-1);
1157  }
1158  for ( i = 0; i < numTransNodes; ++i ) {
1159  transNodeOrder[i] = i;
1160  }
1161  if ( myTransNode < 0 ) {
1162  rand.reorder(transNodeOrder,numTransNodes);
1163  } else { // self last
1164  transNodeOrder[myTransNode] = numTransNodes-1;
1165  transNodeOrder[numTransNodes-1] = myTransNode;
1166  rand.reorder(transNodeOrder,numTransNodes-1);
1167  }
1168  }
1169 
1170  } // ! usePencils
1171 
1172  myGrid.K1 = simParams->PMEGridSizeX;
1173  myGrid.K2 = simParams->PMEGridSizeY;
1174  myGrid.K3 = simParams->PMEGridSizeZ;
1175  myGrid.order = simParams->PMEInterpOrder;
1176  myGrid.dim2 = myGrid.K2;
1177  myGrid.dim3 = 2 * (myGrid.K3/2 + 1);
1178 
1179  if ( ! usePencils ) {
1180  myGrid.block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
1181  myGrid.block2 = ( myGrid.K2 + numTransPes - 1 ) / numTransPes;
1182  myGrid.block3 = myGrid.dim3 / 2; // complex
1183  }
1184 
1185  if ( usePencils ) {
1186  myGrid.block1 = ( myGrid.K1 + xBlocks - 1 ) / xBlocks;
1187  myGrid.block2 = ( myGrid.K2 + yBlocks - 1 ) / yBlocks;
1188  myGrid.block3 = ( myGrid.K3/2 + 1 + zBlocks - 1 ) / zBlocks; // complex
1189 
1190 
1191  int pe = 0;
1192  int x,y,z;
1193 
1194  SortableResizeArray<int> zprocs(xBlocks*yBlocks);
1195  SortableResizeArray<int> yprocs(xBlocks*zBlocks);
1196  SortableResizeArray<int> xprocs(yBlocks*zBlocks);
1197 
1198  // decide which pes to use by bit reversal and patch use
1199  int i;
1200  int ncpus = CkNumPes();
1201  SortableResizeArray<int> patches, nopatches, pmeprocs;
1202  PatchMap *pmap = PatchMap::Object();
1203  for ( int icpu=0; icpu<ncpus; ++icpu ) {
1204  int ri = WorkDistrib::peDiffuseOrdering[icpu];
1205  if ( ri ) { // keep 0 for special case
1206  // pretend pe 1 has patches to avoid placing extra PME load on node
1207  if ( ri == 1 || pmap->numPatchesOnNode(ri) ) patches.add(ri);
1208  else nopatches.add(ri);
1209  }
1210  }
1211 
1212 #if USE_RANDOM_TOPO
1213  Random rand(CkMyPe());
1214  int *tmp = new int[patches.size()];
1215  int nn = patches.size();
1216  for (i=0;i<nn;i++) tmp[i] = patches[i];
1217  rand.reorder(tmp, nn);
1218  patches.resize(0);
1219  for (i=0;i<nn;i++) patches.add(tmp[i]);
1220  delete [] tmp;
1221  tmp = new int[nopatches.size()];
1222  nn = nopatches.size();
1223  for (i=0;i<nn;i++) tmp[i] = nopatches[i];
1224  rand.reorder(tmp, nn);
1225  nopatches.resize(0);
1226  for (i=0;i<nn;i++) nopatches.add(tmp[i]);
1227  delete [] tmp;
1228 #endif
1229 
1230  // only use zero if it eliminates overloading or has patches
1231  int useZero = 0;
1232  int npens = xBlocks*yBlocks;
1233  if ( npens % ncpus == 0 ) useZero = 1;
1234  if ( npens == nopatches.size() + 1 ) useZero = 1;
1235  npens += xBlocks*zBlocks;
1236  if ( npens % ncpus == 0 ) useZero = 1;
1237  if ( npens == nopatches.size() + 1 ) useZero = 1;
1238  npens += yBlocks*zBlocks;
1239  if ( npens % ncpus == 0 ) useZero = 1;
1240  if ( npens == nopatches.size() + 1 ) useZero = 1;
1241 
1242  // add nopatches then patches in reversed order
1243  for ( i=nopatches.size()-1; i>=0; --i ) pmeprocs.add(nopatches[i]);
1244  if ( useZero && ! pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
1245  for ( i=patches.size()-1; i>=0; --i ) pmeprocs.add(patches[i]);
1246  if ( pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
1247 
1248  int npes = pmeprocs.size();
1249  for ( i=0; i<xBlocks*yBlocks; ++i, ++pe ) zprocs[i] = pmeprocs[pe%npes];
1250  if ( i>1 && zprocs[0] == zprocs[i-1] ) zprocs[0] = 0;
1251 #if !USE_RANDOM_TOPO
1252  zprocs.sort();
1253 #endif
1254  for ( i=0; i<xBlocks*zBlocks; ++i, ++pe ) yprocs[i] = pmeprocs[pe%npes];
1255  if ( i>1 && yprocs[0] == yprocs[i-1] ) yprocs[0] = 0;
1256 #if !USE_RANDOM_TOPO
1257  yprocs.sort();
1258 #endif
1259  for ( i=0; i<yBlocks*zBlocks; ++i, ++pe ) xprocs[i] = pmeprocs[pe%npes];
1260  if ( i>1 && xprocs[0] == xprocs[i-1] ) xprocs[0] = 0;
1261 #if !USE_RANDOM_TOPO
1262  xprocs.sort();
1263 #endif
1264 
1265 #if USE_TOPO_SFC
1266  CmiLock(tmgr_lock);
1267  //{
1268  TopoManager tmgr;
1269  int xdim = tmgr.getDimNX();
1270  int ydim = tmgr.getDimNY();
1271  int zdim = tmgr.getDimNZ();
1272  int xdim1 = find_level_grid(xdim);
1273  int ydim1 = find_level_grid(ydim);
1274  int zdim1 = find_level_grid(zdim);
1275  if(CkMyPe() == 0)
1276  printf("xdim: %d %d %d, %d %d %d\n", xdim, ydim, zdim, xdim1, ydim1, zdim1);
1277 
1278  vector<Coord> result;
1279  SFC_grid(xdim, ydim, zdim, xdim1, ydim1, zdim1, result);
1280  sort_sfc(xprocs, tmgr, result);
1281  sort_sfc(yprocs, tmgr, result);
1282  sort_sfc(zprocs, tmgr, result);
1283  //}
1284  CmiUnlock(tmgr_lock);
1285 #endif
1286 
1287 
1288  if(CkMyPe() == 0){
1289  iout << iINFO << "PME Z PENCIL LOCATIONS:";
1290  for ( i=0; i<zprocs.size() && i<10; ++i ) {
1291 #if USE_TOPO_SFC
1292  int x,y,z,t;
1293  tmgr.rankToCoordinates(zprocs[i], x,y, z, t);
1294  iout << " " << zprocs[i] << "(" << x << " " << y << " " << z << ")";
1295 #else
1296  iout << " " << zprocs[i];
1297 #endif
1298  }
1299  if ( i < zprocs.size() ) iout << " ...";
1300  iout << "\n" << endi;
1301  }
1302 
1303  if (CkMyRank() == 0) {
1304  for (pe=0, x = 0; x < xBlocks; ++x)
1305  for (y = 0; y < yBlocks; ++y, ++pe ) {
1306  pencilPMEProcessors[zprocs[pe]] = 1;
1307  }
1308  }
1309 
1310  if(CkMyPe() == 0){
1311  iout << iINFO << "PME Y PENCIL LOCATIONS:";
1312  for ( i=0; i<yprocs.size() && i<10; ++i ) {
1313 #if USE_TOPO_SFC
1314  int x,y,z,t;
1315  tmgr.rankToCoordinates(yprocs[i], x,y, z, t);
1316  iout << " " << yprocs[i] << "(" << x << " " << y << " " << z << ")";
1317 #else
1318  iout << " " << yprocs[i];
1319 #endif
1320  }
1321  if ( i < yprocs.size() ) iout << " ...";
1322  iout << "\n" << endi;
1323  }
1324 
1325  if (CkMyRank() == 0) {
1326  for (pe=0, z = 0; z < zBlocks; ++z )
1327  for (x = 0; x < xBlocks; ++x, ++pe ) {
1328  pencilPMEProcessors[yprocs[pe]] = 1;
1329  }
1330  }
1331 
1332  if(CkMyPe() == 0){
1333  iout << iINFO << "PME X PENCIL LOCATIONS:";
1334  for ( i=0; i<xprocs.size() && i<10; ++i ) {
1335 #if USE_TOPO_SFC
1336  int x,y,z,t;
1337  tmgr.rankToCoordinates(xprocs[i], x,y, z, t);
1338  iout << " " << xprocs[i] << "(" << x << " " << y << " " << z << ")";
1339 #else
1340  iout << " " << xprocs[i];
1341 #endif
1342  }
1343  if ( i < xprocs.size() ) iout << " ...";
1344  iout << "\n" << endi;
1345  }
1346 
1347  if (CkMyRank() == 0) {
1348  for (pe=0, y = 0; y < yBlocks; ++y )
1349  for (z = 0; z < zBlocks; ++z, ++pe ) {
1350  pencilPMEProcessors[xprocs[pe]] = 1;
1351  }
1352  }
1353 
1354 
1355  // creating the pencil arrays
1356  if ( CkMyPe() == 0 ){
1357 #if !USE_RANDOM_TOPO
1358  // std::sort(zprocs.begin(),zprocs.end(),WorkDistrib::pe_sortop_compact());
1359  WorkDistrib::sortPmePes(zprocs.begin(),xBlocks,yBlocks);
1360  std::sort(yprocs.begin(),yprocs.end(),WorkDistrib::pe_sortop_compact());
1361  std::sort(xprocs.begin(),xprocs.end(),WorkDistrib::pe_sortop_compact());
1362 #endif
1363 #if 1
1364  CProxy_PmePencilMap zm = CProxy_PmePencilMap::ckNew(0,1,yBlocks,xBlocks*yBlocks,zprocs.begin());
1365  CProxy_PmePencilMap ym;
1366  if ( simParams->PMEPencilsYLayout )
1367  ym = CProxy_PmePencilMap::ckNew(0,2,zBlocks,zBlocks*xBlocks,yprocs.begin()); // new
1368  else
1369  ym = CProxy_PmePencilMap::ckNew(2,0,xBlocks,zBlocks*xBlocks,yprocs.begin()); // old
1370  CProxy_PmePencilMap xm;
1371  if ( simParams->PMEPencilsXLayout )
1372  xm = CProxy_PmePencilMap::ckNew(2,1,yBlocks,yBlocks*zBlocks,xprocs.begin()); // new
1373  else
1374  xm = CProxy_PmePencilMap::ckNew(1,2,zBlocks,yBlocks*zBlocks,xprocs.begin()); // old
1375  pmeNodeProxy.recvPencilMapProxies(xm,ym,zm);
1376  CkArrayOptions zo(xBlocks,yBlocks,1); zo.setMap(zm);
1377  CkArrayOptions yo(xBlocks,1,zBlocks); yo.setMap(ym);
1378  CkArrayOptions xo(1,yBlocks,zBlocks); xo.setMap(xm);
1379  zo.setAnytimeMigration(false); zo.setStaticInsertion(true);
1380  yo.setAnytimeMigration(false); yo.setStaticInsertion(true);
1381  xo.setAnytimeMigration(false); xo.setStaticInsertion(true);
1382  zPencil = CProxy_PmeZPencil::ckNew(zo); // (xBlocks,yBlocks,1);
1383  yPencil = CProxy_PmeYPencil::ckNew(yo); // (xBlocks,1,zBlocks);
1384  xPencil = CProxy_PmeXPencil::ckNew(xo); // (1,yBlocks,zBlocks);
1385 #else
1386  zPencil = CProxy_PmeZPencil::ckNew(); // (xBlocks,yBlocks,1);
1387  yPencil = CProxy_PmeYPencil::ckNew(); // (xBlocks,1,zBlocks);
1388  xPencil = CProxy_PmeXPencil::ckNew(); // (1,yBlocks,zBlocks);
1389 
1390  for (pe=0, x = 0; x < xBlocks; ++x)
1391  for (y = 0; y < yBlocks; ++y, ++pe ) {
1392  zPencil(x,y,0).insert(zprocs[pe]);
1393  }
1394  zPencil.doneInserting();
1395 
1396  for (pe=0, x = 0; x < xBlocks; ++x)
1397  for (z = 0; z < zBlocks; ++z, ++pe ) {
1398  yPencil(x,0,z).insert(yprocs[pe]);
1399  }
1400  yPencil.doneInserting();
1401 
1402 
1403  for (pe=0, y = 0; y < yBlocks; ++y )
1404  for (z = 0; z < zBlocks; ++z, ++pe ) {
1405  xPencil(0,y,z).insert(xprocs[pe]);
1406  }
1407  xPencil.doneInserting();
1408 #endif
1409 
1410  pmeProxy.recvArrays(xPencil,yPencil,zPencil);
1411  PmePencilInitMsgData msgdata;
1412  msgdata.grid = myGrid;
1413  msgdata.xBlocks = xBlocks;
1414  msgdata.yBlocks = yBlocks;
1415  msgdata.zBlocks = zBlocks;
1416  msgdata.xPencil = xPencil;
1417  msgdata.yPencil = yPencil;
1418  msgdata.zPencil = zPencil;
1419  msgdata.pmeProxy = pmeProxyDir;
1420  msgdata.pmeNodeProxy = pmeNodeProxy;
1421  msgdata.xm = xm;
1422  msgdata.ym = ym;
1423  msgdata.zm = zm;
1424  xPencil.init(new PmePencilInitMsg(msgdata));
1425  yPencil.init(new PmePencilInitMsg(msgdata));
1426  zPencil.init(new PmePencilInitMsg(msgdata));
1427  }
1428 
1429  return; // continue in initialize_pencils() at next startup stage
1430  }
1431 
1432 
1433  int pe;
1434  int nx = 0;
1435  for ( pe = 0; pe < numGridPes; ++pe ) {
1436  localInfo[pe].x_start = nx;
1437  nx += myGrid.block1;
1438  if ( nx > myGrid.K1 ) nx = myGrid.K1;
1439  localInfo[pe].nx = nx - localInfo[pe].x_start;
1440  }
1441  int ny = 0;
1442  for ( pe = 0; pe < numTransPes; ++pe ) {
1443  localInfo[pe].y_start_after_transpose = ny;
1444  ny += myGrid.block2;
1445  if ( ny > myGrid.K2 ) ny = myGrid.K2;
1446  localInfo[pe].ny_after_transpose =
1447  ny - localInfo[pe].y_start_after_transpose;
1448  }
1449 
1450  { // decide how many pes this node exchanges charges with
1451 
1452  PatchMap *patchMap = PatchMap::Object();
1453  Lattice lattice = simParams->lattice;
1454  BigReal sysdima = lattice.a_r().unit() * lattice.a();
1455  BigReal cutoff = simParams->cutoff;
1456  BigReal patchdim = simParams->patchDimension;
1457  int numPatches = patchMap->numPatches();
1458  int numNodes = CkNumPes();
1459  int *source_flags = new int[numNodes];
1460  int node;
1461  for ( node=0; node<numNodes; ++node ) {
1462  source_flags[node] = 0;
1463  recipPeDest[node] = 0;
1464  }
1465 
1466  // // make sure that we don't get ahead of ourselves on this node
1467  // if ( CkMyPe() < numPatches && myRecipPe >= 0 ) {
1468  // source_flags[CkMyPe()] = 1;
1469  // recipPeDest[myRecipPe] = 1;
1470  // }
1471 
1472  for ( int pid=0; pid < numPatches; ++pid ) {
1473  int pnode = patchMap->node(pid);
1474 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
1475  if ( offload ) pnode = CkNodeFirst(CkNodeOf(pnode));
1476 #endif
1477  int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
1478  BigReal minx = patchMap->min_a(pid);
1479  BigReal maxx = patchMap->max_a(pid);
1480  BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
1481  // min1 (max1) is smallest (largest) grid line for this patch
1482  int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
1483  int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
1484  for ( int i=min1; i<=max1; ++i ) {
1485  int ix = i;
1486  while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
1487  while ( ix < 0 ) ix += myGrid.K1;
1488  // set source_flags[pnode] if this patch sends to our node
1489  if ( myGridPe >= 0 && ix >= localInfo[myGridPe].x_start &&
1490  ix < localInfo[myGridPe].x_start + localInfo[myGridPe].nx ) {
1491  source_flags[pnode] = 1;
1492  }
1493  // set dest_flags[] for node that our patch sends to
1494 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
1495  if ( offload ) {
1496  if ( pnode == CkNodeFirst(CkMyNode()) ) {
1497  recipPeDest[ix / myGrid.block1] = 1;
1498  }
1499  } else
1500 #endif
1501  if ( pnode == CkMyPe() ) {
1502  recipPeDest[ix / myGrid.block1] = 1;
1503  }
1504  }
1505  }
1506 
1507  int numSourcesSamePhysicalNode = 0;
1508  numSources = 0;
1509  numDestRecipPes = 0;
1510  for ( node=0; node<numNodes; ++node ) {
1511  if ( source_flags[node] ) ++numSources;
1512  if ( recipPeDest[node] ) ++numDestRecipPes;
1513  if ( source_flags[node] && CmiPeOnSamePhysicalNode(node,CkMyPe()) ) ++numSourcesSamePhysicalNode;
1514  }
1515 
1516 #if 0
1517  if ( numSources ) {
1518  CkPrintf("pe %5d pme %5d of %5d on same physical node\n",
1519  CkMyPe(), numSourcesSamePhysicalNode, numSources);
1520  iout << iINFO << "PME " << CkMyPe() << " sources:";
1521  for ( node=0; node<numNodes; ++node ) {
1522  if ( source_flags[node] ) iout << " " << node;
1523  }
1524  iout << "\n" << endi;
1525  }
1526 #endif
1527 
1528  delete [] source_flags;
1529 
1530  // CkPrintf("PME on node %d has %d sources and %d destinations\n",
1531  // CkMyPe(), numSources, numDestRecipPes);
1532 
1533  } // decide how many pes this node exchanges charges with (end)
1534 
1535  ungrid_count = numDestRecipPes;
1536 
1537  sendTransBarrier_received = 0;
1538 
1539  if ( myGridPe < 0 && myTransPe < 0 ) return;
1540  // the following only for nodes doing reciprocal sum
1541 
1542  if ( myTransPe >= 0 ) {
1543  recipEvirPe = findRecipEvirPe();
1544  pmeProxy[recipEvirPe].addRecipEvirClient();
1545  }
1546 
1547  if ( myTransPe >= 0 ) {
1548  int k2_start = localInfo[myTransPe].y_start_after_transpose;
1549  int k2_end = k2_start + localInfo[myTransPe].ny_after_transpose;
1550  #ifdef OPENATOM_VERSION
1551  if ( simParams->openatomOn ) {
1552  CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
1553  myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2, moaProxy);
1554  } else {
1555  myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
1556  }
1557  #else // OPENATOM_VERSION
1558  myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
1559  #endif // OPENATOM_VERSION
1560  }
1561 
1562  int local_size = myGrid.block1 * myGrid.K2 * myGrid.dim3;
1563  int local_size_2 = myGrid.block2 * myGrid.K1 * myGrid.dim3;
1564  if ( local_size < local_size_2 ) local_size = local_size_2;
1565  qgrid = new float[local_size*numGrids];
1566  if ( numGridPes > 1 || numTransPes > 1 ) {
1567  kgrid = new float[local_size*numGrids];
1568  } else {
1569  kgrid = qgrid;
1570  }
1571  qgrid_size = local_size;
1572 
1573  if ( myGridPe >= 0 ) {
1574  qgrid_start = localInfo[myGridPe].x_start * myGrid.K2 * myGrid.dim3;
1575  qgrid_len = localInfo[myGridPe].nx * myGrid.K2 * myGrid.dim3;
1576  fgrid_start = localInfo[myGridPe].x_start * myGrid.K2;
1577  fgrid_len = localInfo[myGridPe].nx * myGrid.K2;
1578  }
1579 
1580  int n[3]; n[0] = myGrid.K1; n[1] = myGrid.K2; n[2] = myGrid.K3;
1581 #ifdef NAMD_FFTW
1582  CmiLock(fftw_plan_lock);
1583 #ifdef NAMD_FFTW_3
1584  work = new fftwf_complex[n[0]];
1585  int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT : simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE ;
1586  if ( myGridPe >= 0 ) {
1587  forward_plan_yz=new fftwf_plan[numGrids];
1588  backward_plan_yz=new fftwf_plan[numGrids];
1589  }
1590  if ( myTransPe >= 0 ) {
1591  forward_plan_x=new fftwf_plan[numGrids];
1592  backward_plan_x=new fftwf_plan[numGrids];
1593  }
1594  /* need one plan per grid */
1595  if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps. 1..." << endi;
1596  if ( myGridPe >= 0 ) {
1597  for( int g=0; g<numGrids; g++)
1598  {
1599  forward_plan_yz[g] = fftwf_plan_many_dft_r2c(2, n+1,
1600  localInfo[myGridPe].nx,
1601  qgrid + qgrid_size * g,
1602  NULL,
1603  1,
1604  myGrid.dim2 * myGrid.dim3,
1605  (fftwf_complex *)
1606  (qgrid + qgrid_size * g),
1607  NULL,
1608  1,
1609  myGrid.dim2 * (myGrid.dim3/2),
1610  fftwFlags);
1611  }
1612  }
1613  int zdim = myGrid.dim3;
1614  int xStride=localInfo[myTransPe].ny_after_transpose *( myGrid.dim3 / 2);
1615  if ( ! CkMyPe() ) iout << " 2..." << endi;
1616  if ( myTransPe >= 0 ) {
1617  for( int g=0; g<numGrids; g++)
1618  {
1619 
1620  forward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
1621  (fftwf_complex *)
1622  (kgrid+qgrid_size*g),
1623  NULL,
1624  xStride,
1625  1,
1626  (fftwf_complex *)
1627  (kgrid+qgrid_size*g),
1628  NULL,
1629  xStride,
1630  1,
1631  FFTW_FORWARD,fftwFlags);
1632 
1633  }
1634  }
1635  if ( ! CkMyPe() ) iout << " 3..." << endi;
1636  if ( myTransPe >= 0 ) {
1637  for( int g=0; g<numGrids; g++)
1638  {
1639  backward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
1640  (fftwf_complex *)
1641  (kgrid+qgrid_size*g),
1642  NULL,
1643  xStride,
1644  1,
1645  (fftwf_complex *)
1646  (kgrid+qgrid_size*g),
1647  NULL,
1648  xStride,
1649  1,
1650  FFTW_BACKWARD, fftwFlags);
1651 
1652  }
1653  }
1654  if ( ! CkMyPe() ) iout << " 4..." << endi;
1655  if ( myGridPe >= 0 ) {
1656  for( int g=0; g<numGrids; g++)
1657  {
1658  backward_plan_yz[g] = fftwf_plan_many_dft_c2r(2, n+1,
1659  localInfo[myGridPe].nx,
1660  (fftwf_complex *)
1661  (qgrid + qgrid_size * g),
1662  NULL,
1663  1,
1664  myGrid.dim2*(myGrid.dim3/2),
1665  qgrid + qgrid_size * g,
1666  NULL,
1667  1,
1668  myGrid.dim2 * myGrid.dim3,
1669  fftwFlags);
1670  }
1671  }
1672  if ( ! CkMyPe() ) iout << " Done.\n" << endi;
1673 
1674 #else
1675  work = new fftw_complex[n[0]];
1676 
1677  if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps. 1..." << endi;
1678  if ( myGridPe >= 0 ) {
1679  forward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_REAL_TO_COMPLEX,
1680  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1681  | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
1682  }
1683  if ( ! CkMyPe() ) iout << " 2..." << endi;
1684  if ( myTransPe >= 0 ) {
1685  forward_plan_x = fftw_create_plan_specific(n[0], FFTW_FORWARD,
1686  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1687  | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
1688  localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
1689  }
1690  if ( ! CkMyPe() ) iout << " 3..." << endi;
1691  if ( myTransPe >= 0 ) {
1692  backward_plan_x = fftw_create_plan_specific(n[0], FFTW_BACKWARD,
1693  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1694  | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
1695  localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
1696  }
1697  if ( ! CkMyPe() ) iout << " 4..." << endi;
1698  if ( myGridPe >= 0 ) {
1699  backward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_COMPLEX_TO_REAL,
1700  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1701  | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
1702  }
1703  if ( ! CkMyPe() ) iout << " Done.\n" << endi;
1704 #endif
1705  CmiUnlock(fftw_plan_lock);
1706 #else
1707  NAMD_die("Sorry, FFTW must be compiled in to use PME.");
1708 #endif
1709 
1710  if ( myGridPe >= 0 && numSources == 0 )
1711  NAMD_bug("PME grid elements exist without sources.");
1712  grid_count = numSources;
1713  memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
1714  trans_count = numGridPes;
1715 }
static Node * Object()
Definition: Node.h:86
int dim2
Definition: PmeBase.h:22
static CmiNodeLock fftw_plan_lock
Definition: ComputePme.C:440
std::ostream & iINFO(std::ostream &s)
Definition: InfoStream.C:81
static void sortPmePes(int *pmepes, int xdim, int ydim)
Definition: WorkDistrib.C:305
int numNodesWithPatches(void)
Definition: PatchMap.h:61
int size(void) const
Definition: ResizeArray.h:131
int dim3
Definition: PmeBase.h:22
CProxy_ComputePmeMgr pmeProxy
Definition: ComputePme.C:243
BigReal max_a(int pid) const
Definition: PatchMap.h:92
void cuda_errcheck(const char *msg)
Definition: ComputePme.C:67
static PatchMap * Object()
Definition: PatchMap.h:27
int K2
Definition: PmeBase.h:21
CProxy_PmeZPencil zPencil
Definition: ComputePme.C:242
SimParameters * simParameters
Definition: Node.h:181
int K1
Definition: PmeBase.h:21
static int numGrids
Definition: ComputePme.h:32
std::ostream & endi(std::ostream &s)
Definition: InfoStream.C:54
int block1
Definition: PmeBase.h:24
CProxy_PmeYPencil yPencil
Definition: ComputePme.C:241
CProxy_PmePencilMap zm
Definition: ComputePme.C:247
CProxy_PmePencilMap xm
Definition: ComputePme.C:245
CProxy_NodePmeMgr pmeNodeProxy
Definition: ComputePme.C:244
#define iout
Definition: InfoStream.h:51
int block2
Definition: PmeBase.h:24
int add(const Elem &elem)
Definition: ResizeArray.h:101
void resize(int i)
Definition: ResizeArray.h:84
Definition: Random.h:37
int numPatches(void) const
Definition: PatchMap.h:59
int order
Definition: PmeBase.h:23
void NAMD_bug(const char *err_msg)
Definition: common.C:195
int block3
Definition: PmeBase.h:24
void generatePmePeList2(int *gridPeMap, int numGridPes, int *transPeMap, int numTransPes)
Definition: ComputePme.C:318
NAMD_HOST_DEVICE Vector a_r() const
Definition: Lattice.h:284
void NAMD_die(const char *err_msg)
Definition: common.C:147
BigReal min_a(int pid) const
Definition: PatchMap.h:91
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:23
static int findRecipEvirPe()
Definition: ComputePme.C:267
static int * peDiffuseOrdering
Definition: WorkDistrib.h:115
int ny_after_transpose
Definition: ComputePme.C:259
int getDeviceID()
Definition: DeviceCUDA.h:144
#define simParams
Definition: Output.C:129
int K3
Definition: PmeBase.h:21
int numPatchesOnNode(int node)
Definition: PatchMap.h:60
CProxy_PmePencilMap ym
Definition: ComputePme.C:246
int node(int pid) const
Definition: PatchMap.h:114
NAMD_HOST_DEVICE Vector a() const
Definition: Lattice.h:268
bool one_device_per_node()
Definition: DeviceCUDA.C:547
char * pencilPMEProcessors
Definition: ComputePme.C:133
int64_t int64
Definition: common.h:39
NAMD_HOST_DEVICE Vector unit(void) const
Definition: Vector.h:215
double BigReal
Definition: common.h:123
CProxy_PmeXPencil xPencil
Definition: ComputePme.C:240
int y_start_after_transpose
Definition: ComputePme.C:259

◆ initialize_computes()

void ComputePmeMgr::initialize_computes ( )

Definition at line 2756 of file ComputePme.C.

References chargeGridSubmittedCount, cuda_errcheck(), cuda_init_bspline_coeffs(), cuda_lock, deviceCUDA, PmeGrid::dim2, PmeGrid::dim3, DeviceCUDA::getDeviceID(), DeviceCUDA::getMasterPe(), ijpair::i, ijpair::j, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, master_pe, NAMD_bug(), ComputePmeUtil::numGrids, PatchMap::numPatchesOnNode(), PatchMap::Object(), Node::Object(), ReductionMgr::Object(), PmeGrid::order, REDUCTIONS_BASIC, Node::simParameters, simParams, ReductionMgr::willSubmit(), and XCOPY.

2756  {
2757 
2758  noWorkCount = 0;
2759  doWorkCount = 0;
2760  ungridForcesCount = 0;
2761 
2763 
2765 
2766  strayChargeErrors = 0;
2767 
2768 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2769  PatchMap *patchMap = PatchMap::Object();
2770  int pe = master_pe = CkNodeFirst(CkMyNode());
2771  for ( int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
2772  if ( ! patchMap->numPatchesOnNode(master_pe) ) master_pe = pe;
2773  if ( ! patchMap->numPatchesOnNode(pe) ) continue;
2774  if ( master_pe < 1 && pe != deviceCUDA->getMasterPe() ) master_pe = pe;
2775  if ( master_pe == deviceCUDA->getMasterPe() ) master_pe = pe;
2777  && pe != deviceCUDA->getMasterPe() ) {
2778  master_pe = pe;
2779  }
2780  }
2781  if ( ! patchMap->numPatchesOnNode(master_pe) ) {
2782  NAMD_bug("ComputePmeMgr::initialize_computes() master_pe has no patches.");
2783  }
2784 
2785  masterPmeMgr = nodePmeMgr->mgrObjects[master_pe - CkNodeFirst(CkMyNode())];
2786  bool cudaFirst = 1;
2787  if ( offload ) {
2788  CmiLock(cuda_lock);
2789  cudaFirst = ! masterPmeMgr->chargeGridSubmittedCount++;
2790  }
2791 
2792  if ( cudaFirst ) {
2793  nodePmeMgr->master_pe = master_pe;
2794  nodePmeMgr->masterPmeMgr = masterPmeMgr;
2795  }
2796 #endif
2797 
2798  qsize = myGrid.K1 * myGrid.dim2 * myGrid.dim3;
2799  fsize = myGrid.K1 * myGrid.dim2;
2800  if ( myGrid.K2 != myGrid.dim2 ) NAMD_bug("PME myGrid.K2 != myGrid.dim2");
2801 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2802  if ( ! offload )
2803 #endif
2804  {
2805  q_arr = new float*[fsize*numGrids];
2806  memset( (void*) q_arr, 0, fsize*numGrids * sizeof(float*) );
2807  q_list = new float*[fsize*numGrids];
2808  memset( (void*) q_list, 0, fsize*numGrids * sizeof(float*) );
2809  q_count = 0;
2810  }
2811 
2812 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2813  if ( cudaFirst || ! offload ) {
2814 #endif
2815  f_arr = new char[fsize*numGrids];
2816  // memset to non-zero value has race condition on BlueGene/Q
2817  // memset( (void*) f_arr, 2, fsize*numGrids * sizeof(char) );
2818  for ( int n=fsize*numGrids, i=0; i<n; ++i ) f_arr[i] = 2;
2819 
2820  for ( int g=0; g<numGrids; ++g ) {
2821  char *f = f_arr + g*fsize;
2822  if ( usePencils ) {
2823  int K1 = myGrid.K1;
2824  int K2 = myGrid.K2;
2825  int block1 = ( K1 + xBlocks - 1 ) / xBlocks;
2826  int block2 = ( K2 + yBlocks - 1 ) / yBlocks;
2827  int dim2 = myGrid.dim2;
2828  for (int ap=0; ap<numPencilsActive; ++ap) {
2829  int ib = activePencils[ap].i;
2830  int jb = activePencils[ap].j;
2831  int ibegin = ib*block1;
2832  int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
2833  int jbegin = jb*block2;
2834  int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
2835  int flen = numGrids * (iend - ibegin) * (jend - jbegin);
2836  for ( int i=ibegin; i<iend; ++i ) {
2837  for ( int j=jbegin; j<jend; ++j ) {
2838  f[i*dim2+j] = 0;
2839  }
2840  }
2841  }
2842  } else {
2843  int block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
2844  bsize = block1 * myGrid.dim2 * myGrid.dim3;
2845  for (int pe=0; pe<numGridPes; pe++) {
2846  if ( ! recipPeDest[pe] ) continue;
2847  int start = pe * bsize;
2848  int len = bsize;
2849  if ( start >= qsize ) { start = 0; len = 0; }
2850  if ( start + len > qsize ) { len = qsize - start; }
2851  int zdim = myGrid.dim3;
2852  int fstart = start / zdim;
2853  int flen = len / zdim;
2854  memset(f + fstart, 0, flen*sizeof(char));
2855  // CkPrintf("pe %d enabled slabs %d to %d\n", CkMyPe(), fstart/myGrid.dim2, (fstart+flen)/myGrid.dim2-1);
2856  }
2857  }
2858  }
2859 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2860  }
2861  if ( offload ) {
2862  cudaSetDevice(deviceCUDA->getDeviceID());
2863  if ( cudaFirst ) {
2864 
2865  int f_alloc_count = 0;
2866  for ( int n=fsize, i=0; i<n; ++i ) {
2867  if ( f_arr[i] == 0 ) {
2868  ++f_alloc_count;
2869  }
2870  }
2871  // CkPrintf("pe %d f_alloc_count == %d (%d slabs)\n", CkMyPe(), f_alloc_count, f_alloc_count/myGrid.dim2);
2872 
2873  q_arr = new float*[fsize*numGrids];
2874  memset( (void*) q_arr, 0, fsize*numGrids * sizeof(float*) );
2875 
2876  float **q_arr_dev_host = new float*[fsize];
2877  cudaMalloc((void**) &q_arr_dev, fsize * sizeof(float*));
2878 
2879  float **v_arr_dev_host = new float*[fsize];
2880  cudaMalloc((void**) &v_arr_dev, fsize * sizeof(float*));
2881 
2882  int q_stride = myGrid.K3+myGrid.order-1;
2883  q_data_size = f_alloc_count * q_stride * sizeof(float);
2884  ffz_size = (fsize + q_stride) * sizeof(int);
2885 
2886  // tack ffz onto end of q_data to allow merged transfer
2887  cudaMallocHost((void**) &q_data_host, q_data_size+ffz_size);
2888  ffz_host = (int*)(((char*)q_data_host) + q_data_size);
2889  cudaMalloc((void**) &q_data_dev, q_data_size+ffz_size);
2890  ffz_dev = (int*)(((char*)q_data_dev) + q_data_size);
2891  cudaMalloc((void**) &v_data_dev, q_data_size);
2892  cuda_errcheck("malloc grid data for pme");
2893  cudaMemset(q_data_dev, 0, q_data_size + ffz_size); // for first time
2894  cudaEventCreateWithFlags(&(nodePmeMgr->end_charge_memset),cudaEventDisableTiming);
2895  cudaEventRecord(nodePmeMgr->end_charge_memset, 0);
2896  cudaEventCreateWithFlags(&(nodePmeMgr->end_all_pme_kernels),cudaEventDisableTiming);
2897  cudaEventCreateWithFlags(&(nodePmeMgr->end_potential_memcpy),cudaEventDisableTiming);
2898 
2899  f_alloc_count = 0;
2900  for ( int n=fsize, i=0; i<n; ++i ) {
2901  if ( f_arr[i] == 0 ) {
2902  q_arr[i] = q_data_host + f_alloc_count * q_stride;
2903  q_arr_dev_host[i] = q_data_dev + f_alloc_count * q_stride;
2904  v_arr_dev_host[i] = v_data_dev + f_alloc_count * q_stride;
2905  ++f_alloc_count;
2906  } else {
2907  q_arr[i] = 0;
2908  q_arr_dev_host[i] = 0;
2909  v_arr_dev_host[i] = 0;
2910  }
2911  }
2912 
2913  cudaMemcpy(q_arr_dev, q_arr_dev_host, fsize * sizeof(float*), cudaMemcpyHostToDevice);
2914  cudaMemcpy(v_arr_dev, v_arr_dev_host, fsize * sizeof(float*), cudaMemcpyHostToDevice);
2915  delete [] q_arr_dev_host;
2916  delete [] v_arr_dev_host;
2917  delete [] f_arr;
2918  f_arr = new char[fsize + q_stride];
2919  fz_arr = f_arr + fsize;
2920  memset(f_arr, 0, fsize + q_stride);
2921  memset(ffz_host, 0, (fsize + q_stride)*sizeof(int));
2922 
2923  cuda_errcheck("initialize grid data for pme");
2924 
2925  cuda_init_bspline_coeffs(&bspline_coeffs_dev, &bspline_dcoeffs_dev, myGrid.order);
2926  cuda_errcheck("initialize bspline coefficients for pme");
2927 
2928 #define XCOPY(X) masterPmeMgr->X = X;
2929  XCOPY(bspline_coeffs_dev)
2930  XCOPY(bspline_dcoeffs_dev)
2931  XCOPY(q_arr)
2932  XCOPY(q_arr_dev)
2933  XCOPY(v_arr_dev)
2934  XCOPY(q_data_size)
2935  XCOPY(q_data_host)
2936  XCOPY(q_data_dev)
2937  XCOPY(v_data_dev)
2938  XCOPY(ffz_size)
2939  XCOPY(ffz_host)
2940  XCOPY(ffz_dev)
2941  XCOPY(f_arr)
2942  XCOPY(fz_arr)
2943 #undef XCOPY
2944  //CkPrintf("pe %d init first\n", CkMyPe());
2945  } else { // cudaFirst
2946  //CkPrintf("pe %d init later\n", CkMyPe());
2947 #define XCOPY(X) X = masterPmeMgr->X;
2948  XCOPY(bspline_coeffs_dev)
2949  XCOPY(bspline_dcoeffs_dev)
2950  XCOPY(q_arr)
2951  XCOPY(q_arr_dev)
2952  XCOPY(v_arr_dev)
2953  XCOPY(q_data_size)
2954  XCOPY(q_data_host)
2955  XCOPY(q_data_dev)
2956  XCOPY(v_data_dev)
2957  XCOPY(ffz_size)
2958  XCOPY(ffz_host)
2959  XCOPY(ffz_dev)
2960  XCOPY(f_arr)
2961  XCOPY(fz_arr)
2962 #undef XCOPY
2963  } // cudaFirst
2964  CmiUnlock(cuda_lock);
2965  } else // offload
2966 #endif // NAMD_CUDA
2967  {
2968  fz_arr = new char[myGrid.K3+myGrid.order-1];
2969  }
2970 
2971 #if 0 && USE_PERSISTENT
2972  recvGrid_handle = NULL;
2973 #endif
2974 }
static Node * Object()
Definition: Node.h:86
int dim2
Definition: PmeBase.h:22
void cuda_init_bspline_coeffs(float **c, float **dc, int order)
int dim3
Definition: PmeBase.h:22
void cuda_errcheck(const char *msg)
Definition: ComputePme.C:67
static PatchMap * Object()
Definition: PatchMap.h:27
int K2
Definition: PmeBase.h:21
SimParameters * simParameters
Definition: Node.h:181
int K1
Definition: PmeBase.h:21
static int numGrids
Definition: ComputePme.h:32
SubmitReduction * willSubmit(int setID, int size=-1)
Definition: ReductionMgr.C:366
static ReductionMgr * Object(void)
Definition: ReductionMgr.h:279
int order
Definition: PmeBase.h:23
int getMasterPe()
Definition: DeviceCUDA.h:137
void NAMD_bug(const char *err_msg)
Definition: common.C:195
int chargeGridSubmittedCount
Definition: ComputePme.C:470
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:23
#define XCOPY(X)
int getDeviceID()
Definition: DeviceCUDA.h:144
#define simParams
Definition: Output.C:129
int K3
Definition: PmeBase.h:21
int numPatchesOnNode(int node)
Definition: PatchMap.h:60
int i
Definition: ComputePme.C:369
int j
Definition: ComputePme.C:369
static CmiNodeLock cuda_lock
Definition: ComputePme.C:450

◆ initialize_pencils()

void ComputePmeMgr::initialize_pencils ( CkQdMsg *  msg)

Definition at line 1719 of file ComputePme.C.

References Lattice::a(), Lattice::a_r(), Lattice::b(), Lattice::b_r(), PmeGrid::block1, PmeGrid::block2, deviceCUDA, DeviceCUDA::getMasterPe(), PmeGrid::K1, PmeGrid::K2, PatchMap::max_a(), PatchMap::max_b(), PatchMap::min_a(), PatchMap::min_b(), PatchMap::node(), PatchMap::numPatches(), PatchMap::Object(), Node::Object(), PmeGrid::order, Random::reorder(), Node::simParameters, simParams, and Vector::unit().

1719  {
1720  delete msg;
1721  if ( ! usePencils ) return;
1722 
1724 
1725  PatchMap *patchMap = PatchMap::Object();
1726  Lattice lattice = simParams->lattice;
1727  BigReal sysdima = lattice.a_r().unit() * lattice.a();
1728  BigReal sysdimb = lattice.b_r().unit() * lattice.b();
1729  BigReal cutoff = simParams->cutoff;
1730  BigReal patchdim = simParams->patchDimension;
1731  int numPatches = patchMap->numPatches();
1732 
1733  pencilActive = new char[xBlocks*yBlocks];
1734  for ( int i=0; i<xBlocks; ++i ) {
1735  for ( int j=0; j<yBlocks; ++j ) {
1736  pencilActive[i*yBlocks+j] = 0;
1737  }
1738  }
1739 
1740  for ( int pid=0; pid < numPatches; ++pid ) {
1741  int pnode = patchMap->node(pid);
1742 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
1743  if ( offload ) {
1744  if ( CkNodeOf(pnode) != CkMyNode() ) continue;
1745  } else
1746 #endif
1747  if ( pnode != CkMyPe() ) continue;
1748 
1749  int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
1750  int shift2 = (myGrid.K2 + myGrid.order - 1)/2;
1751 
1752  BigReal minx = patchMap->min_a(pid);
1753  BigReal maxx = patchMap->max_a(pid);
1754  BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
1755  // min1 (max1) is smallest (largest) grid line for this patch
1756  int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
1757  int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
1758 
1759  BigReal miny = patchMap->min_b(pid);
1760  BigReal maxy = patchMap->max_b(pid);
1761  BigReal marginb = 0.5 * ( patchdim - cutoff ) / sysdimb;
1762  // min2 (max2) is smallest (largest) grid line for this patch
1763  int min2 = ((int) floor(myGrid.K2 * (miny - marginb))) + shift2 - myGrid.order + 1;
1764  int max2 = ((int) floor(myGrid.K2 * (maxy + marginb))) + shift2;
1765 
1766  for ( int i=min1; i<=max1; ++i ) {
1767  int ix = i;
1768  while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
1769  while ( ix < 0 ) ix += myGrid.K1;
1770  for ( int j=min2; j<=max2; ++j ) {
1771  int jy = j;
1772  while ( jy >= myGrid.K2 ) jy -= myGrid.K2;
1773  while ( jy < 0 ) jy += myGrid.K2;
1774  pencilActive[(ix / myGrid.block1)*yBlocks + (jy / myGrid.block2)] = 1;
1775  }
1776  }
1777  }
1778 
1779  numPencilsActive = 0;
1780  for ( int i=0; i<xBlocks; ++i ) {
1781  for ( int j=0; j<yBlocks; ++j ) {
1782  if ( pencilActive[i*yBlocks+j] ) {
1783  ++numPencilsActive;
1784 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
1785  if ( CkMyPe() == deviceCUDA->getMasterPe() || ! offload )
1786 #endif
1787  zPencil(i,j,0).dummyRecvGrid(CkMyPe(),0);
1788  }
1789  }
1790  }
1791  activePencils = new ijpair[numPencilsActive];
1792  numPencilsActive = 0;
1793  for ( int i=0; i<xBlocks; ++i ) {
1794  for ( int j=0; j<yBlocks; ++j ) {
1795  if ( pencilActive[i*yBlocks+j] ) {
1796  activePencils[numPencilsActive++] = ijpair(i,j);
1797  }
1798  }
1799  }
1800  if ( simParams->PMESendOrder ) {
1801  std::sort(activePencils,activePencils+numPencilsActive,ijpair_sortop_bit_reversed());
1802  } else {
1803  Random rand(CkMyPe());
1804  rand.reorder(activePencils,numPencilsActive);
1805  }
1806  //if ( numPencilsActive ) {
1807  // CkPrintf("node %d sending to %d pencils\n", CkMyPe(), numPencilsActive);
1808  //}
1809 
1810  ungrid_count = numPencilsActive;
1811 }
static Node * Object()
Definition: Node.h:86
BigReal max_a(int pid) const
Definition: PatchMap.h:92
static PatchMap * Object()
Definition: PatchMap.h:27
int K2
Definition: PmeBase.h:21
SimParameters * simParameters
Definition: Node.h:181
int K1
Definition: PmeBase.h:21
int block1
Definition: PmeBase.h:24
int block2
Definition: PmeBase.h:24
Definition: Random.h:37
int numPatches(void) const
Definition: PatchMap.h:59
int order
Definition: PmeBase.h:23
int getMasterPe()
Definition: DeviceCUDA.h:137
NAMD_HOST_DEVICE Vector a_r() const
Definition: Lattice.h:284
NAMD_HOST_DEVICE Vector b_r() const
Definition: Lattice.h:285
BigReal min_a(int pid) const
Definition: PatchMap.h:91
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:23
NAMD_HOST_DEVICE Vector b() const
Definition: Lattice.h:269
#define simParams
Definition: Output.C:129
BigReal max_b(int pid) const
Definition: PatchMap.h:94
int node(int pid) const
Definition: PatchMap.h:114
NAMD_HOST_DEVICE Vector a() const
Definition: Lattice.h:268
BigReal min_b(int pid) const
Definition: PatchMap.h:93
NAMD_HOST_DEVICE Vector unit(void) const
Definition: Vector.h:215
double BigReal
Definition: common.h:123

◆ pollChargeGridReady()

void ComputePmeMgr::pollChargeGridReady ( )

Definition at line 3566 of file ComputePme.C.

References CcdCallBacksReset(), cuda_check_pme_charges(), CUDA_POLL, and NAMD_bug().

3566  {
3567 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3568  CcdCallBacksReset(0,CmiWallTimer()); // fix Charm++
3570 #else
3571  NAMD_bug("ComputePmeMgr::pollChargeGridReady() called in non-CUDA build.");
3572 #endif
3573 }
#define CUDA_POLL(FN, ARG)
Definition: ComputePme.C:2496
void CcdCallBacksReset(void *ignored, double curWallTime)
void NAMD_bug(const char *err_msg)
Definition: common.C:195
void cuda_check_pme_charges(void *arg, double walltime)
Definition: ComputePme.C:3493

◆ pollForcesReady()

void ComputePmeMgr::pollForcesReady ( )

Definition at line 2692 of file ComputePme.C.

References CcdCallBacksReset(), cuda_check_pme_forces(), CUDA_POLL, and NAMD_bug().

2692  {
2693 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2694  CcdCallBacksReset(0,CmiWallTimer()); // fix Charm++
2696 #else
2697  NAMD_bug("ComputePmeMgr::pollForcesReady() called in non-CUDA build.");
2698 #endif
2699 }
#define CUDA_POLL(FN, ARG)
Definition: ComputePme.C:2496
void CcdCallBacksReset(void *ignored, double curWallTime)
void NAMD_bug(const char *err_msg)
Definition: common.C:195
void cuda_check_pme_forces(void *arg, double walltime)
Definition: ComputePme.C:2503

◆ procTrans()

void ComputePmeMgr::procTrans ( PmeTransMsg msg)

Definition at line 2074 of file ComputePme.C.

References PmeGrid::dim3, PmeTransMsg::lattice, NodePmeInfo::npe, ComputePmeUtil::numGrids, PmeTransMsg::nx, LocalPmeInfo::ny_after_transpose, NodePmeInfo::pe_start, PmeTransMsg::qgrid, PmeTransMsg::sequence, PmeTransMsg::x_start, and LocalPmeInfo::y_start_after_transpose.

Referenced by recvSharedTrans(), and recvTrans().

2074  {
2075  // CkPrintf("procTrans on Pe(%d)\n",CkMyPe());
2076  if ( trans_count == numGridPes ) {
2077  lattice = msg->lattice;
2078  grid_sequence = msg->sequence;
2079  }
2080 
2081  if ( msg->nx ) {
2082  int zdim = myGrid.dim3;
2083  NodePmeInfo &nodeInfo(transNodeInfo[myTransNode]);
2084  int first_pe = nodeInfo.pe_start;
2085  int last_pe = first_pe+nodeInfo.npe-1;
2086  int y_skip = localInfo[myTransPe].y_start_after_transpose
2087  - localInfo[first_pe].y_start_after_transpose;
2088  int ny_msg = localInfo[last_pe].y_start_after_transpose
2089  + localInfo[last_pe].ny_after_transpose
2090  - localInfo[first_pe].y_start_after_transpose;
2091  int ny = localInfo[myTransPe].ny_after_transpose;
2092  int x_start = msg->x_start;
2093  int nx = msg->nx;
2094  for ( int g=0; g<numGrids; ++g ) {
2095  CmiMemcpy((void*)(kgrid + qgrid_size * g + x_start*ny*zdim),
2096  (void*)(msg->qgrid + nx*(ny_msg*g+y_skip)*zdim),
2097  nx*ny*zdim*sizeof(float));
2098  }
2099  }
2100 
2101  --trans_count;
2102 
2103  if ( trans_count == 0 ) {
2104  pmeProxyDir[CkMyPe()].gridCalc2();
2105  }
2106 }
int dim3
Definition: PmeBase.h:22
static int numGrids
Definition: ComputePme.h:32
float * qgrid
Definition: ComputePme.C:163
int ny_after_transpose
Definition: ComputePme.C:259
Lattice lattice
Definition: ComputePme.C:160
int y_start_after_transpose
Definition: ComputePme.C:259

◆ procUntrans()

void ComputePmeMgr::procUntrans ( PmeUntransMsg msg)

Definition at line 2328 of file ComputePme.C.

References PmeGrid::dim3, PmeGrid::K2, NodePmeInfo::npe, ComputePmeUtil::numGrids, LocalPmeInfo::nx, PmeUntransMsg::ny, NodePmeInfo::pe_start, PmeUntransMsg::qgrid, LocalPmeInfo::x_start, and PmeUntransMsg::y_start.

Referenced by recvSharedUntrans(), and recvUntrans().

2328  {
2329  // CkPrintf("recvUntrans on Pe(%d)\n",CkMyPe());
2330 
2331 #if CMK_BLUEGENEL
2332  CmiNetworkProgressAfter (0);
2333 #endif
2334 
2335  NodePmeInfo &nodeInfo(gridNodeInfo[myGridNode]);
2336  int first_pe = nodeInfo.pe_start;
2337  int g;
2338 
2339  if ( msg->ny ) {
2340  int zdim = myGrid.dim3;
2341  int last_pe = first_pe+nodeInfo.npe-1;
2342  int x_skip = localInfo[myGridPe].x_start
2343  - localInfo[first_pe].x_start;
2344  int nx_msg = localInfo[last_pe].x_start
2345  + localInfo[last_pe].nx
2346  - localInfo[first_pe].x_start;
2347  int nx = localInfo[myGridPe].nx;
2348  int y_start = msg->y_start;
2349  int ny = msg->ny;
2350  int slicelen = myGrid.K2 * zdim;
2351  int cpylen = ny * zdim;
2352  for ( g=0; g<numGrids; ++g ) {
2353  float *q = qgrid + qgrid_size * g + y_start * zdim;
2354  float *qmsg = msg->qgrid + (nx_msg*g+x_skip) * cpylen;
2355  for ( int x = 0; x < nx; ++x ) {
2356  CmiMemcpy((void*)q, (void*)qmsg, cpylen*sizeof(float));
2357  q += slicelen;
2358  qmsg += cpylen;
2359  }
2360  }
2361  }
2362 
2363  --untrans_count;
2364 
2365  if ( untrans_count == 0 ) {
2366  pmeProxyDir[CkMyPe()].gridCalc3();
2367  }
2368 }
float * qgrid
Definition: ComputePme.C:180
int dim3
Definition: PmeBase.h:22
int K2
Definition: PmeBase.h:21
static int numGrids
Definition: ComputePme.h:32

◆ recvAck()

void ComputePmeMgr::recvAck ( PmeAckMsg msg)

Definition at line 2470 of file ComputePme.C.

References cuda_lock, master_pe, and NAMD_bug().

Referenced by recvUngrid().

2470  {
2471  if ( msg ) delete msg;
2472 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2473  if ( offload ) {
2474  CmiLock(cuda_lock);
2475  if ( ungrid_count == 0 ) {
2476  NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
2477  }
2478  int uc = --ungrid_count;
2479  CmiUnlock(cuda_lock);
2480 
2481  if ( uc == 0 ) {
2482  pmeProxyDir[master_pe].ungridCalc();
2483  }
2484  return;
2485  }
2486 #endif
2487  --ungrid_count;
2488 
2489  if ( ungrid_count == 0 ) {
2490  pmeProxyDir[CkMyPe()].ungridCalc();
2491  }
2492 }
void NAMD_bug(const char *err_msg)
Definition: common.C:195
static CmiNodeLock cuda_lock
Definition: ComputePme.C:450

◆ recvArrays()

void ComputePmeMgr::recvArrays ( CProxy_PmeXPencil  x,
CProxy_PmeYPencil  y,
CProxy_PmeZPencil  z 
)

Definition at line 826 of file ComputePme.C.

827  {
828  xPencil = x; yPencil = y; zPencil = z;
829 
830  if(CmiMyRank()==0)
831  {
832  pmeNodeProxy.ckLocalBranch()->xPencil=x;
833  pmeNodeProxy.ckLocalBranch()->yPencil=y;
834  pmeNodeProxy.ckLocalBranch()->zPencil=z;
835  }
836 }

◆ recvChargeGridReady()

void ComputePmeMgr::recvChargeGridReady ( )

Definition at line 3575 of file ComputePme.C.

References chargeGridReady(), saved_lattice, and saved_sequence.

3575  {
3577 }
Lattice * saved_lattice
Definition: ComputePme.C:473
void chargeGridReady(Lattice &lattice, int sequence)
Definition: ComputePme.C:3579
int saved_sequence
Definition: ComputePme.C:474

◆ recvGrid()

void ComputePmeMgr::recvGrid ( PmeGridMsg msg)

Definition at line 1853 of file ComputePme.C.

References PmeGrid::dim3, PmeGridMsg::fgrid, PmeGridMsg::lattice, NAMD_bug(), ComputePmeUtil::numGrids, PmeGridMsg::qgrid, PmeGridMsg::sequence, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

1853  {
1854  // CkPrintf("recvGrid from %d on Pe(%d)\n",msg->sourceNode,CkMyPe());
1855  if ( grid_count == 0 ) {
1856  NAMD_bug("Message order failure in ComputePmeMgr::recvGrid\n");
1857  }
1858  if ( grid_count == numSources ) {
1859  lattice = msg->lattice;
1860  grid_sequence = msg->sequence;
1861  }
1862 
1863  int zdim = myGrid.dim3;
1864  int zlistlen = msg->zlistlen;
1865  int *zlist = msg->zlist;
1866  float *qmsg = msg->qgrid;
1867  for ( int g=0; g<numGrids; ++g ) {
1868  char *f = msg->fgrid + fgrid_len * g;
1869  float *q = qgrid + qgrid_size * g;
1870  for ( int i=0; i<fgrid_len; ++i ) {
1871  if ( f[i] ) {
1872  for ( int k=0; k<zlistlen; ++k ) {
1873  q[zlist[k]] += *(qmsg++);
1874  }
1875  }
1876  q += zdim;
1877  }
1878  }
1879 
1880  gridmsg_reuse[numSources-grid_count] = msg;
1881  --grid_count;
1882 
1883  if ( grid_count == 0 ) {
1884  pmeProxyDir[CkMyPe()].gridCalc1();
1885  if ( useBarrier ) pmeProxyDir[0].sendTransBarrier();
1886  }
1887 }
int dim3
Definition: PmeBase.h:22
int sequence
Definition: ComputePme.C:142
static int numGrids
Definition: ComputePme.h:32
Lattice lattice
Definition: ComputePme.C:144
void NAMD_bug(const char *err_msg)
Definition: common.C:195
float * qgrid
Definition: ComputePme.C:150
int * zlist
Definition: ComputePme.C:148
int zlistlen
Definition: ComputePme.C:147
char * fgrid
Definition: ComputePme.C:149

◆ recvRecipEvir()

void ComputePmeMgr::recvRecipEvir ( PmeEvirMsg msg)

Definition at line 3059 of file ComputePme.C.

References PmeEvirMsg::evir, NAMD_bug(), ComputePmeUtil::numGrids, pmeComputes, ResizeArray< Elem >::size(), and submitReductions().

3059  {
3060  if ( ! pmeComputes.size() ) NAMD_bug("ComputePmeMgr::recvRecipEvir() called on pe without patches");
3061  for ( int g=0; g<numGrids; ++g ) {
3062  evir[g] += msg->evir[g];
3063  }
3064  delete msg;
3065  // CkPrintf("recvRecipEvir pe %d %d %d\n", CkMyPe(), ungridForcesCount, recipEvirCount);
3066  if ( ! --recipEvirCount && ! ungridForcesCount ) submitReductions();
3067 }
int size(void) const
Definition: ResizeArray.h:131
static int numGrids
Definition: ComputePme.h:32
PmeReduction * evir
Definition: ComputePme.C:193
void NAMD_bug(const char *err_msg)
Definition: common.C:195
void submitReductions()
Definition: ComputePme.C:4239
ResizeArray< ComputePme * > pmeComputes
Definition: ComputePme.C:480

◆ recvSharedTrans()

void ComputePmeMgr::recvSharedTrans ( PmeSharedTransMsg msg)

Definition at line 2056 of file ComputePme.C.

References PmeSharedTransMsg::count, PmeSharedTransMsg::lock, PmeSharedTransMsg::msg, and procTrans().

2056  {
2057  procTrans(msg->msg);
2058  CmiLock(msg->lock);
2059  int count = --(*msg->count);
2060  CmiUnlock(msg->lock);
2061  if ( count == 0 ) {
2062  CmiDestroyLock(msg->lock);
2063  delete msg->count;
2064  delete msg->msg;
2065  }
2066  delete msg;
2067 }
PmeTransMsg * msg
Definition: ComputePme.C:169
void procTrans(PmeTransMsg *)
Definition: ComputePme.C:2074
CmiNodeLock lock
Definition: ComputePme.C:171

◆ recvSharedUntrans()

void ComputePmeMgr::recvSharedUntrans ( PmeSharedUntransMsg msg)

Definition at line 2310 of file ComputePme.C.

References PmeSharedUntransMsg::count, PmeSharedUntransMsg::lock, PmeSharedUntransMsg::msg, and procUntrans().

2310  {
2311  procUntrans(msg->msg);
2312  CmiLock(msg->lock);
2313  int count = --(*msg->count);
2314  CmiUnlock(msg->lock);
2315  if ( count == 0 ) {
2316  CmiDestroyLock(msg->lock);
2317  delete msg->count;
2318  delete msg->msg;
2319  }
2320  delete msg;
2321 }
void procUntrans(PmeUntransMsg *)
Definition: ComputePme.C:2328
CmiNodeLock lock
Definition: ComputePme.C:188
PmeUntransMsg * msg
Definition: ComputePme.C:186

◆ recvTrans()

void ComputePmeMgr::recvTrans ( PmeTransMsg msg)

Definition at line 2069 of file ComputePme.C.

References procTrans().

2069  {
2070  procTrans(msg);
2071  delete msg;
2072 }
void procTrans(PmeTransMsg *)
Definition: ComputePme.C:2074

◆ recvUngrid()

void ComputePmeMgr::recvUngrid ( PmeGridMsg msg)

Definition at line 2455 of file ComputePme.C.

References copyPencils(), copyResults(), NAMD_bug(), and recvAck().

2455  {
2456  // CkPrintf("recvUngrid on Pe(%d)\n",CkMyPe());
2457 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2458  if ( ! offload ) // would need lock
2459 #endif
2460  if ( ungrid_count == 0 ) {
2461  NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
2462  }
2463 
2464  if ( usePencils ) copyPencils(msg);
2465  else copyResults(msg);
2466  delete msg;
2467  recvAck(0);
2468 }
void recvAck(PmeAckMsg *)
Definition: ComputePme.C:2470
void NAMD_bug(const char *err_msg)
Definition: common.C:195
void copyPencils(PmeGridMsg *)
Definition: ComputePme.C:3825
void copyResults(PmeGridMsg *)
Definition: ComputePme.C:4017

◆ recvUntrans()

void ComputePmeMgr::recvUntrans ( PmeUntransMsg msg)

Definition at line 2323 of file ComputePme.C.

References procUntrans().

2323  {
2324  procUntrans(msg);
2325  delete msg;
2326 }
void procUntrans(PmeUntransMsg *)
Definition: ComputePme.C:2328

◆ sendChargeGridReady()

void ComputePmeMgr::sendChargeGridReady ( )

Definition at line 3552 of file ComputePme.C.

References chargeGridSubmittedCount, master_pe, pmeComputes, and ResizeArray< Elem >::size().

Referenced by cuda_check_pme_charges().

3552  {
3553  for ( int i=0; i<CkMyNodeSize(); ++i ) {
3554  ComputePmeMgr *mgr = nodePmeMgr->mgrObjects[i];
3555  int cs = mgr->pmeComputes.size();
3556  if ( cs ) {
3557  mgr->ungridForcesCount = cs;
3558  mgr->recipEvirCount = mgr->recipEvirClients;
3559  masterPmeMgr->chargeGridSubmittedCount++;
3560  }
3561  }
3562  pmeProxy[master_pe].recvChargeGridReady();
3563 }
int size(void) const
Definition: ResizeArray.h:131
int chargeGridSubmittedCount
Definition: ComputePme.C:470
ResizeArray< ComputePme * > pmeComputes
Definition: ComputePme.C:480

◆ sendData()

void ComputePmeMgr::sendData ( Lattice lattice,
int  sequence 
)

Definition at line 3989 of file ComputePme.C.

References sendDataHelper_errors, sendDataHelper_lattice, sendDataHelper_sequence, sendDataHelper_sourcepe, and sendDataPart().

Referenced by chargeGridReady().

3989  {
3990 
3991  sendDataHelper_lattice = &lattice;
3992  sendDataHelper_sequence = sequence;
3993  sendDataHelper_sourcepe = CkMyPe();
3994  sendDataHelper_errors = strayChargeErrors;
3995  strayChargeErrors = 0;
3996 
3997 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3998  if ( offload ) {
3999  for ( int i=0; i < numGridPes; ++i ) {
4000  int pe = gridPeOrder[i]; // different order
4001  if ( ! recipPeDest[pe] && ! sendDataHelper_errors ) continue;
4002 #if CMK_MULTICORE
4003  // nodegroup messages on multicore are delivered to sending pe, or pe 0 if expedited
4004  pmeProxy[gridPeMap[pe]].sendDataHelper(i);
4005 #else
4006  pmeNodeProxy[CkMyNode()].sendDataHelper(i);
4007 #endif
4008  }
4009  } else
4010 #endif
4011  {
4012  sendDataPart(0,numGridPes-1,lattice,sequence,CkMyPe(),sendDataHelper_errors);
4013  }
4014 
4015 }
int sendDataHelper_sequence
Definition: ComputePme.C:397
int sendDataHelper_sourcepe
Definition: ComputePme.C:398
Lattice * sendDataHelper_lattice
Definition: ComputePme.C:396
int sendDataHelper_errors
Definition: ComputePme.C:399
void sendDataPart(int first, int last, Lattice &, int sequence, int sourcepe, int errors)
Definition: ComputePme.C:3867

◆ sendDataHelper()

void ComputePmeMgr::sendDataHelper ( int  iter)

Definition at line 3976 of file ComputePme.C.

References NodePmeMgr::sendDataHelper().

3976  {
3977  nodePmeMgr->sendDataHelper(iter);
3978 }
void sendDataHelper(int)
Definition: ComputePme.C:3980

◆ sendDataPart()

void ComputePmeMgr::sendDataPart ( int  first,
int  last,
Lattice lattice,
int  sequence,
int  sourcepe,
int  errors 
)

Definition at line 3867 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::dim2, PmeGrid::dim3, endi(), PmeGridMsg::fgrid, iERROR(), iout, PmeGrid::K2, PmeGrid::K3, PmeGridMsg::lattice, PmeGridMsg::len, NAMD_bug(), ComputePmeUtil::numGrids, PmeGrid::order, PME_GRID_PRIORITY, PRIORITY_SIZE, PmeGridMsg::qgrid, PmeGridMsg::sequence, SET_PRIORITY, PmeGridMsg::sourceNode, PmeGridMsg::start, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

Referenced by sendData(), and NodePmeMgr::sendDataHelper().

3867  {
3868 
3869  // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
3870 
3871  bsize = myGrid.block1 * myGrid.dim2 * myGrid.dim3;
3872 
3873  CProxy_ComputePmeMgr pmeProxy(CkpvAccess(BOCclass_group).computePmeMgr);
3874  for (int j=first; j<=last; j++) {
3875  int pe = gridPeOrder[j]; // different order
3876  if ( ! recipPeDest[pe] && ! errors ) continue;
3877  int start = pe * bsize;
3878  int len = bsize;
3879  if ( start >= qsize ) { start = 0; len = 0; }
3880  if ( start + len > qsize ) { len = qsize - start; }
3881  int zdim = myGrid.dim3;
3882  int fstart = start / zdim;
3883  int flen = len / zdim;
3884  int fcount = 0;
3885  int i;
3886 
3887  int g;
3888  for ( g=0; g<numGrids; ++g ) {
3889  char *f = f_arr + fstart + g*fsize;
3890 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3891  if ( offload ) {
3892  int errcount = 0;
3893  for ( i=0; i<flen; ++i ) {
3894  f[i] = ffz_host[fstart+i];
3895  fcount += f[i];
3896  if ( ffz_host[fstart+i] & ~1 ) ++errcount;
3897  }
3898  if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::sendDataPart");
3899  } else
3900 #endif
3901  for ( i=0; i<flen; ++i ) {
3902  fcount += f[i];
3903  }
3904  if ( ! recipPeDest[pe] ) {
3905  int errfound = 0;
3906  for ( i=0; i<flen; ++i ) {
3907  if ( f[i] == 3 ) {
3908  errfound = 1;
3909  break;
3910  }
3911  }
3912  if ( errfound ) {
3913  iout << iERROR << "Stray PME grid charges detected: "
3914  << sourcepe << " sending to " << gridPeMap[pe] << " for planes";
3915  int iz = -1;
3916  for ( i=0; i<flen; ++i ) {
3917  if ( f[i] == 3 ) {
3918  f[i] = 2;
3919  int jz = (i+fstart)/myGrid.K2;
3920  if ( iz != jz ) { iout << " " << jz; iz = jz; }
3921  }
3922  }
3923  iout << "\n" << endi;
3924  }
3925  }
3926  }
3927 
3928 #ifdef NETWORK_PROGRESS
3929  CmiNetworkProgress();
3930 #endif
3931 
3932  if ( ! recipPeDest[pe] ) continue;
3933 
3934  int zlistlen = 0;
3935  for ( i=0; i<myGrid.K3; ++i ) {
3936  if ( fz_arr[i] ) ++zlistlen;
3937  }
3938 
3939  PmeGridMsg *msg = new (zlistlen, flen*numGrids,
3940  fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
3941 
3942  msg->sourceNode = sourcepe;
3943  msg->lattice = lattice;
3944  msg->start = fstart;
3945  msg->len = flen;
3946  msg->zlistlen = zlistlen;
3947  int *zlist = msg->zlist;
3948  zlistlen = 0;
3949  for ( i=0; i<myGrid.K3; ++i ) {
3950  if ( fz_arr[i] ) zlist[zlistlen++] = i;
3951  }
3952  float *qmsg = msg->qgrid;
3953  for ( g=0; g<numGrids; ++g ) {
3954  char *f = f_arr + fstart + g*fsize;
3955  CmiMemcpy((void*)(msg->fgrid+g*flen),(void*)f,flen*sizeof(char));
3956  float **q = q_arr + fstart + g*fsize;
3957  for ( i=0; i<flen; ++i ) {
3958  if ( f[i] ) {
3959  for (int h=0; h<myGrid.order-1; ++h) {
3960  q[i][h] += q[i][myGrid.K3+h];
3961  }
3962  for ( int k=0; k<zlistlen; ++k ) {
3963  *(qmsg++) = q[i][zlist[k]];
3964  }
3965  }
3966  }
3967  }
3968 
3969  msg->sequence = compute_sequence;
3970  SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
3971  pmeProxy[gridPeMap[pe]].recvGrid(msg);
3972  }
3973 
3974 }
int dim2
Definition: PmeBase.h:22
int dim3
Definition: PmeBase.h:22
int sequence
Definition: ComputePme.C:142
int K2
Definition: PmeBase.h:21
static int numGrids
Definition: ComputePme.h:32
std::ostream & endi(std::ostream &s)
Definition: InfoStream.C:54
int block1
Definition: PmeBase.h:24
Lattice lattice
Definition: ComputePme.C:144
#define iout
Definition: InfoStream.h:51
int sourceNode
Definition: ComputePme.C:141
#define PRIORITY_SIZE
Definition: Priorities.h:13
int order
Definition: PmeBase.h:23
void NAMD_bug(const char *err_msg)
Definition: common.C:195
float * qgrid
Definition: ComputePme.C:150
int * zlist
Definition: ComputePme.C:148
int K3
Definition: PmeBase.h:21
#define PME_GRID_PRIORITY
Definition: Priorities.h:30
std::ostream & iERROR(std::ostream &s)
Definition: InfoStream.C:83
int zlistlen
Definition: ComputePme.C:147
#define SET_PRIORITY(MSG, SEQ, PRIO)
Definition: Priorities.h:18
char * fgrid
Definition: ComputePme.C:149

◆ sendPencils()

void ComputePmeMgr::sendPencils ( Lattice lattice,
int  sequence 
)

Definition at line 3762 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::block2, PmeGrid::dim2, endi(), ijpair::i, iERROR(), iout, ijpair::j, PmeGrid::K1, PmeGrid::K2, ComputePmeUtil::numGrids, sendDataHelper_lattice, sendDataHelper_sequence, sendDataHelper_sourcepe, sendPencilsPart(), and NodePmeMgr::zm.

Referenced by chargeGridReady().

3762  {
3763 
3764  sendDataHelper_lattice = &lattice;
3765  sendDataHelper_sequence = sequence;
3766  sendDataHelper_sourcepe = CkMyPe();
3767 
3768 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3769  if ( offload ) {
3770  for ( int ap=0; ap < numPencilsActive; ++ap ) {
3771 #if CMK_MULTICORE
3772  // nodegroup messages on multicore are delivered to sending pe, or pe 0 if expedited
3773  int ib = activePencils[ap].i;
3774  int jb = activePencils[ap].j;
3775  int destproc = nodePmeMgr->zm.ckLocalBranch()->procNum(0, CkArrayIndex3D(ib,jb,0));
3776  pmeProxy[destproc].sendPencilsHelper(ap);
3777 #else
3778  pmeNodeProxy[CkMyNode()].sendPencilsHelper(ap);
3779 #endif
3780  }
3781  } else
3782 #endif
3783  {
3784  sendPencilsPart(0,numPencilsActive-1,lattice,sequence,CkMyPe());
3785  }
3786 
3787  if ( strayChargeErrors ) {
3788  strayChargeErrors = 0;
3789  iout << iERROR << "Stray PME grid charges detected: "
3790  << CkMyPe() << " sending to (x,y)";
3791  int K1 = myGrid.K1;
3792  int K2 = myGrid.K2;
3793  int dim2 = myGrid.dim2;
3794  int block1 = myGrid.block1;
3795  int block2 = myGrid.block2;
3796  for (int ib=0; ib<xBlocks; ++ib) {
3797  for (int jb=0; jb<yBlocks; ++jb) {
3798  int ibegin = ib*block1;
3799  int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
3800  int jbegin = jb*block2;
3801  int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
3802  int flen = numGrids * (iend - ibegin) * (jend - jbegin);
3803 
3804  for ( int g=0; g<numGrids; ++g ) {
3805  char *f = f_arr + g*fsize;
3806  if ( ! pencilActive[ib*yBlocks+jb] ) {
3807  for ( int i=ibegin; i<iend; ++i ) {
3808  for ( int j=jbegin; j<jend; ++j ) {
3809  if ( f[i*dim2+j] == 3 ) {
3810  f[i*dim2+j] = 2;
3811  iout << " (" << i << "," << j << ")";
3812  }
3813  }
3814  }
3815  }
3816  }
3817  }
3818  }
3819  iout << "\n" << endi;
3820  }
3821 
3822 }
int dim2
Definition: PmeBase.h:22
CProxy_PmePencilMap zm
Definition: ComputePme.C:660
int K2
Definition: PmeBase.h:21
int K1
Definition: PmeBase.h:21
static int numGrids
Definition: ComputePme.h:32
std::ostream & endi(std::ostream &s)
Definition: InfoStream.C:54
int block1
Definition: PmeBase.h:24
#define iout
Definition: InfoStream.h:51
int block2
Definition: PmeBase.h:24
int sendDataHelper_sequence
Definition: ComputePme.C:397
int sendDataHelper_sourcepe
Definition: ComputePme.C:398
Lattice * sendDataHelper_lattice
Definition: ComputePme.C:396
int i
Definition: ComputePme.C:369
std::ostream & iERROR(std::ostream &s)
Definition: InfoStream.C:83
int j
Definition: ComputePme.C:369
void sendPencilsPart(int first, int last, Lattice &, int sequence, int sourcepe)
Definition: ComputePme.C:3607

◆ sendPencilsHelper()

void ComputePmeMgr::sendPencilsHelper ( int  iter)

Definition at line 3749 of file ComputePme.C.

References NodePmeMgr::sendPencilsHelper().

3749  {
3750  nodePmeMgr->sendPencilsHelper(iter);
3751 }
void sendPencilsHelper(int)
Definition: ComputePme.C:3753

◆ sendPencilsPart()

void ComputePmeMgr::sendPencilsPart ( int  first,
int  last,
Lattice lattice,
int  sequence,
int  sourcepe 
)

Definition at line 3607 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::block2, PmeGridMsg::destElem, PmeGrid::dim2, PmeGrid::dim3, PmeGridMsg::fgrid, PmeGridMsg::hasData, ijpair::i, ijpair::j, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PmeGridMsg::lattice, PmeGridMsg::len, NAMD_bug(), ComputePmeUtil::numGrids, PmeGrid::order, PME_GRID_PRIORITY, PRIORITY_SIZE, PmeGridMsg::qgrid, PmeGridMsg::sequence, SET_PRIORITY, PmeGridMsg::sourceNode, PmeGridMsg::start, PmeGridMsg::zlist, PmeGridMsg::zlistlen, and NodePmeMgr::zm.

Referenced by sendPencils(), and NodePmeMgr::sendPencilsHelper().

3607  {
3608 
3609  // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
3610 
3611 #if 0 && USE_PERSISTENT
3612  if (recvGrid_handle== NULL) setup_recvgrid_persistent();
3613 #endif
3614  int K1 = myGrid.K1;
3615  int K2 = myGrid.K2;
3616  int dim2 = myGrid.dim2;
3617  int dim3 = myGrid.dim3;
3618  int block1 = myGrid.block1;
3619  int block2 = myGrid.block2;
3620 
3621  // int savedMessages = 0;
3622  NodePmeMgr *npMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
3623 
3624  for (int ap=first; ap<=last; ++ap) {
3625  int ib = activePencils[ap].i;
3626  int jb = activePencils[ap].j;
3627  int ibegin = ib*block1;
3628  int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
3629  int jbegin = jb*block2;
3630  int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
3631  int flen = numGrids * (iend - ibegin) * (jend - jbegin);
3632 
3633  int fcount = 0;
3634  for ( int g=0; g<numGrids; ++g ) {
3635  char *f = f_arr + g*fsize;
3636 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3637  if ( offload ) {
3638  int errcount = 0;
3639  for ( int i=ibegin; i<iend; ++i ) {
3640  for ( int j=jbegin; j<jend; ++j ) {
3641  int k = i*dim2+j;
3642  f[k] = ffz_host[k];
3643  fcount += f[k];
3644  if ( ffz_host[k] & ~1 ) ++errcount;
3645  }
3646  }
3647  if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::sendPencilsPart");
3648  } else
3649 #endif
3650  for ( int i=ibegin; i<iend; ++i ) {
3651  for ( int j=jbegin; j<jend; ++j ) {
3652  fcount += f[i*dim2+j];
3653  }
3654  }
3655  }
3656 
3657 #ifdef NETWORK_PROGRESS
3658  CmiNetworkProgress();
3659 #endif
3660 
3661  if ( ! pencilActive[ib*yBlocks+jb] )
3662  NAMD_bug("PME activePencils list inconsistent");
3663 
3664  int zlistlen = 0;
3665  for ( int i=0; i<myGrid.K3; ++i ) {
3666  if ( fz_arr[i] ) ++zlistlen;
3667  }
3668 
3669  int hd = ( fcount? 1 : 0 ); // has data?
3670  // if ( ! hd ) ++savedMessages;
3671 
3672 
3673  PmeGridMsg *msg = new ( hd*zlistlen, hd*flen,
3674  hd*fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
3675  msg->sourceNode = sourcepe;
3676  msg->hasData = hd;
3677  msg->lattice = lattice;
3678  if ( hd ) {
3679 #if 0
3680  msg->start = fstart;
3681  msg->len = flen;
3682 #else
3683  msg->start = -1; // obsolete?
3684  msg->len = -1; // obsolete?
3685 #endif
3686  msg->zlistlen = zlistlen;
3687  int *zlist = msg->zlist;
3688  zlistlen = 0;
3689  for ( int i=0; i<myGrid.K3; ++i ) {
3690  if ( fz_arr[i] ) zlist[zlistlen++] = i;
3691  }
3692  char *fmsg = msg->fgrid;
3693  float *qmsg = msg->qgrid;
3694  for ( int g=0; g<numGrids; ++g ) {
3695  char *f = f_arr + g*fsize;
3696  float **q = q_arr + g*fsize;
3697  for ( int i=ibegin; i<iend; ++i ) {
3698  for ( int j=jbegin; j<jend; ++j ) {
3699  *(fmsg++) = f[i*dim2+j];
3700  if( f[i*dim2+j] ) {
3701  for (int h=0; h<myGrid.order-1; ++h) {
3702  q[i*dim2+j][h] += q[i*dim2+j][myGrid.K3+h];
3703  }
3704  for ( int k=0; k<zlistlen; ++k ) {
3705  *(qmsg++) = q[i*dim2+j][zlist[k]];
3706  }
3707  }
3708  }
3709  }
3710  }
3711  }
3712 
3713  msg->sequence = compute_sequence;
3714  SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
3715  CmiEnableUrgentSend(1);
3716 #if USE_NODE_PAR_RECEIVE
3717  msg->destElem=CkArrayIndex3D(ib,jb,0);
3718  CProxy_PmePencilMap lzm = npMgr->zm;
3719  int destproc = lzm.ckLocalBranch()->procNum(0, msg->destElem);
3720  int destnode = CmiNodeOf(destproc);
3721 
3722 #if 0
3723  CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
3724 #endif
3725  pmeNodeProxy[destnode].recvZGrid(msg);
3726 #if 0
3727  CmiUsePersistentHandle(NULL, 0);
3728 #endif
3729 #else
3730 #if 0
3731  CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
3732 #endif
3733  zPencil(ib,jb,0).recvGrid(msg);
3734 #if 0
3735  CmiUsePersistentHandle(NULL, 0);
3736 #endif
3737 #endif
3738  CmiEnableUrgentSend(0);
3739  }
3740 
3741 
3742  // if ( savedMessages ) {
3743  // CkPrintf("Pe %d eliminated %d PME messages\n",CkMyPe(),savedMessages);
3744  // }
3745 
3746 }
int dim2
Definition: PmeBase.h:22
CProxy_PmePencilMap zm
Definition: ComputePme.C:660
int dim3
Definition: PmeBase.h:22
int sequence
Definition: ComputePme.C:142
int K2
Definition: PmeBase.h:21
int K1
Definition: PmeBase.h:21
static int numGrids
Definition: ComputePme.h:32
int block1
Definition: PmeBase.h:24
Lattice lattice
Definition: ComputePme.C:144
int block2
Definition: PmeBase.h:24
int sourceNode
Definition: ComputePme.C:141
#define PRIORITY_SIZE
Definition: Priorities.h:13
int order
Definition: PmeBase.h:23
void NAMD_bug(const char *err_msg)
Definition: common.C:195
CkArrayIndex3D destElem
Definition: ComputePme.C:151
float * qgrid
Definition: ComputePme.C:150
int * zlist
Definition: ComputePme.C:148
int K3
Definition: PmeBase.h:21
#define PME_GRID_PRIORITY
Definition: Priorities.h:30
int i
Definition: ComputePme.C:369
int zlistlen
Definition: ComputePme.C:147
#define SET_PRIORITY(MSG, SEQ, PRIO)
Definition: Priorities.h:18
char * fgrid
Definition: ComputePme.C:149
int j
Definition: ComputePme.C:369

◆ sendTrans()

void ComputePmeMgr::sendTrans ( void  )

Definition at line 1965 of file ComputePme.C.

References CKLOOP_CTRL_PME_SENDTRANS, Node::Object(), PmeSlabSendTrans(), sendTransSubset(), Node::simParameters, and SimParameters::useCkLoop.

1965  {
1966 
1967  untrans_count = numTransPes;
1968 
1969 #if CMK_SMP && USE_CKLOOP
1970  int useCkLoop = Node::Object()->simParameters->useCkLoop;
1971  if ( useCkLoop >= CKLOOP_CTRL_PME_SENDTRANS && CkNumPes() >= 2 * numGridPes) {
1972  CkLoop_Parallelize(PmeSlabSendTrans, 1, (void *)this, CkMyNodeSize(), 0, numTransNodes-1, 0); // no sync
1973  } else
1974 #endif
1975  {
1976  sendTransSubset(0, numTransNodes-1);
1977  }
1978 
1979 }
static Node * Object()
Definition: Node.h:86
SimParameters * simParameters
Definition: Node.h:181
static void PmeSlabSendTrans(int first, int last, void *result, int paraNum, void *param)
Definition: ComputePme.C:1960
#define CKLOOP_CTRL_PME_SENDTRANS
Definition: SimParameters.h:97
void sendTransSubset(int first, int last)
Definition: ComputePme.C:1981

◆ sendTransBarrier()

void ComputePmeMgr::sendTransBarrier ( void  )

Definition at line 1950 of file ComputePme.C.

1950  {
1951  sendTransBarrier_received += 1;
1952  // CkPrintf("sendTransBarrier on %d %d\n",myGridPe,numGridPes-sendTransBarrier_received);
1953  if ( sendTransBarrier_received < numGridPes ) return;
1954  sendTransBarrier_received = 0;
1955  for ( int i=0; i<numGridPes; ++i ) {
1956  pmeProxyDir[gridPeMap[i]].sendTrans();
1957  }
1958 }

◆ sendTransSubset()

void ComputePmeMgr::sendTransSubset ( int  first,
int  last 
)

Definition at line 1981 of file ComputePme.C.

References PmeGrid::dim3, fwdSharedTrans(), PmeGrid::K2, PmeTransMsg::lattice, NodePmeInfo::npe, ComputePmeUtil::numGrids, PmeTransMsg::nx, LocalPmeInfo::nx, LocalPmeInfo::ny_after_transpose, NodePmeInfo::pe_start, PME_TRANS_PRIORITY, PRIORITY_SIZE, PmeTransMsg::qgrid, NodePmeInfo::real_node, PmeTransMsg::sequence, SET_PRIORITY, PmeTransMsg::sourceNode, PmeTransMsg::x_start, LocalPmeInfo::x_start, and LocalPmeInfo::y_start_after_transpose.

Referenced by PmeSlabSendTrans(), and sendTrans().

1981  {
1982  // CkPrintf("sendTrans on Pe(%d)\n",CkMyPe());
1983 
1984  // send data for transpose
1985  int zdim = myGrid.dim3;
1986  int nx = localInfo[myGridPe].nx;
1987  int x_start = localInfo[myGridPe].x_start;
1988  int slicelen = myGrid.K2 * zdim;
1989 
1990  ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
1991 
1992 #if CMK_BLUEGENEL
1993  CmiNetworkProgressAfter (0);
1994 #endif
1995 
1996  for (int j=first; j<=last; j++) {
1997  int node = transNodeOrder[j]; // different order on each node
1998  int pe = transNodeInfo[node].pe_start;
1999  int npe = transNodeInfo[node].npe;
2000  int totlen = 0;
2001  if ( node != myTransNode ) for (int i=0; i<npe; ++i, ++pe) {
2002  LocalPmeInfo &li = localInfo[pe];
2003  int cpylen = li.ny_after_transpose * zdim;
2004  totlen += cpylen;
2005  }
2006  PmeTransMsg *newmsg = new (nx * totlen * numGrids,
2008  newmsg->sourceNode = myGridPe;
2009  newmsg->lattice = lattice;
2010  newmsg->x_start = x_start;
2011  newmsg->nx = nx;
2012  for ( int g=0; g<numGrids; ++g ) {
2013  float *qmsg = newmsg->qgrid + nx * totlen * g;
2014  pe = transNodeInfo[node].pe_start;
2015  for (int i=0; i<npe; ++i, ++pe) {
2016  LocalPmeInfo &li = localInfo[pe];
2017  int cpylen = li.ny_after_transpose * zdim;
2018  if ( node == myTransNode ) {
2019  ComputePmeMgr *m = mgrObjects[CkRankOf(transPeMap[pe])];
2020  qmsg = m->kgrid + m->qgrid_size * g + x_start*cpylen;
2021  }
2022  float *q = qgrid + qgrid_size * g + li.y_start_after_transpose * zdim;
2023  for ( int x = 0; x < nx; ++x ) {
2024  CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
2025  q += slicelen;
2026  qmsg += cpylen;
2027  }
2028  }
2029  }
2030  newmsg->sequence = grid_sequence;
2031  SET_PRIORITY(newmsg,grid_sequence,PME_TRANS_PRIORITY)
2032  if ( node == myTransNode ) newmsg->nx = 0;
2033  if ( npe > 1 ) {
2034  if ( node == myTransNode ) fwdSharedTrans(newmsg);
2035  else pmeNodeProxy[transNodeInfo[node].real_node].recvTrans(newmsg);
2036  } else pmeProxy[transPeMap[transNodeInfo[node].pe_start]].recvTrans(newmsg);
2037  }
2038 }
int dim3
Definition: PmeBase.h:22
int K2
Definition: PmeBase.h:21
static int numGrids
Definition: ComputePme.h:32
float * qgrid
Definition: ComputePme.C:163
void fwdSharedTrans(PmeTransMsg *)
Definition: ComputePme.C:2040
#define PRIORITY_SIZE
Definition: Priorities.h:13
int sourceNode
Definition: ComputePme.C:157
#define PME_TRANS_PRIORITY
Definition: Priorities.h:31
int ny_after_transpose
Definition: ComputePme.C:259
Lattice lattice
Definition: ComputePme.C:160
#define SET_PRIORITY(MSG, SEQ, PRIO)
Definition: Priorities.h:18
int y_start_after_transpose
Definition: ComputePme.C:259

◆ sendUngrid()

void ComputePmeMgr::sendUngrid ( void  )

Definition at line 2395 of file ComputePme.C.

References CKLOOP_CTRL_PME_SENDUNTRANS, ComputePmeUtil::numGrids, Node::Object(), PmeSlabSendUngrid(), sendUngridSubset(), Node::simParameters, and SimParameters::useCkLoop.

2395  {
2396 
2397 #if CMK_SMP && USE_CKLOOP
2398  int useCkLoop = Node::Object()->simParameters->useCkLoop;
2399  if ( useCkLoop >= CKLOOP_CTRL_PME_SENDUNTRANS && CkNumPes() >= 2 * numGridPes) {
2400  CkLoop_Parallelize(PmeSlabSendUngrid, 1, (void *)this, CkMyNodeSize(), 0, numSources-1, 1); // sync
2401  } else
2402 #endif
2403  {
2404  sendUngridSubset(0, numSources-1);
2405  }
2406 
2407  grid_count = numSources;
2408  memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
2409 }
static Node * Object()
Definition: Node.h:86
SimParameters * simParameters
Definition: Node.h:181
static int numGrids
Definition: ComputePme.h:32
static void PmeSlabSendUngrid(int first, int last, void *result, int paraNum, void *param)
Definition: ComputePme.C:2390
#define CKLOOP_CTRL_PME_SENDUNTRANS
void sendUngridSubset(int first, int last)
Definition: ComputePme.C:2411

◆ sendUngridSubset()

void ComputePmeMgr::sendUngridSubset ( int  first,
int  last 
)

Definition at line 2411 of file ComputePme.C.

References PmeGrid::dim3, PmeGridMsg::fgrid, PmeGridMsg::len, ComputePmeUtil::numGrids, PME_OFFLOAD_UNGRID_PRIORITY, PME_UNGRID_PRIORITY, PmeGridMsg::qgrid, SET_PRIORITY, PmeGridMsg::sourceNode, PmeGridMsg::start, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

Referenced by PmeSlabSendUngrid(), and sendUngrid().

2411  {
2412 
2413 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2414  const int UNGRID_PRIORITY = ( offload ? PME_OFFLOAD_UNGRID_PRIORITY : PME_UNGRID_PRIORITY );
2415 #else
2416  const int UNGRID_PRIORITY = PME_UNGRID_PRIORITY ;
2417 #endif
2418 
2419  for ( int j=first; j<=last; ++j ) {
2420  // int msglen = qgrid_len;
2421  PmeGridMsg *newmsg = gridmsg_reuse[j];
2422  int pe = newmsg->sourceNode;
2423  int zdim = myGrid.dim3;
2424  int flen = newmsg->len;
2425  int fstart = newmsg->start;
2426  int zlistlen = newmsg->zlistlen;
2427  int *zlist = newmsg->zlist;
2428  float *qmsg = newmsg->qgrid;
2429  for ( int g=0; g<numGrids; ++g ) {
2430  char *f = newmsg->fgrid + fgrid_len * g;
2431  float *q = qgrid + qgrid_size * g + (fstart-fgrid_start) * zdim;
2432  for ( int i=0; i<flen; ++i ) {
2433  if ( f[i] ) {
2434  for ( int k=0; k<zlistlen; ++k ) {
2435  *(qmsg++) = q[zlist[k]];
2436  }
2437  }
2438  q += zdim;
2439  }
2440  }
2441  newmsg->sourceNode = myGridPe;
2442 
2443  SET_PRIORITY(newmsg,grid_sequence,UNGRID_PRIORITY)
2444  CmiEnableUrgentSend(1);
2445 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2446  if ( offload ) {
2447  pmeNodeProxy[CkNodeOf(pe)].recvUngrid(newmsg);
2448  } else
2449 #endif
2450  pmeProxyDir[pe].recvUngrid(newmsg);
2451  CmiEnableUrgentSend(0);
2452  }
2453 }
#define PME_UNGRID_PRIORITY
Definition: Priorities.h:74
int dim3
Definition: PmeBase.h:22
static int numGrids
Definition: ComputePme.h:32
#define PME_OFFLOAD_UNGRID_PRIORITY
Definition: Priorities.h:42
int sourceNode
Definition: ComputePme.C:141
float * qgrid
Definition: ComputePme.C:150
int * zlist
Definition: ComputePme.C:148
int zlistlen
Definition: ComputePme.C:147
#define SET_PRIORITY(MSG, SEQ, PRIO)
Definition: Priorities.h:18
char * fgrid
Definition: ComputePme.C:149

◆ sendUntrans()

void ComputePmeMgr::sendUntrans ( void  )

Definition at line 2209 of file ComputePme.C.

References CKLOOP_CTRL_PME_SENDUNTRANS, PmeEvirMsg::evir, ComputePmeUtil::numGrids, Node::Object(), PME_UNGRID_PRIORITY, PmeSlabSendUntrans(), PRIORITY_SIZE, sendUntransSubset(), SET_PRIORITY, Node::simParameters, and SimParameters::useCkLoop.

2209  {
2210 
2211  trans_count = numGridPes;
2212 
2213  { // send energy and virial
2214  PmeEvirMsg *newmsg = new (numGrids, PRIORITY_SIZE) PmeEvirMsg;
2215  for ( int g=0; g<numGrids; ++g ) {
2216  newmsg->evir[g] = recip_evir2[g];
2217  }
2218  SET_PRIORITY(newmsg,grid_sequence,PME_UNGRID_PRIORITY)
2219  CmiEnableUrgentSend(1);
2220  pmeProxy[recipEvirPe].recvRecipEvir(newmsg);
2221  CmiEnableUrgentSend(0);
2222  }
2223 
2224 #if CMK_SMP && USE_CKLOOP
2225  int useCkLoop = Node::Object()->simParameters->useCkLoop;
2226  if ( useCkLoop >= CKLOOP_CTRL_PME_SENDUNTRANS && CkNumPes() >= 2 * numTransPes) {
2227  CkLoop_Parallelize(PmeSlabSendUntrans, 1, (void *)this, CkMyNodeSize(), 0, numGridNodes-1, 0); // no sync
2228  } else
2229 #endif
2230  {
2231  sendUntransSubset(0, numGridNodes-1);
2232  }
2233 
2234 }
static Node * Object()
Definition: Node.h:86
#define PME_UNGRID_PRIORITY
Definition: Priorities.h:74
SimParameters * simParameters
Definition: Node.h:181
static int numGrids
Definition: ComputePme.h:32
void sendUntransSubset(int first, int last)
Definition: ComputePme.C:2236
PmeReduction * evir
Definition: ComputePme.C:193
#define CKLOOP_CTRL_PME_SENDUNTRANS
#define PRIORITY_SIZE
Definition: Priorities.h:13
#define SET_PRIORITY(MSG, SEQ, PRIO)
Definition: Priorities.h:18
static void PmeSlabSendUntrans(int first, int last, void *result, int paraNum, void *param)
Definition: ComputePme.C:2204

◆ sendUntransSubset()

void ComputePmeMgr::sendUntransSubset ( int  first,
int  last 
)

Definition at line 2236 of file ComputePme.C.

References PmeGrid::dim3, fwdSharedUntrans(), PmeGrid::K2, NodePmeInfo::npe, ComputePmeUtil::numGrids, LocalPmeInfo::nx, PmeUntransMsg::ny, LocalPmeInfo::ny_after_transpose, NodePmeInfo::pe_start, PME_UNTRANS_PRIORITY, PRIORITY_SIZE, PmeUntransMsg::qgrid, NodePmeInfo::real_node, SET_PRIORITY, PmeUntransMsg::sourceNode, LocalPmeInfo::x_start, PmeUntransMsg::y_start, and LocalPmeInfo::y_start_after_transpose.

Referenced by PmeSlabSendUntrans(), and sendUntrans().

2236  {
2237 
2238  int zdim = myGrid.dim3;
2239  int y_start = localInfo[myTransPe].y_start_after_transpose;
2240  int ny = localInfo[myTransPe].ny_after_transpose;
2241  int slicelen = myGrid.K2 * zdim;
2242 
2243  ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
2244 
2245 #if CMK_BLUEGENEL
2246  CmiNetworkProgressAfter (0);
2247 #endif
2248 
2249  // send data for reverse transpose
2250  for (int j=first; j<=last; j++) {
2251  int node = gridNodeOrder[j]; // different order on each node
2252  int pe = gridNodeInfo[node].pe_start;
2253  int npe = gridNodeInfo[node].npe;
2254  int totlen = 0;
2255  if ( node != myGridNode ) for (int i=0; i<npe; ++i, ++pe) {
2256  LocalPmeInfo &li = localInfo[pe];
2257  int cpylen = li.nx * zdim;
2258  totlen += cpylen;
2259  }
2260  PmeUntransMsg *newmsg = new (ny * totlen * numGrids, PRIORITY_SIZE) PmeUntransMsg;
2261  newmsg->sourceNode = myTransPe;
2262  newmsg->y_start = y_start;
2263  newmsg->ny = ny;
2264  for ( int g=0; g<numGrids; ++g ) {
2265  float *qmsg = newmsg->qgrid + ny * totlen * g;
2266  pe = gridNodeInfo[node].pe_start;
2267  for (int i=0; i<npe; ++i, ++pe) {
2268  LocalPmeInfo &li = localInfo[pe];
2269  if ( node == myGridNode ) {
2270  ComputePmeMgr *m = mgrObjects[CkRankOf(gridPeMap[pe])];
2271  qmsg = m->qgrid + m->qgrid_size * g + y_start * zdim;
2272  float *q = kgrid + qgrid_size*g + li.x_start*ny*zdim;
2273  int cpylen = ny * zdim;
2274  for ( int x = 0; x < li.nx; ++x ) {
2275  CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
2276  q += cpylen;
2277  qmsg += slicelen;
2278  }
2279  } else {
2280  CmiMemcpy((void*)qmsg,
2281  (void*)(kgrid + qgrid_size*g + li.x_start*ny*zdim),
2282  li.nx*ny*zdim*sizeof(float));
2283  qmsg += li.nx*ny*zdim;
2284  }
2285  }
2286  }
2287  SET_PRIORITY(newmsg,grid_sequence,PME_UNTRANS_PRIORITY)
2288  if ( node == myGridNode ) newmsg->ny = 0;
2289  if ( npe > 1 ) {
2290  if ( node == myGridNode ) fwdSharedUntrans(newmsg);
2291  else pmeNodeProxy[gridNodeInfo[node].real_node].recvUntrans(newmsg);
2292  } else pmeProxy[gridPeMap[gridNodeInfo[node].pe_start]].recvUntrans(newmsg);
2293  }
2294 }
float * qgrid
Definition: ComputePme.C:180
int dim3
Definition: PmeBase.h:22
int K2
Definition: PmeBase.h:21
static int numGrids
Definition: ComputePme.h:32
#define PRIORITY_SIZE
Definition: Priorities.h:13
void fwdSharedUntrans(PmeUntransMsg *)
Definition: ComputePme.C:2296
int ny_after_transpose
Definition: ComputePme.C:259
#define PME_UNTRANS_PRIORITY
Definition: Priorities.h:33
#define SET_PRIORITY(MSG, SEQ, PRIO)
Definition: Priorities.h:18
int y_start_after_transpose
Definition: ComputePme.C:259

◆ submitReductions()

void ComputePmeMgr::submitReductions ( )

Definition at line 4239 of file ComputePme.C.

References ComputePmeUtil::alchDecouple, ComputePmeUtil::alchFepOn, ComputePmeUtil::alchOn, ComputePmeUtil::alchThermIntOn, SubmitReduction::item(), ComputePmeUtil::lesFactor, ComputePmeUtil::lesOn, WorkDistrib::messageEnqueueWork(), NAMD_bug(), ComputePmeUtil::numGrids, Node::Object(), ComputePmeUtil::pairOn, REDUCTION_ELECT_ENERGY_PME_TI_1, REDUCTION_ELECT_ENERGY_PME_TI_2, REDUCTION_ELECT_ENERGY_SLOW, REDUCTION_ELECT_ENERGY_SLOW_F, REDUCTION_STRAY_CHARGE_ERRORS, ResizeArray< Elem >::resize(), Node::simParameters, simParams, ResizeArray< Elem >::size(), and SubmitReduction::submit().

Referenced by ComputePme::doWork(), and recvRecipEvir().

4239  {
4240 
4242 
4243  for ( int g=0; g<numGrids; ++g ) {
4244  float scale = 1.;
4245  if (alchOn) {
4246  BigReal elecLambdaUp, elecLambdaDown;
4247  // alchLambda set on each step in ComputePme::ungridForces()
4248  if ( alchLambda < 0 || alchLambda > 1 ) {
4249  NAMD_bug("ComputePmeMgr::submitReductions alchLambda out of range");
4250  }
4251  elecLambdaUp = simParams->getElecLambda(alchLambda);
4252  elecLambdaDown = simParams->getElecLambda(1-alchLambda);
4253  if ( g == 0 ) scale = elecLambdaUp;
4254  else if ( g == 1 ) scale = elecLambdaDown;
4255  else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
4256  if (alchDecouple) {
4257  if ( g == 2 ) scale = 1-elecLambdaUp;
4258  else if ( g == 3 ) scale = 1-elecLambdaDown;
4259  else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
4260  }
4261  } else if ( lesOn ) {
4262  scale = 1.0 / lesFactor;
4263  } else if ( pairOn ) {
4264  scale = ( g == 0 ? 1. : -1. );
4265  }
4266  reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += evir[g][0] * scale;
4267  reduction->item(REDUCTION_VIRIAL_SLOW_XX) += evir[g][1] * scale;
4268  reduction->item(REDUCTION_VIRIAL_SLOW_XY) += evir[g][2] * scale;
4269  reduction->item(REDUCTION_VIRIAL_SLOW_XZ) += evir[g][3] * scale;
4270  reduction->item(REDUCTION_VIRIAL_SLOW_YX) += evir[g][2] * scale;
4271  reduction->item(REDUCTION_VIRIAL_SLOW_YY) += evir[g][4] * scale;
4272  reduction->item(REDUCTION_VIRIAL_SLOW_YZ) += evir[g][5] * scale;
4273  reduction->item(REDUCTION_VIRIAL_SLOW_ZX) += evir[g][3] * scale;
4274  reduction->item(REDUCTION_VIRIAL_SLOW_ZY) += evir[g][5] * scale;
4275  reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += evir[g][6] * scale;
4276 
4277  float scale2 = 0.;
4278 
4279  // why is this declared/defined again here?
4281 
4282  if (alchFepOn) {
4283  BigReal elecLambda2Up=0.0, elecLambda2Down=0.0;
4284  elecLambda2Up = simParams->getElecLambda(alchLambda2);
4285  elecLambda2Down = simParams->getElecLambda(1.-alchLambda2);
4286  if ( g == 0 ) scale2 = elecLambda2Up;
4287  else if ( g == 1 ) scale2 = elecLambda2Down;
4288  else if ( g == 2 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
4289  if (alchDecouple && g == 2 ) scale2 = 1 - elecLambda2Up;
4290  else if (alchDecouple && g == 3 ) scale2 = 1 - elecLambda2Down;
4291  else if (alchDecouple && g == 4 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
4292  }
4293  reduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += evir[g][0] * scale2;
4294 
4295  if (alchThermIntOn) {
4296 
4297  // no decoupling:
4298  // part. 1 <-> all of system except partition 2: g[0] - g[2]
4299  // (interactions between all atoms [partition 0 OR partition 1],
4300  // minus all [within partition 0])
4301  // U = elecLambdaUp * (U[0] - U[2])
4302  // dU/dl = U[0] - U[2];
4303 
4304  // part. 2 <-> all of system except partition 1: g[1] - g[2]
4305  // (interactions between all atoms [partition 0 OR partition 2],
4306  // minus all [within partition 0])
4307  // U = elecLambdaDown * (U[1] - U[2])
4308  // dU/dl = U[1] - U[2];
4309 
4310  // alchDecouple:
4311  // part. 1 <-> part. 0: g[0] - g[2] - g[4]
4312  // (interactions between all atoms [partition 0 OR partition 1]
4313  // minus all [within partition 1] minus all [within partition 0]
4314  // U = elecLambdaUp * (U[0] - U[4]) + (1-elecLambdaUp)* U[2]
4315  // dU/dl = U[0] - U[2] - U[4];
4316 
4317  // part. 2 <-> part. 0: g[1] - g[3] - g[4]
4318  // (interactions between all atoms [partition 0 OR partition 2]
4319  // minus all [within partition 2] minus all [within partition 0]
4320  // U = elecLambdaDown * (U[1] - U[4]) + (1-elecLambdaDown)* U[3]
4321  // dU/dl = U[1] - U[3] - U[4];
4322 
4323 
4324  if ( g == 0 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) += evir[g][0];
4325  if ( g == 1 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) += evir[g][0];
4326  if (!alchDecouple) {
4327  if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
4328  if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
4329  }
4330  else { // alchDecouple
4331  if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
4332  if ( g == 3 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
4333  if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
4334  if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
4335  }
4336  }
4337  }
4338 
4339  alchLambda = -1.; // illegal value to catch if not updated
4340 
4341  reduction->item(REDUCTION_STRAY_CHARGE_ERRORS) += strayChargeErrors;
4342  reduction->submit();
4343 
4344  for ( int i=0; i<heldComputes.size(); ++i ) {
4345  WorkDistrib::messageEnqueueWork(heldComputes[i]);
4346  }
4347  heldComputes.resize(0);
4348 }
static Node * Object()
Definition: Node.h:86
int size(void) const
Definition: ResizeArray.h:131
SimParameters * simParameters
Definition: Node.h:181
BigReal & item(int i)
Definition: ReductionMgr.h:313
static int numGrids
Definition: ComputePme.h:32
static Bool alchOn
Definition: ComputePme.h:33
static void messageEnqueueWork(Compute *)
Definition: WorkDistrib.C:2852
void resize(int i)
Definition: ResizeArray.h:84
void NAMD_bug(const char *err_msg)
Definition: common.C:195
static Bool alchDecouple
Definition: ComputePme.h:36
static int lesFactor
Definition: ComputePme.h:39
#define simParams
Definition: Output.C:129
static Bool pairOn
Definition: ComputePme.h:40
static Bool lesOn
Definition: ComputePme.h:38
void submit(void)
Definition: ReductionMgr.h:324
static Bool alchFepOn
Definition: ComputePme.h:34
double BigReal
Definition: common.h:123
static Bool alchThermIntOn
Definition: ComputePme.h:35

◆ ungridCalc()

void ComputePmeMgr::ungridCalc ( void  )

Definition at line 2545 of file ComputePme.C.

References a_data_dev, cuda_errcheck(), CUDA_EVENT_ID_PME_COPY, CUDA_EVENT_ID_PME_KERNEL, CUDA_EVENT_ID_PME_TICK, deviceCUDA, end_forces, EVENT_STRIDE, f_data_dev, f_data_host, forces_count, forces_done_count, forces_time, DeviceCUDA::getDeviceID(), PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, WorkDistrib::messageEnqueueWork(), PmeGrid::order, pmeComputes, ResizeArray< Elem >::size(), this_pe, and ungridCalc().

Referenced by ungridCalc().

2545  {
2546  // CkPrintf("ungridCalc on Pe(%d)\n",CkMyPe());
2547 
2548  ungridForcesCount = pmeComputes.size();
2549 
2550 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2551  if ( offload ) {
2552  //CmiLock(cuda_lock);
2553  cudaSetDevice(deviceCUDA->getDeviceID());
2554 
2555  if ( this == masterPmeMgr ) {
2556  double before = CmiWallTimer();
2557  // XXX prevents something from breaking???
2558  cudaMemcpyAsync(v_data_dev, q_data_host, q_data_size, cudaMemcpyHostToDevice, 0 /*streams[stream]*/);
2559  cudaEventRecord(nodePmeMgr->end_potential_memcpy, 0 /*streams[stream]*/);
2560  // try to make the unspecified launch failures go away
2561  cudaEventSynchronize(nodePmeMgr->end_potential_memcpy);
2562  cuda_errcheck("in ComputePmeMgr::ungridCalc after potential memcpy");
2563  traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
2564 
2565  const int myrank = CkMyRank();
2566  for ( int i=0; i<CkMyNodeSize(); ++i ) {
2567  if ( myrank != i && nodePmeMgr->mgrObjects[i]->pmeComputes.size() ) {
2568  nodePmeMgr->mgrObjects[i]->ungridCalc();
2569  }
2570  }
2571  if ( ! pmeComputes.size() ) return;
2572  }
2573 
2574  if ( ! end_forces ) {
2575  int n=(pmeComputes.size()-1)/EVENT_STRIDE+1;
2576  end_forces = new cudaEvent_t[n];
2577  for ( int i=0; i<n; ++i ) {
2578  cudaEventCreateWithFlags(&end_forces[i],cudaEventDisableTiming);
2579  }
2580  }
2581 
2582  const int pcsz = pmeComputes.size();
2583  if ( ! afn_host ) {
2584  cudaMallocHost((void**) &afn_host, 3*pcsz*sizeof(float*));
2585  cudaMalloc((void**) &afn_dev, 3*pcsz*sizeof(float*));
2586  cuda_errcheck("malloc params for pme");
2587  }
2588  int totn = 0;
2589  for ( int i=0; i<pcsz; ++i ) {
2590  int n = pmeComputes[i]->numGridAtoms[0];
2591  totn += n;
2592  }
2593  if ( totn > f_data_mgr_alloc ) {
2594  if ( f_data_mgr_alloc ) {
2595  CkPrintf("Expanding CUDA forces allocation because %d > %d\n", totn, f_data_mgr_alloc);
2596  cudaFree(f_data_mgr_dev);
2597  cudaFreeHost(f_data_mgr_host);
2598  }
2599  f_data_mgr_alloc = 1.2 * (totn + 100);
2600  cudaMalloc((void**) &f_data_mgr_dev, 3*f_data_mgr_alloc*sizeof(float));
2601  cudaMallocHost((void**) &f_data_mgr_host, 3*f_data_mgr_alloc*sizeof(float));
2602  cuda_errcheck("malloc forces for pme");
2603  }
2604  // CkPrintf("pe %d pcsz %d totn %d alloc %d\n", CkMyPe(), pcsz, totn, f_data_mgr_alloc);
2605  float *f_dev = f_data_mgr_dev;
2606  float *f_host = f_data_mgr_host;
2607  for ( int i=0; i<pcsz; ++i ) {
2608  int n = pmeComputes[i]->numGridAtoms[0];
2609  pmeComputes[i]->f_data_dev = f_dev;
2610  pmeComputes[i]->f_data_host = f_host;
2611  afn_host[3*i ] = a_data_dev + 7 * pmeComputes[i]->cuda_atoms_offset;
2612  afn_host[3*i+1] = f_dev;
2613  afn_host[3*i+2] = f_dev + n; // avoid type conversion issues
2614  f_dev += 3*n;
2615  f_host += 3*n;
2616  }
2617  //CmiLock(cuda_lock);
2618  double before = CmiWallTimer();
2619  cudaMemcpyAsync(afn_dev, afn_host, 3*pcsz*sizeof(float*), cudaMemcpyHostToDevice, streams[stream]);
2620  cuda_errcheck("in ComputePmeMgr::ungridCalc after force pointer memcpy");
2621  traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
2622  cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_potential_memcpy, 0);
2623  cuda_errcheck("in ComputePmeMgr::ungridCalc after wait for potential memcpy");
2624  traceUserEvent(CUDA_EVENT_ID_PME_TICK);
2625 
2626  for ( int i=0; i<pcsz; ++i ) {
2627  // cudaMemsetAsync(pmeComputes[i]->f_data_dev, 0, 3*n*sizeof(float), streams[stream]);
2628  if ( i%EVENT_STRIDE == 0 ) {
2629  int dimy = pcsz - i;
2630  if ( dimy > EVENT_STRIDE ) dimy = EVENT_STRIDE;
2631  int maxn = 0;
2632  int subtotn = 0;
2633  for ( int j=0; j<dimy; ++j ) {
2634  int n = pmeComputes[i+j]->numGridAtoms[0];
2635  subtotn += n;
2636  if ( n > maxn ) maxn = n;
2637  }
2638  // CkPrintf("pe %d dimy %d maxn %d subtotn %d\n", CkMyPe(), dimy, maxn, subtotn);
2639  before = CmiWallTimer();
2640  cuda_pme_forces(
2641  bspline_coeffs_dev,
2642  v_arr_dev, afn_dev+3*i, dimy, maxn, /*
2643  pmeComputes[i]->a_data_dev,
2644  pmeComputes[i]->f_data_dev,
2645  n, */ myGrid.K1, myGrid.K2, myGrid.K3, myGrid.order,
2646  streams[stream]);
2647  cuda_errcheck("in ComputePmeMgr::ungridCalc after force kernel submit");
2648  traceUserBracketEvent(CUDA_EVENT_ID_PME_KERNEL,before,CmiWallTimer());
2649  before = CmiWallTimer();
2650  cudaMemcpyAsync(pmeComputes[i]->f_data_host, pmeComputes[i]->f_data_dev, 3*subtotn*sizeof(float),
2651  cudaMemcpyDeviceToHost, streams[stream]);
2652 #if 0
2653  cudaDeviceSynchronize();
2654  fprintf(stderr, "i = %d\n", i);
2655  for(int k=0; k < subtotn*3; k++)
2656  {
2657  fprintf(stderr, "f_data_host[%d][%d] = %f\n", i, k,
2658  pmeComputes[i]->f_data_host[k]);
2659  }
2660 #endif
2661  cuda_errcheck("in ComputePmeMgr::ungridCalc after force memcpy submit");
2662  traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
2663  cudaEventRecord(end_forces[i/EVENT_STRIDE], streams[stream]);
2664  cuda_errcheck("in ComputePmeMgr::ungridCalc after end_forces event");
2665  traceUserEvent(CUDA_EVENT_ID_PME_TICK);
2666  }
2667  // CkPrintf("pe %d c %d natoms %d fdev %lld fhost %lld\n", CkMyPe(), i, (int64)afn_host[3*i+2], pmeComputes[i]->f_data_dev, pmeComputes[i]->f_data_host);
2668  }
2669  //CmiUnlock(cuda_lock);
2670  } else
2671 #endif // NAMD_CUDA
2672  {
2673  for ( int i=0; i<pmeComputes.size(); ++i ) {
2675  // pmeComputes[i]->ungridForces();
2676  }
2677  }
2678  // submitReductions(); // must follow all ungridForces()
2679 
2680 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2681  if ( offload ) {
2682  forces_time = CmiWallTimer();
2683  forces_count = ungridForcesCount;
2684  forces_done_count = 0;
2685  pmeProxy[this_pe].pollForcesReady();
2686  }
2687 #endif
2688 
2689  ungrid_count = (usePencils ? numPencilsActive : numDestRecipPes );
2690 }
double forces_time
Definition: ComputePme.C:457
int size(void) const
Definition: ResizeArray.h:131
float * a_data_dev
Definition: ComputePme.C:445
#define EVENT_STRIDE
Definition: ComputePme.C:2497
void cuda_errcheck(const char *msg)
Definition: ComputePme.C:67
int K2
Definition: PmeBase.h:21
int K1
Definition: PmeBase.h:21
#define CUDA_EVENT_ID_PME_COPY
static void messageEnqueueWork(Compute *)
Definition: WorkDistrib.C:2852
float * f_data_host
Definition: ComputePme.C:446
int order
Definition: PmeBase.h:23
#define CUDA_EVENT_ID_PME_TICK
float * f_data_dev
Definition: ComputePme.C:447
void ungridCalc(void)
Definition: ComputePme.C:2545
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:23
int getDeviceID()
Definition: DeviceCUDA.h:144
int K3
Definition: PmeBase.h:21
ResizeArray< ComputePme * > pmeComputes
Definition: ComputePme.C:480
cudaEvent_t * end_forces
Definition: ComputePme.C:453
#define CUDA_EVENT_ID_PME_KERNEL
int forces_done_count
Definition: ComputePme.C:455

Friends And Related Function Documentation

◆ ComputePme

friend class ComputePme
friend

Definition at line 383 of file ComputePme.C.

◆ NodePmeMgr

friend class NodePmeMgr
friend

Definition at line 384 of file ComputePme.C.

Member Data Documentation

◆ a_data_dev

float* ComputePmeMgr::a_data_dev

Definition at line 445 of file ComputePme.C.

Referenced by cuda_submit_charges(), ComputePme::doWork(), and ungridCalc().

◆ a_data_host

float* ComputePmeMgr::a_data_host

Definition at line 444 of file ComputePme.C.

Referenced by cuda_submit_charges(), and ComputePme::doWork().

◆ chargeGridSubmittedCount

int ComputePmeMgr::chargeGridSubmittedCount

◆ charges_time

double ComputePmeMgr::charges_time

Definition at line 456 of file ComputePme.C.

Referenced by cuda_check_pme_charges(), and cuda_submit_charges().

◆ check_charges_count

int ComputePmeMgr::check_charges_count

Definition at line 458 of file ComputePme.C.

Referenced by ComputePmeMgr(), and cuda_check_pme_charges().

◆ check_forces_count

int ComputePmeMgr::check_forces_count

Definition at line 459 of file ComputePme.C.

Referenced by ComputePmeMgr(), and cuda_check_pme_forces().

◆ cuda_atoms_alloc

int ComputePmeMgr::cuda_atoms_alloc

Definition at line 449 of file ComputePme.C.

Referenced by ComputePmeMgr(), and ComputePme::doWork().

◆ cuda_atoms_count

int ComputePmeMgr::cuda_atoms_count

◆ cuda_busy

bool ComputePmeMgr::cuda_busy
static

Definition at line 468 of file ComputePme.C.

Referenced by ComputePme::doWork().

◆ cuda_lock

CmiNodeLock ComputePmeMgr::cuda_lock
static

Definition at line 450 of file ComputePme.C.

Referenced by ComputePmeMgr(), ComputePme::doWork(), initialize_computes(), and recvAck().

◆ cuda_submit_charges_deque

std::deque< ComputePmeMgr::cuda_submit_charges_args > ComputePmeMgr::cuda_submit_charges_deque
static

Definition at line 467 of file ComputePme.C.

Referenced by ComputePme::doWork().

◆ end_charges

cudaEvent_t ComputePmeMgr::end_charges

Definition at line 452 of file ComputePme.C.

Referenced by chargeGridSubmitted(), ComputePmeMgr(), and cuda_check_pme_charges().

◆ end_forces

cudaEvent_t* ComputePmeMgr::end_forces

Definition at line 453 of file ComputePme.C.

Referenced by ComputePmeMgr(), cuda_check_pme_forces(), and ungridCalc().

◆ f_data_dev

float* ComputePmeMgr::f_data_dev

Definition at line 447 of file ComputePme.C.

Referenced by ungridCalc().

◆ f_data_host

float* ComputePmeMgr::f_data_host

Definition at line 446 of file ComputePme.C.

Referenced by ungridCalc().

◆ fftw_plan_lock

CmiNodeLock ComputePmeMgr::fftw_plan_lock
static

◆ forces_count

int ComputePmeMgr::forces_count

Definition at line 454 of file ComputePme.C.

Referenced by cuda_check_pme_forces(), and ungridCalc().

◆ forces_done_count

int ComputePmeMgr::forces_done_count

Definition at line 455 of file ComputePme.C.

Referenced by cuda_check_pme_forces(), and ungridCalc().

◆ forces_time

double ComputePmeMgr::forces_time

Definition at line 457 of file ComputePme.C.

Referenced by cuda_check_pme_forces(), and ungridCalc().

◆ master_pe

int ComputePmeMgr::master_pe

◆ pmeComputes

ResizeArray<ComputePme*> ComputePmeMgr::pmeComputes

◆ pmemgr_lock

CmiNodeLock ComputePmeMgr::pmemgr_lock

Definition at line 441 of file ComputePme.C.

Referenced by ComputePmeMgr(), and ~ComputePmeMgr().

◆ saved_lattice

Lattice* ComputePmeMgr::saved_lattice

Definition at line 473 of file ComputePme.C.

Referenced by chargeGridSubmitted(), and recvChargeGridReady().

◆ saved_sequence

int ComputePmeMgr::saved_sequence

◆ sendDataHelper_errors

int ComputePmeMgr::sendDataHelper_errors

Definition at line 399 of file ComputePme.C.

Referenced by sendData(), and NodePmeMgr::sendDataHelper().

◆ sendDataHelper_lattice

Lattice* ComputePmeMgr::sendDataHelper_lattice

◆ sendDataHelper_sequence

int ComputePmeMgr::sendDataHelper_sequence

◆ sendDataHelper_sourcepe

int ComputePmeMgr::sendDataHelper_sourcepe

◆ this_pe

int ComputePmeMgr::this_pe

Definition at line 461 of file ComputePme.C.

Referenced by ComputePmeMgr(), and ungridCalc().


The documentation for this class was generated from the following file: