NAMD
Classes | Public Member Functions | Public Attributes | Static Public Attributes | Friends | List of all members
ComputePmeMgr Class Reference
Inheritance diagram for ComputePmeMgr:
ComputePmeUtil

Classes

struct  cuda_submit_charges_args
 

Public Member Functions

 ComputePmeMgr ()
 
 ~ComputePmeMgr ()
 
void initialize (CkQdMsg *)
 
void initialize_pencils (CkQdMsg *)
 
void activate_pencils (CkQdMsg *)
 
void recvArrays (CProxy_PmeXPencil, CProxy_PmeYPencil, CProxy_PmeZPencil)
 
void initialize_computes ()
 
void sendData (Lattice &, int sequence)
 
void sendDataPart (int first, int last, Lattice &, int sequence, int sourcepe, int errors)
 
void sendPencils (Lattice &, int sequence)
 
void sendPencilsPart (int first, int last, Lattice &, int sequence, int sourcepe)
 
void recvGrid (PmeGridMsg *)
 
void gridCalc1 (void)
 
void sendTransBarrier (void)
 
void sendTransSubset (int first, int last)
 
void sendTrans (void)
 
void fwdSharedTrans (PmeTransMsg *)
 
void recvSharedTrans (PmeSharedTransMsg *)
 
void sendDataHelper (int)
 
void sendPencilsHelper (int)
 
void recvTrans (PmeTransMsg *)
 
void procTrans (PmeTransMsg *)
 
void gridCalc2 (void)
 
void gridCalc2R (void)
 
void fwdSharedUntrans (PmeUntransMsg *)
 
void recvSharedUntrans (PmeSharedUntransMsg *)
 
void sendUntrans (void)
 
void sendUntransSubset (int first, int last)
 
void recvUntrans (PmeUntransMsg *)
 
void procUntrans (PmeUntransMsg *)
 
void gridCalc3 (void)
 
void sendUngrid (void)
 
void sendUngridSubset (int first, int last)
 
void recvUngrid (PmeGridMsg *)
 
void recvAck (PmeAckMsg *)
 
void copyResults (PmeGridMsg *)
 
void copyPencils (PmeGridMsg *)
 
void ungridCalc (void)
 
void recvRecipEvir (PmeEvirMsg *)
 
void addRecipEvirClient (void)
 
void submitReductions ()
 
void chargeGridSubmitted (Lattice &lattice, int sequence)
 
void cuda_submit_charges (Lattice &lattice, int sequence)
 
void sendChargeGridReady ()
 
void pollChargeGridReady ()
 
void pollForcesReady ()
 
void recvChargeGridReady ()
 
void chargeGridReady (Lattice &lattice, int sequence)
 
- Public Member Functions inherited from ComputePmeUtil
 ComputePmeUtil ()
 
 ~ComputePmeUtil ()
 

Public Attributes

LatticesendDataHelper_lattice
 
int sendDataHelper_sequence
 
int sendDataHelper_sourcepe
 
int sendDataHelper_errors
 
CmiNodeLock pmemgr_lock
 
float * a_data_host
 
float * a_data_dev
 
float * f_data_host
 
float * f_data_dev
 
int cuda_atoms_count
 
int cuda_atoms_alloc
 
cudaEvent_t end_charges
 
cudaEvent_t * end_forces
 
int forces_count
 
int forces_done_count
 
double charges_time
 
double forces_time
 
int check_charges_count
 
int check_forces_count
 
int master_pe
 
int this_pe
 
int chargeGridSubmittedCount
 
Latticesaved_lattice
 
int saved_sequence
 
ResizeArray< ComputePme * > pmeComputes
 

Static Public Attributes

static CmiNodeLock fftw_plan_lock
 
static CmiNodeLock cuda_lock
 
static std::deque
< cuda_submit_charges_args
cuda_submit_charges_deque
 
static bool cuda_busy
 
- Static Public Attributes inherited from ComputePmeUtil
static int numGrids
 
static Bool alchOn
 
static Bool alchFepOn
 
static Bool alchThermIntOn
 
static Bool alchDecouple
 
static BigReal alchElecLambdaStart
 
static Bool lesOn
 
static int lesFactor
 
static Bool pairOn
 
static Bool selfOn
 

Friends

class ComputePme
 
class NodePmeMgr
 

Additional Inherited Members

- Static Public Member Functions inherited from ComputePmeUtil
static void select (void)
 

Detailed Description

Definition at line 361 of file ComputePme.C.

Constructor & Destructor Documentation

ComputePmeMgr::ComputePmeMgr ( )

Definition at line 716 of file ComputePme.C.

References chargeGridSubmittedCount, check_charges_count, check_forces_count, cuda_atoms_alloc, cuda_atoms_count, cuda_errcheck(), CUDA_EVENT_ID_PME_CHARGES, CUDA_EVENT_ID_PME_COPY, CUDA_EVENT_ID_PME_FORCES, CUDA_EVENT_ID_PME_KERNEL, CUDA_EVENT_ID_PME_TICK, cuda_lock, CUDA_STREAM_CREATE, end_charges, end_forces, fftw_plan_lock, NUM_STREAMS, pmemgr_lock, stream, and this_pe.

716  : pmeProxy(thisgroup),
717  pmeProxyDir(thisgroup) {
718 
719  CkpvAccess(BOCclass_group).computePmeMgr = thisgroup;
720  pmeNodeProxy = CkpvAccess(BOCclass_group).nodePmeMgr;
721  nodePmeMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
722 
723  pmeNodeProxy.ckLocalBranch()->initialize();
724 
725  if ( CmiMyRank() == 0 ) {
726  fftw_plan_lock = CmiCreateLock();
727  }
728  pmemgr_lock = CmiCreateLock();
729 
730  myKSpace = 0;
731  kgrid = 0;
732  work = 0;
733  grid_count = 0;
734  trans_count = 0;
735  untrans_count = 0;
736  ungrid_count = 0;
737  gridmsg_reuse= new PmeGridMsg*[CkNumPes()];
738  useBarrier = 0;
739  sendTransBarrier_received = 0;
740  usePencils = 0;
741 
742 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
743  // offload has not been set so this happens on every run
744  if ( CmiMyRank() == 0 ) {
745  cuda_lock = CmiCreateLock();
746  }
747 
748 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP)
749  int leastPriority, greatestPriority;
750  cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
751  cuda_errcheck("in cudaDeviceGetStreamPriorityRange");
752  //if ( CkMyNode() == 0 ) {
753  // CkPrintf("Pe %d PME CUDA stream priority range %d %d\n", CkMyPe(), leastPriority, greatestPriority);
754  //}
755 #define CUDA_STREAM_CREATE(X) cudaStreamCreateWithPriority(X,cudaStreamDefault,greatestPriority)
756 #else
757 #define CUDA_STREAM_CREATE(X) cudaStreamCreate(X)
758 #endif
759 
760  stream = 0;
761  for ( int i=0; i<NUM_STREAMS; ++i ) {
762 #if 1
763  CUDA_STREAM_CREATE(&streams[i]);
764  cuda_errcheck("cudaStreamCreate");
765 #else
766  streams[i] = 0; // XXXX Testing!!!
767 #endif
768  }
769 
770  this_pe = CkMyPe();
771 
772  cudaEventCreateWithFlags(&end_charges,cudaEventDisableTiming);
773  end_forces = 0;
775  check_forces_count = 0;
777 
778  cuda_atoms_count = 0;
779  cuda_atoms_alloc = 0;
780 
781  f_data_mgr_alloc = 0;
782  f_data_mgr_host = 0;
783  f_data_mgr_dev = 0;
784  afn_host = 0;
785  afn_dev = 0;
786 
787 #define CUDA_EVENT_ID_PME_CHARGES 80
788 #define CUDA_EVENT_ID_PME_FORCES 81
789 #define CUDA_EVENT_ID_PME_TICK 82
790 #define CUDA_EVENT_ID_PME_COPY 83
791 #define CUDA_EVENT_ID_PME_KERNEL 84
792  if ( 0 == CkMyPe() ) {
793  traceRegisterUserEvent("CUDA PME charges", CUDA_EVENT_ID_PME_CHARGES);
794  traceRegisterUserEvent("CUDA PME forces", CUDA_EVENT_ID_PME_FORCES);
795  traceRegisterUserEvent("CUDA PME tick", CUDA_EVENT_ID_PME_TICK);
796  traceRegisterUserEvent("CUDA PME memcpy", CUDA_EVENT_ID_PME_COPY);
797  traceRegisterUserEvent("CUDA PME kernel", CUDA_EVENT_ID_PME_KERNEL);
798  }
799 #endif
800  recipEvirCount = 0;
801  recipEvirClients = 0;
802  recipEvirPe = -999;
803 }
static CmiNodeLock fftw_plan_lock
Definition: ComputePme.C:420
cudaEvent_t end_charges
Definition: ComputePme.C:432
#define CUDA_STREAM_CREATE(X)
#define CUDA_EVENT_ID_PME_COPY
CmiNodeLock pmemgr_lock
Definition: ComputePme.C:421
int check_charges_count
Definition: ComputePme.C:438
int cuda_atoms_alloc
Definition: ComputePme.C:429
#define CUDA_EVENT_ID_PME_FORCES
#define CUDA_EVENT_ID_PME_TICK
int chargeGridSubmittedCount
Definition: ComputePme.C:450
int cuda_atoms_count
Definition: ComputePme.C:428
void cuda_errcheck(const char *msg)
cudaEvent_t * end_forces
Definition: ComputePme.C:433
#define CUDA_EVENT_ID_PME_KERNEL
#define CUDA_EVENT_ID_PME_CHARGES
#define NUM_STREAMS
Definition: ComputePme.C:520
int check_forces_count
Definition: ComputePme.C:439
static CmiNodeLock cuda_lock
Definition: ComputePme.C:430
ComputePmeMgr::~ComputePmeMgr ( )

Definition at line 1800 of file ComputePme.C.

References fftw_plan_lock, and pmemgr_lock.

1800  {
1801 
1802  if ( CmiMyRank() == 0 ) {
1803  CmiDestroyLock(fftw_plan_lock);
1804  }
1805  CmiDestroyLock(pmemgr_lock);
1806 
1807  delete myKSpace;
1808  delete [] localInfo;
1809  delete [] gridNodeInfo;
1810  delete [] transNodeInfo;
1811  delete [] gridPeMap;
1812  delete [] transPeMap;
1813  delete [] recipPeDest;
1814  delete [] gridPeOrder;
1815  delete [] gridNodeOrder;
1816  delete [] transNodeOrder;
1817  delete [] qgrid;
1818  if ( kgrid != qgrid ) delete [] kgrid;
1819  delete [] work;
1820  delete [] gridmsg_reuse;
1821 
1822  if ( ! offload ) {
1823  for (int i=0; i<q_count; ++i) {
1824  delete [] q_list[i];
1825  }
1826  delete [] q_list;
1827  delete [] fz_arr;
1828  }
1829  delete [] f_arr;
1830  delete [] q_arr;
1831 }
static CmiNodeLock fftw_plan_lock
Definition: ComputePme.C:420
CmiNodeLock pmemgr_lock
Definition: ComputePme.C:421

Member Function Documentation

void ComputePmeMgr::activate_pencils ( CkQdMsg *  msg)

Definition at line 1794 of file ComputePme.C.

1794  {
1795  if ( ! usePencils ) return;
1796  if ( CkMyPe() == 0 ) zPencil.dummyRecvGrid(CkMyPe(),1);
1797 }
void ComputePmeMgr::addRecipEvirClient ( void  )

Definition at line 3025 of file ComputePme.C.

3025  {
3026  ++recipEvirClients;
3027 }
void ComputePmeMgr::chargeGridReady ( Lattice lattice,
int  sequence 
)

Definition at line 3554 of file ComputePme.C.

References PmeGrid::K3, NAMD_bug(), PmeGrid::order, pmeComputes, sendData(), sendPencils(), and ResizeArray< T >::size().

Referenced by ComputePme::doWork(), and recvChargeGridReady().

3554  {
3555 
3556 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3557  if ( offload ) {
3558  int errcount = 0;
3559  int q_stride = myGrid.K3+myGrid.order-1;
3560  for (int n=fsize+q_stride, j=fsize; j<n; ++j) {
3561  f_arr[j] = ffz_host[j];
3562  if ( ffz_host[j] & ~1 ) ++errcount;
3563  }
3564  if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::chargeGridReady");
3565  }
3566 #endif
3567  recipEvirCount = recipEvirClients;
3568  ungridForcesCount = pmeComputes.size();
3569 
3570  for (int j=0; j<myGrid.order-1; ++j) {
3571  fz_arr[j] |= fz_arr[myGrid.K3+j];
3572  }
3573 
3574  if ( usePencils ) {
3575  sendPencils(lattice,sequence);
3576  } else {
3577  sendData(lattice,sequence);
3578  }
3579 }
void sendPencils(Lattice &, int sequence)
Definition: ComputePme.C:3737
int order
Definition: PmeBase.h:20
void NAMD_bug(const char *err_msg)
Definition: common.C:129
void sendData(Lattice &, int sequence)
Definition: ComputePme.C:3964
int K3
Definition: PmeBase.h:18
ResizeArray< ComputePme * > pmeComputes
Definition: ComputePme.C:460
int size(void) const
Definition: ResizeArray.h:127
void ComputePmeMgr::chargeGridSubmitted ( Lattice lattice,
int  sequence 
)

Definition at line 3490 of file ComputePme.C.

References chargeGridSubmittedCount, computeMgr, CUDA_EVENT_ID_PME_COPY, deviceCUDA, end_charges, DeviceCUDA::getMasterPe(), master_pe, Node::Object(), saved_lattice, saved_sequence, Node::simParameters, and SimParameters::useCUDA2.

Referenced by cuda_submit_charges().

3490  {
3491  saved_lattice = &lattice;
3492  saved_sequence = sequence;
3493 
3494  // cudaDeviceSynchronize(); // XXXX TESTING
3495  //int q_stride = myGrid.K3+myGrid.order-1;
3496  //for (int n=fsize+q_stride, j=0; j<n; ++j) {
3497  // if ( ffz_host[j] != 0 && ffz_host[j] != 1 ) {
3498  // CkPrintf("pre-memcpy flag %d/%d == %d on pe %d in ComputePmeMgr::chargeGridReady\n", j, n, ffz_host[j], CkMyPe());
3499  // }
3500  //}
3501  //CmiLock(cuda_lock);
3502 
3503  if ( --(masterPmeMgr->chargeGridSubmittedCount) == 0 ) {
3504  double before = CmiWallTimer();
3505  cudaEventRecord(nodePmeMgr->end_all_pme_kernels, 0); // when all streams complete
3506  cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_all_pme_kernels, 0);
3507  cudaMemcpyAsync(q_data_host, q_data_dev, q_data_size+ffz_size,
3508  cudaMemcpyDeviceToHost, streams[stream]);
3509  traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
3510  cudaEventRecord(masterPmeMgr->end_charges, streams[stream]);
3511  cudaMemsetAsync(q_data_dev, 0, q_data_size + ffz_size, streams[stream]); // for next time
3512  cudaEventRecord(nodePmeMgr->end_charge_memset, streams[stream]);
3513  //CmiUnlock(cuda_lock);
3514  // cudaDeviceSynchronize(); // XXXX TESTING
3515  // cuda_errcheck("after memcpy grid to host");
3516 
3518  if ( ! simParams->useCUDA2 ) {
3519  CProxy_ComputeMgr cm(CkpvAccess(BOCclass_group).computeMgr);
3520  cm[deviceCUDA->getMasterPe()].recvYieldDevice(-1);
3521  }
3522 
3523  pmeProxy[master_pe].pollChargeGridReady();
3524  }
3525 }
static Node * Object()
Definition: Node.h:86
Lattice * saved_lattice
Definition: ComputePme.C:453
cudaEvent_t end_charges
Definition: ComputePme.C:432
static __thread ComputeMgr * computeMgr
SimParameters * simParameters
Definition: Node.h:178
#define CUDA_EVENT_ID_PME_COPY
int getMasterPe()
Definition: DeviceCUDA.h:105
int chargeGridSubmittedCount
Definition: ComputePme.C:450
#define simParams
Definition: Output.C:127
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:22
int saved_sequence
Definition: ComputePme.C:454
void ComputePmeMgr::copyPencils ( PmeGridMsg msg)

Definition at line 3800 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::block2, PmeGrid::dim2, PmeGrid::dim3, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, ComputePmeUtil::numGrids, PmeGrid::order, PmeGridMsg::qgrid, PmeGridMsg::sourceNode, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

Referenced by recvUngrid().

3800  {
3801 
3802  int K1 = myGrid.K1;
3803  int K2 = myGrid.K2;
3804  int dim2 = myGrid.dim2;
3805  int dim3 = myGrid.dim3;
3806  int block1 = myGrid.block1;
3807  int block2 = myGrid.block2;
3808 
3809  // msg->sourceNode = thisIndex.x * initdata.yBlocks + thisIndex.y;
3810  int ib = msg->sourceNode / yBlocks;
3811  int jb = msg->sourceNode % yBlocks;
3812 
3813  int ibegin = ib*block1;
3814  int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
3815  int jbegin = jb*block2;
3816  int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
3817 
3818  int zlistlen = msg->zlistlen;
3819  int *zlist = msg->zlist;
3820  float *qmsg = msg->qgrid;
3821  int g;
3822  for ( g=0; g<numGrids; ++g ) {
3823  char *f = f_arr + g*fsize;
3824  float **q = q_arr + g*fsize;
3825  for ( int i=ibegin; i<iend; ++i ) {
3826  for ( int j=jbegin; j<jend; ++j ) {
3827  if( f[i*dim2+j] ) {
3828  f[i*dim2+j] = 0;
3829  for ( int k=0; k<zlistlen; ++k ) {
3830  q[i*dim2+j][zlist[k]] = *(qmsg++);
3831  }
3832  for (int h=0; h<myGrid.order-1; ++h) {
3833  q[i*dim2+j][myGrid.K3+h] = q[i*dim2+j][h];
3834  }
3835  }
3836  }
3837  }
3838  }
3839 }
int dim2
Definition: PmeBase.h:19
int dim3
Definition: PmeBase.h:19
int K2
Definition: PmeBase.h:18
int K1
Definition: PmeBase.h:18
static int numGrids
Definition: ComputePme.h:32
int block1
Definition: PmeBase.h:21
int block2
Definition: PmeBase.h:21
int sourceNode
Definition: ComputePme.C:121
int order
Definition: PmeBase.h:20
float * qgrid
Definition: ComputePme.C:130
int * zlist
Definition: ComputePme.C:128
int K3
Definition: PmeBase.h:18
int zlistlen
Definition: ComputePme.C:127
void ComputePmeMgr::copyResults ( PmeGridMsg msg)

Definition at line 3992 of file ComputePme.C.

References PmeGrid::dim3, PmeGridMsg::fgrid, PmeGrid::K3, PmeGridMsg::len, ComputePmeUtil::numGrids, PmeGrid::order, PmeGridMsg::qgrid, PmeGridMsg::start, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

Referenced by recvUngrid().

3992  {
3993 
3994  int zdim = myGrid.dim3;
3995  int flen = msg->len;
3996  int fstart = msg->start;
3997  int zlistlen = msg->zlistlen;
3998  int *zlist = msg->zlist;
3999  float *qmsg = msg->qgrid;
4000  int g;
4001  for ( g=0; g<numGrids; ++g ) {
4002  char *f = msg->fgrid + g*flen;
4003  float **q = q_arr + fstart + g*fsize;
4004  for ( int i=0; i<flen; ++i ) {
4005  if ( f[i] ) {
4006  f[i] = 0;
4007  for ( int k=0; k<zlistlen; ++k ) {
4008  q[i][zlist[k]] = *(qmsg++);
4009  }
4010  for (int h=0; h<myGrid.order-1; ++h) {
4011  q[i][myGrid.K3+h] = q[i][h];
4012  }
4013  }
4014  }
4015  }
4016 }
int dim3
Definition: PmeBase.h:19
static int numGrids
Definition: ComputePme.h:32
int order
Definition: PmeBase.h:20
float * qgrid
Definition: ComputePme.C:130
int * zlist
Definition: ComputePme.C:128
int K3
Definition: PmeBase.h:18
int zlistlen
Definition: ComputePme.C:127
char * fgrid
Definition: ComputePme.C:129
void ComputePmeMgr::cuda_submit_charges ( Lattice lattice,
int  sequence 
)

Definition at line 3435 of file ComputePme.C.

References a_data_dev, a_data_host, chargeGridSubmitted(), charges_time, cuda_atoms_count, CUDA_EVENT_ID_PME_COPY, CUDA_EVENT_ID_PME_KERNEL, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, and PmeGrid::order.

Referenced by ComputePme::doWork().

3435  {
3436 
3437  int n = cuda_atoms_count;
3438  //CkPrintf("pe %d cuda_atoms_count %d\n", CkMyPe(), cuda_atoms_count);
3439  cuda_atoms_count = 0;
3440 
3441  const double before = CmiWallTimer();
3442  cudaMemcpyAsync(a_data_dev, a_data_host, 7*n*sizeof(float),
3443  cudaMemcpyHostToDevice, streams[stream]);
3444  const double after = CmiWallTimer();
3445 
3446  cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_charge_memset, 0);
3447 
3448  cuda_pme_charges(
3449  bspline_coeffs_dev,
3450  q_arr_dev, ffz_dev, ffz_dev + fsize,
3451  a_data_dev, n,
3452  myGrid.K1, myGrid.K2, myGrid.K3, myGrid.order,
3453  streams[stream]);
3454  const double after2 = CmiWallTimer();
3455 
3456  chargeGridSubmitted(lattice,sequence); // must be inside lock
3457 
3458  masterPmeMgr->charges_time = before;
3459  traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,after);
3460  traceUserBracketEvent(CUDA_EVENT_ID_PME_KERNEL,after,after2);
3461 }
float * a_data_dev
Definition: ComputePme.C:425
int K2
Definition: PmeBase.h:18
int K1
Definition: PmeBase.h:18
#define CUDA_EVENT_ID_PME_COPY
int order
Definition: PmeBase.h:20
int K3
Definition: PmeBase.h:18
int cuda_atoms_count
Definition: ComputePme.C:428
#define CUDA_EVENT_ID_PME_KERNEL
double charges_time
Definition: ComputePme.C:436
void chargeGridSubmitted(Lattice &lattice, int sequence)
Definition: ComputePme.C:3490
float * a_data_host
Definition: ComputePme.C:424
void ComputePmeMgr::fwdSharedTrans ( PmeTransMsg msg)

Definition at line 2020 of file ComputePme.C.

References PmeSharedTransMsg::count, PmeSharedTransMsg::lock, PmeSharedTransMsg::msg, NodePmeInfo::npe, NodePmeInfo::pe_start, PME_TRANS_PRIORITY, PRIORITY_SIZE, PmeTransMsg::sequence, and SET_PRIORITY.

Referenced by sendTransSubset().

2020  {
2021  // CkPrintf("fwdSharedTrans on Pe(%d)\n",CkMyPe());
2022  int pe = transNodeInfo[myTransNode].pe_start;
2023  int npe = transNodeInfo[myTransNode].npe;
2024  CmiNodeLock lock = CmiCreateLock();
2025  int *count = new int; *count = npe;
2026  for (int i=0; i<npe; ++i, ++pe) {
2029  shmsg->msg = msg;
2030  shmsg->count = count;
2031  shmsg->lock = lock;
2032  pmeProxy[transPeMap[pe]].recvSharedTrans(shmsg);
2033  }
2034 }
#define PRIORITY_SIZE
Definition: Priorities.h:13
#define PME_TRANS_PRIORITY
Definition: Priorities.h:31
void recvSharedTrans(PmeSharedTransMsg *)
Definition: ComputePme.C:2036
#define SET_PRIORITY(MSG, SEQ, PRIO)
Definition: Priorities.h:18
void ComputePmeMgr::fwdSharedUntrans ( PmeUntransMsg msg)

Definition at line 2276 of file ComputePme.C.

References PmeSharedUntransMsg::count, PmeSharedUntransMsg::lock, PmeSharedUntransMsg::msg, NodePmeInfo::npe, and NodePmeInfo::pe_start.

Referenced by sendUntransSubset().

2276  {
2277  int pe = gridNodeInfo[myGridNode].pe_start;
2278  int npe = gridNodeInfo[myGridNode].npe;
2279  CmiNodeLock lock = CmiCreateLock();
2280  int *count = new int; *count = npe;
2281  for (int i=0; i<npe; ++i, ++pe) {
2283  shmsg->msg = msg;
2284  shmsg->count = count;
2285  shmsg->lock = lock;
2286  pmeProxy[gridPeMap[pe]].recvSharedUntrans(shmsg);
2287  }
2288 }
CmiNodeLock lock
Definition: ComputePme.C:168
PmeUntransMsg * msg
Definition: ComputePme.C:166
void ComputePmeMgr::gridCalc1 ( void  )

Definition at line 1912 of file ComputePme.C.

References PmeGrid::dim2, PmeGrid::dim3, and ComputePmeUtil::numGrids.

1912  {
1913  // CkPrintf("gridCalc1 on Pe(%d)\n",CkMyPe());
1914 
1915 #ifdef NAMD_FFTW
1916  for ( int g=0; g<numGrids; ++g ) {
1917 #ifdef NAMD_FFTW_3
1918  fftwf_execute(forward_plan_yz[g]);
1919 #else
1920  rfftwnd_real_to_complex(forward_plan_yz, localInfo[myGridPe].nx,
1921  qgrid + qgrid_size * g, 1, myGrid.dim2 * myGrid.dim3, 0, 0, 0);
1922 #endif
1923 
1924  }
1925 #endif
1926 
1927  if ( ! useBarrier ) pmeProxyDir[CkMyPe()].sendTrans();
1928 }
int dim2
Definition: PmeBase.h:19
int dim3
Definition: PmeBase.h:19
static int numGrids
Definition: ComputePme.h:32
void ComputePmeMgr::gridCalc2 ( void  )

Definition at line 2088 of file ComputePme.C.

References PmeGrid::dim3, gridCalc2R(), ComputePmeUtil::numGrids, LocalPmeInfo::ny_after_transpose, and simParams.

2088  {
2089  // CkPrintf("gridCalc2 on Pe(%d)\n",CkMyPe());
2090 
2091 #if CMK_BLUEGENEL
2092  CmiNetworkProgressAfter (0);
2093 #endif
2094 
2095  int zdim = myGrid.dim3;
2096  // int y_start = localInfo[myTransPe].y_start_after_transpose;
2097  int ny = localInfo[myTransPe].ny_after_transpose;
2098 
2099  for ( int g=0; g<numGrids; ++g ) {
2100  // finish forward FFT (x dimension)
2101 #ifdef NAMD_FFTW
2102 #ifdef NAMD_FFTW_3
2103  fftwf_execute(forward_plan_x[g]);
2104 #else
2105  fftw(forward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
2106  ny * zdim / 2, 1, work, 1, 0);
2107 #endif
2108 #endif
2109  }
2110 
2111 #ifdef OPENATOM_VERSION
2112  if ( ! simParams -> openatomOn ) {
2113 #endif // OPENATOM_VERSION
2114  gridCalc2R();
2115 #ifdef OPENATOM_VERSION
2116  } else {
2117  gridCalc2Moa();
2118  }
2119 #endif // OPENATOM_VERSION
2120 }
int dim3
Definition: PmeBase.h:19
static int numGrids
Definition: ComputePme.h:32
int ny_after_transpose
Definition: ComputePme.C:239
#define simParams
Definition: Output.C:127
void gridCalc2R(void)
Definition: ComputePme.C:2148
void ComputePmeMgr::gridCalc2R ( void  )

Definition at line 2148 of file ComputePme.C.

References CKLOOP_CTRL_PME_KSPACE, PmeKSpace::compute_energy(), PmeGrid::dim3, ComputeNonbondedUtil::ewaldcof, ComputePmeUtil::numGrids, LocalPmeInfo::ny_after_transpose, and Node::Object().

Referenced by gridCalc2().

2148  {
2149 
2150  int useCkLoop = 0;
2151 #if CMK_SMP && USE_CKLOOP
2152  if ( Node::Object()->simParameters->useCkLoop >= CKLOOP_CTRL_PME_KSPACE
2153  && CkNumPes() >= 2 * numTransPes ) {
2154  useCkLoop = 1;
2155  }
2156 #endif
2157 
2158  int zdim = myGrid.dim3;
2159  // int y_start = localInfo[myTransPe].y_start_after_transpose;
2160  int ny = localInfo[myTransPe].ny_after_transpose;
2161 
2162  for ( int g=0; g<numGrids; ++g ) {
2163  // reciprocal space portion of PME
2165  recip_evir2[g][0] = myKSpace->compute_energy(kgrid+qgrid_size*g,
2166  lattice, ewaldcof, &(recip_evir2[g][1]), useCkLoop);
2167  // CkPrintf("Ewald reciprocal energy = %f\n", recip_evir2[g][0]);
2168 
2169  // start backward FFT (x dimension)
2170 
2171 #ifdef NAMD_FFTW
2172 #ifdef NAMD_FFTW_3
2173  fftwf_execute(backward_plan_x[g]);
2174 #else
2175  fftw(backward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
2176  ny * zdim / 2, 1, work, 1, 0);
2177 #endif
2178 #endif
2179  }
2180 
2181  pmeProxyDir[CkMyPe()].sendUntrans();
2182 }
static Node * Object()
Definition: Node.h:86
int dim3
Definition: PmeBase.h:19
static int numGrids
Definition: ComputePme.h:32
double compute_energy(float q_arr[], const Lattice &lattice, double ewald, double virial[], int useCkLoop)
Definition: PmeKSpace.C:321
#define CKLOOP_CTRL_PME_KSPACE
Definition: SimParameters.h:96
int ny_after_transpose
Definition: ComputePme.C:239
double BigReal
Definition: common.h:114
void ComputePmeMgr::gridCalc3 ( void  )

Definition at line 2350 of file ComputePme.C.

References PmeGrid::dim2, PmeGrid::dim3, and ComputePmeUtil::numGrids.

2350  {
2351  // CkPrintf("gridCalc3 on Pe(%d)\n",CkMyPe());
2352 
2353  // finish backward FFT
2354 #ifdef NAMD_FFTW
2355  for ( int g=0; g<numGrids; ++g ) {
2356 #ifdef NAMD_FFTW_3
2357  fftwf_execute(backward_plan_yz[g]);
2358 #else
2359  rfftwnd_complex_to_real(backward_plan_yz, localInfo[myGridPe].nx,
2360  (fftw_complex *) (qgrid + qgrid_size * g),
2361  1, myGrid.dim2 * myGrid.dim3 / 2, 0, 0, 0);
2362 #endif
2363  }
2364 
2365 #endif
2366 
2367  pmeProxyDir[CkMyPe()].sendUngrid();
2368 }
int dim2
Definition: PmeBase.h:19
int dim3
Definition: PmeBase.h:19
static int numGrids
Definition: ComputePme.h:32
void ComputePmeMgr::initialize ( CkQdMsg *  msg)

Definition at line 868 of file ComputePme.C.

References Lattice::a(), Lattice::a_r(), ResizeArray< Elem >::add(), ResizeArray< Elem >::begin(), PmeGrid::block1, PmeGrid::block2, PmeGrid::block3, cuda_errcheck(), SimParameters::cutoff, deviceCUDA, PmeGrid::dim2, PmeGrid::dim3, ResizeArray< Elem >::end(), endi(), fftw_plan_lock, SimParameters::FFTWEstimate, SimParameters::FFTWPatient, findRecipEvirPe(), generatePmePeList2(), DeviceCUDA::getDeviceID(), PmePencilInitMsgData::grid, if(), iINFO(), iout, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, SimParameters::lattice, PatchMap::max_a(), PatchMap::min_a(), NAMD_bug(), NAMD_die(), PatchMap::node(), NodePmeInfo::npe, ComputePmeUtil::numGrids, PatchMap::numNodesWithPatches(), PatchMap::numPatches(), numPatches, PatchMap::numPatchesOnNode(), LocalPmeInfo::nx, LocalPmeInfo::ny_after_transpose, PatchMap::Object(), Node::Object(), DeviceCUDA::one_device_per_node(), PmeGrid::order, SimParameters::patchDimension, NodePmeInfo::pe_start, WorkDistrib::peDiffuseOrdering, pencilPMEProcessors, SimParameters::PMEBarrier, SimParameters::PMEGridSizeX, SimParameters::PMEGridSizeY, SimParameters::PMEGridSizeZ, SimParameters::PMEInterpOrder, SimParameters::PMEMinPoints, SimParameters::PMEMinSlices, PmePencilInitMsgData::pmeNodeProxy, SimParameters::PMEOffload, SimParameters::PMEPencils, SimParameters::PMEPencilsX, SimParameters::PMEPencilsXLayout, SimParameters::PMEPencilsY, SimParameters::PMEPencilsYLayout, SimParameters::PMEPencilsZ, SimParameters::PMEProcessors, PmePencilInitMsgData::pmeProxy, NodePmeInfo::real_node, Random::reorder(), ResizeArray< Elem >::resize(), Node::simParameters, simParams, ResizeArray< Elem >::size(), SortableResizeArray< Type >::sort(), sort, WorkDistrib::sortPmePes(), Vector::unit(), x, LocalPmeInfo::x_start, PmePencilInitMsgData::xBlocks, PmePencilInitMsgData::xm, PmePencilInitMsgData::xPencil, y, LocalPmeInfo::y_start_after_transpose, PmePencilInitMsgData::yBlocks, PmePencilInitMsgData::ym, PmePencilInitMsgData::yPencil, z, PmePencilInitMsgData::zBlocks, PmePencilInitMsgData::zm, and PmePencilInitMsgData::zPencil.

868  {
869  delete msg;
870 
871  localInfo = new LocalPmeInfo[CkNumPes()];
872  gridNodeInfo = new NodePmeInfo[CkNumNodes()];
873  transNodeInfo = new NodePmeInfo[CkNumNodes()];
874  gridPeMap = new int[CkNumPes()];
875  transPeMap = new int[CkNumPes()];
876  recipPeDest = new int[CkNumPes()];
877  gridPeOrder = new int[CkNumPes()];
878  gridNodeOrder = new int[CkNumNodes()];
879  transNodeOrder = new int[CkNumNodes()];
880 
881  if (CkMyRank() == 0) {
882  pencilPMEProcessors = new char [CkNumPes()];
883  memset (pencilPMEProcessors, 0, sizeof(char) * CkNumPes());
884  }
885 
887  PatchMap *patchMap = PatchMap::Object();
888 
889  offload = simParams->PMEOffload;
890 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
891  if ( offload && ! deviceCUDA->one_device_per_node() ) {
892  NAMD_die("PME offload requires exactly one CUDA device per process. Use \"PMEOffload no\".");
893  }
894  if ( offload ) {
895  int dev;
896  cudaGetDevice(&dev);
897  cuda_errcheck("in cudaGetDevice");
898  if ( dev != deviceCUDA->getDeviceID() ) NAMD_bug("ComputePmeMgr::initialize dev != deviceCUDA->getDeviceID()");
899  cudaDeviceProp deviceProp;
900  cudaGetDeviceProperties(&deviceProp, dev);
901  cuda_errcheck("in cudaGetDeviceProperties");
902  if ( deviceProp.major < 2 )
903  NAMD_die("PME offload requires CUDA device of compute capability 2.0 or higher. Use \"PMEOffload no\".");
904  }
905 #endif
906 
907  alchLambda = -1.; // illegal value to catch if not updated
908  alchLambda2 = -1.;
909  useBarrier = simParams->PMEBarrier;
910 
911  if ( numGrids != 1 || simParams->PMEPencils == 0 ) usePencils = 0;
912  else if ( simParams->PMEPencils > 0 ) usePencils = 1;
913  else {
914  int nrps = simParams->PMEProcessors;
915  if ( nrps <= 0 ) nrps = CkNumPes();
916  if ( nrps > CkNumPes() ) nrps = CkNumPes();
917  int dimx = simParams->PMEGridSizeX;
918  int dimy = simParams->PMEGridSizeY;
919  int maxslabs = 1 + (dimx - 1) / simParams->PMEMinSlices;
920  if ( maxslabs > nrps ) maxslabs = nrps;
921  int maxpencils = ( simParams->PMEGridSizeX * (int64) simParams->PMEGridSizeY
922  * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
923  if ( maxpencils > nrps ) maxpencils = nrps;
924  if ( maxpencils > 3 * maxslabs ) usePencils = 1;
925  else usePencils = 0;
926  }
927 
928  if ( usePencils ) {
929  int nrps = simParams->PMEProcessors;
930  if ( nrps <= 0 ) nrps = CkNumPes();
931  if ( nrps > CkNumPes() ) nrps = CkNumPes();
932  if ( simParams->PMEPencils > 1 &&
933  simParams->PMEPencils * simParams->PMEPencils <= nrps ) {
934  xBlocks = yBlocks = zBlocks = simParams->PMEPencils;
935  } else {
936  int nb2 = ( simParams->PMEGridSizeX * (int64) simParams->PMEGridSizeY
937  * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
938  if ( nb2 > nrps ) nb2 = nrps;
939  if ( nb2 < 1 ) nb2 = 1;
940  int nb = (int) sqrt((float)nb2);
941  if ( nb < 1 ) nb = 1;
942  xBlocks = zBlocks = nb;
943  yBlocks = nb2 / nb;
944  }
945 
946  if ( simParams->PMEPencilsX > 0 ) xBlocks = simParams->PMEPencilsX;
947  if ( simParams->PMEPencilsY > 0 ) yBlocks = simParams->PMEPencilsY;
948  if ( simParams->PMEPencilsZ > 0 ) zBlocks = simParams->PMEPencilsZ;
949 
950  int dimx = simParams->PMEGridSizeX;
951  int bx = 1 + ( dimx - 1 ) / xBlocks;
952  xBlocks = 1 + ( dimx - 1 ) / bx;
953 
954  int dimy = simParams->PMEGridSizeY;
955  int by = 1 + ( dimy - 1 ) / yBlocks;
956  yBlocks = 1 + ( dimy - 1 ) / by;
957 
958  int dimz = simParams->PMEGridSizeZ / 2 + 1; // complex
959  int bz = 1 + ( dimz - 1 ) / zBlocks;
960  zBlocks = 1 + ( dimz - 1 ) / bz;
961 
962  if ( xBlocks * yBlocks > CkNumPes() ) {
963  NAMD_die("PME pencils xBlocks * yBlocks > numPes");
964  }
965  if ( xBlocks * zBlocks > CkNumPes() ) {
966  NAMD_die("PME pencils xBlocks * zBlocks > numPes");
967  }
968  if ( yBlocks * zBlocks > CkNumPes() ) {
969  NAMD_die("PME pencils yBlocks * zBlocks > numPes");
970  }
971 
972  if ( ! CkMyPe() ) {
973  iout << iINFO << "PME using " << xBlocks << " x " <<
974  yBlocks << " x " << zBlocks <<
975  " pencil grid for FFT and reciprocal sum.\n" << endi;
976  }
977  } else { // usePencils
978 
979  { // decide how many pes to use for reciprocal sum
980 
981  // rules based on work available
982  int minslices = simParams->PMEMinSlices;
983  int dimx = simParams->PMEGridSizeX;
984  int nrpx = ( dimx + minslices - 1 ) / minslices;
985  int dimy = simParams->PMEGridSizeY;
986  int nrpy = ( dimy + minslices - 1 ) / minslices;
987 
988  // rules based on processors available
989  int nrpp = CkNumPes();
990  // if ( nrpp > 32 ) nrpp = 32; // cap to limit messages
991  if ( nrpp < nrpx ) nrpx = nrpp;
992  if ( nrpp < nrpy ) nrpy = nrpp;
993 
994  // user override
995  int nrps = simParams->PMEProcessors;
996  if ( nrps > CkNumPes() ) nrps = CkNumPes();
997  if ( nrps > 0 ) nrpx = nrps;
998  if ( nrps > 0 ) nrpy = nrps;
999 
1000  // make sure there aren't any totally empty processors
1001  int bx = ( dimx + nrpx - 1 ) / nrpx;
1002  nrpx = ( dimx + bx - 1 ) / bx;
1003  int by = ( dimy + nrpy - 1 ) / nrpy;
1004  nrpy = ( dimy + by - 1 ) / by;
1005  if ( bx != ( dimx + nrpx - 1 ) / nrpx )
1006  NAMD_bug("Error in selecting number of PME processors.");
1007  if ( by != ( dimy + nrpy - 1 ) / nrpy )
1008  NAMD_bug("Error in selecting number of PME processors.");
1009 
1010  numGridPes = nrpx;
1011  numTransPes = nrpy;
1012  }
1013  if ( ! CkMyPe() ) {
1014  iout << iINFO << "PME using " << numGridPes << " and " << numTransPes <<
1015  " processors for FFT and reciprocal sum.\n" << endi;
1016  }
1017 
1018  int sum_npes = numTransPes + numGridPes;
1019  int max_npes = (numTransPes > numGridPes)?numTransPes:numGridPes;
1020 
1021 #if 0 // USE_TOPOMAP
1022  /* This code is being disabled permanently for slab PME on Blue Gene machines */
1023  PatchMap * pmap = PatchMap::Object();
1024 
1025  int patch_pes = pmap->numNodesWithPatches();
1026  TopoManager tmgr;
1027  if(tmgr.hasMultipleProcsPerNode())
1028  patch_pes *= 2;
1029 
1030  bool done = false;
1031  if(CkNumPes() > 2*sum_npes + patch_pes) {
1032  done = generateBGLORBPmePeList(transPeMap, numTransPes);
1033  done &= generateBGLORBPmePeList(gridPeMap, numGridPes, transPeMap, numTransPes);
1034  }
1035  else
1036  if(CkNumPes() > 2 *max_npes + patch_pes) {
1037  done = generateBGLORBPmePeList(transPeMap, max_npes);
1038  gridPeMap = transPeMap;
1039  }
1040 
1041  if (!done)
1042 #endif
1043  {
1044  //generatePmePeList(transPeMap, max_npes);
1045  //gridPeMap = transPeMap;
1046  generatePmePeList2(gridPeMap, numGridPes, transPeMap, numTransPes);
1047  }
1048 
1049  if ( ! CkMyPe() ) {
1050  iout << iINFO << "PME GRID LOCATIONS:";
1051  int i;
1052  for ( i=0; i<numGridPes && i<10; ++i ) {
1053  iout << " " << gridPeMap[i];
1054  }
1055  if ( i < numGridPes ) iout << " ...";
1056  iout << "\n" << endi;
1057  iout << iINFO << "PME TRANS LOCATIONS:";
1058  for ( i=0; i<numTransPes && i<10; ++i ) {
1059  iout << " " << transPeMap[i];
1060  }
1061  if ( i < numTransPes ) iout << " ...";
1062  iout << "\n" << endi;
1063  }
1064 
1065  // sort based on nodes and physical nodes
1066  std::sort(gridPeMap,gridPeMap+numGridPes,WorkDistrib::pe_sortop_compact());
1067 
1068  myGridPe = -1;
1069  myGridNode = -1;
1070  int i = 0;
1071  int node = -1;
1072  int real_node = -1;
1073  for ( i=0; i<numGridPes; ++i ) {
1074  if ( gridPeMap[i] == CkMyPe() ) myGridPe = i;
1075  if (CkMyRank() == 0) pencilPMEProcessors[gridPeMap[i]] |= 1;
1076  int real_node_i = CkNodeOf(gridPeMap[i]);
1077  if ( real_node_i == real_node ) {
1078  gridNodeInfo[node].npe += 1;
1079  } else {
1080  real_node = real_node_i;
1081  ++node;
1082  gridNodeInfo[node].real_node = real_node;
1083  gridNodeInfo[node].pe_start = i;
1084  gridNodeInfo[node].npe = 1;
1085  }
1086  if ( CkMyNode() == real_node_i ) myGridNode = node;
1087  }
1088  numGridNodes = node + 1;
1089  myTransPe = -1;
1090  myTransNode = -1;
1091  node = -1;
1092  real_node = -1;
1093  for ( i=0; i<numTransPes; ++i ) {
1094  if ( transPeMap[i] == CkMyPe() ) myTransPe = i;
1095  if (CkMyRank() == 0) pencilPMEProcessors[transPeMap[i]] |= 2;
1096  int real_node_i = CkNodeOf(transPeMap[i]);
1097  if ( real_node_i == real_node ) {
1098  transNodeInfo[node].npe += 1;
1099  } else {
1100  real_node = real_node_i;
1101  ++node;
1102  transNodeInfo[node].real_node = real_node;
1103  transNodeInfo[node].pe_start = i;
1104  transNodeInfo[node].npe = 1;
1105  }
1106  if ( CkMyNode() == real_node_i ) myTransNode = node;
1107  }
1108  numTransNodes = node + 1;
1109 
1110  if ( ! CkMyPe() ) {
1111  iout << iINFO << "PME USING " << numGridNodes << " GRID NODES AND "
1112  << numTransNodes << " TRANS NODES\n" << endi;
1113  }
1114 
1115  { // generate random orderings for grid and trans messages
1116  int i;
1117  for ( i = 0; i < numGridPes; ++i ) {
1118  gridPeOrder[i] = i;
1119  }
1120  Random rand(CkMyPe());
1121  if ( myGridPe < 0 ) {
1122  rand.reorder(gridPeOrder,numGridPes);
1123  } else { // self last
1124  gridPeOrder[myGridPe] = numGridPes-1;
1125  gridPeOrder[numGridPes-1] = myGridPe;
1126  rand.reorder(gridPeOrder,numGridPes-1);
1127  }
1128  for ( i = 0; i < numGridNodes; ++i ) {
1129  gridNodeOrder[i] = i;
1130  }
1131  if ( myGridNode < 0 ) {
1132  rand.reorder(gridNodeOrder,numGridNodes);
1133  } else { // self last
1134  gridNodeOrder[myGridNode] = numGridNodes-1;
1135  gridNodeOrder[numGridNodes-1] = myGridNode;
1136  rand.reorder(gridNodeOrder,numGridNodes-1);
1137  }
1138  for ( i = 0; i < numTransNodes; ++i ) {
1139  transNodeOrder[i] = i;
1140  }
1141  if ( myTransNode < 0 ) {
1142  rand.reorder(transNodeOrder,numTransNodes);
1143  } else { // self last
1144  transNodeOrder[myTransNode] = numTransNodes-1;
1145  transNodeOrder[numTransNodes-1] = myTransNode;
1146  rand.reorder(transNodeOrder,numTransNodes-1);
1147  }
1148  }
1149 
1150  } // ! usePencils
1151 
1152  myGrid.K1 = simParams->PMEGridSizeX;
1153  myGrid.K2 = simParams->PMEGridSizeY;
1154  myGrid.K3 = simParams->PMEGridSizeZ;
1155  myGrid.order = simParams->PMEInterpOrder;
1156  myGrid.dim2 = myGrid.K2;
1157  myGrid.dim3 = 2 * (myGrid.K3/2 + 1);
1158 
1159  if ( ! usePencils ) {
1160  myGrid.block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
1161  myGrid.block2 = ( myGrid.K2 + numTransPes - 1 ) / numTransPes;
1162  myGrid.block3 = myGrid.dim3 / 2; // complex
1163  }
1164 
1165  if ( usePencils ) {
1166  myGrid.block1 = ( myGrid.K1 + xBlocks - 1 ) / xBlocks;
1167  myGrid.block2 = ( myGrid.K2 + yBlocks - 1 ) / yBlocks;
1168  myGrid.block3 = ( myGrid.K3/2 + 1 + zBlocks - 1 ) / zBlocks; // complex
1169 
1170 
1171  int pe = 0;
1172  int x,y,z;
1173 
1174  SortableResizeArray<int> zprocs(xBlocks*yBlocks);
1175  SortableResizeArray<int> yprocs(xBlocks*zBlocks);
1176  SortableResizeArray<int> xprocs(yBlocks*zBlocks);
1177 
1178  // decide which pes to use by bit reversal and patch use
1179  int i;
1180  int ncpus = CkNumPes();
1181  SortableResizeArray<int> patches, nopatches, pmeprocs;
1182  PatchMap *pmap = PatchMap::Object();
1183  for ( int icpu=0; icpu<ncpus; ++icpu ) {
1184  int ri = WorkDistrib::peDiffuseOrdering[icpu];
1185  if ( ri ) { // keep 0 for special case
1186  // pretend pe 1 has patches to avoid placing extra PME load on node
1187  if ( ri == 1 || pmap->numPatchesOnNode(ri) ) patches.add(ri);
1188  else nopatches.add(ri);
1189  }
1190  }
1191 
1192 #if USE_RANDOM_TOPO
1193  Random rand(CkMyPe());
1194  int *tmp = new int[patches.size()];
1195  int nn = patches.size();
1196  for (i=0;i<nn;i++) tmp[i] = patches[i];
1197  rand.reorder(tmp, nn);
1198  patches.resize(0);
1199  for (i=0;i<nn;i++) patches.add(tmp[i]);
1200  delete [] tmp;
1201  tmp = new int[nopatches.size()];
1202  nn = nopatches.size();
1203  for (i=0;i<nn;i++) tmp[i] = nopatches[i];
1204  rand.reorder(tmp, nn);
1205  nopatches.resize(0);
1206  for (i=0;i<nn;i++) nopatches.add(tmp[i]);
1207  delete [] tmp;
1208 #endif
1209 
1210  // only use zero if it eliminates overloading or has patches
1211  int useZero = 0;
1212  int npens = xBlocks*yBlocks;
1213  if ( npens % ncpus == 0 ) useZero = 1;
1214  if ( npens == nopatches.size() + 1 ) useZero = 1;
1215  npens += xBlocks*zBlocks;
1216  if ( npens % ncpus == 0 ) useZero = 1;
1217  if ( npens == nopatches.size() + 1 ) useZero = 1;
1218  npens += yBlocks*zBlocks;
1219  if ( npens % ncpus == 0 ) useZero = 1;
1220  if ( npens == nopatches.size() + 1 ) useZero = 1;
1221 
1222  // add nopatches then patches in reversed order
1223  for ( i=nopatches.size()-1; i>=0; --i ) pmeprocs.add(nopatches[i]);
1224  if ( useZero && ! pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
1225  for ( i=patches.size()-1; i>=0; --i ) pmeprocs.add(patches[i]);
1226  if ( pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
1227 
1228  int npes = pmeprocs.size();
1229  for ( i=0; i<xBlocks*yBlocks; ++i, ++pe ) zprocs[i] = pmeprocs[pe%npes];
1230  if ( i>1 && zprocs[0] == zprocs[i-1] ) zprocs[0] = 0;
1231 #if !USE_RANDOM_TOPO
1232  zprocs.sort();
1233 #endif
1234  for ( i=0; i<xBlocks*zBlocks; ++i, ++pe ) yprocs[i] = pmeprocs[pe%npes];
1235  if ( i>1 && yprocs[0] == yprocs[i-1] ) yprocs[0] = 0;
1236 #if !USE_RANDOM_TOPO
1237  yprocs.sort();
1238 #endif
1239  for ( i=0; i<yBlocks*zBlocks; ++i, ++pe ) xprocs[i] = pmeprocs[pe%npes];
1240  if ( i>1 && xprocs[0] == xprocs[i-1] ) xprocs[0] = 0;
1241 #if !USE_RANDOM_TOPO
1242  xprocs.sort();
1243 #endif
1244 
1245 #if USE_TOPO_SFC
1246  CmiLock(tmgr_lock);
1247  //{
1248  TopoManager tmgr;
1249  int xdim = tmgr.getDimNX();
1250  int ydim = tmgr.getDimNY();
1251  int zdim = tmgr.getDimNZ();
1252  int xdim1 = find_level_grid(xdim);
1253  int ydim1 = find_level_grid(ydim);
1254  int zdim1 = find_level_grid(zdim);
1255  if(CkMyPe() == 0)
1256  printf("xdim: %d %d %d, %d %d %d\n", xdim, ydim, zdim, xdim1, ydim1, zdim1);
1257 
1258  vector<Coord> result;
1259  SFC_grid(xdim, ydim, zdim, xdim1, ydim1, zdim1, result);
1260  sort_sfc(xprocs, tmgr, result);
1261  sort_sfc(yprocs, tmgr, result);
1262  sort_sfc(zprocs, tmgr, result);
1263  //}
1264  CmiUnlock(tmgr_lock);
1265 #endif
1266 
1267 
1268  if(CkMyPe() == 0){
1269  iout << iINFO << "PME Z PENCIL LOCATIONS:";
1270  for ( i=0; i<zprocs.size() && i<10; ++i ) {
1271 #if USE_TOPO_SFC
1272  int x,y,z,t;
1273  tmgr.rankToCoordinates(zprocs[i], x,y, z, t);
1274  iout << " " << zprocs[i] << "(" << x << " " << y << " " << z << ")";
1275 #else
1276  iout << " " << zprocs[i];
1277 #endif
1278  }
1279  if ( i < zprocs.size() ) iout << " ...";
1280  iout << "\n" << endi;
1281  }
1282 
1283  if (CkMyRank() == 0) {
1284  for (pe=0, x = 0; x < xBlocks; ++x)
1285  for (y = 0; y < yBlocks; ++y, ++pe ) {
1286  pencilPMEProcessors[zprocs[pe]] = 1;
1287  }
1288  }
1289 
1290  if(CkMyPe() == 0){
1291  iout << iINFO << "PME Y PENCIL LOCATIONS:";
1292  for ( i=0; i<yprocs.size() && i<10; ++i ) {
1293 #if USE_TOPO_SFC
1294  int x,y,z,t;
1295  tmgr.rankToCoordinates(yprocs[i], x,y, z, t);
1296  iout << " " << yprocs[i] << "(" << x << " " << y << " " << z << ")";
1297 #else
1298  iout << " " << yprocs[i];
1299 #endif
1300  }
1301  if ( i < yprocs.size() ) iout << " ...";
1302  iout << "\n" << endi;
1303  }
1304 
1305  if (CkMyRank() == 0) {
1306  for (pe=0, z = 0; z < zBlocks; ++z )
1307  for (x = 0; x < xBlocks; ++x, ++pe ) {
1308  pencilPMEProcessors[yprocs[pe]] = 1;
1309  }
1310  }
1311 
1312  if(CkMyPe() == 0){
1313  iout << iINFO << "PME X PENCIL LOCATIONS:";
1314  for ( i=0; i<xprocs.size() && i<10; ++i ) {
1315 #if USE_TOPO_SFC
1316  int x,y,z,t;
1317  tmgr.rankToCoordinates(xprocs[i], x,y, z, t);
1318  iout << " " << xprocs[i] << "(" << x << " " << y << " " << z << ")";
1319 #else
1320  iout << " " << xprocs[i];
1321 #endif
1322  }
1323  if ( i < xprocs.size() ) iout << " ...";
1324  iout << "\n" << endi;
1325  }
1326 
1327  if (CkMyRank() == 0) {
1328  for (pe=0, y = 0; y < yBlocks; ++y )
1329  for (z = 0; z < zBlocks; ++z, ++pe ) {
1330  pencilPMEProcessors[xprocs[pe]] = 1;
1331  }
1332  }
1333 
1334 
1335  // creating the pencil arrays
1336  if ( CkMyPe() == 0 ){
1337 #if !USE_RANDOM_TOPO
1338  // std::sort(zprocs.begin(),zprocs.end(),WorkDistrib::pe_sortop_compact());
1339  WorkDistrib::sortPmePes(zprocs.begin(),xBlocks,yBlocks);
1340  std::sort(yprocs.begin(),yprocs.end(),WorkDistrib::pe_sortop_compact());
1341  std::sort(xprocs.begin(),xprocs.end(),WorkDistrib::pe_sortop_compact());
1342 #endif
1343 #if 1
1344  CProxy_PmePencilMap zm = CProxy_PmePencilMap::ckNew(0,1,yBlocks,xBlocks*yBlocks,zprocs.begin());
1345  CProxy_PmePencilMap ym;
1346  if ( simParams->PMEPencilsYLayout )
1347  ym = CProxy_PmePencilMap::ckNew(0,2,zBlocks,zBlocks*xBlocks,yprocs.begin()); // new
1348  else
1349  ym = CProxy_PmePencilMap::ckNew(2,0,xBlocks,zBlocks*xBlocks,yprocs.begin()); // old
1350  CProxy_PmePencilMap xm;
1351  if ( simParams->PMEPencilsXLayout )
1352  xm = CProxy_PmePencilMap::ckNew(2,1,yBlocks,yBlocks*zBlocks,xprocs.begin()); // new
1353  else
1354  xm = CProxy_PmePencilMap::ckNew(1,2,zBlocks,yBlocks*zBlocks,xprocs.begin()); // old
1355  pmeNodeProxy.recvPencilMapProxies(xm,ym,zm);
1356  CkArrayOptions zo(xBlocks,yBlocks,1); zo.setMap(zm);
1357  CkArrayOptions yo(xBlocks,1,zBlocks); yo.setMap(ym);
1358  CkArrayOptions xo(1,yBlocks,zBlocks); xo.setMap(xm);
1359  zo.setAnytimeMigration(false); zo.setStaticInsertion(true);
1360  yo.setAnytimeMigration(false); yo.setStaticInsertion(true);
1361  xo.setAnytimeMigration(false); xo.setStaticInsertion(true);
1362  zPencil = CProxy_PmeZPencil::ckNew(zo); // (xBlocks,yBlocks,1);
1363  yPencil = CProxy_PmeYPencil::ckNew(yo); // (xBlocks,1,zBlocks);
1364  xPencil = CProxy_PmeXPencil::ckNew(xo); // (1,yBlocks,zBlocks);
1365 #else
1366  zPencil = CProxy_PmeZPencil::ckNew(); // (xBlocks,yBlocks,1);
1367  yPencil = CProxy_PmeYPencil::ckNew(); // (xBlocks,1,zBlocks);
1368  xPencil = CProxy_PmeXPencil::ckNew(); // (1,yBlocks,zBlocks);
1369 
1370  for (pe=0, x = 0; x < xBlocks; ++x)
1371  for (y = 0; y < yBlocks; ++y, ++pe ) {
1372  zPencil(x,y,0).insert(zprocs[pe]);
1373  }
1374  zPencil.doneInserting();
1375 
1376  for (pe=0, x = 0; x < xBlocks; ++x)
1377  for (z = 0; z < zBlocks; ++z, ++pe ) {
1378  yPencil(x,0,z).insert(yprocs[pe]);
1379  }
1380  yPencil.doneInserting();
1381 
1382 
1383  for (pe=0, y = 0; y < yBlocks; ++y )
1384  for (z = 0; z < zBlocks; ++z, ++pe ) {
1385  xPencil(0,y,z).insert(xprocs[pe]);
1386  }
1387  xPencil.doneInserting();
1388 #endif
1389 
1390  pmeProxy.recvArrays(xPencil,yPencil,zPencil);
1391  PmePencilInitMsgData msgdata;
1392  msgdata.grid = myGrid;
1393  msgdata.xBlocks = xBlocks;
1394  msgdata.yBlocks = yBlocks;
1395  msgdata.zBlocks = zBlocks;
1396  msgdata.xPencil = xPencil;
1397  msgdata.yPencil = yPencil;
1398  msgdata.zPencil = zPencil;
1399  msgdata.pmeProxy = pmeProxyDir;
1400  msgdata.pmeNodeProxy = pmeNodeProxy;
1401  msgdata.xm = xm;
1402  msgdata.ym = ym;
1403  msgdata.zm = zm;
1404  xPencil.init(new PmePencilInitMsg(msgdata));
1405  yPencil.init(new PmePencilInitMsg(msgdata));
1406  zPencil.init(new PmePencilInitMsg(msgdata));
1407  }
1408 
1409  return; // continue in initialize_pencils() at next startup stage
1410  }
1411 
1412 
1413  int pe;
1414  int nx = 0;
1415  for ( pe = 0; pe < numGridPes; ++pe ) {
1416  localInfo[pe].x_start = nx;
1417  nx += myGrid.block1;
1418  if ( nx > myGrid.K1 ) nx = myGrid.K1;
1419  localInfo[pe].nx = nx - localInfo[pe].x_start;
1420  }
1421  int ny = 0;
1422  for ( pe = 0; pe < numTransPes; ++pe ) {
1423  localInfo[pe].y_start_after_transpose = ny;
1424  ny += myGrid.block2;
1425  if ( ny > myGrid.K2 ) ny = myGrid.K2;
1426  localInfo[pe].ny_after_transpose =
1427  ny - localInfo[pe].y_start_after_transpose;
1428  }
1429 
1430  { // decide how many pes this node exchanges charges with
1431 
1432  PatchMap *patchMap = PatchMap::Object();
1433  Lattice lattice = simParams->lattice;
1434  BigReal sysdima = lattice.a_r().unit() * lattice.a();
1435  BigReal cutoff = simParams->cutoff;
1436  BigReal patchdim = simParams->patchDimension;
1437  int numPatches = patchMap->numPatches();
1438  int numNodes = CkNumPes();
1439  int *source_flags = new int[numNodes];
1440  int node;
1441  for ( node=0; node<numNodes; ++node ) {
1442  source_flags[node] = 0;
1443  recipPeDest[node] = 0;
1444  }
1445 
1446  // // make sure that we don't get ahead of ourselves on this node
1447  // if ( CkMyPe() < numPatches && myRecipPe >= 0 ) {
1448  // source_flags[CkMyPe()] = 1;
1449  // recipPeDest[myRecipPe] = 1;
1450  // }
1451 
1452  for ( int pid=0; pid < numPatches; ++pid ) {
1453  int pnode = patchMap->node(pid);
1454 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
1455  if ( offload ) pnode = CkNodeFirst(CkNodeOf(pnode));
1456 #endif
1457  int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
1458  BigReal minx = patchMap->min_a(pid);
1459  BigReal maxx = patchMap->max_a(pid);
1460  BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
1461  // min1 (max1) is smallest (largest) grid line for this patch
1462  int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
1463  int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
1464  for ( int i=min1; i<=max1; ++i ) {
1465  int ix = i;
1466  while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
1467  while ( ix < 0 ) ix += myGrid.K1;
1468  // set source_flags[pnode] if this patch sends to our node
1469  if ( myGridPe >= 0 && ix >= localInfo[myGridPe].x_start &&
1470  ix < localInfo[myGridPe].x_start + localInfo[myGridPe].nx ) {
1471  source_flags[pnode] = 1;
1472  }
1473  // set dest_flags[] for node that our patch sends to
1474 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
1475  if ( offload ) {
1476  if ( pnode == CkNodeFirst(CkMyNode()) ) {
1477  recipPeDest[ix / myGrid.block1] = 1;
1478  }
1479  } else
1480 #endif
1481  if ( pnode == CkMyPe() ) {
1482  recipPeDest[ix / myGrid.block1] = 1;
1483  }
1484  }
1485  }
1486 
1487  int numSourcesSamePhysicalNode = 0;
1488  numSources = 0;
1489  numDestRecipPes = 0;
1490  for ( node=0; node<numNodes; ++node ) {
1491  if ( source_flags[node] ) ++numSources;
1492  if ( recipPeDest[node] ) ++numDestRecipPes;
1493  if ( source_flags[node] && CmiPeOnSamePhysicalNode(node,CkMyPe()) ) ++numSourcesSamePhysicalNode;
1494  }
1495 
1496 #if 0
1497  if ( numSources ) {
1498  CkPrintf("pe %5d pme %5d of %5d on same physical node\n",
1499  CkMyPe(), numSourcesSamePhysicalNode, numSources);
1500  iout << iINFO << "PME " << CkMyPe() << " sources:";
1501  for ( node=0; node<numNodes; ++node ) {
1502  if ( source_flags[node] ) iout << " " << node;
1503  }
1504  iout << "\n" << endi;
1505  }
1506 #endif
1507 
1508  delete [] source_flags;
1509 
1510  // CkPrintf("PME on node %d has %d sources and %d destinations\n",
1511  // CkMyPe(), numSources, numDestRecipPes);
1512 
1513  } // decide how many pes this node exchanges charges with (end)
1514 
1515  ungrid_count = numDestRecipPes;
1516 
1517  sendTransBarrier_received = 0;
1518 
1519  if ( myGridPe < 0 && myTransPe < 0 ) return;
1520  // the following only for nodes doing reciprocal sum
1521 
1522  if ( myTransPe >= 0 ) {
1523  recipEvirPe = findRecipEvirPe();
1524  pmeProxy[recipEvirPe].addRecipEvirClient();
1525  }
1526 
1527  if ( myTransPe >= 0 ) {
1528  int k2_start = localInfo[myTransPe].y_start_after_transpose;
1529  int k2_end = k2_start + localInfo[myTransPe].ny_after_transpose;
1530  #ifdef OPENATOM_VERSION
1531  if ( simParams->openatomOn ) {
1532  CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
1533  myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2, moaProxy);
1534  } else {
1535  myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
1536  }
1537  #else // OPENATOM_VERSION
1538  myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
1539  #endif // OPENATOM_VERSION
1540  }
1541 
1542  int local_size = myGrid.block1 * myGrid.K2 * myGrid.dim3;
1543  int local_size_2 = myGrid.block2 * myGrid.K1 * myGrid.dim3;
1544  if ( local_size < local_size_2 ) local_size = local_size_2;
1545  qgrid = new float[local_size*numGrids];
1546  if ( numGridPes > 1 || numTransPes > 1 ) {
1547  kgrid = new float[local_size*numGrids];
1548  } else {
1549  kgrid = qgrid;
1550  }
1551  qgrid_size = local_size;
1552 
1553  if ( myGridPe >= 0 ) {
1554  qgrid_start = localInfo[myGridPe].x_start * myGrid.K2 * myGrid.dim3;
1555  qgrid_len = localInfo[myGridPe].nx * myGrid.K2 * myGrid.dim3;
1556  fgrid_start = localInfo[myGridPe].x_start * myGrid.K2;
1557  fgrid_len = localInfo[myGridPe].nx * myGrid.K2;
1558  }
1559 
1560  int n[3]; n[0] = myGrid.K1; n[1] = myGrid.K2; n[2] = myGrid.K3;
1561 #ifdef NAMD_FFTW
1562  CmiLock(fftw_plan_lock);
1563 #ifdef NAMD_FFTW_3
1564  work = new fftwf_complex[n[0]];
1565  int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT : simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE ;
1566  if ( myGridPe >= 0 ) {
1567  forward_plan_yz=new fftwf_plan[numGrids];
1568  backward_plan_yz=new fftwf_plan[numGrids];
1569  }
1570  if ( myTransPe >= 0 ) {
1571  forward_plan_x=new fftwf_plan[numGrids];
1572  backward_plan_x=new fftwf_plan[numGrids];
1573  }
1574  /* need one plan per grid */
1575  if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps. 1..." << endi;
1576  if ( myGridPe >= 0 ) {
1577  for( int g=0; g<numGrids; g++)
1578  {
1579  forward_plan_yz[g] = fftwf_plan_many_dft_r2c(2, n+1,
1580  localInfo[myGridPe].nx,
1581  qgrid + qgrid_size * g,
1582  NULL,
1583  1,
1584  myGrid.dim2 * myGrid.dim3,
1585  (fftwf_complex *)
1586  (qgrid + qgrid_size * g),
1587  NULL,
1588  1,
1589  myGrid.dim2 * (myGrid.dim3/2),
1590  fftwFlags);
1591  }
1592  }
1593  int zdim = myGrid.dim3;
1594  int xStride=localInfo[myTransPe].ny_after_transpose *( myGrid.dim3 / 2);
1595  if ( ! CkMyPe() ) iout << " 2..." << endi;
1596  if ( myTransPe >= 0 ) {
1597  for( int g=0; g<numGrids; g++)
1598  {
1599 
1600  forward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
1601  (fftwf_complex *)
1602  (kgrid+qgrid_size*g),
1603  NULL,
1604  xStride,
1605  1,
1606  (fftwf_complex *)
1607  (kgrid+qgrid_size*g),
1608  NULL,
1609  xStride,
1610  1,
1611  FFTW_FORWARD,fftwFlags);
1612 
1613  }
1614  }
1615  if ( ! CkMyPe() ) iout << " 3..." << endi;
1616  if ( myTransPe >= 0 ) {
1617  for( int g=0; g<numGrids; g++)
1618  {
1619  backward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
1620  (fftwf_complex *)
1621  (kgrid+qgrid_size*g),
1622  NULL,
1623  xStride,
1624  1,
1625  (fftwf_complex *)
1626  (kgrid+qgrid_size*g),
1627  NULL,
1628  xStride,
1629  1,
1630  FFTW_BACKWARD, fftwFlags);
1631 
1632  }
1633  }
1634  if ( ! CkMyPe() ) iout << " 4..." << endi;
1635  if ( myGridPe >= 0 ) {
1636  for( int g=0; g<numGrids; g++)
1637  {
1638  backward_plan_yz[g] = fftwf_plan_many_dft_c2r(2, n+1,
1639  localInfo[myGridPe].nx,
1640  (fftwf_complex *)
1641  (qgrid + qgrid_size * g),
1642  NULL,
1643  1,
1644  myGrid.dim2*(myGrid.dim3/2),
1645  qgrid + qgrid_size * g,
1646  NULL,
1647  1,
1648  myGrid.dim2 * myGrid.dim3,
1649  fftwFlags);
1650  }
1651  }
1652  if ( ! CkMyPe() ) iout << " Done.\n" << endi;
1653 
1654 #else
1655  work = new fftw_complex[n[0]];
1656 
1657  if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps. 1..." << endi;
1658  if ( myGridPe >= 0 ) {
1659  forward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_REAL_TO_COMPLEX,
1660  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1661  | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
1662  }
1663  if ( ! CkMyPe() ) iout << " 2..." << endi;
1664  if ( myTransPe >= 0 ) {
1665  forward_plan_x = fftw_create_plan_specific(n[0], FFTW_FORWARD,
1666  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1667  | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
1668  localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
1669  }
1670  if ( ! CkMyPe() ) iout << " 3..." << endi;
1671  if ( myTransPe >= 0 ) {
1672  backward_plan_x = fftw_create_plan_specific(n[0], FFTW_BACKWARD,
1673  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1674  | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
1675  localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
1676  }
1677  if ( ! CkMyPe() ) iout << " 4..." << endi;
1678  if ( myGridPe >= 0 ) {
1679  backward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_COMPLEX_TO_REAL,
1680  ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
1681  | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
1682  }
1683  if ( ! CkMyPe() ) iout << " Done.\n" << endi;
1684 #endif
1685  CmiUnlock(fftw_plan_lock);
1686 #else
1687  NAMD_die("Sorry, FFTW must be compiled in to use PME.");
1688 #endif
1689 
1690  if ( myGridPe >= 0 && numSources == 0 )
1691  NAMD_bug("PME grid elements exist without sources.");
1692  grid_count = numSources;
1693  memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
1694  trans_count = numGridPes;
1695 }
static Node * Object()
Definition: Node.h:86
int dim2
Definition: PmeBase.h:19
static CmiNodeLock fftw_plan_lock
Definition: ComputePme.C:420
std::ostream & iINFO(std::ostream &s)
Definition: InfoStream.C:81
static void sortPmePes(int *pmepes, int xdim, int ydim)
Definition: WorkDistrib.C:304
int numNodesWithPatches(void)
Definition: PatchMap.h:61
Vector a_r() const
Definition: Lattice.h:268
int dim3
Definition: PmeBase.h:19
CProxy_ComputePmeMgr pmeProxy
Definition: ComputePme.C:223
static PatchMap * Object()
Definition: PatchMap.h:27
BigReal min_a(int pid) const
Definition: PatchMap.h:91
int K2
Definition: PmeBase.h:18
CProxy_PmeZPencil zPencil
Definition: ComputePme.C:222
SimParameters * simParameters
Definition: Node.h:178
int K1
Definition: PmeBase.h:18
static int numGrids
Definition: ComputePme.h:32
std::ostream & endi(std::ostream &s)
Definition: InfoStream.C:54
int block1
Definition: PmeBase.h:21
CProxy_PmeYPencil yPencil
Definition: ComputePme.C:221
CProxy_PmePencilMap zm
Definition: ComputePme.C:227
CProxy_PmePencilMap xm
Definition: ComputePme.C:225
if(ComputeNonbondedUtil::goMethod==2)
CProxy_NodePmeMgr pmeNodeProxy
Definition: ComputePme.C:224
#define iout
Definition: InfoStream.h:51
int block2
Definition: PmeBase.h:21
Definition: Random.h:37
int order
Definition: PmeBase.h:20
void NAMD_bug(const char *err_msg)
Definition: common.C:129
int block3
Definition: PmeBase.h:21
void generatePmePeList2(int *gridPeMap, int numGridPes, int *transPeMap, int numTransPes)
Definition: ComputePme.C:298
gridSize z
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int numPatches
void NAMD_die(const char *err_msg)
Definition: common.C:85
static int findRecipEvirPe()
Definition: ComputePme.C:247
static int * peDiffuseOrdering
Definition: WorkDistrib.h:115
int ny_after_transpose
Definition: ComputePme.C:239
int getDeviceID()
Definition: DeviceCUDA.h:112
int add(const Elem &elem)
Definition: ResizeArray.h:97
BigReal max_a(int pid) const
Definition: PatchMap.h:92
BlockRadixSort::TempStorage sort
long long int64
Definition: common.h:34
#define simParams
Definition: Output.C:127
int K3
Definition: PmeBase.h:18
int numPatches(void) const
Definition: PatchMap.h:59
void resize(int i)
Definition: ResizeArray.h:84
int node(int pid) const
Definition: PatchMap.h:114
void cuda_errcheck(const char *msg)
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:22
BigReal patchDimension
int numPatchesOnNode(int node)
Definition: PatchMap.h:60
gridSize y
CProxy_PmePencilMap ym
Definition: ComputePme.C:226
int size(void) const
Definition: ResizeArray.h:127
gridSize x
bool one_device_per_node()
Definition: DeviceCUDA.C:402
char * pencilPMEProcessors
Definition: ComputePme.C:113
Vector a() const
Definition: Lattice.h:252
Vector unit(void) const
Definition: Vector.h:182
double BigReal
Definition: common.h:114
CProxy_PmeXPencil xPencil
Definition: ComputePme.C:220
int y_start_after_transpose
Definition: ComputePme.C:239
void ComputePmeMgr::initialize_computes ( )

Definition at line 2726 of file ComputePme.C.

References chargeGridSubmittedCount, cuda_errcheck(), cuda_init_bspline_coeffs(), cuda_lock, deviceCUDA, PmeGrid::dim2, PmeGrid::dim3, DeviceCUDA::getDeviceID(), DeviceCUDA::getMasterPe(), ijpair::i, ijpair::j, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, master_pe, NAMD_bug(), ComputePmeUtil::numGrids, PatchMap::numPatchesOnNode(), PatchMap::Object(), Node::Object(), ReductionMgr::Object(), PmeGrid::order, REDUCTIONS_BASIC, Node::simParameters, ReductionMgr::willSubmit(), and XCOPY.

2726  {
2727 
2728  noWorkCount = 0;
2729  doWorkCount = 0;
2730  ungridForcesCount = 0;
2731 
2733 
2735 
2736  strayChargeErrors = 0;
2737 
2738 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2739  PatchMap *patchMap = PatchMap::Object();
2740  int pe = master_pe = CkNodeFirst(CkMyNode());
2741  for ( int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
2742  if ( ! patchMap->numPatchesOnNode(master_pe) ) master_pe = pe;
2743  if ( ! patchMap->numPatchesOnNode(pe) ) continue;
2744  if ( master_pe < 1 && pe != deviceCUDA->getMasterPe() ) master_pe = pe;
2745  if ( master_pe == deviceCUDA->getMasterPe() ) master_pe = pe;
2747  && pe != deviceCUDA->getMasterPe() ) {
2748  master_pe = pe;
2749  }
2750  }
2751  if ( ! patchMap->numPatchesOnNode(master_pe) ) {
2752  NAMD_bug("ComputePmeMgr::initialize_computes() master_pe has no patches.");
2753  }
2754 
2755  masterPmeMgr = nodePmeMgr->mgrObjects[master_pe - CkNodeFirst(CkMyNode())];
2756  bool cudaFirst = 1;
2757  if ( offload ) {
2758  CmiLock(cuda_lock);
2759  cudaFirst = ! masterPmeMgr->chargeGridSubmittedCount++;
2760  }
2761 
2762  if ( cudaFirst ) {
2763  nodePmeMgr->master_pe = master_pe;
2764  nodePmeMgr->masterPmeMgr = masterPmeMgr;
2765  }
2766 #endif
2767 
2768  qsize = myGrid.K1 * myGrid.dim2 * myGrid.dim3;
2769  fsize = myGrid.K1 * myGrid.dim2;
2770  if ( myGrid.K2 != myGrid.dim2 ) NAMD_bug("PME myGrid.K2 != myGrid.dim2");
2771 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2772  if ( ! offload )
2773 #endif
2774  {
2775  q_arr = new float*[fsize*numGrids];
2776  memset( (void*) q_arr, 0, fsize*numGrids * sizeof(float*) );
2777  q_list = new float*[fsize*numGrids];
2778  memset( (void*) q_list, 0, fsize*numGrids * sizeof(float*) );
2779  q_count = 0;
2780  }
2781 
2782 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2783  if ( cudaFirst || ! offload ) {
2784 #endif
2785  f_arr = new char[fsize*numGrids];
2786  // memset to non-zero value has race condition on BlueGene/Q
2787  // memset( (void*) f_arr, 2, fsize*numGrids * sizeof(char) );
2788  for ( int n=fsize*numGrids, i=0; i<n; ++i ) f_arr[i] = 2;
2789 
2790  for ( int g=0; g<numGrids; ++g ) {
2791  char *f = f_arr + g*fsize;
2792  if ( usePencils ) {
2793  int K1 = myGrid.K1;
2794  int K2 = myGrid.K2;
2795  int block1 = ( K1 + xBlocks - 1 ) / xBlocks;
2796  int block2 = ( K2 + yBlocks - 1 ) / yBlocks;
2797  int dim2 = myGrid.dim2;
2798  for (int ap=0; ap<numPencilsActive; ++ap) {
2799  int ib = activePencils[ap].i;
2800  int jb = activePencils[ap].j;
2801  int ibegin = ib*block1;
2802  int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
2803  int jbegin = jb*block2;
2804  int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
2805  int flen = numGrids * (iend - ibegin) * (jend - jbegin);
2806  for ( int i=ibegin; i<iend; ++i ) {
2807  for ( int j=jbegin; j<jend; ++j ) {
2808  f[i*dim2+j] = 0;
2809  }
2810  }
2811  }
2812  } else {
2813  int block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
2814  bsize = block1 * myGrid.dim2 * myGrid.dim3;
2815  for (int pe=0; pe<numGridPes; pe++) {
2816  if ( ! recipPeDest[pe] ) continue;
2817  int start = pe * bsize;
2818  int len = bsize;
2819  if ( start >= qsize ) { start = 0; len = 0; }
2820  if ( start + len > qsize ) { len = qsize - start; }
2821  int zdim = myGrid.dim3;
2822  int fstart = start / zdim;
2823  int flen = len / zdim;
2824  memset(f + fstart, 0, flen*sizeof(char));
2825  // CkPrintf("pe %d enabled slabs %d to %d\n", CkMyPe(), fstart/myGrid.dim2, (fstart+flen)/myGrid.dim2-1);
2826  }
2827  }
2828  }
2829 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2830  }
2831  if ( offload ) {
2832  cudaSetDevice(deviceCUDA->getDeviceID());
2833  if ( cudaFirst ) {
2834 
2835  int f_alloc_count = 0;
2836  for ( int n=fsize, i=0; i<n; ++i ) {
2837  if ( f_arr[i] == 0 ) {
2838  ++f_alloc_count;
2839  }
2840  }
2841  // CkPrintf("pe %d f_alloc_count == %d (%d slabs)\n", CkMyPe(), f_alloc_count, f_alloc_count/myGrid.dim2);
2842 
2843  q_arr = new float*[fsize*numGrids];
2844  memset( (void*) q_arr, 0, fsize*numGrids * sizeof(float*) );
2845 
2846  float **q_arr_dev_host = new float*[fsize];
2847  cudaMalloc((void**) &q_arr_dev, fsize * sizeof(float*));
2848 
2849  float **v_arr_dev_host = new float*[fsize];
2850  cudaMalloc((void**) &v_arr_dev, fsize * sizeof(float*));
2851 
2852  int q_stride = myGrid.K3+myGrid.order-1;
2853  q_data_size = f_alloc_count * q_stride * sizeof(float);
2854  ffz_size = (fsize + q_stride) * sizeof(int);
2855 
2856  // tack ffz onto end of q_data to allow merged transfer
2857  cudaMallocHost((void**) &q_data_host, q_data_size+ffz_size);
2858  ffz_host = (int*)(((char*)q_data_host) + q_data_size);
2859  cudaMalloc((void**) &q_data_dev, q_data_size+ffz_size);
2860  ffz_dev = (int*)(((char*)q_data_dev) + q_data_size);
2861  cudaMalloc((void**) &v_data_dev, q_data_size);
2862  cuda_errcheck("malloc grid data for pme");
2863  cudaMemset(q_data_dev, 0, q_data_size + ffz_size); // for first time
2864  cudaEventCreateWithFlags(&(nodePmeMgr->end_charge_memset),cudaEventDisableTiming);
2865  cudaEventRecord(nodePmeMgr->end_charge_memset, 0);
2866  cudaEventCreateWithFlags(&(nodePmeMgr->end_all_pme_kernels),cudaEventDisableTiming);
2867  cudaEventCreateWithFlags(&(nodePmeMgr->end_potential_memcpy),cudaEventDisableTiming);
2868 
2869  f_alloc_count = 0;
2870  for ( int n=fsize, i=0; i<n; ++i ) {
2871  if ( f_arr[i] == 0 ) {
2872  q_arr[i] = q_data_host + f_alloc_count * q_stride;
2873  q_arr_dev_host[i] = q_data_dev + f_alloc_count * q_stride;
2874  v_arr_dev_host[i] = v_data_dev + f_alloc_count * q_stride;
2875  ++f_alloc_count;
2876  } else {
2877  q_arr[i] = 0;
2878  q_arr_dev_host[i] = 0;
2879  v_arr_dev_host[i] = 0;
2880  }
2881  }
2882 
2883  cudaMemcpy(q_arr_dev, q_arr_dev_host, fsize * sizeof(float*), cudaMemcpyHostToDevice);
2884  cudaMemcpy(v_arr_dev, v_arr_dev_host, fsize * sizeof(float*), cudaMemcpyHostToDevice);
2885  delete [] q_arr_dev_host;
2886  delete [] v_arr_dev_host;
2887  delete [] f_arr;
2888  f_arr = new char[fsize + q_stride];
2889  fz_arr = f_arr + fsize;
2890  memset(f_arr, 0, fsize + q_stride);
2891  memset(ffz_host, 0, (fsize + q_stride)*sizeof(int));
2892 
2893  cuda_errcheck("initialize grid data for pme");
2894 
2895  cuda_init_bspline_coeffs(&bspline_coeffs_dev, &bspline_dcoeffs_dev, myGrid.order);
2896  cuda_errcheck("initialize bspline coefficients for pme");
2897 
2898 #define XCOPY(X) masterPmeMgr->X = X;
2899  XCOPY(bspline_coeffs_dev)
2900  XCOPY(bspline_dcoeffs_dev)
2901  XCOPY(q_arr)
2902  XCOPY(q_arr_dev)
2903  XCOPY(v_arr_dev)
2904  XCOPY(q_data_size)
2905  XCOPY(q_data_host)
2906  XCOPY(q_data_dev)
2907  XCOPY(v_data_dev)
2908  XCOPY(ffz_size)
2909  XCOPY(ffz_host)
2910  XCOPY(ffz_dev)
2911  XCOPY(f_arr)
2912  XCOPY(fz_arr)
2913 #undef XCOPY
2914  //CkPrintf("pe %d init first\n", CkMyPe());
2915  } else { // cudaFirst
2916  //CkPrintf("pe %d init later\n", CkMyPe());
2917 #define XCOPY(X) X = masterPmeMgr->X;
2918  XCOPY(bspline_coeffs_dev)
2919  XCOPY(bspline_dcoeffs_dev)
2920  XCOPY(q_arr)
2921  XCOPY(q_arr_dev)
2922  XCOPY(v_arr_dev)
2923  XCOPY(q_data_size)
2924  XCOPY(q_data_host)
2925  XCOPY(q_data_dev)
2926  XCOPY(v_data_dev)
2927  XCOPY(ffz_size)
2928  XCOPY(ffz_host)
2929  XCOPY(ffz_dev)
2930  XCOPY(f_arr)
2931  XCOPY(fz_arr)
2932 #undef XCOPY
2933  } // cudaFirst
2934  CmiUnlock(cuda_lock);
2935  } else // offload
2936 #endif // NAMD_CUDA
2937  {
2938  fz_arr = new char[myGrid.K3+myGrid.order-1];
2939  }
2940 
2941 #if 0 && USE_PERSISTENT
2942  recvGrid_handle = NULL;
2943 #endif
2944 }
static Node * Object()
Definition: Node.h:86
int dim2
Definition: PmeBase.h:19
int dim3
Definition: PmeBase.h:19
static PatchMap * Object()
Definition: PatchMap.h:27
int K2
Definition: PmeBase.h:18
SimParameters * simParameters
Definition: Node.h:178
int K1
Definition: PmeBase.h:18
static int numGrids
Definition: ComputePme.h:32
SubmitReduction * willSubmit(int setID, int size=-1)
Definition: ReductionMgr.C:365
static ReductionMgr * Object(void)
Definition: ReductionMgr.h:278
int order
Definition: PmeBase.h:20
int getMasterPe()
Definition: DeviceCUDA.h:105
void NAMD_bug(const char *err_msg)
Definition: common.C:129
int chargeGridSubmittedCount
Definition: ComputePme.C:450
#define XCOPY(X)
int getDeviceID()
Definition: DeviceCUDA.h:112
#define simParams
Definition: Output.C:127
int K3
Definition: PmeBase.h:18
void cuda_errcheck(const char *msg)
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:22
int numPatchesOnNode(int node)
Definition: PatchMap.h:60
int i
Definition: ComputePme.C:349
void cuda_init_bspline_coeffs(float **c, float **dc, int order)
int j
Definition: ComputePme.C:349
static CmiNodeLock cuda_lock
Definition: ComputePme.C:430
void ComputePmeMgr::initialize_pencils ( CkQdMsg *  msg)

Definition at line 1699 of file ComputePme.C.

References Lattice::a(), Lattice::a_r(), Lattice::b(), Lattice::b_r(), PmeGrid::block1, PmeGrid::block2, SimParameters::cutoff, deviceCUDA, DeviceCUDA::getMasterPe(), PmeGrid::K1, PmeGrid::K2, SimParameters::lattice, PatchMap::max_a(), PatchMap::max_b(), PatchMap::min_a(), PatchMap::min_b(), PatchMap::node(), PatchMap::numPatches(), numPatches, PatchMap::Object(), Node::Object(), PmeGrid::order, SimParameters::patchDimension, SimParameters::PMESendOrder, Random::reorder(), Node::simParameters, simParams, sort, and Vector::unit().

1699  {
1700  delete msg;
1701  if ( ! usePencils ) return;
1702 
1704 
1705  PatchMap *patchMap = PatchMap::Object();
1706  Lattice lattice = simParams->lattice;
1707  BigReal sysdima = lattice.a_r().unit() * lattice.a();
1708  BigReal sysdimb = lattice.b_r().unit() * lattice.b();
1709  BigReal cutoff = simParams->cutoff;
1710  BigReal patchdim = simParams->patchDimension;
1711  int numPatches = patchMap->numPatches();
1712 
1713  pencilActive = new char[xBlocks*yBlocks];
1714  for ( int i=0; i<xBlocks; ++i ) {
1715  for ( int j=0; j<yBlocks; ++j ) {
1716  pencilActive[i*yBlocks+j] = 0;
1717  }
1718  }
1719 
1720  for ( int pid=0; pid < numPatches; ++pid ) {
1721  int pnode = patchMap->node(pid);
1722 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
1723  if ( offload ) {
1724  if ( CkNodeOf(pnode) != CkMyNode() ) continue;
1725  } else
1726 #endif
1727  if ( pnode != CkMyPe() ) continue;
1728 
1729  int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
1730  int shift2 = (myGrid.K2 + myGrid.order - 1)/2;
1731 
1732  BigReal minx = patchMap->min_a(pid);
1733  BigReal maxx = patchMap->max_a(pid);
1734  BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
1735  // min1 (max1) is smallest (largest) grid line for this patch
1736  int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
1737  int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
1738 
1739  BigReal miny = patchMap->min_b(pid);
1740  BigReal maxy = patchMap->max_b(pid);
1741  BigReal marginb = 0.5 * ( patchdim - cutoff ) / sysdimb;
1742  // min2 (max2) is smallest (largest) grid line for this patch
1743  int min2 = ((int) floor(myGrid.K2 * (miny - marginb))) + shift2 - myGrid.order + 1;
1744  int max2 = ((int) floor(myGrid.K2 * (maxy + marginb))) + shift2;
1745 
1746  for ( int i=min1; i<=max1; ++i ) {
1747  int ix = i;
1748  while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
1749  while ( ix < 0 ) ix += myGrid.K1;
1750  for ( int j=min2; j<=max2; ++j ) {
1751  int jy = j;
1752  while ( jy >= myGrid.K2 ) jy -= myGrid.K2;
1753  while ( jy < 0 ) jy += myGrid.K2;
1754  pencilActive[(ix / myGrid.block1)*yBlocks + (jy / myGrid.block2)] = 1;
1755  }
1756  }
1757  }
1758 
1759  numPencilsActive = 0;
1760  for ( int i=0; i<xBlocks; ++i ) {
1761  for ( int j=0; j<yBlocks; ++j ) {
1762  if ( pencilActive[i*yBlocks+j] ) {
1763  ++numPencilsActive;
1764 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
1765  if ( CkMyPe() == deviceCUDA->getMasterPe() || ! offload )
1766 #endif
1767  zPencil(i,j,0).dummyRecvGrid(CkMyPe(),0);
1768  }
1769  }
1770  }
1771  activePencils = new ijpair[numPencilsActive];
1772  numPencilsActive = 0;
1773  for ( int i=0; i<xBlocks; ++i ) {
1774  for ( int j=0; j<yBlocks; ++j ) {
1775  if ( pencilActive[i*yBlocks+j] ) {
1776  activePencils[numPencilsActive++] = ijpair(i,j);
1777  }
1778  }
1779  }
1780  if ( simParams->PMESendOrder ) {
1781  std::sort(activePencils,activePencils+numPencilsActive,ijpair_sortop_bit_reversed());
1782  } else {
1783  Random rand(CkMyPe());
1784  rand.reorder(activePencils,numPencilsActive);
1785  }
1786  //if ( numPencilsActive ) {
1787  // CkPrintf("node %d sending to %d pencils\n", CkMyPe(), numPencilsActive);
1788  //}
1789 
1790  ungrid_count = numPencilsActive;
1791 }
static Node * Object()
Definition: Node.h:86
Vector a_r() const
Definition: Lattice.h:268
static PatchMap * Object()
Definition: PatchMap.h:27
BigReal min_a(int pid) const
Definition: PatchMap.h:91
int K2
Definition: PmeBase.h:18
SimParameters * simParameters
Definition: Node.h:178
int K1
Definition: PmeBase.h:18
int block1
Definition: PmeBase.h:21
int block2
Definition: PmeBase.h:21
Vector b_r() const
Definition: Lattice.h:269
BigReal min_b(int pid) const
Definition: PatchMap.h:93
Definition: Random.h:37
int order
Definition: PmeBase.h:20
int getMasterPe()
Definition: DeviceCUDA.h:105
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int numPatches
BigReal max_b(int pid) const
Definition: PatchMap.h:94
BigReal max_a(int pid) const
Definition: PatchMap.h:92
BlockRadixSort::TempStorage sort
#define simParams
Definition: Output.C:127
int numPatches(void) const
Definition: PatchMap.h:59
int node(int pid) const
Definition: PatchMap.h:114
Vector b() const
Definition: Lattice.h:253
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:22
BigReal patchDimension
Vector a() const
Definition: Lattice.h:252
Vector unit(void) const
Definition: Vector.h:182
double BigReal
Definition: common.h:114
void ComputePmeMgr::pollChargeGridReady ( )

Definition at line 3541 of file ComputePme.C.

References CcdCallBacksReset(), cuda_check_pme_charges(), CUDA_POLL, and NAMD_bug().

3541  {
3542 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3543  CcdCallBacksReset(0,CmiWallTimer()); // fix Charm++
3545 #else
3546  NAMD_bug("ComputePmeMgr::pollChargeGridReady() called in non-CUDA build.");
3547 #endif
3548 }
#define CUDA_POLL(FN, ARG)
Definition: ComputePme.C:2476
void CcdCallBacksReset(void *ignored, double curWallTime)
void NAMD_bug(const char *err_msg)
Definition: common.C:129
void cuda_check_pme_charges(void *arg, double walltime)
Definition: ComputePme.C:3463
void ComputePmeMgr::pollForcesReady ( )

Definition at line 2662 of file ComputePme.C.

References CcdCallBacksReset(), cuda_check_pme_forces(), CUDA_POLL, and NAMD_bug().

2662  {
2663 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2664  CcdCallBacksReset(0,CmiWallTimer()); // fix Charm++
2666 #else
2667  NAMD_bug("ComputePmeMgr::pollForcesReady() called in non-CUDA build.");
2668 #endif
2669 }
#define CUDA_POLL(FN, ARG)
Definition: ComputePme.C:2476
void CcdCallBacksReset(void *ignored, double curWallTime)
void NAMD_bug(const char *err_msg)
Definition: common.C:129
void cuda_check_pme_forces(void *arg, double walltime)
Definition: ComputePme.C:2483
void ComputePmeMgr::procTrans ( PmeTransMsg msg)

Definition at line 2054 of file ComputePme.C.

References PmeGrid::dim3, PmeTransMsg::lattice, NodePmeInfo::npe, ComputePmeUtil::numGrids, PmeTransMsg::nx, LocalPmeInfo::ny_after_transpose, NodePmeInfo::pe_start, PmeTransMsg::qgrid, PmeTransMsg::sequence, PmeTransMsg::x_start, and LocalPmeInfo::y_start_after_transpose.

Referenced by recvSharedTrans(), and recvTrans().

2054  {
2055  // CkPrintf("procTrans on Pe(%d)\n",CkMyPe());
2056  if ( trans_count == numGridPes ) {
2057  lattice = msg->lattice;
2058  grid_sequence = msg->sequence;
2059  }
2060 
2061  if ( msg->nx ) {
2062  int zdim = myGrid.dim3;
2063  NodePmeInfo &nodeInfo(transNodeInfo[myTransNode]);
2064  int first_pe = nodeInfo.pe_start;
2065  int last_pe = first_pe+nodeInfo.npe-1;
2066  int y_skip = localInfo[myTransPe].y_start_after_transpose
2067  - localInfo[first_pe].y_start_after_transpose;
2068  int ny_msg = localInfo[last_pe].y_start_after_transpose
2069  + localInfo[last_pe].ny_after_transpose
2070  - localInfo[first_pe].y_start_after_transpose;
2071  int ny = localInfo[myTransPe].ny_after_transpose;
2072  int x_start = msg->x_start;
2073  int nx = msg->nx;
2074  for ( int g=0; g<numGrids; ++g ) {
2075  CmiMemcpy((void*)(kgrid + qgrid_size * g + x_start*ny*zdim),
2076  (void*)(msg->qgrid + nx*(ny_msg*g+y_skip)*zdim),
2077  nx*ny*zdim*sizeof(float));
2078  }
2079  }
2080 
2081  --trans_count;
2082 
2083  if ( trans_count == 0 ) {
2084  pmeProxyDir[CkMyPe()].gridCalc2();
2085  }
2086 }
int dim3
Definition: PmeBase.h:19
static int numGrids
Definition: ComputePme.h:32
float * qgrid
Definition: ComputePme.C:143
int ny_after_transpose
Definition: ComputePme.C:239
Lattice lattice
Definition: ComputePme.C:140
int y_start_after_transpose
Definition: ComputePme.C:239
void ComputePmeMgr::procUntrans ( PmeUntransMsg msg)

Definition at line 2308 of file ComputePme.C.

References PmeGrid::dim3, PmeGrid::K2, NodePmeInfo::npe, ComputePmeUtil::numGrids, LocalPmeInfo::nx, PmeUntransMsg::ny, NodePmeInfo::pe_start, PmeUntransMsg::qgrid, x, LocalPmeInfo::x_start, and PmeUntransMsg::y_start.

Referenced by recvSharedUntrans(), and recvUntrans().

2308  {
2309  // CkPrintf("recvUntrans on Pe(%d)\n",CkMyPe());
2310 
2311 #if CMK_BLUEGENEL
2312  CmiNetworkProgressAfter (0);
2313 #endif
2314 
2315  NodePmeInfo &nodeInfo(gridNodeInfo[myGridNode]);
2316  int first_pe = nodeInfo.pe_start;
2317  int g;
2318 
2319  if ( msg->ny ) {
2320  int zdim = myGrid.dim3;
2321  int last_pe = first_pe+nodeInfo.npe-1;
2322  int x_skip = localInfo[myGridPe].x_start
2323  - localInfo[first_pe].x_start;
2324  int nx_msg = localInfo[last_pe].x_start
2325  + localInfo[last_pe].nx
2326  - localInfo[first_pe].x_start;
2327  int nx = localInfo[myGridPe].nx;
2328  int y_start = msg->y_start;
2329  int ny = msg->ny;
2330  int slicelen = myGrid.K2 * zdim;
2331  int cpylen = ny * zdim;
2332  for ( g=0; g<numGrids; ++g ) {
2333  float *q = qgrid + qgrid_size * g + y_start * zdim;
2334  float *qmsg = msg->qgrid + (nx_msg*g+x_skip) * cpylen;
2335  for ( int x = 0; x < nx; ++x ) {
2336  CmiMemcpy((void*)q, (void*)qmsg, cpylen*sizeof(float));
2337  q += slicelen;
2338  qmsg += cpylen;
2339  }
2340  }
2341  }
2342 
2343  --untrans_count;
2344 
2345  if ( untrans_count == 0 ) {
2346  pmeProxyDir[CkMyPe()].gridCalc3();
2347  }
2348 }
float * qgrid
Definition: ComputePme.C:160
int dim3
Definition: PmeBase.h:19
int K2
Definition: PmeBase.h:18
static int numGrids
Definition: ComputePme.h:32
gridSize x
void ComputePmeMgr::recvAck ( PmeAckMsg msg)

Definition at line 2450 of file ComputePme.C.

References cuda_lock, master_pe, and NAMD_bug().

Referenced by recvUngrid().

2450  {
2451  if ( msg ) delete msg;
2452 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2453  if ( offload ) {
2454  CmiLock(cuda_lock);
2455  if ( ungrid_count == 0 ) {
2456  NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
2457  }
2458  int uc = --ungrid_count;
2459  CmiUnlock(cuda_lock);
2460 
2461  if ( uc == 0 ) {
2462  pmeProxyDir[master_pe].ungridCalc();
2463  }
2464  return;
2465  }
2466 #endif
2467  --ungrid_count;
2468 
2469  if ( ungrid_count == 0 ) {
2470  pmeProxyDir[CkMyPe()].ungridCalc();
2471  }
2472 }
void NAMD_bug(const char *err_msg)
Definition: common.C:129
static CmiNodeLock cuda_lock
Definition: ComputePme.C:430
void ComputePmeMgr::recvArrays ( CProxy_PmeXPencil  x,
CProxy_PmeYPencil  y,
CProxy_PmeZPencil  z 
)

Definition at line 806 of file ComputePme.C.

References x, y, and z.

807  {
808  xPencil = x; yPencil = y; zPencil = z;
809 
810  if(CmiMyRank()==0)
811  {
812  pmeNodeProxy.ckLocalBranch()->xPencil=x;
813  pmeNodeProxy.ckLocalBranch()->yPencil=y;
814  pmeNodeProxy.ckLocalBranch()->zPencil=z;
815  }
816 }
gridSize z
gridSize y
gridSize x
void ComputePmeMgr::recvChargeGridReady ( )

Definition at line 3550 of file ComputePme.C.

References chargeGridReady(), saved_lattice, and saved_sequence.

3550  {
3552 }
Lattice * saved_lattice
Definition: ComputePme.C:453
void chargeGridReady(Lattice &lattice, int sequence)
Definition: ComputePme.C:3554
int saved_sequence
Definition: ComputePme.C:454
void ComputePmeMgr::recvGrid ( PmeGridMsg msg)

Definition at line 1833 of file ComputePme.C.

References PmeGrid::dim3, PmeGridMsg::fgrid, PmeGridMsg::lattice, NAMD_bug(), ComputePmeUtil::numGrids, PmeGridMsg::qgrid, PmeGridMsg::sequence, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

1833  {
1834  // CkPrintf("recvGrid from %d on Pe(%d)\n",msg->sourceNode,CkMyPe());
1835  if ( grid_count == 0 ) {
1836  NAMD_bug("Message order failure in ComputePmeMgr::recvGrid\n");
1837  }
1838  if ( grid_count == numSources ) {
1839  lattice = msg->lattice;
1840  grid_sequence = msg->sequence;
1841  }
1842 
1843  int zdim = myGrid.dim3;
1844  int zlistlen = msg->zlistlen;
1845  int *zlist = msg->zlist;
1846  float *qmsg = msg->qgrid;
1847  for ( int g=0; g<numGrids; ++g ) {
1848  char *f = msg->fgrid + fgrid_len * g;
1849  float *q = qgrid + qgrid_size * g;
1850  for ( int i=0; i<fgrid_len; ++i ) {
1851  if ( f[i] ) {
1852  for ( int k=0; k<zlistlen; ++k ) {
1853  q[zlist[k]] += *(qmsg++);
1854  }
1855  }
1856  q += zdim;
1857  }
1858  }
1859 
1860  gridmsg_reuse[numSources-grid_count] = msg;
1861  --grid_count;
1862 
1863  if ( grid_count == 0 ) {
1864  pmeProxyDir[CkMyPe()].gridCalc1();
1865  if ( useBarrier ) pmeProxyDir[0].sendTransBarrier();
1866  }
1867 }
int dim3
Definition: PmeBase.h:19
int sequence
Definition: ComputePme.C:122
static int numGrids
Definition: ComputePme.h:32
Lattice lattice
Definition: ComputePme.C:124
void NAMD_bug(const char *err_msg)
Definition: common.C:129
float * qgrid
Definition: ComputePme.C:130
int * zlist
Definition: ComputePme.C:128
int zlistlen
Definition: ComputePme.C:127
char * fgrid
Definition: ComputePme.C:129
void ComputePmeMgr::recvRecipEvir ( PmeEvirMsg msg)

Definition at line 3029 of file ComputePme.C.

References PmeEvirMsg::evir, NAMD_bug(), ComputePmeUtil::numGrids, pmeComputes, ResizeArray< T >::size(), and submitReductions().

3029  {
3030  if ( ! pmeComputes.size() ) NAMD_bug("ComputePmeMgr::recvRecipEvir() called on pe without patches");
3031  for ( int g=0; g<numGrids; ++g ) {
3032  evir[g] += msg->evir[g];
3033  }
3034  delete msg;
3035  // CkPrintf("recvRecipEvir pe %d %d %d\n", CkMyPe(), ungridForcesCount, recipEvirCount);
3036  if ( ! --recipEvirCount && ! ungridForcesCount ) submitReductions();
3037 }
static int numGrids
Definition: ComputePme.h:32
PmeReduction * evir
Definition: ComputePme.C:173
void NAMD_bug(const char *err_msg)
Definition: common.C:129
void submitReductions()
Definition: ComputePme.C:4214
ResizeArray< ComputePme * > pmeComputes
Definition: ComputePme.C:460
int size(void) const
Definition: ResizeArray.h:127
void ComputePmeMgr::recvSharedTrans ( PmeSharedTransMsg msg)

Definition at line 2036 of file ComputePme.C.

References PmeSharedTransMsg::count, PmeSharedTransMsg::lock, PmeSharedTransMsg::msg, and procTrans().

2036  {
2037  procTrans(msg->msg);
2038  CmiLock(msg->lock);
2039  int count = --(*msg->count);
2040  CmiUnlock(msg->lock);
2041  if ( count == 0 ) {
2042  CmiDestroyLock(msg->lock);
2043  delete msg->count;
2044  delete msg->msg;
2045  }
2046  delete msg;
2047 }
PmeTransMsg * msg
Definition: ComputePme.C:149
void procTrans(PmeTransMsg *)
Definition: ComputePme.C:2054
CmiNodeLock lock
Definition: ComputePme.C:151
void ComputePmeMgr::recvSharedUntrans ( PmeSharedUntransMsg msg)

Definition at line 2290 of file ComputePme.C.

References PmeSharedUntransMsg::count, PmeSharedUntransMsg::lock, PmeSharedUntransMsg::msg, and procUntrans().

2290  {
2291  procUntrans(msg->msg);
2292  CmiLock(msg->lock);
2293  int count = --(*msg->count);
2294  CmiUnlock(msg->lock);
2295  if ( count == 0 ) {
2296  CmiDestroyLock(msg->lock);
2297  delete msg->count;
2298  delete msg->msg;
2299  }
2300  delete msg;
2301 }
void procUntrans(PmeUntransMsg *)
Definition: ComputePme.C:2308
CmiNodeLock lock
Definition: ComputePme.C:168
PmeUntransMsg * msg
Definition: ComputePme.C:166
void ComputePmeMgr::recvTrans ( PmeTransMsg msg)

Definition at line 2049 of file ComputePme.C.

References procTrans().

2049  {
2050  procTrans(msg);
2051  delete msg;
2052 }
void procTrans(PmeTransMsg *)
Definition: ComputePme.C:2054
void ComputePmeMgr::recvUngrid ( PmeGridMsg msg)

Definition at line 2435 of file ComputePme.C.

References copyPencils(), copyResults(), NAMD_bug(), and recvAck().

2435  {
2436  // CkPrintf("recvUngrid on Pe(%d)\n",CkMyPe());
2437 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2438  if ( ! offload ) // would need lock
2439 #endif
2440  if ( ungrid_count == 0 ) {
2441  NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
2442  }
2443 
2444  if ( usePencils ) copyPencils(msg);
2445  else copyResults(msg);
2446  delete msg;
2447  recvAck(0);
2448 }
void recvAck(PmeAckMsg *)
Definition: ComputePme.C:2450
void NAMD_bug(const char *err_msg)
Definition: common.C:129
void copyPencils(PmeGridMsg *)
Definition: ComputePme.C:3800
void copyResults(PmeGridMsg *)
Definition: ComputePme.C:3992
void ComputePmeMgr::recvUntrans ( PmeUntransMsg msg)

Definition at line 2303 of file ComputePme.C.

References procUntrans().

2303  {
2304  procUntrans(msg);
2305  delete msg;
2306 }
void procUntrans(PmeUntransMsg *)
Definition: ComputePme.C:2308
void ComputePmeMgr::sendChargeGridReady ( )

Definition at line 3527 of file ComputePme.C.

References chargeGridSubmittedCount, master_pe, pmeComputes, and ResizeArray< T >::size().

Referenced by cuda_check_pme_charges().

3527  {
3528  for ( int i=0; i<CkMyNodeSize(); ++i ) {
3529  ComputePmeMgr *mgr = nodePmeMgr->mgrObjects[i];
3530  int cs = mgr->pmeComputes.size();
3531  if ( cs ) {
3532  mgr->ungridForcesCount = cs;
3533  mgr->recipEvirCount = mgr->recipEvirClients;
3534  masterPmeMgr->chargeGridSubmittedCount++;
3535  }
3536  }
3537  pmeProxy[master_pe].recvChargeGridReady();
3538 }
int chargeGridSubmittedCount
Definition: ComputePme.C:450
ResizeArray< ComputePme * > pmeComputes
Definition: ComputePme.C:460
int size(void) const
Definition: ResizeArray.h:127
void ComputePmeMgr::sendData ( Lattice lattice,
int  sequence 
)

Definition at line 3964 of file ComputePme.C.

References sendDataHelper_errors, sendDataHelper_lattice, sendDataHelper_sequence, sendDataHelper_sourcepe, and sendDataPart().

Referenced by chargeGridReady().

3964  {
3965 
3966  sendDataHelper_lattice = &lattice;
3967  sendDataHelper_sequence = sequence;
3968  sendDataHelper_sourcepe = CkMyPe();
3969  sendDataHelper_errors = strayChargeErrors;
3970  strayChargeErrors = 0;
3971 
3972 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3973  if ( offload ) {
3974  for ( int i=0; i < numGridPes; ++i ) {
3975  int pe = gridPeOrder[i]; // different order
3976  if ( ! recipPeDest[pe] && ! sendDataHelper_errors ) continue;
3977 #if CMK_MULTICORE
3978  // nodegroup messages on multicore are delivered to sending pe, or pe 0 if expedited
3979  pmeProxy[gridPeMap[pe]].sendDataHelper(i);
3980 #else
3981  pmeNodeProxy[CkMyNode()].sendDataHelper(i);
3982 #endif
3983  }
3984  } else
3985 #endif
3986  {
3987  sendDataPart(0,numGridPes-1,lattice,sequence,CkMyPe(),sendDataHelper_errors);
3988  }
3989 
3990 }
int sendDataHelper_sequence
Definition: ComputePme.C:377
int sendDataHelper_sourcepe
Definition: ComputePme.C:378
Lattice * sendDataHelper_lattice
Definition: ComputePme.C:376
int sendDataHelper_errors
Definition: ComputePme.C:379
void sendDataPart(int first, int last, Lattice &, int sequence, int sourcepe, int errors)
Definition: ComputePme.C:3842
void ComputePmeMgr::sendDataHelper ( int  iter)

Definition at line 3951 of file ComputePme.C.

References NodePmeMgr::sendDataHelper().

3951  {
3952  nodePmeMgr->sendDataHelper(iter);
3953 }
void sendDataHelper(int)
Definition: ComputePme.C:3955
void ComputePmeMgr::sendDataPart ( int  first,
int  last,
Lattice lattice,
int  sequence,
int  sourcepe,
int  errors 
)

Definition at line 3842 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::dim2, PmeGrid::dim3, endi(), PmeGridMsg::fgrid, iERROR(), if(), iout, PmeGrid::K2, PmeGrid::K3, PmeGridMsg::lattice, PmeGridMsg::len, NAMD_bug(), ComputePmeUtil::numGrids, PmeGrid::order, PME_GRID_PRIORITY, PRIORITY_SIZE, PmeGridMsg::qgrid, PmeGridMsg::sequence, SET_PRIORITY, PmeGridMsg::sourceNode, PmeGridMsg::start, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

Referenced by sendData(), and NodePmeMgr::sendDataHelper().

3842  {
3843 
3844  // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
3845 
3846  bsize = myGrid.block1 * myGrid.dim2 * myGrid.dim3;
3847 
3848  CProxy_ComputePmeMgr pmeProxy(CkpvAccess(BOCclass_group).computePmeMgr);
3849  for (int j=first; j<=last; j++) {
3850  int pe = gridPeOrder[j]; // different order
3851  if ( ! recipPeDest[pe] && ! errors ) continue;
3852  int start = pe * bsize;
3853  int len = bsize;
3854  if ( start >= qsize ) { start = 0; len = 0; }
3855  if ( start + len > qsize ) { len = qsize - start; }
3856  int zdim = myGrid.dim3;
3857  int fstart = start / zdim;
3858  int flen = len / zdim;
3859  int fcount = 0;
3860  int i;
3861 
3862  int g;
3863  for ( g=0; g<numGrids; ++g ) {
3864  char *f = f_arr + fstart + g*fsize;
3865 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3866  if ( offload ) {
3867  int errcount = 0;
3868  for ( i=0; i<flen; ++i ) {
3869  f[i] = ffz_host[fstart+i];
3870  fcount += f[i];
3871  if ( ffz_host[fstart+i] & ~1 ) ++errcount;
3872  }
3873  if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::sendDataPart");
3874  } else
3875 #endif
3876  for ( i=0; i<flen; ++i ) {
3877  fcount += f[i];
3878  }
3879  if ( ! recipPeDest[pe] ) {
3880  int errfound = 0;
3881  for ( i=0; i<flen; ++i ) {
3882  if ( f[i] == 3 ) {
3883  errfound = 1;
3884  break;
3885  }
3886  }
3887  if ( errfound ) {
3888  iout << iERROR << "Stray PME grid charges detected: "
3889  << sourcepe << " sending to " << gridPeMap[pe] << " for planes";
3890  int iz = -1;
3891  for ( i=0; i<flen; ++i ) {
3892  if ( f[i] == 3 ) {
3893  f[i] = 2;
3894  int jz = (i+fstart)/myGrid.K2;
3895  if ( iz != jz ) { iout << " " << jz; iz = jz; }
3896  }
3897  }
3898  iout << "\n" << endi;
3899  }
3900  }
3901  }
3902 
3903 #ifdef NETWORK_PROGRESS
3904  CmiNetworkProgress();
3905 #endif
3906 
3907  if ( ! recipPeDest[pe] ) continue;
3908 
3909  int zlistlen = 0;
3910  for ( i=0; i<myGrid.K3; ++i ) {
3911  if ( fz_arr[i] ) ++zlistlen;
3912  }
3913 
3914  PmeGridMsg *msg = new (zlistlen, flen*numGrids,
3915  fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
3916 
3917  msg->sourceNode = sourcepe;
3918  msg->lattice = lattice;
3919  msg->start = fstart;
3920  msg->len = flen;
3921  msg->zlistlen = zlistlen;
3922  int *zlist = msg->zlist;
3923  zlistlen = 0;
3924  for ( i=0; i<myGrid.K3; ++i ) {
3925  if ( fz_arr[i] ) zlist[zlistlen++] = i;
3926  }
3927  float *qmsg = msg->qgrid;
3928  for ( g=0; g<numGrids; ++g ) {
3929  char *f = f_arr + fstart + g*fsize;
3930  CmiMemcpy((void*)(msg->fgrid+g*flen),(void*)f,flen*sizeof(char));
3931  float **q = q_arr + fstart + g*fsize;
3932  for ( i=0; i<flen; ++i ) {
3933  if ( f[i] ) {
3934  for (int h=0; h<myGrid.order-1; ++h) {
3935  q[i][h] += q[i][myGrid.K3+h];
3936  }
3937  for ( int k=0; k<zlistlen; ++k ) {
3938  *(qmsg++) = q[i][zlist[k]];
3939  }
3940  }
3941  }
3942  }
3943 
3944  msg->sequence = compute_sequence;
3945  SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
3946  pmeProxy[gridPeMap[pe]].recvGrid(msg);
3947  }
3948 
3949 }
int dim2
Definition: PmeBase.h:19
int dim3
Definition: PmeBase.h:19
int sequence
Definition: ComputePme.C:122
int K2
Definition: PmeBase.h:18
static int numGrids
Definition: ComputePme.h:32
std::ostream & endi(std::ostream &s)
Definition: InfoStream.C:54
int block1
Definition: PmeBase.h:21
if(ComputeNonbondedUtil::goMethod==2)
Lattice lattice
Definition: ComputePme.C:124
#define iout
Definition: InfoStream.h:51
void recvGrid(PmeGridMsg *)
Definition: ComputePme.C:1833
int sourceNode
Definition: ComputePme.C:121
#define PRIORITY_SIZE
Definition: Priorities.h:13
int order
Definition: PmeBase.h:20
void NAMD_bug(const char *err_msg)
Definition: common.C:129
float * qgrid
Definition: ComputePme.C:130
int * zlist
Definition: ComputePme.C:128
int K3
Definition: PmeBase.h:18
#define PME_GRID_PRIORITY
Definition: Priorities.h:30
std::ostream & iERROR(std::ostream &s)
Definition: InfoStream.C:83
int zlistlen
Definition: ComputePme.C:127
#define SET_PRIORITY(MSG, SEQ, PRIO)
Definition: Priorities.h:18
char * fgrid
Definition: ComputePme.C:129
void ComputePmeMgr::sendPencils ( Lattice lattice,
int  sequence 
)

Definition at line 3737 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::block2, PmeGrid::dim2, endi(), ijpair::i, iERROR(), iout, ijpair::j, PmeGrid::K1, PmeGrid::K2, ComputePmeUtil::numGrids, sendDataHelper_lattice, sendDataHelper_sequence, sendDataHelper_sourcepe, sendPencilsPart(), and NodePmeMgr::zm.

Referenced by chargeGridReady().

3737  {
3738 
3739  sendDataHelper_lattice = &lattice;
3740  sendDataHelper_sequence = sequence;
3741  sendDataHelper_sourcepe = CkMyPe();
3742 
3743 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3744  if ( offload ) {
3745  for ( int ap=0; ap < numPencilsActive; ++ap ) {
3746 #if CMK_MULTICORE
3747  // nodegroup messages on multicore are delivered to sending pe, or pe 0 if expedited
3748  int ib = activePencils[ap].i;
3749  int jb = activePencils[ap].j;
3750  int destproc = nodePmeMgr->zm.ckLocalBranch()->procNum(0, CkArrayIndex3D(ib,jb,0));
3751  pmeProxy[destproc].sendPencilsHelper(ap);
3752 #else
3753  pmeNodeProxy[CkMyNode()].sendPencilsHelper(ap);
3754 #endif
3755  }
3756  } else
3757 #endif
3758  {
3759  sendPencilsPart(0,numPencilsActive-1,lattice,sequence,CkMyPe());
3760  }
3761 
3762  if ( strayChargeErrors ) {
3763  strayChargeErrors = 0;
3764  iout << iERROR << "Stray PME grid charges detected: "
3765  << CkMyPe() << " sending to (x,y)";
3766  int K1 = myGrid.K1;
3767  int K2 = myGrid.K2;
3768  int dim2 = myGrid.dim2;
3769  int block1 = myGrid.block1;
3770  int block2 = myGrid.block2;
3771  for (int ib=0; ib<xBlocks; ++ib) {
3772  for (int jb=0; jb<yBlocks; ++jb) {
3773  int ibegin = ib*block1;
3774  int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
3775  int jbegin = jb*block2;
3776  int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
3777  int flen = numGrids * (iend - ibegin) * (jend - jbegin);
3778 
3779  for ( int g=0; g<numGrids; ++g ) {
3780  char *f = f_arr + g*fsize;
3781  if ( ! pencilActive[ib*yBlocks+jb] ) {
3782  for ( int i=ibegin; i<iend; ++i ) {
3783  for ( int j=jbegin; j<jend; ++j ) {
3784  if ( f[i*dim2+j] == 3 ) {
3785  f[i*dim2+j] = 2;
3786  iout << " (" << i << "," << j << ")";
3787  }
3788  }
3789  }
3790  }
3791  }
3792  }
3793  }
3794  iout << "\n" << endi;
3795  }
3796 
3797 }
int dim2
Definition: PmeBase.h:19
CProxy_PmePencilMap zm
Definition: ComputePme.C:640
int K2
Definition: PmeBase.h:18
int K1
Definition: PmeBase.h:18
static int numGrids
Definition: ComputePme.h:32
std::ostream & endi(std::ostream &s)
Definition: InfoStream.C:54
int block1
Definition: PmeBase.h:21
#define iout
Definition: InfoStream.h:51
int block2
Definition: PmeBase.h:21
int sendDataHelper_sequence
Definition: ComputePme.C:377
int sendDataHelper_sourcepe
Definition: ComputePme.C:378
Lattice * sendDataHelper_lattice
Definition: ComputePme.C:376
int i
Definition: ComputePme.C:349
std::ostream & iERROR(std::ostream &s)
Definition: InfoStream.C:83
int j
Definition: ComputePme.C:349
void sendPencilsPart(int first, int last, Lattice &, int sequence, int sourcepe)
Definition: ComputePme.C:3582
void ComputePmeMgr::sendPencilsHelper ( int  iter)

Definition at line 3724 of file ComputePme.C.

References NodePmeMgr::sendPencilsHelper().

3724  {
3725  nodePmeMgr->sendPencilsHelper(iter);
3726 }
void sendPencilsHelper(int)
Definition: ComputePme.C:3728
void ComputePmeMgr::sendPencilsPart ( int  first,
int  last,
Lattice lattice,
int  sequence,
int  sourcepe 
)

Definition at line 3582 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::block2, PmeGridMsg::destElem, PmeGrid::dim2, PmeGrid::dim3, PmeGridMsg::fgrid, PmeGridMsg::hasData, ijpair::i, ijpair::j, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PmeGridMsg::lattice, PmeGridMsg::len, NAMD_bug(), ComputePmeUtil::numGrids, PmeGrid::order, PME_GRID_PRIORITY, PRIORITY_SIZE, PmeGridMsg::qgrid, PmeGridMsg::sequence, SET_PRIORITY, PmeGridMsg::sourceNode, PmeGridMsg::start, PmeGridMsg::zlist, PmeGridMsg::zlistlen, and NodePmeMgr::zm.

Referenced by sendPencils(), and NodePmeMgr::sendPencilsHelper().

3582  {
3583 
3584  // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
3585 
3586 #if 0 && USE_PERSISTENT
3587  if (recvGrid_handle== NULL) setup_recvgrid_persistent();
3588 #endif
3589  int K1 = myGrid.K1;
3590  int K2 = myGrid.K2;
3591  int dim2 = myGrid.dim2;
3592  int dim3 = myGrid.dim3;
3593  int block1 = myGrid.block1;
3594  int block2 = myGrid.block2;
3595 
3596  // int savedMessages = 0;
3597  NodePmeMgr *npMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
3598 
3599  for (int ap=first; ap<=last; ++ap) {
3600  int ib = activePencils[ap].i;
3601  int jb = activePencils[ap].j;
3602  int ibegin = ib*block1;
3603  int iend = ibegin + block1; if ( iend > K1 ) iend = K1;
3604  int jbegin = jb*block2;
3605  int jend = jbegin + block2; if ( jend > K2 ) jend = K2;
3606  int flen = numGrids * (iend - ibegin) * (jend - jbegin);
3607 
3608  int fcount = 0;
3609  for ( int g=0; g<numGrids; ++g ) {
3610  char *f = f_arr + g*fsize;
3611 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
3612  if ( offload ) {
3613  int errcount = 0;
3614  for ( int i=ibegin; i<iend; ++i ) {
3615  for ( int j=jbegin; j<jend; ++j ) {
3616  int k = i*dim2+j;
3617  f[k] = ffz_host[k];
3618  fcount += f[k];
3619  if ( ffz_host[k] & ~1 ) ++errcount;
3620  }
3621  }
3622  if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::sendPencilsPart");
3623  } else
3624 #endif
3625  for ( int i=ibegin; i<iend; ++i ) {
3626  for ( int j=jbegin; j<jend; ++j ) {
3627  fcount += f[i*dim2+j];
3628  }
3629  }
3630  }
3631 
3632 #ifdef NETWORK_PROGRESS
3633  CmiNetworkProgress();
3634 #endif
3635 
3636  if ( ! pencilActive[ib*yBlocks+jb] )
3637  NAMD_bug("PME activePencils list inconsistent");
3638 
3639  int zlistlen = 0;
3640  for ( int i=0; i<myGrid.K3; ++i ) {
3641  if ( fz_arr[i] ) ++zlistlen;
3642  }
3643 
3644  int hd = ( fcount? 1 : 0 ); // has data?
3645  // if ( ! hd ) ++savedMessages;
3646 
3647 
3648  PmeGridMsg *msg = new ( hd*zlistlen, hd*flen,
3649  hd*fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
3650  msg->sourceNode = sourcepe;
3651  msg->hasData = hd;
3652  msg->lattice = lattice;
3653  if ( hd ) {
3654 #if 0
3655  msg->start = fstart;
3656  msg->len = flen;
3657 #else
3658  msg->start = -1; // obsolete?
3659  msg->len = -1; // obsolete?
3660 #endif
3661  msg->zlistlen = zlistlen;
3662  int *zlist = msg->zlist;
3663  zlistlen = 0;
3664  for ( int i=0; i<myGrid.K3; ++i ) {
3665  if ( fz_arr[i] ) zlist[zlistlen++] = i;
3666  }
3667  char *fmsg = msg->fgrid;
3668  float *qmsg = msg->qgrid;
3669  for ( int g=0; g<numGrids; ++g ) {
3670  char *f = f_arr + g*fsize;
3671  float **q = q_arr + g*fsize;
3672  for ( int i=ibegin; i<iend; ++i ) {
3673  for ( int j=jbegin; j<jend; ++j ) {
3674  *(fmsg++) = f[i*dim2+j];
3675  if( f[i*dim2+j] ) {
3676  for (int h=0; h<myGrid.order-1; ++h) {
3677  q[i*dim2+j][h] += q[i*dim2+j][myGrid.K3+h];
3678  }
3679  for ( int k=0; k<zlistlen; ++k ) {
3680  *(qmsg++) = q[i*dim2+j][zlist[k]];
3681  }
3682  }
3683  }
3684  }
3685  }
3686  }
3687 
3688  msg->sequence = compute_sequence;
3689  SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
3690  CmiEnableUrgentSend(1);
3691 #if USE_NODE_PAR_RECEIVE
3692  msg->destElem=CkArrayIndex3D(ib,jb,0);
3693  CProxy_PmePencilMap lzm = npMgr->zm;
3694  int destproc = lzm.ckLocalBranch()->procNum(0, msg->destElem);
3695  int destnode = CmiNodeOf(destproc);
3696 
3697 #if 0
3698  CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
3699 #endif
3700  pmeNodeProxy[destnode].recvZGrid(msg);
3701 #if 0
3702  CmiUsePersistentHandle(NULL, 0);
3703 #endif
3704 #else
3705 #if 0
3706  CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
3707 #endif
3708  zPencil(ib,jb,0).recvGrid(msg);
3709 #if 0
3710  CmiUsePersistentHandle(NULL, 0);
3711 #endif
3712 #endif
3713  CmiEnableUrgentSend(0);
3714  }
3715 
3716 
3717  // if ( savedMessages ) {
3718  // CkPrintf("Pe %d eliminated %d PME messages\n",CkMyPe(),savedMessages);
3719  // }
3720 
3721 }
int dim2
Definition: PmeBase.h:19
CProxy_PmePencilMap zm
Definition: ComputePme.C:640
int dim3
Definition: PmeBase.h:19
int sequence
Definition: ComputePme.C:122
int K2
Definition: PmeBase.h:18
int K1
Definition: PmeBase.h:18
static int numGrids
Definition: ComputePme.h:32
int block1
Definition: PmeBase.h:21
Lattice lattice
Definition: ComputePme.C:124
int block2
Definition: PmeBase.h:21
int sourceNode
Definition: ComputePme.C:121
#define PRIORITY_SIZE
Definition: Priorities.h:13
int order
Definition: PmeBase.h:20
void NAMD_bug(const char *err_msg)
Definition: common.C:129
CkArrayIndex3D destElem
Definition: ComputePme.C:131
float * qgrid
Definition: ComputePme.C:130
int * zlist
Definition: ComputePme.C:128
int K3
Definition: PmeBase.h:18
#define PME_GRID_PRIORITY
Definition: Priorities.h:30
int i
Definition: ComputePme.C:349
int zlistlen
Definition: ComputePme.C:127
#define SET_PRIORITY(MSG, SEQ, PRIO)
Definition: Priorities.h:18
char * fgrid
Definition: ComputePme.C:129
int j
Definition: ComputePme.C:349
void ComputePmeMgr::sendTrans ( void  )

Definition at line 1945 of file ComputePme.C.

References CKLOOP_CTRL_PME_SENDTRANS, Node::Object(), PmeSlabSendTrans(), sendTransSubset(), Node::simParameters, and SimParameters::useCkLoop.

1945  {
1946 
1947  untrans_count = numTransPes;
1948 
1949 #if CMK_SMP && USE_CKLOOP
1950  int useCkLoop = Node::Object()->simParameters->useCkLoop;
1951  if ( useCkLoop >= CKLOOP_CTRL_PME_SENDTRANS && CkNumPes() >= 2 * numGridPes) {
1952  CkLoop_Parallelize(PmeSlabSendTrans, 1, (void *)this, CkMyNodeSize(), 0, numTransNodes-1, 0); // no sync
1953  } else
1954 #endif
1955  {
1956  sendTransSubset(0, numTransNodes-1);
1957  }
1958 
1959 }
static Node * Object()
Definition: Node.h:86
SimParameters * simParameters
Definition: Node.h:178
static void PmeSlabSendTrans(int first, int last, void *result, int paraNum, void *param)
Definition: ComputePme.C:1940
#define CKLOOP_CTRL_PME_SENDTRANS
Definition: SimParameters.h:95
void sendTransSubset(int first, int last)
Definition: ComputePme.C:1961
void ComputePmeMgr::sendTransBarrier ( void  )

Definition at line 1930 of file ComputePme.C.

1930  {
1931  sendTransBarrier_received += 1;
1932  // CkPrintf("sendTransBarrier on %d %d\n",myGridPe,numGridPes-sendTransBarrier_received);
1933  if ( sendTransBarrier_received < numGridPes ) return;
1934  sendTransBarrier_received = 0;
1935  for ( int i=0; i<numGridPes; ++i ) {
1936  pmeProxyDir[gridPeMap[i]].sendTrans();
1937  }
1938 }
void ComputePmeMgr::sendTransSubset ( int  first,
int  last 
)

Definition at line 1961 of file ComputePme.C.

References PmeGrid::dim3, fwdSharedTrans(), PmeGrid::K2, PmeTransMsg::lattice, NodePmeInfo::npe, ComputePmeUtil::numGrids, PmeTransMsg::nx, LocalPmeInfo::nx, LocalPmeInfo::ny_after_transpose, NodePmeInfo::pe_start, PME_TRANS_PRIORITY, PRIORITY_SIZE, PmeTransMsg::qgrid, NodePmeInfo::real_node, PmeTransMsg::sequence, SET_PRIORITY, PmeTransMsg::sourceNode, x, PmeTransMsg::x_start, LocalPmeInfo::x_start, and LocalPmeInfo::y_start_after_transpose.

Referenced by PmeSlabSendTrans(), and sendTrans().

1961  {
1962  // CkPrintf("sendTrans on Pe(%d)\n",CkMyPe());
1963 
1964  // send data for transpose
1965  int zdim = myGrid.dim3;
1966  int nx = localInfo[myGridPe].nx;
1967  int x_start = localInfo[myGridPe].x_start;
1968  int slicelen = myGrid.K2 * zdim;
1969 
1970  ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
1971 
1972 #if CMK_BLUEGENEL
1973  CmiNetworkProgressAfter (0);
1974 #endif
1975 
1976  for (int j=first; j<=last; j++) {
1977  int node = transNodeOrder[j]; // different order on each node
1978  int pe = transNodeInfo[node].pe_start;
1979  int npe = transNodeInfo[node].npe;
1980  int totlen = 0;
1981  if ( node != myTransNode ) for (int i=0; i<npe; ++i, ++pe) {
1982  LocalPmeInfo &li = localInfo[pe];
1983  int cpylen = li.ny_after_transpose * zdim;
1984  totlen += cpylen;
1985  }
1986  PmeTransMsg *newmsg = new (nx * totlen * numGrids,
1988  newmsg->sourceNode = myGridPe;
1989  newmsg->lattice = lattice;
1990  newmsg->x_start = x_start;
1991  newmsg->nx = nx;
1992  for ( int g=0; g<numGrids; ++g ) {
1993  float *qmsg = newmsg->qgrid + nx * totlen * g;
1994  pe = transNodeInfo[node].pe_start;
1995  for (int i=0; i<npe; ++i, ++pe) {
1996  LocalPmeInfo &li = localInfo[pe];
1997  int cpylen = li.ny_after_transpose * zdim;
1998  if ( node == myTransNode ) {
1999  ComputePmeMgr *m = mgrObjects[CkRankOf(transPeMap[pe])];
2000  qmsg = m->kgrid + m->qgrid_size * g + x_start*cpylen;
2001  }
2002  float *q = qgrid + qgrid_size * g + li.y_start_after_transpose * zdim;
2003  for ( int x = 0; x < nx; ++x ) {
2004  CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
2005  q += slicelen;
2006  qmsg += cpylen;
2007  }
2008  }
2009  }
2010  newmsg->sequence = grid_sequence;
2011  SET_PRIORITY(newmsg,grid_sequence,PME_TRANS_PRIORITY)
2012  if ( node == myTransNode ) newmsg->nx = 0;
2013  if ( npe > 1 ) {
2014  if ( node == myTransNode ) fwdSharedTrans(newmsg);
2015  else pmeNodeProxy[transNodeInfo[node].real_node].recvTrans(newmsg);
2016  } else pmeProxy[transPeMap[transNodeInfo[node].pe_start]].recvTrans(newmsg);
2017  }
2018 }
int dim3
Definition: PmeBase.h:19
int K2
Definition: PmeBase.h:18
static int numGrids
Definition: ComputePme.h:32
if(ComputeNonbondedUtil::goMethod==2)
float * qgrid
Definition: ComputePme.C:143
void fwdSharedTrans(PmeTransMsg *)
Definition: ComputePme.C:2020
#define PRIORITY_SIZE
Definition: Priorities.h:13
int sourceNode
Definition: ComputePme.C:137
#define PME_TRANS_PRIORITY
Definition: Priorities.h:31
int ny_after_transpose
Definition: ComputePme.C:239
Lattice lattice
Definition: ComputePme.C:140
#define SET_PRIORITY(MSG, SEQ, PRIO)
Definition: Priorities.h:18
gridSize x
int y_start_after_transpose
Definition: ComputePme.C:239
void ComputePmeMgr::sendUngrid ( void  )

Definition at line 2375 of file ComputePme.C.

References CKLOOP_CTRL_PME_SENDUNTRANS, Node::Object(), PmeSlabSendUngrid(), sendUngridSubset(), Node::simParameters, and SimParameters::useCkLoop.

2375  {
2376 
2377 #if CMK_SMP && USE_CKLOOP
2378  int useCkLoop = Node::Object()->simParameters->useCkLoop;
2379  if ( useCkLoop >= CKLOOP_CTRL_PME_SENDUNTRANS && CkNumPes() >= 2 * numGridPes) {
2380  CkLoop_Parallelize(PmeSlabSendUngrid, 1, (void *)this, CkMyNodeSize(), 0, numSources-1, 1); // sync
2381  } else
2382 #endif
2383  {
2384  sendUngridSubset(0, numSources-1);
2385  }
2386 
2387  grid_count = numSources;
2388  memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
2389 }
static Node * Object()
Definition: Node.h:86
SimParameters * simParameters
Definition: Node.h:178
static int numGrids
Definition: ComputePme.h:32
static void PmeSlabSendUngrid(int first, int last, void *result, int paraNum, void *param)
Definition: ComputePme.C:2370
#define CKLOOP_CTRL_PME_SENDUNTRANS
Definition: SimParameters.h:98
void sendUngridSubset(int first, int last)
Definition: ComputePme.C:2391
void ComputePmeMgr::sendUngridSubset ( int  first,
int  last 
)

Definition at line 2391 of file ComputePme.C.

References PmeGrid::dim3, PmeGridMsg::fgrid, PmeGridMsg::len, ComputePmeUtil::numGrids, PME_OFFLOAD_UNGRID_PRIORITY, PME_UNGRID_PRIORITY, PmeGridMsg::qgrid, SET_PRIORITY, PmeGridMsg::sourceNode, PmeGridMsg::start, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

Referenced by PmeSlabSendUngrid(), and sendUngrid().

2391  {
2392 
2393 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2394  const int UNGRID_PRIORITY = ( offload ? PME_OFFLOAD_UNGRID_PRIORITY : PME_UNGRID_PRIORITY );
2395 #else
2396  const int UNGRID_PRIORITY = PME_UNGRID_PRIORITY ;
2397 #endif
2398 
2399  for ( int j=first; j<=last; ++j ) {
2400  // int msglen = qgrid_len;
2401  PmeGridMsg *newmsg = gridmsg_reuse[j];
2402  int pe = newmsg->sourceNode;
2403  int zdim = myGrid.dim3;
2404  int flen = newmsg->len;
2405  int fstart = newmsg->start;
2406  int zlistlen = newmsg->zlistlen;
2407  int *zlist = newmsg->zlist;
2408  float *qmsg = newmsg->qgrid;
2409  for ( int g=0; g<numGrids; ++g ) {
2410  char *f = newmsg->fgrid + fgrid_len * g;
2411  float *q = qgrid + qgrid_size * g + (fstart-fgrid_start) * zdim;
2412  for ( int i=0; i<flen; ++i ) {
2413  if ( f[i] ) {
2414  for ( int k=0; k<zlistlen; ++k ) {
2415  *(qmsg++) = q[zlist[k]];
2416  }
2417  }
2418  q += zdim;
2419  }
2420  }
2421  newmsg->sourceNode = myGridPe;
2422 
2423  SET_PRIORITY(newmsg,grid_sequence,UNGRID_PRIORITY)
2424  CmiEnableUrgentSend(1);
2425 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2426  if ( offload ) {
2427  pmeNodeProxy[CkNodeOf(pe)].recvUngrid(newmsg);
2428  } else
2429 #endif
2430  pmeProxyDir[pe].recvUngrid(newmsg);
2431  CmiEnableUrgentSend(0);
2432  }
2433 }
#define PME_UNGRID_PRIORITY
Definition: Priorities.h:74
int dim3
Definition: PmeBase.h:19
static int numGrids
Definition: ComputePme.h:32
#define PME_OFFLOAD_UNGRID_PRIORITY
Definition: Priorities.h:42
int sourceNode
Definition: ComputePme.C:121
float * qgrid
Definition: ComputePme.C:130
int * zlist
Definition: ComputePme.C:128
int zlistlen
Definition: ComputePme.C:127
#define SET_PRIORITY(MSG, SEQ, PRIO)
Definition: Priorities.h:18
char * fgrid
Definition: ComputePme.C:129
void ComputePmeMgr::sendUntrans ( void  )

Definition at line 2189 of file ComputePme.C.

References CKLOOP_CTRL_PME_SENDUNTRANS, PmeEvirMsg::evir, ComputePmeUtil::numGrids, Node::Object(), PME_UNGRID_PRIORITY, PmeSlabSendUntrans(), PRIORITY_SIZE, sendUntransSubset(), SET_PRIORITY, Node::simParameters, and SimParameters::useCkLoop.

2189  {
2190 
2191  trans_count = numGridPes;
2192 
2193  { // send energy and virial
2194  PmeEvirMsg *newmsg = new (numGrids, PRIORITY_SIZE) PmeEvirMsg;
2195  for ( int g=0; g<numGrids; ++g ) {
2196  newmsg->evir[g] = recip_evir2[g];
2197  }
2198  SET_PRIORITY(newmsg,grid_sequence,PME_UNGRID_PRIORITY)
2199  CmiEnableUrgentSend(1);
2200  pmeProxy[recipEvirPe].recvRecipEvir(newmsg);
2201  CmiEnableUrgentSend(0);
2202  }
2203 
2204 #if CMK_SMP && USE_CKLOOP
2205  int useCkLoop = Node::Object()->simParameters->useCkLoop;
2206  if ( useCkLoop >= CKLOOP_CTRL_PME_SENDUNTRANS && CkNumPes() >= 2 * numTransPes) {
2207  CkLoop_Parallelize(PmeSlabSendUntrans, 1, (void *)this, CkMyNodeSize(), 0, numGridNodes-1, 0); // no sync
2208  } else
2209 #endif
2210  {
2211  sendUntransSubset(0, numGridNodes-1);
2212  }
2213 
2214 }
static Node * Object()
Definition: Node.h:86
#define PME_UNGRID_PRIORITY
Definition: Priorities.h:74
SimParameters * simParameters
Definition: Node.h:178
static int numGrids
Definition: ComputePme.h:32
void sendUntransSubset(int first, int last)
Definition: ComputePme.C:2216
PmeReduction * evir
Definition: ComputePme.C:173
#define CKLOOP_CTRL_PME_SENDUNTRANS
Definition: SimParameters.h:98
#define PRIORITY_SIZE
Definition: Priorities.h:13
void recvRecipEvir(PmeEvirMsg *)
Definition: ComputePme.C:3029
#define SET_PRIORITY(MSG, SEQ, PRIO)
Definition: Priorities.h:18
static void PmeSlabSendUntrans(int first, int last, void *result, int paraNum, void *param)
Definition: ComputePme.C:2184
void ComputePmeMgr::sendUntransSubset ( int  first,
int  last 
)

Definition at line 2216 of file ComputePme.C.

References PmeGrid::dim3, fwdSharedUntrans(), PmeGrid::K2, NodePmeInfo::npe, ComputePmeUtil::numGrids, LocalPmeInfo::nx, PmeUntransMsg::ny, LocalPmeInfo::ny_after_transpose, NodePmeInfo::pe_start, PME_UNTRANS_PRIORITY, PRIORITY_SIZE, PmeUntransMsg::qgrid, NodePmeInfo::real_node, SET_PRIORITY, PmeUntransMsg::sourceNode, x, LocalPmeInfo::x_start, PmeUntransMsg::y_start, and LocalPmeInfo::y_start_after_transpose.

Referenced by PmeSlabSendUntrans(), and sendUntrans().

2216  {
2217 
2218  int zdim = myGrid.dim3;
2219  int y_start = localInfo[myTransPe].y_start_after_transpose;
2220  int ny = localInfo[myTransPe].ny_after_transpose;
2221  int slicelen = myGrid.K2 * zdim;
2222 
2223  ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
2224 
2225 #if CMK_BLUEGENEL
2226  CmiNetworkProgressAfter (0);
2227 #endif
2228 
2229  // send data for reverse transpose
2230  for (int j=first; j<=last; j++) {
2231  int node = gridNodeOrder[j]; // different order on each node
2232  int pe = gridNodeInfo[node].pe_start;
2233  int npe = gridNodeInfo[node].npe;
2234  int totlen = 0;
2235  if ( node != myGridNode ) for (int i=0; i<npe; ++i, ++pe) {
2236  LocalPmeInfo &li = localInfo[pe];
2237  int cpylen = li.nx * zdim;
2238  totlen += cpylen;
2239  }
2240  PmeUntransMsg *newmsg = new (ny * totlen * numGrids, PRIORITY_SIZE) PmeUntransMsg;
2241  newmsg->sourceNode = myTransPe;
2242  newmsg->y_start = y_start;
2243  newmsg->ny = ny;
2244  for ( int g=0; g<numGrids; ++g ) {
2245  float *qmsg = newmsg->qgrid + ny * totlen * g;
2246  pe = gridNodeInfo[node].pe_start;
2247  for (int i=0; i<npe; ++i, ++pe) {
2248  LocalPmeInfo &li = localInfo[pe];
2249  if ( node == myGridNode ) {
2250  ComputePmeMgr *m = mgrObjects[CkRankOf(gridPeMap[pe])];
2251  qmsg = m->qgrid + m->qgrid_size * g + y_start * zdim;
2252  float *q = kgrid + qgrid_size*g + li.x_start*ny*zdim;
2253  int cpylen = ny * zdim;
2254  for ( int x = 0; x < li.nx; ++x ) {
2255  CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
2256  q += cpylen;
2257  qmsg += slicelen;
2258  }
2259  } else {
2260  CmiMemcpy((void*)qmsg,
2261  (void*)(kgrid + qgrid_size*g + li.x_start*ny*zdim),
2262  li.nx*ny*zdim*sizeof(float));
2263  qmsg += li.nx*ny*zdim;
2264  }
2265  }
2266  }
2267  SET_PRIORITY(newmsg,grid_sequence,PME_UNTRANS_PRIORITY)
2268  if ( node == myGridNode ) newmsg->ny = 0;
2269  if ( npe > 1 ) {
2270  if ( node == myGridNode ) fwdSharedUntrans(newmsg);
2271  else pmeNodeProxy[gridNodeInfo[node].real_node].recvUntrans(newmsg);
2272  } else pmeProxy[gridPeMap[gridNodeInfo[node].pe_start]].recvUntrans(newmsg);
2273  }
2274 }
float * qgrid
Definition: ComputePme.C:160
int dim3
Definition: PmeBase.h:19
int K2
Definition: PmeBase.h:18
static int numGrids
Definition: ComputePme.h:32
if(ComputeNonbondedUtil::goMethod==2)
#define PRIORITY_SIZE
Definition: Priorities.h:13
void fwdSharedUntrans(PmeUntransMsg *)
Definition: ComputePme.C:2276
int ny_after_transpose
Definition: ComputePme.C:239
#define PME_UNTRANS_PRIORITY
Definition: Priorities.h:33
#define SET_PRIORITY(MSG, SEQ, PRIO)
Definition: Priorities.h:18
gridSize x
int y_start_after_transpose
Definition: ComputePme.C:239
void ComputePmeMgr::submitReductions ( )

Definition at line 4214 of file ComputePme.C.

References ComputePmeUtil::alchDecouple, ComputePmeUtil::alchFepOn, ComputePmeUtil::alchOn, ComputePmeUtil::alchThermIntOn, SimParameters::getElecLambda(), SubmitReduction::item(), ComputePmeUtil::lesFactor, ComputePmeUtil::lesOn, WorkDistrib::messageEnqueueWork(), NAMD_bug(), ComputePmeUtil::numGrids, Node::Object(), ComputePmeUtil::pairOn, REDUCTION_ELECT_ENERGY_PME_TI_1, REDUCTION_ELECT_ENERGY_PME_TI_2, REDUCTION_ELECT_ENERGY_SLOW, REDUCTION_ELECT_ENERGY_SLOW_F, REDUCTION_STRAY_CHARGE_ERRORS, ResizeArray< T >::resize(), Node::simParameters, ResizeArray< T >::size(), and SubmitReduction::submit().

Referenced by ComputePme::doWork(), and recvRecipEvir().

4214  {
4215 
4217 
4218  for ( int g=0; g<numGrids; ++g ) {
4219  float scale = 1.;
4220  if (alchOn) {
4221  BigReal elecLambdaUp, elecLambdaDown;
4222  // alchLambda set on each step in ComputePme::ungridForces()
4223  if ( alchLambda < 0 || alchLambda > 1 ) {
4224  NAMD_bug("ComputePmeMgr::submitReductions alchLambda out of range");
4225  }
4226  elecLambdaUp = simParams->getElecLambda(alchLambda);
4227  elecLambdaDown = simParams->getElecLambda(1-alchLambda);
4228  if ( g == 0 ) scale = elecLambdaUp;
4229  else if ( g == 1 ) scale = elecLambdaDown;
4230  else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
4231  if (alchDecouple) {
4232  if ( g == 2 ) scale = 1-elecLambdaUp;
4233  else if ( g == 3 ) scale = 1-elecLambdaDown;
4234  else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
4235  }
4236  } else if ( lesOn ) {
4237  scale = 1.0 / lesFactor;
4238  } else if ( pairOn ) {
4239  scale = ( g == 0 ? 1. : -1. );
4240  }
4241  reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += evir[g][0] * scale;
4242  reduction->item(REDUCTION_VIRIAL_SLOW_XX) += evir[g][1] * scale;
4243  reduction->item(REDUCTION_VIRIAL_SLOW_XY) += evir[g][2] * scale;
4244  reduction->item(REDUCTION_VIRIAL_SLOW_XZ) += evir[g][3] * scale;
4245  reduction->item(REDUCTION_VIRIAL_SLOW_YX) += evir[g][2] * scale;
4246  reduction->item(REDUCTION_VIRIAL_SLOW_YY) += evir[g][4] * scale;
4247  reduction->item(REDUCTION_VIRIAL_SLOW_YZ) += evir[g][5] * scale;
4248  reduction->item(REDUCTION_VIRIAL_SLOW_ZX) += evir[g][3] * scale;
4249  reduction->item(REDUCTION_VIRIAL_SLOW_ZY) += evir[g][5] * scale;
4250  reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += evir[g][6] * scale;
4251 
4252  float scale2 = 0.;
4253 
4254  // why is this declared/defined again here?
4255  SimParameters *simParams = Node::Object()->simParameters;
4256 
4257  if (alchFepOn) {
4258  BigReal elecLambda2Up=0.0, elecLambda2Down=0.0;
4259  elecLambda2Up = simParams->getElecLambda(alchLambda2);
4260  elecLambda2Down = simParams->getElecLambda(1.-alchLambda2);
4261  if ( g == 0 ) scale2 = elecLambda2Up;
4262  else if ( g == 1 ) scale2 = elecLambda2Down;
4263  else if ( g == 2 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
4264  if (alchDecouple && g == 2 ) scale2 = 1 - elecLambda2Up;
4265  else if (alchDecouple && g == 3 ) scale2 = 1 - elecLambda2Down;
4266  else if (alchDecouple && g == 4 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
4267  }
4268  reduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += evir[g][0] * scale2;
4269 
4270  if (alchThermIntOn) {
4271 
4272  // no decoupling:
4273  // part. 1 <-> all of system except partition 2: g[0] - g[2]
4274  // (interactions between all atoms [partition 0 OR partition 1],
4275  // minus all [within partition 0])
4276  // U = elecLambdaUp * (U[0] - U[2])
4277  // dU/dl = U[0] - U[2];
4278 
4279  // part. 2 <-> all of system except partition 1: g[1] - g[2]
4280  // (interactions between all atoms [partition 0 OR partition 2],
4281  // minus all [within partition 0])
4282  // U = elecLambdaDown * (U[1] - U[2])
4283  // dU/dl = U[1] - U[2];
4284 
4285  // alchDecouple:
4286  // part. 1 <-> part. 0: g[0] - g[2] - g[4]
4287  // (interactions between all atoms [partition 0 OR partition 1]
4288  // minus all [within partition 1] minus all [within partition 0]
4289  // U = elecLambdaUp * (U[0] - U[4]) + (1-elecLambdaUp)* U[2]
4290  // dU/dl = U[0] - U[2] - U[4];
4291 
4292  // part. 2 <-> part. 0: g[1] - g[3] - g[4]
4293  // (interactions between all atoms [partition 0 OR partition 2]
4294  // minus all [within partition 2] minus all [within partition 0]
4295  // U = elecLambdaDown * (U[1] - U[4]) + (1-elecLambdaDown)* U[3]
4296  // dU/dl = U[1] - U[3] - U[4];
4297 
4298 
4299  if ( g == 0 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) += evir[g][0];
4300  if ( g == 1 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) += evir[g][0];
4301  if (!alchDecouple) {
4302  if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
4303  if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
4304  }
4305  else { // alchDecouple
4306  if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
4307  if ( g == 3 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
4308  if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
4309  if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
4310  }
4311  }
4312  }
4313 
4314  alchLambda = -1.; // illegal value to catch if not updated
4315 
4316  reduction->item(REDUCTION_STRAY_CHARGE_ERRORS) += strayChargeErrors;
4317  reduction->submit();
4318 
4319  for ( int i=0; i<heldComputes.size(); ++i ) {
4320  WorkDistrib::messageEnqueueWork(heldComputes[i]);
4321  }
4322  heldComputes.resize(0);
4323 }
static Node * Object()
Definition: Node.h:86
SimParameters * simParameters
Definition: Node.h:178
BigReal & item(int i)
Definition: ReductionMgr.h:312
static int numGrids
Definition: ComputePme.h:32
static Bool alchOn
Definition: ComputePme.h:33
static void messageEnqueueWork(Compute *)
Definition: WorkDistrib.C:2732
void NAMD_bug(const char *err_msg)
Definition: common.C:129
static Bool alchDecouple
Definition: ComputePme.h:36
static int lesFactor
Definition: ComputePme.h:39
#define simParams
Definition: Output.C:127
static Bool pairOn
Definition: ComputePme.h:40
static Bool lesOn
Definition: ComputePme.h:38
void resize(int i)
Definition: ResizeArray.h:84
void submit(void)
Definition: ReductionMgr.h:323
int size(void) const
Definition: ResizeArray.h:127
static Bool alchFepOn
Definition: ComputePme.h:34
BigReal getElecLambda(const BigReal)
double BigReal
Definition: common.h:114
static Bool alchThermIntOn
Definition: ComputePme.h:35
void ComputePmeMgr::ungridCalc ( void  )

Definition at line 2525 of file ComputePme.C.

References a_data_dev, cuda_errcheck(), CUDA_EVENT_ID_PME_COPY, CUDA_EVENT_ID_PME_KERNEL, CUDA_EVENT_ID_PME_TICK, deviceCUDA, end_forces, EVENT_STRIDE, f_data_dev, f_data_host, forces_count, forces_done_count, forces_time, DeviceCUDA::getDeviceID(), PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, WorkDistrib::messageEnqueueWork(), PmeGrid::order, pmeComputes, ResizeArray< T >::size(), this_pe, and ungridCalc().

Referenced by ungridCalc().

2525  {
2526  // CkPrintf("ungridCalc on Pe(%d)\n",CkMyPe());
2527 
2528  ungridForcesCount = pmeComputes.size();
2529 
2530 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2531  if ( offload ) {
2532  //CmiLock(cuda_lock);
2533  cudaSetDevice(deviceCUDA->getDeviceID());
2534 
2535  if ( this == masterPmeMgr ) {
2536  double before = CmiWallTimer();
2537  cudaMemcpyAsync(v_data_dev, q_data_host, q_data_size, cudaMemcpyHostToDevice, 0 /*streams[stream]*/);
2538  cudaEventRecord(nodePmeMgr->end_potential_memcpy, 0 /*streams[stream]*/);
2539  // try to make the unspecified launch failures go away
2540  cudaEventSynchronize(nodePmeMgr->end_potential_memcpy);
2541  cuda_errcheck("in ComputePmeMgr::ungridCalc after potential memcpy");
2542  traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
2543 
2544  const int myrank = CkMyRank();
2545  for ( int i=0; i<CkMyNodeSize(); ++i ) {
2546  if ( myrank != i && nodePmeMgr->mgrObjects[i]->pmeComputes.size() ) {
2547  nodePmeMgr->mgrObjects[i]->ungridCalc();
2548  }
2549  }
2550  if ( ! pmeComputes.size() ) return;
2551  }
2552 
2553  if ( ! end_forces ) {
2554  int n=(pmeComputes.size()-1)/EVENT_STRIDE+1;
2555  end_forces = new cudaEvent_t[n];
2556  for ( int i=0; i<n; ++i ) {
2557  cudaEventCreateWithFlags(&end_forces[i],cudaEventDisableTiming);
2558  }
2559  }
2560 
2561  const int pcsz = pmeComputes.size();
2562  if ( ! afn_host ) {
2563  cudaMallocHost((void**) &afn_host, 3*pcsz*sizeof(float*));
2564  cudaMalloc((void**) &afn_dev, 3*pcsz*sizeof(float*));
2565  cuda_errcheck("malloc params for pme");
2566  }
2567  int totn = 0;
2568  for ( int i=0; i<pcsz; ++i ) {
2569  int n = pmeComputes[i]->numGridAtoms[0];
2570  totn += n;
2571  }
2572  if ( totn > f_data_mgr_alloc ) {
2573  if ( f_data_mgr_alloc ) {
2574  CkPrintf("Expanding CUDA forces allocation because %d > %d\n", totn, f_data_mgr_alloc);
2575  cudaFree(f_data_mgr_dev);
2576  cudaFreeHost(f_data_mgr_host);
2577  }
2578  f_data_mgr_alloc = 1.2 * (totn + 100);
2579  cudaMalloc((void**) &f_data_mgr_dev, 3*f_data_mgr_alloc*sizeof(float));
2580  cudaMallocHost((void**) &f_data_mgr_host, 3*f_data_mgr_alloc*sizeof(float));
2581  cuda_errcheck("malloc forces for pme");
2582  }
2583  // CkPrintf("pe %d pcsz %d totn %d alloc %d\n", CkMyPe(), pcsz, totn, f_data_mgr_alloc);
2584  float *f_dev = f_data_mgr_dev;
2585  float *f_host = f_data_mgr_host;
2586  for ( int i=0; i<pcsz; ++i ) {
2587  int n = pmeComputes[i]->numGridAtoms[0];
2588  pmeComputes[i]->f_data_dev = f_dev;
2589  pmeComputes[i]->f_data_host = f_host;
2590  afn_host[3*i ] = a_data_dev + 7 * pmeComputes[i]->cuda_atoms_offset;
2591  afn_host[3*i+1] = f_dev;
2592  afn_host[3*i+2] = f_dev + n; // avoid type conversion issues
2593  f_dev += 3*n;
2594  f_host += 3*n;
2595  }
2596  //CmiLock(cuda_lock);
2597  double before = CmiWallTimer();
2598  cudaMemcpyAsync(afn_dev, afn_host, 3*pcsz*sizeof(float*), cudaMemcpyHostToDevice, streams[stream]);
2599  cuda_errcheck("in ComputePmeMgr::ungridCalc after force pointer memcpy");
2600  traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
2601  cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_potential_memcpy, 0);
2602  cuda_errcheck("in ComputePmeMgr::ungridCalc after wait for potential memcpy");
2603  traceUserEvent(CUDA_EVENT_ID_PME_TICK);
2604 
2605  for ( int i=0; i<pcsz; ++i ) {
2606  // cudaMemsetAsync(pmeComputes[i]->f_data_dev, 0, 3*n*sizeof(float), streams[stream]);
2607  if ( i%EVENT_STRIDE == 0 ) {
2608  int dimy = pcsz - i;
2609  if ( dimy > EVENT_STRIDE ) dimy = EVENT_STRIDE;
2610  int maxn = 0;
2611  int subtotn = 0;
2612  for ( int j=0; j<dimy; ++j ) {
2613  int n = pmeComputes[i+j]->numGridAtoms[0];
2614  subtotn += n;
2615  if ( n > maxn ) maxn = n;
2616  }
2617  // CkPrintf("pe %d dimy %d maxn %d subtotn %d\n", CkMyPe(), dimy, maxn, subtotn);
2618  before = CmiWallTimer();
2619  cuda_pme_forces(
2620  bspline_coeffs_dev,
2621  v_arr_dev, afn_dev+3*i, dimy, maxn, /*
2622  pmeComputes[i]->a_data_dev,
2623  pmeComputes[i]->f_data_dev,
2624  n, */ myGrid.K1, myGrid.K2, myGrid.K3, myGrid.order,
2625  streams[stream]);
2626  cuda_errcheck("in ComputePmeMgr::ungridCalc after force kernel submit");
2627  traceUserBracketEvent(CUDA_EVENT_ID_PME_KERNEL,before,CmiWallTimer());
2628  before = CmiWallTimer();
2629  cudaMemcpyAsync(pmeComputes[i]->f_data_host, pmeComputes[i]->f_data_dev, 3*subtotn*sizeof(float),
2630  cudaMemcpyDeviceToHost, streams[stream]);
2631  cuda_errcheck("in ComputePmeMgr::ungridCalc after force memcpy submit");
2632  traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
2633  cudaEventRecord(end_forces[i/EVENT_STRIDE], streams[stream]);
2634  cuda_errcheck("in ComputePmeMgr::ungridCalc after end_forces event");
2635  traceUserEvent(CUDA_EVENT_ID_PME_TICK);
2636  }
2637  // CkPrintf("pe %d c %d natoms %d fdev %lld fhost %lld\n", CkMyPe(), i, (int64)afn_host[3*i+2], pmeComputes[i]->f_data_dev, pmeComputes[i]->f_data_host);
2638  }
2639  //CmiUnlock(cuda_lock);
2640  } else
2641 #endif // NAMD_CUDA
2642  {
2643  for ( int i=0; i<pmeComputes.size(); ++i ) {
2645  // pmeComputes[i]->ungridForces();
2646  }
2647  }
2648  // submitReductions(); // must follow all ungridForces()
2649 
2650 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
2651  if ( offload ) {
2652  forces_time = CmiWallTimer();
2653  forces_count = ungridForcesCount;
2654  forces_done_count = 0;
2655  pmeProxy[this_pe].pollForcesReady();
2656  }
2657 #endif
2658 
2659  ungrid_count = (usePencils ? numPencilsActive : numDestRecipPes );
2660 }
double forces_time
Definition: ComputePme.C:437
float * a_data_dev
Definition: ComputePme.C:425
#define EVENT_STRIDE
Definition: ComputePme.C:2477
int K2
Definition: PmeBase.h:18
int K1
Definition: PmeBase.h:18
#define CUDA_EVENT_ID_PME_COPY
static void messageEnqueueWork(Compute *)
Definition: WorkDistrib.C:2732
float * f_data_host
Definition: ComputePme.C:426
int order
Definition: PmeBase.h:20
#define CUDA_EVENT_ID_PME_TICK
float * f_data_dev
Definition: ComputePme.C:427
void ungridCalc(void)
Definition: ComputePme.C:2525
int getDeviceID()
Definition: DeviceCUDA.h:112
int K3
Definition: PmeBase.h:18
void cuda_errcheck(const char *msg)
ResizeArray< ComputePme * > pmeComputes
Definition: ComputePme.C:460
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:22
cudaEvent_t * end_forces
Definition: ComputePme.C:433
#define CUDA_EVENT_ID_PME_KERNEL
int size(void) const
Definition: ResizeArray.h:127
int forces_done_count
Definition: ComputePme.C:435

Friends And Related Function Documentation

friend class ComputePme
friend

Definition at line 363 of file ComputePme.C.

friend class NodePmeMgr
friend

Definition at line 364 of file ComputePme.C.

Member Data Documentation

float* ComputePmeMgr::a_data_dev

Definition at line 425 of file ComputePme.C.

Referenced by cuda_submit_charges(), ComputePme::doWork(), and ungridCalc().

float* ComputePmeMgr::a_data_host

Definition at line 424 of file ComputePme.C.

Referenced by cuda_submit_charges(), and ComputePme::doWork().

int ComputePmeMgr::chargeGridSubmittedCount
double ComputePmeMgr::charges_time

Definition at line 436 of file ComputePme.C.

Referenced by cuda_check_pme_charges(), and cuda_submit_charges().

int ComputePmeMgr::check_charges_count

Definition at line 438 of file ComputePme.C.

Referenced by ComputePmeMgr(), and cuda_check_pme_charges().

int ComputePmeMgr::check_forces_count

Definition at line 439 of file ComputePme.C.

Referenced by ComputePmeMgr(), and cuda_check_pme_forces().

int ComputePmeMgr::cuda_atoms_alloc

Definition at line 429 of file ComputePme.C.

Referenced by ComputePmeMgr(), and ComputePme::doWork().

int ComputePmeMgr::cuda_atoms_count
bool ComputePmeMgr::cuda_busy
static

Definition at line 448 of file ComputePme.C.

Referenced by ComputePme::doWork().

CmiNodeLock ComputePmeMgr::cuda_lock
static

Definition at line 430 of file ComputePme.C.

Referenced by ComputePmeMgr(), ComputePme::doWork(), initialize_computes(), and recvAck().

std::deque< ComputePmeMgr::cuda_submit_charges_args > ComputePmeMgr::cuda_submit_charges_deque
static

Definition at line 447 of file ComputePme.C.

Referenced by ComputePme::doWork().

cudaEvent_t ComputePmeMgr::end_charges

Definition at line 432 of file ComputePme.C.

Referenced by chargeGridSubmitted(), ComputePmeMgr(), and cuda_check_pme_charges().

cudaEvent_t* ComputePmeMgr::end_forces

Definition at line 433 of file ComputePme.C.

Referenced by ComputePmeMgr(), cuda_check_pme_forces(), and ungridCalc().

float* ComputePmeMgr::f_data_dev

Definition at line 427 of file ComputePme.C.

Referenced by ungridCalc().

float* ComputePmeMgr::f_data_host

Definition at line 426 of file ComputePme.C.

Referenced by ungridCalc().

CmiNodeLock ComputePmeMgr::fftw_plan_lock
static
int ComputePmeMgr::forces_count

Definition at line 434 of file ComputePme.C.

Referenced by cuda_check_pme_forces(), and ungridCalc().

int ComputePmeMgr::forces_done_count

Definition at line 435 of file ComputePme.C.

Referenced by cuda_check_pme_forces(), and ungridCalc().

double ComputePmeMgr::forces_time

Definition at line 437 of file ComputePme.C.

Referenced by cuda_check_pme_forces(), and ungridCalc().

int ComputePmeMgr::master_pe
ResizeArray<ComputePme*> ComputePmeMgr::pmeComputes
CmiNodeLock ComputePmeMgr::pmemgr_lock

Definition at line 421 of file ComputePme.C.

Referenced by ComputePmeMgr(), and ~ComputePmeMgr().

Lattice* ComputePmeMgr::saved_lattice

Definition at line 453 of file ComputePme.C.

Referenced by chargeGridSubmitted(), and recvChargeGridReady().

int ComputePmeMgr::saved_sequence
int ComputePmeMgr::sendDataHelper_errors

Definition at line 379 of file ComputePme.C.

Referenced by sendData(), and NodePmeMgr::sendDataHelper().

Lattice* ComputePmeMgr::sendDataHelper_lattice
int ComputePmeMgr::sendDataHelper_sequence
int ComputePmeMgr::sendDataHelper_sourcepe
int ComputePmeMgr::this_pe

Definition at line 441 of file ComputePme.C.

Referenced by ComputePmeMgr(), and ungridCalc().


The documentation for this class was generated from the following file: