ComputePmeMgr Class Reference

List of all members.

Public Member Functions

 ComputePmeMgr ()
 ~ComputePmeMgr ()
void initialize (CkQdMsg *)
void initialize_pencils (CkQdMsg *)
void activate_pencils (CkQdMsg *)
void recvArrays (CProxy_PmeXPencil, CProxy_PmeYPencil, CProxy_PmeZPencil)
void initialize_computes ()
void sendData (Lattice &, int sequence)
void sendDataPart (int first, int last, Lattice &, int sequence, int sourcepe, int errors)
void sendPencils (Lattice &, int sequence)
void sendPencilsPart (int first, int last, Lattice &, int sequence, int sourcepe)
void recvGrid (PmeGridMsg *)
void gridCalc1 (void)
void sendTransBarrier (void)
void sendTransSubset (int first, int last)
void sendTrans (void)
void fwdSharedTrans (PmeTransMsg *)
void recvSharedTrans (PmeSharedTransMsg *)
void sendDataHelper (int)
void sendPencilsHelper (int)
void recvTrans (PmeTransMsg *)
void procTrans (PmeTransMsg *)
void gridCalc2 (void)
void gridCalc2R (void)
void fwdSharedUntrans (PmeUntransMsg *)
void recvSharedUntrans (PmeSharedUntransMsg *)
void sendUntrans (void)
void sendUntransSubset (int first, int last)
void recvUntrans (PmeUntransMsg *)
void procUntrans (PmeUntransMsg *)
void gridCalc3 (void)
void sendUngrid (void)
void sendUngridSubset (int first, int last)
void recvUngrid (PmeGridMsg *)
void recvAck (PmeAckMsg *)
void copyResults (PmeGridMsg *)
void copyPencils (PmeGridMsg *)
void ungridCalc (void)
void recvRecipEvir (PmeEvirMsg *)
void addRecipEvirClient (void)
void submitReductions ()
void chargeGridSubmitted (Lattice &lattice, int sequence)
void cuda_submit_charges (Lattice &lattice, int sequence)
void sendChargeGridReady ()
void pollChargeGridReady ()
void pollForcesReady ()
void recvChargeGridReady ()
void chargeGridReady (Lattice &lattice, int sequence)

Public Attributes

LatticesendDataHelper_lattice
int sendDataHelper_sequence
int sendDataHelper_sourcepe
int sendDataHelper_errors
CmiNodeLock pmemgr_lock
float * a_data_host
float * a_data_dev
float * f_data_host
float * f_data_dev
int cuda_atoms_count
int cuda_atoms_alloc
cudaEvent_t end_charges
cudaEvent_t * end_forces
int forces_count
int forces_done_count
double charges_time
double forces_time
int check_charges_count
int check_forces_count
int master_pe
int this_pe
int chargeGridSubmittedCount
Latticesaved_lattice
int saved_sequence
ResizeArray< ComputePme * > pmeComputes

Static Public Attributes

static CmiNodeLock fftw_plan_lock
static CmiNodeLock cuda_lock
static std::deque< cuda_submit_charges_argscuda_submit_charges_deque
static bool cuda_busy

Friends

class ComputePme
class NodePmeMgr

Classes

struct  cuda_submit_charges_args

Detailed Description

Definition at line 344 of file ComputePme.C.


Constructor & Destructor Documentation

ComputePmeMgr::ComputePmeMgr (  ) 

Definition at line 701 of file ComputePme.C.

References chargeGridSubmittedCount, check_charges_count, check_forces_count, cuda_atoms_alloc, cuda_atoms_count, cuda_errcheck(), CUDA_EVENT_ID_PME_CHARGES, CUDA_EVENT_ID_PME_COPY, CUDA_EVENT_ID_PME_FORCES, CUDA_EVENT_ID_PME_KERNEL, CUDA_EVENT_ID_PME_TICK, cuda_lock, CUDA_STREAM_CREATE, end_charges, end_forces, fftw_plan_lock, NUM_STREAMS, pmemgr_lock, stream, and this_pe.

00701                              : pmeProxy(thisgroup), 
00702                                  pmeProxyDir(thisgroup) {
00703 
00704   CkpvAccess(BOCclass_group).computePmeMgr = thisgroup;
00705   pmeNodeProxy = CkpvAccess(BOCclass_group).nodePmeMgr;
00706   nodePmeMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
00707 
00708   pmeNodeProxy.ckLocalBranch()->initialize();
00709 
00710   if ( CmiMyRank() == 0 ) {
00711     fftw_plan_lock = CmiCreateLock();
00712   }
00713   pmemgr_lock = CmiCreateLock();
00714 
00715   myKSpace = 0;
00716   kgrid = 0;
00717   work = 0;
00718   grid_count = 0;
00719   trans_count = 0;
00720   untrans_count = 0;
00721   ungrid_count = 0;
00722   gridmsg_reuse= new PmeGridMsg*[CkNumPes()];
00723   useBarrier = 0;
00724   sendTransBarrier_received = 0;
00725   usePencils = 0;
00726 
00727 #ifdef NAMD_CUDA
00728  // offload has not been set so this happens on every run
00729   if ( CmiMyRank() == 0 ) {
00730     cuda_lock = CmiCreateLock();
00731   }
00732 
00733 #if CUDA_VERSION >= 5050
00734   int leastPriority, greatestPriority;
00735   cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
00736   cuda_errcheck("in cudaDeviceGetStreamPriorityRange");
00737   //if ( CkMyNode() == 0 ) {
00738   //  CkPrintf("Pe %d PME CUDA stream priority range %d %d\n", CkMyPe(), leastPriority, greatestPriority);
00739   //}
00740 #define CUDA_STREAM_CREATE(X) cudaStreamCreateWithPriority(X,cudaStreamDefault,greatestPriority)
00741 #else
00742 #define CUDA_STREAM_CREATE(X) cudaStreamCreate(X)
00743 #endif
00744 
00745   stream = 0;
00746   for ( int i=0; i<NUM_STREAMS; ++i ) {
00747 #if 1
00748     CUDA_STREAM_CREATE(&streams[i]);
00749     cuda_errcheck("cudaStreamCreate");
00750 #else
00751   streams[i] = 0;  // XXXX Testing!!!
00752 #endif
00753   }
00754 
00755   this_pe = CkMyPe();
00756  
00757   cudaEventCreateWithFlags(&end_charges,cudaEventDisableTiming);
00758   end_forces = 0;
00759   check_charges_count = 0;
00760   check_forces_count = 0;
00761   chargeGridSubmittedCount = 0;
00762 
00763   cuda_atoms_count = 0;
00764   cuda_atoms_alloc = 0;
00765 
00766   f_data_mgr_alloc = 0;
00767   f_data_mgr_host = 0;
00768   f_data_mgr_dev = 0;
00769   afn_host = 0;
00770   afn_dev = 0;
00771 
00772 #define CUDA_EVENT_ID_PME_CHARGES 80
00773 #define CUDA_EVENT_ID_PME_FORCES 81
00774 #define CUDA_EVENT_ID_PME_TICK 82
00775 #define CUDA_EVENT_ID_PME_COPY 83
00776 #define CUDA_EVENT_ID_PME_KERNEL 84
00777   if ( 0 == CkMyPe() ) {
00778     traceRegisterUserEvent("CUDA PME charges", CUDA_EVENT_ID_PME_CHARGES);
00779     traceRegisterUserEvent("CUDA PME forces", CUDA_EVENT_ID_PME_FORCES);
00780     traceRegisterUserEvent("CUDA PME tick", CUDA_EVENT_ID_PME_TICK);
00781     traceRegisterUserEvent("CUDA PME memcpy", CUDA_EVENT_ID_PME_COPY);
00782     traceRegisterUserEvent("CUDA PME kernel", CUDA_EVENT_ID_PME_KERNEL);
00783   }
00784 #endif
00785   recipEvirCount = 0;
00786   recipEvirClients = 0;
00787   recipEvirPe = -999;
00788 }

ComputePmeMgr::~ComputePmeMgr (  ) 

Definition at line 1807 of file ComputePme.C.

References fftw_plan_lock, and pmemgr_lock.

01807                               {
01808 
01809   if ( CmiMyRank() == 0 ) {
01810     CmiDestroyLock(fftw_plan_lock);
01811   }
01812   CmiDestroyLock(pmemgr_lock);
01813 
01814   delete myKSpace;
01815   delete [] localInfo;
01816   delete [] gridNodeInfo;
01817   delete [] transNodeInfo;
01818   delete [] gridPeMap;
01819   delete [] transPeMap;
01820   delete [] recipPeDest;
01821   delete [] gridPeOrder;
01822   delete [] gridNodeOrder;
01823   delete [] transNodeOrder;
01824   delete [] qgrid;
01825   if ( kgrid != qgrid ) delete [] kgrid;
01826   delete [] work;
01827   delete [] gridmsg_reuse;
01828 
01829  if ( ! offload ) {
01830   for (int i=0; i<q_count; ++i) {
01831     delete [] q_list[i];
01832   }
01833   delete [] q_list;
01834   delete [] fz_arr;
01835  }
01836   delete [] f_arr;
01837   delete [] q_arr;
01838 }


Member Function Documentation

void ComputePmeMgr::activate_pencils ( CkQdMsg *   ) 

Definition at line 1801 of file ComputePme.C.

01801                                                  {
01802   if ( ! usePencils ) return;
01803   if ( CkMyPe() == 0 ) zPencil.dummyRecvGrid(CkMyPe(),1);
01804 }

void ComputePmeMgr::addRecipEvirClient ( void   ) 

Definition at line 3042 of file ComputePme.C.

03042                                        {
03043   ++recipEvirClients;
03044 }

void ComputePmeMgr::chargeGridReady ( Lattice lattice,
int  sequence 
)

Definition at line 3559 of file ComputePme.C.

References j, PmeGrid::K3, NAMD_bug(), PmeGrid::order, pmeComputes, sendData(), sendPencils(), and ResizeArray< Elem >::size().

Referenced by recvChargeGridReady().

03559                                                                   {
03560 
03561 #ifdef NAMD_CUDA
03562  if ( offload ) {
03563   int errcount = 0;
03564   int q_stride = myGrid.K3+myGrid.order-1;
03565   for (int n=fsize+q_stride, j=fsize; j<n; ++j) {
03566     f_arr[j] = ffz_host[j];
03567     if ( ffz_host[j] & ~1 ) ++errcount;
03568   }
03569   if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::chargeGridReady");
03570  }
03571 #endif
03572   recipEvirCount = recipEvirClients;
03573   ungridForcesCount = pmeComputes.size();
03574 
03575   for (int j=0; j<myGrid.order-1; ++j) {
03576     fz_arr[j] |= fz_arr[myGrid.K3+j];
03577   }
03578 
03579   if ( usePencils ) {
03580     sendPencils(lattice,sequence);
03581   } else {
03582     sendData(lattice,sequence);
03583   }
03584 }

void ComputePmeMgr::chargeGridSubmitted ( Lattice lattice,
int  sequence 
)

Definition at line 3495 of file ComputePme.C.

References chargeGridSubmittedCount, computeMgr, CUDA_EVENT_ID_PME_COPY, deviceCUDA, NodePmeMgr::end_all_pme_kernels, NodePmeMgr::end_charge_memset, end_charges, DeviceCUDA::getMasterPe(), master_pe, Node::Object(), saved_lattice, saved_sequence, Node::simParameters, and simParams.

Referenced by cuda_submit_charges().

03495                                                                       {
03496   saved_lattice = &lattice;
03497   saved_sequence = sequence;
03498 
03499   // cudaDeviceSynchronize();  //  XXXX TESTING
03500   //int q_stride = myGrid.K3+myGrid.order-1;
03501   //for (int n=fsize+q_stride, j=0; j<n; ++j) {
03502   //  if ( ffz_host[j] != 0 && ffz_host[j] != 1 ) {
03503   //    CkPrintf("pre-memcpy flag %d/%d == %d on pe %d in ComputePmeMgr::chargeGridReady\n", j, n, ffz_host[j], CkMyPe());
03504   //  }
03505   //}
03506   //CmiLock(cuda_lock);
03507 
03508  if ( --(masterPmeMgr->chargeGridSubmittedCount) == 0 ) {
03509   double before = CmiWallTimer();
03510   cudaEventRecord(nodePmeMgr->end_all_pme_kernels, 0);  // when all streams complete
03511   cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_all_pme_kernels, 0);
03512   cudaMemcpyAsync(q_data_host, q_data_dev, q_data_size+ffz_size,
03513                         cudaMemcpyDeviceToHost, streams[stream]);
03514   traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
03515   cudaEventRecord(masterPmeMgr->end_charges, streams[stream]);
03516   cudaMemsetAsync(q_data_dev, 0, q_data_size + ffz_size, streams[stream]);  // for next time
03517   cudaEventRecord(nodePmeMgr->end_charge_memset, streams[stream]);
03518   //CmiUnlock(cuda_lock);
03519   // cudaDeviceSynchronize();  //  XXXX TESTING
03520   // cuda_errcheck("after memcpy grid to host");
03521 
03522   SimParameters *simParams = Node::Object()->simParameters;
03523   if ( ! simParams->useCUDA2 ) {
03524     CProxy_ComputeMgr cm(CkpvAccess(BOCclass_group).computeMgr);
03525     cm[deviceCUDA->getMasterPe()].recvYieldDevice(-1);
03526   }
03527 
03528   pmeProxy[master_pe].pollChargeGridReady();
03529  }
03530 }

void ComputePmeMgr::copyPencils ( PmeGridMsg  ) 

Definition at line 3805 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::block2, PmeGrid::dim2, PmeGrid::dim3, j, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PmeGrid::order, PmeGridMsg::qgrid, PmeGridMsg::sourceNode, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

Referenced by recvUngrid().

03805                                                {
03806 
03807   int K1 = myGrid.K1;
03808   int K2 = myGrid.K2;
03809   int dim2 = myGrid.dim2;
03810   int dim3 = myGrid.dim3;
03811   int block1 = myGrid.block1;
03812   int block2 = myGrid.block2;
03813 
03814   // msg->sourceNode = thisIndex.x * initdata.yBlocks + thisIndex.y;
03815   int ib = msg->sourceNode / yBlocks;
03816   int jb = msg->sourceNode % yBlocks;
03817 
03818   int ibegin = ib*block1;
03819   int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
03820   int jbegin = jb*block2;
03821   int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
03822 
03823   int zlistlen = msg->zlistlen;
03824   int *zlist = msg->zlist;
03825   float *qmsg = msg->qgrid;
03826   int g;
03827   for ( g=0; g<numGrids; ++g ) {
03828     char *f = f_arr + g*fsize;
03829     float **q = q_arr + g*fsize;
03830     for ( int i=ibegin; i<iend; ++i ) {
03831      for ( int j=jbegin; j<jend; ++j ) {
03832       if( f[i*dim2+j] ) {
03833         f[i*dim2+j] = 0;
03834         for ( int k=0; k<zlistlen; ++k ) {
03835           q[i*dim2+j][zlist[k]] = *(qmsg++);
03836         }
03837         for (int h=0; h<myGrid.order-1; ++h) {
03838           q[i*dim2+j][myGrid.K3+h] = q[i*dim2+j][h];
03839         }
03840       }
03841      }
03842     }
03843   }
03844 }

void ComputePmeMgr::copyResults ( PmeGridMsg  ) 

Definition at line 3997 of file ComputePme.C.

References PmeGrid::dim3, PmeGridMsg::fgrid, PmeGrid::K3, PmeGridMsg::len, PmeGrid::order, PmeGridMsg::qgrid, PmeGridMsg::start, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

Referenced by recvUngrid().

03997                                                {
03998 
03999   int zdim = myGrid.dim3;
04000   int flen = msg->len;
04001   int fstart = msg->start;
04002   int zlistlen = msg->zlistlen;
04003   int *zlist = msg->zlist;
04004   float *qmsg = msg->qgrid;
04005   int g;
04006   for ( g=0; g<numGrids; ++g ) {
04007     char *f = msg->fgrid + g*flen;
04008     float **q = q_arr + fstart + g*fsize;
04009     for ( int i=0; i<flen; ++i ) {
04010       if ( f[i] ) {
04011         f[i] = 0;
04012         for ( int k=0; k<zlistlen; ++k ) {
04013           q[i][zlist[k]] = *(qmsg++);
04014         }
04015         for (int h=0; h<myGrid.order-1; ++h) {
04016           q[i][myGrid.K3+h] = q[i][h];
04017         }
04018       }
04019     }
04020   }
04021 }

void ComputePmeMgr::cuda_submit_charges ( Lattice lattice,
int  sequence 
)

Definition at line 3442 of file ComputePme.C.

References a_data_dev, chargeGridSubmitted(), charges_time, cuda_atoms_count, CUDA_EVENT_ID_PME_COPY, CUDA_EVENT_ID_PME_KERNEL, NodePmeMgr::end_charge_memset, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, and PmeGrid::order.

03442                                                                       {
03443 
03444     int n = cuda_atoms_count;
03445     //CkPrintf("pe %d cuda_atoms_count %d\n", CkMyPe(), cuda_atoms_count);
03446     cuda_atoms_count = 0;
03447 
03448     const double before = CmiWallTimer();
03449     cudaMemcpyAsync(a_data_dev, a_data_host, 7*n*sizeof(float),
03450                           cudaMemcpyHostToDevice, streams[stream]);
03451     const double after = CmiWallTimer();
03452 
03453     cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_charge_memset, 0);
03454 
03455     cuda_pme_charges(
03456       bspline_coeffs_dev,
03457       q_arr_dev, ffz_dev, ffz_dev + fsize,
03458       a_data_dev, n,
03459       myGrid.K1, myGrid.K2, myGrid.K3, myGrid.order,
03460       streams[stream]);
03461     const double after2 = CmiWallTimer();
03462 
03463     chargeGridSubmitted(lattice,sequence);  // must be inside lock
03464 
03465     masterPmeMgr->charges_time = before;
03466     traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,after);
03467     traceUserBracketEvent(CUDA_EVENT_ID_PME_KERNEL,after,after2);
03468 }

void ComputePmeMgr::fwdSharedTrans ( PmeTransMsg  ) 

Definition at line 2027 of file ComputePme.C.

References NodePmeInfo::npe, NodePmeInfo::pe_start, PME_TRANS_PRIORITY, PRIORITY_SIZE, PmeTransMsg::sequence, and SET_PRIORITY.

Referenced by NodePmeMgr::recvTrans(), and sendTransSubset().

02027                                                    {
02028   // CkPrintf("fwdSharedTrans on Pe(%d)\n",CkMyPe());
02029   int pe = transNodeInfo[myTransNode].pe_start;
02030   int npe = transNodeInfo[myTransNode].npe;
02031   CmiNodeLock lock = CmiCreateLock();
02032   int *count = new int; *count = npe;
02033   for (int i=0; i<npe; ++i, ++pe) {
02034     PmeSharedTransMsg *shmsg = new (PRIORITY_SIZE) PmeSharedTransMsg;
02035     SET_PRIORITY(shmsg,msg->sequence,PME_TRANS_PRIORITY)
02036     shmsg->msg = msg;
02037     shmsg->count = count;
02038     shmsg->lock = lock;
02039     pmeProxy[transPeMap[pe]].recvSharedTrans(shmsg);
02040   }
02041 }

void ComputePmeMgr::fwdSharedUntrans ( PmeUntransMsg  ) 

Definition at line 2283 of file ComputePme.C.

References NodePmeInfo::npe, and NodePmeInfo::pe_start.

Referenced by NodePmeMgr::recvUntrans(), and sendUntransSubset().

02283                                                        {
02284   int pe = gridNodeInfo[myGridNode].pe_start;
02285   int npe = gridNodeInfo[myGridNode].npe;
02286   CmiNodeLock lock = CmiCreateLock();
02287   int *count = new int; *count = npe;
02288   for (int i=0; i<npe; ++i, ++pe) {
02289     PmeSharedUntransMsg *shmsg = new PmeSharedUntransMsg;
02290     shmsg->msg = msg;
02291     shmsg->count = count;
02292     shmsg->lock = lock;
02293     pmeProxy[gridPeMap[pe]].recvSharedUntrans(shmsg);
02294   }
02295 }

void ComputePmeMgr::gridCalc1 ( void   ) 

Definition at line 1919 of file ComputePme.C.

References PmeGrid::dim2, and PmeGrid::dim3.

Referenced by recvGrid().

01919                                   {
01920   // CkPrintf("gridCalc1 on Pe(%d)\n",CkMyPe());
01921 
01922 #ifdef NAMD_FFTW
01923   for ( int g=0; g<numGrids; ++g ) {
01924 #ifdef NAMD_FFTW_3
01925     fftwf_execute(forward_plan_yz[g]);
01926 #else
01927     rfftwnd_real_to_complex(forward_plan_yz, localInfo[myGridPe].nx,
01928         qgrid + qgrid_size * g, 1, myGrid.dim2 * myGrid.dim3, 0, 0, 0);
01929 #endif
01930 
01931   }
01932 #endif
01933 
01934   if ( ! useBarrier ) pmeProxyDir[CkMyPe()].sendTrans();
01935 }

void ComputePmeMgr::gridCalc2 ( void   ) 

Definition at line 2095 of file ComputePme.C.

References PmeGrid::dim3, gridCalc2R(), LocalPmeInfo::ny_after_transpose, and simParams.

Referenced by procTrans().

02095                                   {
02096   // CkPrintf("gridCalc2 on Pe(%d)\n",CkMyPe());
02097 
02098 #if CMK_BLUEGENEL
02099   CmiNetworkProgressAfter (0);
02100 #endif
02101 
02102   int zdim = myGrid.dim3;
02103   // int y_start = localInfo[myTransPe].y_start_after_transpose;
02104   int ny = localInfo[myTransPe].ny_after_transpose;
02105 
02106   for ( int g=0; g<numGrids; ++g ) {
02107     // finish forward FFT (x dimension)
02108 #ifdef NAMD_FFTW
02109 #ifdef NAMD_FFTW_3
02110     fftwf_execute(forward_plan_x[g]);
02111 #else
02112     fftw(forward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
02113         ny * zdim / 2, 1, work, 1, 0);
02114 #endif
02115 #endif
02116   }
02117 
02118 #ifdef OPENATOM_VERSION
02119     if ( ! simParams -> openatomOn ) { 
02120 #endif // OPENATOM_VERSION
02121       gridCalc2R();
02122 #ifdef OPENATOM_VERSION
02123     } else {
02124       gridCalc2Moa();
02125     }
02126 #endif // OPENATOM_VERSION
02127 }

void ComputePmeMgr::gridCalc2R ( void   ) 

Definition at line 2155 of file ComputePme.C.

References CKLOOP_CTRL_PME_KSPACE, PmeKSpace::compute_energy(), PmeGrid::dim3, ComputeNonbondedUtil::ewaldcof, LocalPmeInfo::ny_after_transpose, and Node::Object().

Referenced by gridCalc2().

02155                                    {
02156 
02157   int useCkLoop = 0;
02158 #if CMK_SMP && USE_CKLOOP
02159   if ( Node::Object()->simParameters->useCkLoop >= CKLOOP_CTRL_PME_KSPACE
02160        && CkNumPes() >= 2 * numTransPes ) {
02161     useCkLoop = 1;
02162   }
02163 #endif
02164 
02165   int zdim = myGrid.dim3;
02166   // int y_start = localInfo[myTransPe].y_start_after_transpose;
02167   int ny = localInfo[myTransPe].ny_after_transpose;
02168 
02169   for ( int g=0; g<numGrids; ++g ) {
02170     // reciprocal space portion of PME
02171     BigReal ewaldcof = ComputeNonbondedUtil::ewaldcof;
02172     recip_evir2[g][0] = myKSpace->compute_energy(kgrid+qgrid_size*g,
02173                         lattice, ewaldcof, &(recip_evir2[g][1]), useCkLoop);
02174     // CkPrintf("Ewald reciprocal energy = %f\n", recip_evir2[g][0]);
02175 
02176     // start backward FFT (x dimension)
02177 
02178 #ifdef NAMD_FFTW
02179 #ifdef NAMD_FFTW_3
02180     fftwf_execute(backward_plan_x[g]);
02181 #else
02182     fftw(backward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
02183         ny * zdim / 2, 1, work, 1, 0);
02184 #endif
02185 #endif
02186   }
02187   
02188   pmeProxyDir[CkMyPe()].sendUntrans();
02189 }

void ComputePmeMgr::gridCalc3 ( void   ) 

Definition at line 2357 of file ComputePme.C.

References PmeGrid::dim2, and PmeGrid::dim3.

Referenced by procUntrans().

02357                                   {
02358   // CkPrintf("gridCalc3 on Pe(%d)\n",CkMyPe());
02359 
02360   // finish backward FFT
02361 #ifdef NAMD_FFTW
02362 
02363   for ( int g=0; g<numGrids; ++g ) {
02364 #ifdef NAMD_FFTW_3
02365     fftwf_execute(backward_plan_yz[g]);
02366 #else
02367     rfftwnd_complex_to_real(backward_plan_yz, localInfo[myGridPe].nx,
02368         (fftw_complex *) (qgrid + qgrid_size * g),
02369         1, myGrid.dim2 * myGrid.dim3 / 2, 0, 0, 0);
02370 #endif
02371   }
02372 
02373 #endif
02374 
02375   pmeProxyDir[CkMyPe()].sendUngrid();
02376 }

void ComputePmeMgr::initialize ( CkQdMsg *   ) 

Definition at line 853 of file ComputePme.C.

References Lattice::a(), Lattice::a_r(), ResizeArray< Elem >::add(), ResizeArray< Elem >::begin(), PmeGrid::block1, PmeGrid::block2, PmeGrid::block3, cuda_errcheck(), deviceCUDA, PmeGrid::dim2, PmeGrid::dim3, ResizeArray< Elem >::end(), endi(), fftw_plan_lock, findRecipEvirPe(), generatePmePeList2(), DeviceCUDA::getDeviceID(), PmePencilInitMsgData::grid, if(), iINFO(), iout, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PatchMap::max_a(), PatchMap::min_a(), NAMD_bug(), NAMD_die(), PatchMap::node(), NodePmeInfo::npe, PatchMap::numNodesWithPatches(), PatchMap::numPatches(), numPatches, PatchMap::numPatchesOnNode(), LocalPmeInfo::nx, LocalPmeInfo::ny_after_transpose, PatchMap::Object(), Node::Object(), DeviceCUDA::one_device_per_node(), PmeGrid::order, NodePmeInfo::pe_start, WorkDistrib::peDiffuseOrdering, pencilPMEProcessors, PmePencilInitMsgData::pmeNodeProxy, PmePencilInitMsgData::pmeProxy, NodePmeInfo::real_node, Random::reorder(), ResizeArray< Elem >::resize(), Node::simParameters, simParams, ResizeArray< Elem >::size(), SortableResizeArray< Elem >::sort(), WorkDistrib::sortPmePes(), Vector::unit(), LocalPmeInfo::x_start, PmePencilInitMsgData::xBlocks, PmePencilInitMsgData::xm, PmePencilInitMsgData::xPencil, LocalPmeInfo::y_start_after_transpose, PmePencilInitMsgData::yBlocks, PmePencilInitMsgData::ym, PmePencilInitMsgData::yPencil, PmePencilInitMsgData::zBlocks, PmePencilInitMsgData::zm, and PmePencilInitMsgData::zPencil.

00853                                            {
00854   delete msg;
00855 
00856   localInfo = new LocalPmeInfo[CkNumPes()];
00857   gridNodeInfo = new NodePmeInfo[CkNumNodes()];
00858   transNodeInfo = new NodePmeInfo[CkNumNodes()];
00859   gridPeMap = new int[CkNumPes()];
00860   transPeMap = new int[CkNumPes()];
00861   recipPeDest = new int[CkNumPes()];
00862   gridPeOrder = new int[CkNumPes()];
00863   gridNodeOrder = new int[CkNumNodes()];
00864   transNodeOrder = new int[CkNumNodes()];
00865 
00866   if (CkMyRank() == 0) {
00867     pencilPMEProcessors = new char [CkNumPes()];
00868     memset (pencilPMEProcessors, 0, sizeof(char) * CkNumPes());
00869   }
00870 
00871   SimParameters *simParams = Node::Object()->simParameters;
00872   PatchMap *patchMap = PatchMap::Object();
00873 
00874   offload = simParams->PMEOffload;
00875 #ifdef NAMD_CUDA
00876   if ( offload && ! deviceCUDA->one_device_per_node() ) {
00877     NAMD_die("PME offload requires exactly one CUDA device per process.  Use \"PMEOffload no\".");
00878   }
00879   if ( offload ) {
00880     int dev;
00881     cudaGetDevice(&dev);
00882     cuda_errcheck("in cudaGetDevice");
00883     if ( dev != deviceCUDA->getDeviceID() ) NAMD_bug("ComputePmeMgr::initialize dev != deviceCUDA->getDeviceID()");
00884     cudaDeviceProp deviceProp;
00885     cudaGetDeviceProperties(&deviceProp, dev);
00886     cuda_errcheck("in cudaGetDeviceProperties");
00887     if ( deviceProp.major < 2 )
00888       NAMD_die("PME offload requires CUDA device of compute capability 2.0 or higher.  Use \"PMEOffload no\".");
00889   }
00890 #endif
00891 
00892   alchLambda = -1.;  // illegal value to catch if not updated
00893 
00894   alchOn = simParams->alchOn;
00895   alchFepOn = simParams->alchFepOn;
00896   alchThermIntOn = simParams->alchThermIntOn;
00897   alchDecouple = alchOn && simParams->alchDecouple;
00898   alchElecLambdaStart = alchOn ? simParams->alchElecLambdaStart : 0;
00899   if (alchOn) {
00900     numGrids = 2;
00901     if (alchDecouple) numGrids += 2;
00902     if (alchElecLambdaStart || alchThermIntOn) numGrids ++;
00903   }
00904   else numGrids = 1;
00905   lesOn = simParams->lesOn;
00906   useBarrier = simParams->PMEBarrier;
00907   if ( lesOn ) {
00908     lesFactor = simParams->lesFactor;
00909     numGrids = lesFactor;
00910   }
00911   selfOn = 0;
00912   pairOn = simParams->pairInteractionOn;
00913   if ( pairOn ) {
00914     selfOn = simParams->pairInteractionSelf;
00915     if ( selfOn ) pairOn = 0;  // make pairOn and selfOn exclusive
00916     numGrids = selfOn ? 1 : 3;
00917   }
00918 
00919   if ( numGrids != 1 || simParams->PMEPencils == 0 ) usePencils = 0;
00920   else if ( simParams->PMEPencils > 0 ) usePencils = 1;
00921   else {
00922     int nrps = simParams->PMEProcessors;
00923     if ( nrps <= 0 ) nrps = CkNumPes();
00924     if ( nrps > CkNumPes() ) nrps = CkNumPes();
00925     int dimx = simParams->PMEGridSizeX;
00926     int dimy = simParams->PMEGridSizeY;
00927     int maxslabs = 1 + (dimx - 1) / simParams->PMEMinSlices;
00928     if ( maxslabs > nrps ) maxslabs = nrps;
00929     int maxpencils = ( simParams->PMEGridSizeX * simParams->PMEGridSizeY
00930                 * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
00931     if ( maxpencils > nrps ) maxpencils = nrps;
00932     if ( maxpencils > 3 * maxslabs ) usePencils = 1;
00933     else usePencils = 0;
00934   }
00935 
00936   if ( usePencils ) {
00937     int nrps = simParams->PMEProcessors;
00938     if ( nrps <= 0 ) nrps = CkNumPes();
00939     if ( nrps > CkNumPes() ) nrps = CkNumPes();
00940     if ( simParams->PMEPencils > 1 &&
00941          simParams->PMEPencils * simParams->PMEPencils <= nrps ) {
00942       xBlocks = yBlocks = zBlocks = simParams->PMEPencils;
00943     } else {
00944       int nb2 = ( simParams->PMEGridSizeX * simParams->PMEGridSizeY
00945                 * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
00946       if ( nb2 > nrps ) nb2 = nrps;
00947       if ( nb2 < 1 ) nb2 = 1;
00948       int nb = (int) sqrt((float)nb2);
00949       if ( nb < 1 ) nb = 1;
00950       xBlocks = zBlocks = nb;
00951       yBlocks = nb2 / nb;
00952     }
00953 
00954     if ( simParams->PMEPencilsX > 0 ) xBlocks = simParams->PMEPencilsX;
00955     if ( simParams->PMEPencilsY > 0 ) yBlocks = simParams->PMEPencilsY;
00956     if ( simParams->PMEPencilsZ > 0 ) zBlocks = simParams->PMEPencilsZ;
00957 
00958     int dimx = simParams->PMEGridSizeX;
00959     int bx = 1 + ( dimx - 1 ) / xBlocks;
00960     xBlocks = 1 + ( dimx - 1 ) / bx;
00961 
00962     int dimy = simParams->PMEGridSizeY;
00963     int by = 1 + ( dimy - 1 ) / yBlocks;
00964     yBlocks = 1 + ( dimy - 1 ) / by;
00965 
00966     int dimz = simParams->PMEGridSizeZ / 2 + 1;  // complex
00967     int bz = 1 + ( dimz - 1 ) / zBlocks;
00968     zBlocks = 1 + ( dimz - 1 ) / bz;
00969 
00970     if ( xBlocks * yBlocks > CkNumPes() ) {
00971       NAMD_die("PME pencils xBlocks * yBlocks > numPes");
00972     }
00973     if ( xBlocks * zBlocks > CkNumPes() ) {
00974       NAMD_die("PME pencils xBlocks * zBlocks > numPes");
00975     }
00976     if ( yBlocks * zBlocks > CkNumPes() ) {
00977       NAMD_die("PME pencils yBlocks * zBlocks > numPes");
00978     }
00979 
00980     if ( ! CkMyPe() ) {
00981       iout << iINFO << "PME using " << xBlocks << " x " <<
00982         yBlocks << " x " << zBlocks <<
00983         " pencil grid for FFT and reciprocal sum.\n" << endi;
00984     }
00985   } else { // usePencils
00986 
00987   {  // decide how many pes to use for reciprocal sum
00988 
00989     // rules based on work available
00990     int minslices = simParams->PMEMinSlices;
00991     int dimx = simParams->PMEGridSizeX;
00992     int nrpx = ( dimx + minslices - 1 ) / minslices;
00993     int dimy = simParams->PMEGridSizeY;
00994     int nrpy = ( dimy + minslices - 1 ) / minslices;
00995 
00996     // rules based on processors available
00997     int nrpp = CkNumPes();
00998     // if ( nrpp > 32 ) nrpp = 32;  // cap to limit messages
00999     if ( nrpp < nrpx ) nrpx = nrpp;
01000     if ( nrpp < nrpy ) nrpy = nrpp;
01001 
01002     // user override
01003     int nrps = simParams->PMEProcessors;
01004     if ( nrps > CkNumPes() ) nrps = CkNumPes();
01005     if ( nrps > 0 ) nrpx = nrps;
01006     if ( nrps > 0 ) nrpy = nrps;
01007 
01008     // make sure there aren't any totally empty processors
01009     int bx = ( dimx + nrpx - 1 ) / nrpx;
01010     nrpx = ( dimx + bx - 1 ) / bx;
01011     int by = ( dimy + nrpy - 1 ) / nrpy;
01012     nrpy = ( dimy + by - 1 ) / by;
01013     if ( bx != ( dimx + nrpx - 1 ) / nrpx )
01014       NAMD_bug("Error in selecting number of PME processors.");
01015     if ( by != ( dimy + nrpy - 1 ) / nrpy )
01016       NAMD_bug("Error in selecting number of PME processors.");
01017 
01018     numGridPes = nrpx;
01019     numTransPes = nrpy;
01020   }
01021   if ( ! CkMyPe() ) {
01022     iout << iINFO << "PME using " << numGridPes << " and " << numTransPes <<
01023       " processors for FFT and reciprocal sum.\n" << endi;
01024   }
01025 
01026   int sum_npes = numTransPes + numGridPes;
01027   int max_npes = (numTransPes > numGridPes)?numTransPes:numGridPes;
01028 
01029 #if 0 // USE_TOPOMAP
01030   /* This code is being disabled permanently for slab PME on Blue Gene machines */
01031   PatchMap * pmap = PatchMap::Object();
01032   
01033   int patch_pes = pmap->numNodesWithPatches();
01034   TopoManager tmgr;
01035   if(tmgr.hasMultipleProcsPerNode())
01036     patch_pes *= 2;
01037 
01038   bool done = false;
01039   if(CkNumPes() > 2*sum_npes + patch_pes) {    
01040     done = generateBGLORBPmePeList(transPeMap, numTransPes);
01041     done &= generateBGLORBPmePeList(gridPeMap, numGridPes, transPeMap, numTransPes);    
01042   }
01043   else 
01044     if(CkNumPes() > 2 *max_npes + patch_pes) {
01045       done = generateBGLORBPmePeList(transPeMap, max_npes);
01046       gridPeMap = transPeMap;
01047     }
01048 
01049   if (!done)
01050 #endif
01051     {
01052       //generatePmePeList(transPeMap, max_npes);
01053       //gridPeMap = transPeMap;
01054       generatePmePeList2(gridPeMap, numGridPes, transPeMap, numTransPes);
01055     }
01056   
01057   if ( ! CkMyPe() ) {
01058     iout << iINFO << "PME GRID LOCATIONS:";
01059     int i;
01060     for ( i=0; i<numGridPes && i<10; ++i ) {
01061       iout << " " << gridPeMap[i];
01062     }
01063     if ( i < numGridPes ) iout << " ...";
01064     iout << "\n" << endi;
01065     iout << iINFO << "PME TRANS LOCATIONS:";
01066     for ( i=0; i<numTransPes && i<10; ++i ) {
01067       iout << " " << transPeMap[i];
01068     }
01069     if ( i < numTransPes ) iout << " ...";
01070     iout << "\n" << endi;
01071   }
01072 
01073   // sort based on nodes and physical nodes
01074   std::sort(gridPeMap,gridPeMap+numGridPes,WorkDistrib::pe_sortop_compact());
01075 
01076   myGridPe = -1;
01077   myGridNode = -1;
01078   int i = 0;
01079   int node = -1;
01080   int real_node = -1;
01081   for ( i=0; i<numGridPes; ++i ) {
01082     if ( gridPeMap[i] == CkMyPe() ) myGridPe = i;
01083     if (CkMyRank() == 0) pencilPMEProcessors[gridPeMap[i]] |= 1;
01084     int real_node_i = CkNodeOf(gridPeMap[i]);
01085     if ( real_node_i == real_node ) {
01086       gridNodeInfo[node].npe += 1;
01087     } else {
01088       real_node = real_node_i;
01089       ++node;
01090       gridNodeInfo[node].real_node = real_node;
01091       gridNodeInfo[node].pe_start = i;
01092       gridNodeInfo[node].npe = 1;
01093     }
01094     if ( CkMyNode() == real_node_i ) myGridNode = node;
01095   }
01096   numGridNodes = node + 1;
01097   myTransPe = -1;
01098   myTransNode = -1;
01099   node = -1;
01100   real_node = -1;
01101   for ( i=0; i<numTransPes; ++i ) {
01102     if ( transPeMap[i] == CkMyPe() ) myTransPe = i;
01103     if (CkMyRank() == 0) pencilPMEProcessors[transPeMap[i]] |= 2;
01104     int real_node_i = CkNodeOf(transPeMap[i]);
01105     if ( real_node_i == real_node ) {
01106       transNodeInfo[node].npe += 1;
01107     } else {
01108       real_node = real_node_i;
01109       ++node;
01110       transNodeInfo[node].real_node = real_node;
01111       transNodeInfo[node].pe_start = i;
01112       transNodeInfo[node].npe = 1;
01113     }
01114     if ( CkMyNode() == real_node_i ) myTransNode = node;
01115   }
01116   numTransNodes = node + 1;
01117 
01118   if ( ! CkMyPe() ) {
01119     iout << iINFO << "PME USING " << numGridNodes << " GRID NODES AND "
01120          << numTransNodes << " TRANS NODES\n" << endi;
01121   }
01122 
01123   { // generate random orderings for grid and trans messages
01124     int i;
01125     for ( i = 0; i < numGridPes; ++i ) {
01126       gridPeOrder[i] = i;
01127     }
01128     Random rand(CkMyPe());
01129     if ( myGridPe < 0 ) {
01130       rand.reorder(gridPeOrder,numGridPes);
01131     } else {  // self last
01132       gridPeOrder[myGridPe] = numGridPes-1;
01133       gridPeOrder[numGridPes-1] = myGridPe;
01134       rand.reorder(gridPeOrder,numGridPes-1);
01135     } 
01136     for ( i = 0; i < numGridNodes; ++i ) {
01137       gridNodeOrder[i] = i;
01138     }
01139     if ( myGridNode < 0 ) {
01140       rand.reorder(gridNodeOrder,numGridNodes);
01141     } else {  // self last
01142       gridNodeOrder[myGridNode] = numGridNodes-1;
01143       gridNodeOrder[numGridNodes-1] = myGridNode;
01144       rand.reorder(gridNodeOrder,numGridNodes-1);
01145     }
01146     for ( i = 0; i < numTransNodes; ++i ) {
01147       transNodeOrder[i] = i;
01148     }
01149     if ( myTransNode < 0 ) {
01150       rand.reorder(transNodeOrder,numTransNodes);
01151     } else {  // self last
01152       transNodeOrder[myTransNode] = numTransNodes-1;
01153       transNodeOrder[numTransNodes-1] = myTransNode;
01154       rand.reorder(transNodeOrder,numTransNodes-1);
01155     }
01156   }
01157   
01158   } // ! usePencils
01159 
01160   myGrid.K1 = simParams->PMEGridSizeX;
01161   myGrid.K2 = simParams->PMEGridSizeY;
01162   myGrid.K3 = simParams->PMEGridSizeZ;
01163   myGrid.order = simParams->PMEInterpOrder;
01164   myGrid.dim2 = myGrid.K2;
01165   myGrid.dim3 = 2 * (myGrid.K3/2 + 1);
01166 
01167   if ( ! usePencils ) {
01168     myGrid.block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
01169     myGrid.block2 = ( myGrid.K2 + numTransPes - 1 ) / numTransPes;
01170     myGrid.block3 = myGrid.dim3 / 2;  // complex
01171   }
01172 
01173   if ( usePencils ) {
01174     myGrid.block1 = ( myGrid.K1 + xBlocks - 1 ) / xBlocks;
01175     myGrid.block2 = ( myGrid.K2 + yBlocks - 1 ) / yBlocks;
01176     myGrid.block3 = ( myGrid.K3/2 + 1 + zBlocks - 1 ) / zBlocks;  // complex
01177 
01178 
01179       int pe = 0;
01180       int x,y,z;
01181 
01182                 SortableResizeArray<int> zprocs(xBlocks*yBlocks);
01183                 SortableResizeArray<int> yprocs(xBlocks*zBlocks);
01184                 SortableResizeArray<int> xprocs(yBlocks*zBlocks);
01185       
01186                 // decide which pes to use by bit reversal and patch use
01187                 int i;
01188                 int ncpus = CkNumPes();
01189                 SortableResizeArray<int> patches, nopatches, pmeprocs;
01190                 PatchMap *pmap = PatchMap::Object();
01191                 for ( int icpu=0; icpu<ncpus; ++icpu ) {
01192                         int ri = WorkDistrib::peDiffuseOrdering[icpu];
01193                         if ( ri ) { // keep 0 for special case
01194                                 if ( pmap->numPatchesOnNode(ri) ) patches.add(ri);
01195                                 else nopatches.add(ri);
01196                         }
01197                 }
01198 
01199 #if USE_RANDOM_TOPO
01200             Random rand(CkMyPe());
01201             int *tmp = new int[patches.size()];
01202             int nn = patches.size();
01203             for (i=0;i<nn;i++)  tmp[i] = patches[i];
01204             rand.reorder(tmp, nn);
01205             patches.resize(0);
01206             for (i=0;i<nn;i++)  patches.add(tmp[i]);
01207             delete [] tmp;
01208             tmp = new int[nopatches.size()];
01209             nn = nopatches.size();
01210             for (i=0;i<nn;i++)  tmp[i] = nopatches[i];
01211             rand.reorder(tmp, nn);
01212             nopatches.resize(0);
01213             for (i=0;i<nn;i++)  nopatches.add(tmp[i]);
01214             delete [] tmp;
01215 #endif
01216 
01217                 // only use zero if it eliminates overloading or has patches
01218                 int useZero = 0;
01219                 int npens = xBlocks*yBlocks;
01220                 if ( npens % ncpus == 0 ) useZero = 1;
01221                 if ( npens == nopatches.size() + 1 ) useZero = 1;
01222                 npens += xBlocks*zBlocks;
01223                 if ( npens % ncpus == 0 ) useZero = 1;
01224                 if ( npens == nopatches.size() + 1 ) useZero = 1;
01225                 npens += yBlocks*zBlocks;
01226                 if ( npens % ncpus == 0 ) useZero = 1;
01227                 if ( npens == nopatches.size() + 1 ) useZero = 1;
01228 
01229                 // add nopatches then patches in reversed order
01230                 for ( i=nopatches.size()-1; i>=0; --i ) pmeprocs.add(nopatches[i]);
01231                 if ( useZero && ! pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
01232                 for ( i=patches.size()-1; i>=0; --i ) pmeprocs.add(patches[i]);
01233                 if ( pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
01234   
01235                 int npes = pmeprocs.size();
01236                 for ( i=0; i<xBlocks*yBlocks; ++i, ++pe ) zprocs[i] = pmeprocs[pe%npes];
01237                 if ( i>1 && zprocs[0] == zprocs[i-1] ) zprocs[0] = 0;
01238 #if !USE_RANDOM_TOPO
01239                 zprocs.sort();
01240 #endif
01241                 for ( i=0; i<xBlocks*zBlocks; ++i, ++pe ) yprocs[i] = pmeprocs[pe%npes];
01242                 if ( i>1 && yprocs[0] == yprocs[i-1] ) yprocs[0] = 0;
01243 #if !USE_RANDOM_TOPO
01244                 yprocs.sort();
01245 #endif
01246       for ( i=0; i<yBlocks*zBlocks; ++i, ++pe ) xprocs[i] = pmeprocs[pe%npes];
01247       if ( i>1 && xprocs[0] == xprocs[i-1] ) xprocs[0] = 0;
01248 #if !USE_RANDOM_TOPO
01249       xprocs.sort();
01250 #endif
01251 
01252 #if USE_TOPO_SFC
01253   CmiLock(tmgr_lock);
01254   //{
01255   TopoManager tmgr;
01256   int xdim = tmgr.getDimNX();
01257   int ydim = tmgr.getDimNY();
01258   int zdim = tmgr.getDimNZ();
01259   int xdim1 = find_level_grid(xdim);
01260   int ydim1 = find_level_grid(ydim);
01261   int zdim1 = find_level_grid(zdim);
01262   if(CkMyPe() == 0)
01263       printf("xdim: %d %d %d, %d %d %d\n", xdim, ydim, zdim, xdim1, ydim1, zdim1);
01264 
01265   vector<Coord> result;
01266   SFC_grid(xdim, ydim, zdim, xdim1, ydim1, zdim1, result);
01267   sort_sfc(xprocs, tmgr, result);
01268   sort_sfc(yprocs, tmgr, result);
01269   sort_sfc(zprocs, tmgr, result);
01270   //}
01271   CmiUnlock(tmgr_lock);
01272 #endif
01273 
01274 
01275                 if(CkMyPe() == 0){  
01276               iout << iINFO << "PME Z PENCIL LOCATIONS:";
01277           for ( i=0; i<zprocs.size() && i<10; ++i ) {
01278 #if USE_TOPO_SFC
01279               int x,y,z,t;
01280               tmgr.rankToCoordinates(zprocs[i], x,y, z, t);
01281               iout << " " << zprocs[i] << "(" << x << " " << y << " " << z << ")";
01282 #else
01283               iout << " " << zprocs[i];
01284 #endif
01285           }
01286           if ( i < zprocs.size() ) iout << " ...";
01287               iout << "\n" << endi;
01288                 }
01289 
01290     if (CkMyRank() == 0) {
01291       for (pe=0, x = 0; x < xBlocks; ++x)
01292         for (y = 0; y < yBlocks; ++y, ++pe ) {
01293           pencilPMEProcessors[zprocs[pe]] = 1;
01294         }
01295     }
01296      
01297                 if(CkMyPe() == 0){  
01298               iout << iINFO << "PME Y PENCIL LOCATIONS:";
01299           for ( i=0; i<yprocs.size() && i<10; ++i ) {
01300 #if USE_TOPO_SFC
01301               int x,y,z,t;
01302               tmgr.rankToCoordinates(yprocs[i], x,y, z, t);
01303               iout << " " << yprocs[i] << "(" << x << " " << y << " " << z << ")";
01304 #else
01305               iout << " " << yprocs[i];
01306 #endif
01307           }
01308           if ( i < yprocs.size() ) iout << " ...";
01309               iout << "\n" << endi;
01310                 }
01311 
01312     if (CkMyRank() == 0) {
01313       for (pe=0, z = 0; z < zBlocks; ++z )
01314         for (x = 0; x < xBlocks; ++x, ++pe ) {
01315           pencilPMEProcessors[yprocs[pe]] = 1;
01316         }
01317     }
01318     
01319                 if(CkMyPe() == 0){  
01320                 iout << iINFO << "PME X PENCIL LOCATIONS:";
01321                     for ( i=0; i<xprocs.size() && i<10; ++i ) {
01322 #if USE_TOPO_SFC
01323                 int x,y,z,t;
01324                 tmgr.rankToCoordinates(xprocs[i], x,y, z, t);
01325                 iout << " " << xprocs[i] << "(" << x << "  " << y << " " << z << ")";
01326 #else
01327                 iout << " " << xprocs[i];
01328 #endif
01329             }
01330                 if ( i < xprocs.size() ) iout << " ...";
01331                 iout << "\n" << endi;
01332                 }
01333 
01334     if (CkMyRank() == 0) {
01335       for (pe=0, y = 0; y < yBlocks; ++y )      
01336         for (z = 0; z < zBlocks; ++z, ++pe ) {
01337           pencilPMEProcessors[xprocs[pe]] = 1;
01338         }
01339     }
01340         
01341 
01342         // creating the pencil arrays
01343         if ( CkMyPe() == 0 ){
01344 #if !USE_RANDOM_TOPO
01345         // std::sort(zprocs.begin(),zprocs.end(),WorkDistrib::pe_sortop_compact());
01346         WorkDistrib::sortPmePes(zprocs.begin(),xBlocks,yBlocks);
01347         std::sort(yprocs.begin(),yprocs.end(),WorkDistrib::pe_sortop_compact());
01348         std::sort(xprocs.begin(),xprocs.end(),WorkDistrib::pe_sortop_compact());
01349 #endif
01350 #if 1
01351         CProxy_PmePencilMap zm = CProxy_PmePencilMap::ckNew(0,1,yBlocks,xBlocks*yBlocks,zprocs.begin());
01352         CProxy_PmePencilMap ym;
01353         if ( simParams->PMEPencilsYLayout )
01354           ym = CProxy_PmePencilMap::ckNew(0,2,zBlocks,zBlocks*xBlocks,yprocs.begin()); // new
01355         else
01356           ym = CProxy_PmePencilMap::ckNew(2,0,xBlocks,zBlocks*xBlocks,yprocs.begin()); // old
01357         CProxy_PmePencilMap xm;
01358         if ( simParams->PMEPencilsXLayout )
01359           xm = CProxy_PmePencilMap::ckNew(2,1,yBlocks,yBlocks*zBlocks,xprocs.begin()); // new
01360         else
01361           xm = CProxy_PmePencilMap::ckNew(1,2,zBlocks,yBlocks*zBlocks,xprocs.begin()); // old
01362         pmeNodeProxy.recvPencilMapProxies(xm,ym,zm);
01363         CkArrayOptions zo(xBlocks,yBlocks,1);  zo.setMap(zm);
01364         CkArrayOptions yo(xBlocks,1,zBlocks);  yo.setMap(ym);
01365         CkArrayOptions xo(1,yBlocks,zBlocks);  xo.setMap(xm);
01366         zo.setAnytimeMigration(false);  zo.setStaticInsertion(true);
01367         yo.setAnytimeMigration(false);  yo.setStaticInsertion(true);
01368         xo.setAnytimeMigration(false);  xo.setStaticInsertion(true);
01369         zPencil = CProxy_PmeZPencil::ckNew(zo);  // (xBlocks,yBlocks,1);
01370         yPencil = CProxy_PmeYPencil::ckNew(yo);  // (xBlocks,1,zBlocks);
01371         xPencil = CProxy_PmeXPencil::ckNew(xo);  // (1,yBlocks,zBlocks);
01372 #else
01373         zPencil = CProxy_PmeZPencil::ckNew();  // (xBlocks,yBlocks,1);
01374         yPencil = CProxy_PmeYPencil::ckNew();  // (xBlocks,1,zBlocks);
01375         xPencil = CProxy_PmeXPencil::ckNew();  // (1,yBlocks,zBlocks);
01376 
01377                 for (pe=0, x = 0; x < xBlocks; ++x)
01378                         for (y = 0; y < yBlocks; ++y, ++pe ) {
01379                                 zPencil(x,y,0).insert(zprocs[pe]);
01380                         }
01381         zPencil.doneInserting();
01382 
01383                 for (pe=0, x = 0; x < xBlocks; ++x)
01384                         for (z = 0; z < zBlocks; ++z, ++pe ) {
01385                                 yPencil(x,0,z).insert(yprocs[pe]);
01386                         }
01387         yPencil.doneInserting();
01388 
01389 
01390                 for (pe=0, y = 0; y < yBlocks; ++y )    
01391                         for (z = 0; z < zBlocks; ++z, ++pe ) {
01392                                 xPencil(0,y,z).insert(xprocs[pe]);
01393                         }
01394                 xPencil.doneInserting();     
01395 #endif
01396 
01397                 pmeProxy.recvArrays(xPencil,yPencil,zPencil);
01398                 PmePencilInitMsgData msgdata;
01399                 msgdata.grid = myGrid;
01400                 msgdata.xBlocks = xBlocks;
01401                 msgdata.yBlocks = yBlocks;
01402                 msgdata.zBlocks = zBlocks;
01403                 msgdata.xPencil = xPencil;
01404                 msgdata.yPencil = yPencil;
01405                 msgdata.zPencil = zPencil;
01406                 msgdata.pmeProxy = pmeProxyDir;
01407         msgdata.pmeNodeProxy = pmeNodeProxy;
01408         msgdata.xm = xm;
01409         msgdata.ym = ym;
01410         msgdata.zm = zm;
01411                 xPencil.init(new PmePencilInitMsg(msgdata));
01412                 yPencil.init(new PmePencilInitMsg(msgdata));
01413                 zPencil.init(new PmePencilInitMsg(msgdata));
01414         }
01415 
01416     return;  // continue in initialize_pencils() at next startup stage
01417   }
01418 
01419 
01420   int pe;
01421   int nx = 0;
01422   for ( pe = 0; pe < numGridPes; ++pe ) {
01423     localInfo[pe].x_start = nx;
01424     nx += myGrid.block1;
01425     if ( nx > myGrid.K1 ) nx = myGrid.K1;
01426     localInfo[pe].nx = nx - localInfo[pe].x_start;
01427   }
01428   int ny = 0;
01429   for ( pe = 0; pe < numTransPes; ++pe ) {
01430     localInfo[pe].y_start_after_transpose = ny;
01431     ny += myGrid.block2;
01432     if ( ny > myGrid.K2 ) ny = myGrid.K2;
01433     localInfo[pe].ny_after_transpose =
01434                         ny - localInfo[pe].y_start_after_transpose;
01435   }
01436 
01437   {  // decide how many pes this node exchanges charges with
01438 
01439   PatchMap *patchMap = PatchMap::Object();
01440   Lattice lattice = simParams->lattice;
01441   BigReal sysdima = lattice.a_r().unit() * lattice.a();
01442   BigReal cutoff = simParams->cutoff;
01443   BigReal patchdim = simParams->patchDimension;
01444   int numPatches = patchMap->numPatches();
01445   int numNodes = CkNumPes();
01446   int *source_flags = new int[numNodes];
01447   int node;
01448   for ( node=0; node<numNodes; ++node ) {
01449     source_flags[node] = 0;
01450     recipPeDest[node] = 0;
01451   }
01452 
01453   // // make sure that we don't get ahead of ourselves on this node
01454   // if ( CkMyPe() < numPatches && myRecipPe >= 0 ) {
01455   //   source_flags[CkMyPe()] = 1;
01456   //   recipPeDest[myRecipPe] = 1;
01457   // }
01458 
01459   for ( int pid=0; pid < numPatches; ++pid ) {
01460     int pnode = patchMap->node(pid);
01461 #ifdef NAMD_CUDA
01462     if ( offload ) pnode = CkNodeFirst(CkNodeOf(pnode));
01463 #endif
01464     int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
01465     BigReal minx = patchMap->min_a(pid);
01466     BigReal maxx = patchMap->max_a(pid);
01467     BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
01468     // min1 (max1) is smallest (largest) grid line for this patch
01469     int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
01470     int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
01471     for ( int i=min1; i<=max1; ++i ) {
01472       int ix = i;
01473       while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
01474       while ( ix < 0 ) ix += myGrid.K1;
01475       // set source_flags[pnode] if this patch sends to our node
01476       if ( myGridPe >= 0 && ix >= localInfo[myGridPe].x_start &&
01477            ix < localInfo[myGridPe].x_start + localInfo[myGridPe].nx ) {
01478         source_flags[pnode] = 1;
01479       }
01480       // set dest_flags[] for node that our patch sends to
01481 #ifdef NAMD_CUDA
01482       if ( offload ) {
01483         if ( pnode == CkNodeFirst(CkMyNode()) ) {
01484           recipPeDest[ix / myGrid.block1] = 1;
01485         }
01486       } else
01487 #endif
01488       if ( pnode == CkMyPe() ) {
01489         recipPeDest[ix / myGrid.block1] = 1;
01490       }
01491     }
01492   }
01493 
01494   int numSourcesSamePhysicalNode = 0;
01495   numSources = 0;
01496   numDestRecipPes = 0;
01497   for ( node=0; node<numNodes; ++node ) {
01498     if ( source_flags[node] ) ++numSources;
01499     if ( recipPeDest[node] ) ++numDestRecipPes;
01500     if ( source_flags[node] && CmiPeOnSamePhysicalNode(node,CkMyPe()) ) ++numSourcesSamePhysicalNode;
01501   }
01502 
01503 #if 0
01504   if ( numSources ) {
01505     CkPrintf("pe %5d pme %5d of %5d on same physical node\n",
01506             CkMyPe(), numSourcesSamePhysicalNode, numSources);
01507     iout << iINFO << "PME " << CkMyPe() << " sources:";
01508     for ( node=0; node<numNodes; ++node ) {
01509       if ( source_flags[node] ) iout << " " << node;
01510     }
01511     iout << "\n" << endi;
01512   }
01513 #endif
01514 
01515   delete [] source_flags;
01516 
01517   // CkPrintf("PME on node %d has %d sources and %d destinations\n",
01518   //           CkMyPe(), numSources, numDestRecipPes);
01519 
01520   }  // decide how many pes this node exchanges charges with (end)
01521 
01522   ungrid_count = numDestRecipPes;
01523 
01524   sendTransBarrier_received = 0;
01525 
01526   if ( myGridPe < 0 && myTransPe < 0 ) return;
01527   // the following only for nodes doing reciprocal sum
01528 
01529   if ( myTransPe >= 0 ) {
01530     recipEvirPe = findRecipEvirPe();
01531     pmeProxy[recipEvirPe].addRecipEvirClient();
01532   }
01533 
01534   if ( myTransPe >= 0 ) {
01535       int k2_start = localInfo[myTransPe].y_start_after_transpose;
01536       int k2_end = k2_start + localInfo[myTransPe].ny_after_transpose;
01537       #ifdef OPENATOM_VERSION
01538       if ( simParams->openatomOn ) { 
01539         CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
01540         myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2, moaProxy);
01541       } else {
01542         myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
01543       }
01544       #else  // OPENATOM_VERSION
01545       myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
01546       #endif // OPENATOM_VERSION
01547   }
01548 
01549   int local_size = myGrid.block1 * myGrid.K2 * myGrid.dim3;
01550   int local_size_2 = myGrid.block2 * myGrid.K1 * myGrid.dim3;
01551   if ( local_size < local_size_2 ) local_size = local_size_2;
01552   qgrid = new float[local_size*numGrids];
01553   if ( numGridPes > 1 || numTransPes > 1 ) {
01554     kgrid = new float[local_size*numGrids];
01555   } else {
01556     kgrid = qgrid;
01557   }
01558   qgrid_size = local_size;
01559 
01560   if ( myGridPe >= 0 ) {
01561   qgrid_start = localInfo[myGridPe].x_start * myGrid.K2 * myGrid.dim3;
01562   qgrid_len = localInfo[myGridPe].nx * myGrid.K2 * myGrid.dim3;
01563   fgrid_start = localInfo[myGridPe].x_start * myGrid.K2;
01564   fgrid_len = localInfo[myGridPe].nx * myGrid.K2;
01565   }
01566 
01567   int n[3]; n[0] = myGrid.K1; n[1] = myGrid.K2; n[2] = myGrid.K3;
01568 #ifdef NAMD_FFTW
01569   CmiLock(fftw_plan_lock);
01570 #ifdef NAMD_FFTW_3
01571   work = new fftwf_complex[n[0]];
01572   int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT  : simParams->FFTWEstimate ? FFTW_ESTIMATE  : FFTW_MEASURE ;
01573   if ( myGridPe >= 0 ) {
01574     forward_plan_yz=new fftwf_plan[numGrids];
01575     backward_plan_yz=new fftwf_plan[numGrids];
01576   }
01577   if ( myTransPe >= 0 ) {
01578     forward_plan_x=new fftwf_plan[numGrids];
01579     backward_plan_x=new fftwf_plan[numGrids];
01580   }
01581   /* need one plan per grid */
01582   if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps.  1..." << endi;
01583   if ( myGridPe >= 0 ) {
01584     for( int g=0; g<numGrids; g++)
01585       {
01586         forward_plan_yz[g] = fftwf_plan_many_dft_r2c(2, n+1, 
01587                                                      localInfo[myGridPe].nx,
01588                                                      qgrid + qgrid_size * g,
01589                                                      NULL,
01590                                                      1,
01591                                                      myGrid.dim2 * myGrid.dim3,
01592                                                      (fftwf_complex *) 
01593                                                      (qgrid + qgrid_size * g),
01594                                                      NULL,
01595                                                      1,
01596                                                      myGrid.dim2 * (myGrid.dim3/2),
01597                                                      fftwFlags);
01598       }
01599   }
01600   int zdim = myGrid.dim3;
01601   int xStride=localInfo[myTransPe].ny_after_transpose *( myGrid.dim3 / 2);
01602   if ( ! CkMyPe() ) iout << " 2..." << endi;
01603   if ( myTransPe >= 0 ) {
01604     for( int g=0; g<numGrids; g++)
01605       {
01606 
01607         forward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
01608                                                 (fftwf_complex *)
01609                                                 (kgrid+qgrid_size*g),
01610                                                 NULL,
01611                                                 xStride,
01612                                                 1,
01613                                                 (fftwf_complex *)
01614                                                 (kgrid+qgrid_size*g),
01615                                                 NULL,
01616                                                 xStride,
01617                                                 1,
01618                                                 FFTW_FORWARD,fftwFlags);
01619         
01620       }
01621   }
01622   if ( ! CkMyPe() ) iout << " 3..." << endi;
01623   if ( myTransPe >= 0 ) {
01624     for( int g=0; g<numGrids; g++)
01625       {
01626         backward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
01627                                                  (fftwf_complex *)
01628                                                  (kgrid+qgrid_size*g),
01629                                                  NULL,
01630                                                  xStride,
01631                                                  1,
01632                                                  (fftwf_complex *)
01633                                                  (kgrid+qgrid_size*g),
01634                                                  NULL,
01635                                                  xStride,
01636                                                  1,
01637                                                  FFTW_BACKWARD, fftwFlags);
01638 
01639       }
01640   }
01641   if ( ! CkMyPe() ) iout << " 4..." << endi;
01642   if ( myGridPe >= 0 ) {
01643     for( int g=0; g<numGrids; g++)
01644       {
01645         backward_plan_yz[g] = fftwf_plan_many_dft_c2r(2, n+1, 
01646                                                       localInfo[myGridPe].nx,
01647                                                       (fftwf_complex *)
01648                                                       (qgrid + qgrid_size * g),
01649                                                       NULL,
01650                                                       1,
01651                                                       myGrid.dim2*(myGrid.dim3/2),
01652                                                       qgrid + qgrid_size * g,
01653                                                       NULL,
01654                                                       1,
01655                                                       myGrid.dim2 * myGrid.dim3,
01656                                                       fftwFlags);
01657       }
01658   }
01659   if ( ! CkMyPe() ) iout << "   Done.\n" << endi;
01660 
01661 #else
01662   work = new fftw_complex[n[0]];
01663 
01664   if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps.  1..." << endi;
01665   if ( myGridPe >= 0 ) {
01666   forward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_REAL_TO_COMPLEX,
01667         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01668         | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
01669   }
01670   if ( ! CkMyPe() ) iout << " 2..." << endi;
01671   if ( myTransPe >= 0 ) {
01672       forward_plan_x = fftw_create_plan_specific(n[0], FFTW_FORWARD,
01673         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01674         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
01675         localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
01676   }
01677   if ( ! CkMyPe() ) iout << " 3..." << endi;
01678   if ( myTransPe >= 0 ) {
01679   backward_plan_x = fftw_create_plan_specific(n[0], FFTW_BACKWARD,
01680         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01681         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
01682         localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
01683   }
01684   if ( ! CkMyPe() ) iout << " 4..." << endi;
01685   if ( myGridPe >= 0 ) {
01686   backward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_COMPLEX_TO_REAL,
01687         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01688         | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
01689   }
01690   if ( ! CkMyPe() ) iout << "   Done.\n" << endi;
01691 #endif
01692   CmiUnlock(fftw_plan_lock);
01693 #else
01694   NAMD_die("Sorry, FFTW must be compiled in to use PME.");
01695 #endif
01696 
01697   if ( myGridPe >= 0 && numSources == 0 )
01698                 NAMD_bug("PME grid elements exist without sources.");
01699   grid_count = numSources;
01700   memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
01701   trans_count = numGridPes;
01702 }

void ComputePmeMgr::initialize_computes (  ) 

Definition at line 2743 of file ComputePme.C.

References chargeGridSubmittedCount, cuda_errcheck(), cuda_init_bspline_coeffs(), cuda_lock, deviceCUDA, PmeGrid::dim2, PmeGrid::dim3, NodePmeMgr::end_all_pme_kernels, NodePmeMgr::end_charge_memset, NodePmeMgr::end_potential_memcpy, f, DeviceCUDA::getDeviceID(), DeviceCUDA::getMasterPe(), j, ijpair::j, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, NodePmeMgr::master_pe, master_pe, NodePmeMgr::masterPmeMgr, NodePmeMgr::mgrObjects, NAMD_bug(), PatchMap::numPatchesOnNode(), PatchMap::Object(), Node::Object(), ReductionMgr::Object(), PmeGrid::order, REDUCTIONS_BASIC, Node::simParameters, simParams, ReductionMgr::willSubmit(), and XCOPY.

02743                                         {
02744 
02745   noWorkCount = 0;
02746   doWorkCount = 0;
02747   ungridForcesCount = 0;
02748 
02749   reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
02750 
02751   SimParameters *simParams = Node::Object()->simParameters;
02752 
02753   strayChargeErrors = 0;
02754 
02755 #ifdef NAMD_CUDA
02756  PatchMap *patchMap = PatchMap::Object();
02757  int pe = master_pe = CkNodeFirst(CkMyNode());
02758  for ( int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
02759     if ( ! patchMap->numPatchesOnNode(master_pe) ) master_pe = pe;
02760     if ( ! patchMap->numPatchesOnNode(pe) ) continue;
02761     if ( master_pe < 1 && pe != deviceCUDA->getMasterPe() ) master_pe = pe;
02762     if ( master_pe == deviceCUDA->getMasterPe() ) master_pe = pe;
02763     if ( WorkDistrib::pe_sortop_diffuse()(pe,master_pe)
02764         && pe != deviceCUDA->getMasterPe() ) {
02765       master_pe = pe;
02766     }
02767  }
02768  if ( ! patchMap->numPatchesOnNode(master_pe) ) {
02769    NAMD_bug("ComputePmeMgr::initialize_computes() master_pe has no patches.");
02770  }
02771 
02772  masterPmeMgr = nodePmeMgr->mgrObjects[master_pe - CkNodeFirst(CkMyNode())];
02773  bool cudaFirst = 1;
02774  if ( offload ) {
02775   CmiLock(cuda_lock);
02776   cudaFirst = ! masterPmeMgr->chargeGridSubmittedCount++;
02777  }
02778 
02779  if ( cudaFirst ) {
02780   nodePmeMgr->master_pe = master_pe;
02781   nodePmeMgr->masterPmeMgr = masterPmeMgr;
02782  }
02783 #endif
02784 
02785   qsize = myGrid.K1 * myGrid.dim2 * myGrid.dim3;
02786   fsize = myGrid.K1 * myGrid.dim2;
02787   if ( myGrid.K2 != myGrid.dim2 ) NAMD_bug("PME myGrid.K2 != myGrid.dim2");
02788 #ifdef NAMD_CUDA
02789  if ( ! offload )
02790 #endif
02791  {
02792   q_arr = new float*[fsize*numGrids];
02793   memset( (void*) q_arr, 0, fsize*numGrids * sizeof(float*) );
02794   q_list = new float*[fsize*numGrids];
02795   memset( (void*) q_list, 0, fsize*numGrids * sizeof(float*) );
02796   q_count = 0;
02797  }
02798 
02799 #ifdef NAMD_CUDA
02800  if ( cudaFirst || ! offload ) {
02801 #endif
02802   f_arr = new char[fsize*numGrids];
02803   // memset to non-zero value has race condition on BlueGene/Q
02804   // memset( (void*) f_arr, 2, fsize*numGrids * sizeof(char) );
02805   for ( int n=fsize*numGrids, i=0; i<n; ++i ) f_arr[i] = 2;
02806 
02807   for ( int g=0; g<numGrids; ++g ) {
02808     char *f = f_arr + g*fsize;
02809     if ( usePencils ) {
02810       int K1 = myGrid.K1;
02811       int K2 = myGrid.K2;
02812       int block1 = ( K1 + xBlocks - 1 ) / xBlocks;
02813       int block2 = ( K2 + yBlocks - 1 ) / yBlocks;
02814       int dim2 = myGrid.dim2;
02815       for (int ap=0; ap<numPencilsActive; ++ap) {
02816         int ib = activePencils[ap].i;
02817         int jb = activePencils[ap].j;
02818         int ibegin = ib*block1;
02819         int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
02820         int jbegin = jb*block2;
02821         int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
02822         int flen = numGrids * (iend - ibegin) * (jend - jbegin);
02823         for ( int i=ibegin; i<iend; ++i ) {
02824           for ( int j=jbegin; j<jend; ++j ) {
02825             f[i*dim2+j] = 0;
02826           }
02827         }
02828       }
02829     } else {
02830       int block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
02831       bsize = block1 * myGrid.dim2 * myGrid.dim3;
02832       for (int pe=0; pe<numGridPes; pe++) {
02833         if ( ! recipPeDest[pe] ) continue;
02834         int start = pe * bsize;
02835         int len = bsize;
02836         if ( start >= qsize ) { start = 0; len = 0; }
02837         if ( start + len > qsize ) { len = qsize - start; }
02838         int zdim = myGrid.dim3;
02839         int fstart = start / zdim;
02840         int flen = len / zdim;
02841         memset(f + fstart, 0, flen*sizeof(char));
02842         // CkPrintf("pe %d enabled slabs %d to %d\n", CkMyPe(), fstart/myGrid.dim2, (fstart+flen)/myGrid.dim2-1);
02843       }
02844     }
02845   }
02846 #ifdef NAMD_CUDA
02847  }
02848  if ( offload ) {
02849  cudaSetDevice(deviceCUDA->getDeviceID());
02850  if ( cudaFirst ) {
02851 
02852   int f_alloc_count = 0;
02853   for ( int n=fsize, i=0; i<n; ++i ) {
02854     if ( f_arr[i] == 0 ) {
02855       ++f_alloc_count;
02856     }
02857   }
02858   // CkPrintf("pe %d f_alloc_count == %d (%d slabs)\n", CkMyPe(), f_alloc_count, f_alloc_count/myGrid.dim2);
02859 
02860   q_arr = new float*[fsize*numGrids];
02861   memset( (void*) q_arr, 0, fsize*numGrids * sizeof(float*) );
02862 
02863   float **q_arr_dev_host = new float*[fsize];
02864   cudaMalloc((void**) &q_arr_dev, fsize * sizeof(float*));
02865 
02866   float **v_arr_dev_host = new float*[fsize];
02867   cudaMalloc((void**) &v_arr_dev, fsize * sizeof(float*));
02868 
02869   int q_stride = myGrid.K3+myGrid.order-1;
02870   q_data_size = f_alloc_count * q_stride * sizeof(float);
02871   ffz_size = (fsize + q_stride) * sizeof(int);
02872 
02873   // tack ffz onto end of q_data to allow merged transfer
02874   cudaMallocHost((void**) &q_data_host, q_data_size+ffz_size);
02875   ffz_host = (int*)(((char*)q_data_host) + q_data_size);
02876   cudaMalloc((void**) &q_data_dev, q_data_size+ffz_size);
02877   ffz_dev = (int*)(((char*)q_data_dev) + q_data_size);
02878   cudaMalloc((void**) &v_data_dev, q_data_size);
02879   cuda_errcheck("malloc grid data for pme");
02880   cudaMemset(q_data_dev, 0, q_data_size + ffz_size);  // for first time
02881   cudaEventCreateWithFlags(&(nodePmeMgr->end_charge_memset),cudaEventDisableTiming);
02882   cudaEventRecord(nodePmeMgr->end_charge_memset, 0);
02883   cudaEventCreateWithFlags(&(nodePmeMgr->end_all_pme_kernels),cudaEventDisableTiming);
02884   cudaEventCreateWithFlags(&(nodePmeMgr->end_potential_memcpy),cudaEventDisableTiming);
02885 
02886   f_alloc_count = 0;
02887   for ( int n=fsize, i=0; i<n; ++i ) {
02888     if ( f_arr[i] == 0 ) {
02889       q_arr[i] = q_data_host + f_alloc_count * q_stride;
02890       q_arr_dev_host[i] = q_data_dev + f_alloc_count * q_stride;
02891       v_arr_dev_host[i] = v_data_dev + f_alloc_count * q_stride;
02892       ++f_alloc_count;
02893     } else {
02894       q_arr[i] = 0;
02895       q_arr_dev_host[i] = 0;
02896       v_arr_dev_host[i] = 0;
02897     }
02898   }
02899 
02900   cudaMemcpy(q_arr_dev, q_arr_dev_host, fsize * sizeof(float*), cudaMemcpyHostToDevice);
02901   cudaMemcpy(v_arr_dev, v_arr_dev_host, fsize * sizeof(float*), cudaMemcpyHostToDevice);
02902   delete [] q_arr_dev_host;
02903   delete [] v_arr_dev_host;
02904   delete [] f_arr;
02905   f_arr = new char[fsize + q_stride];
02906   fz_arr = f_arr + fsize;
02907   memset(f_arr, 0, fsize + q_stride);
02908   memset(ffz_host, 0, (fsize + q_stride)*sizeof(int));
02909 
02910   cuda_errcheck("initialize grid data for pme");
02911 
02912   cuda_init_bspline_coeffs(&bspline_coeffs_dev, &bspline_dcoeffs_dev, myGrid.order);
02913   cuda_errcheck("initialize bspline coefficients for pme");
02914 
02915 #define XCOPY(X) masterPmeMgr->X = X;
02916   XCOPY(bspline_coeffs_dev)
02917   XCOPY(bspline_dcoeffs_dev)
02918   XCOPY(q_arr)
02919   XCOPY(q_arr_dev)
02920   XCOPY(v_arr_dev)
02921   XCOPY(q_data_size)
02922   XCOPY(q_data_host)
02923   XCOPY(q_data_dev)
02924   XCOPY(v_data_dev)
02925   XCOPY(ffz_size)
02926   XCOPY(ffz_host)
02927   XCOPY(ffz_dev)
02928   XCOPY(f_arr)
02929   XCOPY(fz_arr)
02930 #undef XCOPY
02931   //CkPrintf("pe %d init first\n", CkMyPe());
02932  } else { // cudaFirst
02933   //CkPrintf("pe %d init later\n", CkMyPe());
02934 #define XCOPY(X) X = masterPmeMgr->X;
02935   XCOPY(bspline_coeffs_dev)
02936   XCOPY(bspline_dcoeffs_dev)
02937   XCOPY(q_arr)
02938   XCOPY(q_arr_dev)
02939   XCOPY(v_arr_dev)
02940   XCOPY(q_data_size)
02941   XCOPY(q_data_host)
02942   XCOPY(q_data_dev)
02943   XCOPY(v_data_dev)
02944   XCOPY(ffz_size)
02945   XCOPY(ffz_host)
02946   XCOPY(ffz_dev)
02947   XCOPY(f_arr)
02948   XCOPY(fz_arr)
02949 #undef XCOPY
02950  } // cudaFirst
02951   CmiUnlock(cuda_lock);
02952  } else // offload
02953 #endif // NAMD_CUDA
02954  {
02955   fz_arr = new char[myGrid.K3+myGrid.order-1];
02956  }
02957 
02958 #if 0 && USE_PERSISTENT
02959   recvGrid_handle = NULL;
02960 #endif
02961 }

void ComputePmeMgr::initialize_pencils ( CkQdMsg *   ) 

Definition at line 1706 of file ComputePme.C.

References Lattice::a(), Lattice::a_r(), Lattice::b(), Lattice::b_r(), PmeGrid::block1, PmeGrid::block2, deviceCUDA, DeviceCUDA::getMasterPe(), j, PmeGrid::K1, PmeGrid::K2, PatchMap::max_a(), PatchMap::max_b(), PatchMap::min_a(), PatchMap::min_b(), PatchMap::node(), PatchMap::numPatches(), numPatches, PatchMap::Object(), Node::Object(), PmeGrid::order, Node::simParameters, simParams, and Vector::unit().

01706                                                    {
01707   delete msg;
01708   if ( ! usePencils ) return;
01709 
01710   SimParameters *simParams = Node::Object()->simParameters;
01711 
01712   PatchMap *patchMap = PatchMap::Object();
01713   Lattice lattice = simParams->lattice;
01714   BigReal sysdima = lattice.a_r().unit() * lattice.a();
01715   BigReal sysdimb = lattice.b_r().unit() * lattice.b();
01716   BigReal cutoff = simParams->cutoff;
01717   BigReal patchdim = simParams->patchDimension;
01718   int numPatches = patchMap->numPatches();
01719 
01720   pencilActive = new char[xBlocks*yBlocks];
01721   for ( int i=0; i<xBlocks; ++i ) {
01722     for ( int j=0; j<yBlocks; ++j ) {
01723       pencilActive[i*yBlocks+j] = 0;
01724     }
01725   }
01726 
01727   for ( int pid=0; pid < numPatches; ++pid ) {
01728     int pnode = patchMap->node(pid);
01729 #ifdef NAMD_CUDA
01730     if ( offload ) {
01731       if ( CkNodeOf(pnode) != CkMyNode() ) continue;
01732     } else
01733 #endif
01734     if ( pnode != CkMyPe() ) continue;
01735 
01736     int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
01737     int shift2 = (myGrid.K2 + myGrid.order - 1)/2;
01738 
01739     BigReal minx = patchMap->min_a(pid);
01740     BigReal maxx = patchMap->max_a(pid);
01741     BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
01742     // min1 (max1) is smallest (largest) grid line for this patch
01743     int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
01744     int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
01745 
01746     BigReal miny = patchMap->min_b(pid);
01747     BigReal maxy = patchMap->max_b(pid);
01748     BigReal marginb = 0.5 * ( patchdim - cutoff ) / sysdimb;
01749     // min2 (max2) is smallest (largest) grid line for this patch
01750     int min2 = ((int) floor(myGrid.K2 * (miny - marginb))) + shift2 - myGrid.order + 1;
01751     int max2 = ((int) floor(myGrid.K2 * (maxy + marginb))) + shift2;
01752 
01753     for ( int i=min1; i<=max1; ++i ) {
01754       int ix = i;
01755       while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
01756       while ( ix < 0 ) ix += myGrid.K1;
01757       for ( int j=min2; j<=max2; ++j ) {
01758         int jy = j;
01759         while ( jy >= myGrid.K2 ) jy -= myGrid.K2;
01760         while ( jy < 0 ) jy += myGrid.K2;
01761         pencilActive[(ix / myGrid.block1)*yBlocks + (jy / myGrid.block2)] = 1;
01762       }
01763     }
01764   }
01765 
01766   numPencilsActive = 0;
01767   for ( int i=0; i<xBlocks; ++i ) {
01768     for ( int j=0; j<yBlocks; ++j ) {
01769       if ( pencilActive[i*yBlocks+j] ) {
01770         ++numPencilsActive;
01771 #ifdef NAMD_CUDA
01772         if ( CkMyPe() == deviceCUDA->getMasterPe() || ! offload )
01773 #endif
01774         zPencil(i,j,0).dummyRecvGrid(CkMyPe(),0);
01775       }
01776     }
01777   }
01778   activePencils = new ijpair[numPencilsActive];
01779   numPencilsActive = 0;
01780   for ( int i=0; i<xBlocks; ++i ) {
01781     for ( int j=0; j<yBlocks; ++j ) {
01782       if ( pencilActive[i*yBlocks+j] ) {
01783         activePencils[numPencilsActive++] = ijpair(i,j);
01784       }
01785     }
01786   }
01787   if ( simParams->PMESendOrder ) {
01788     std::sort(activePencils,activePencils+numPencilsActive,ijpair_sortop_bit_reversed());
01789   } else {
01790     Random rand(CkMyPe());
01791     rand.reorder(activePencils,numPencilsActive);
01792   }
01793   //if ( numPencilsActive ) {
01794   //  CkPrintf("node %d sending to %d pencils\n", CkMyPe(), numPencilsActive);
01795   //}
01796 
01797   ungrid_count = numPencilsActive;
01798 }

void ComputePmeMgr::pollChargeGridReady (  ) 

Definition at line 3546 of file ComputePme.C.

References CcdCallBacksReset(), cuda_check_pme_charges(), CUDA_POLL, and NAMD_bug().

03546                                         {
03547 #ifdef NAMD_CUDA
03548   CcdCallBacksReset(0,CmiWallTimer());  // fix Charm++
03549   CUDA_POLL(cuda_check_pme_charges,this);
03550 #else
03551   NAMD_bug("ComputePmeMgr::pollChargeGridReady() called in non-CUDA build.");
03552 #endif
03553 }

void ComputePmeMgr::pollForcesReady (  ) 

Definition at line 2656 of file ComputePme.C.

References CcdCallBacksReset(), cuda_check_pme_forces(), CUDA_POLL, and NAMD_bug().

02656                                     {
02657 #ifdef NAMD_CUDA
02658   CcdCallBacksReset(0,CmiWallTimer());  // fix Charm++
02659   CUDA_POLL(cuda_check_pme_forces,this);
02660 #else
02661   NAMD_bug("ComputePmeMgr::pollForcesReady() called in non-CUDA build.");
02662 #endif
02663 }

void ComputePmeMgr::procTrans ( PmeTransMsg  ) 

Definition at line 2061 of file ComputePme.C.

References PmeGrid::dim3, gridCalc2(), PmeTransMsg::lattice, NodePmeInfo::npe, PmeTransMsg::nx, LocalPmeInfo::ny_after_transpose, NodePmeInfo::pe_start, PmeTransMsg::qgrid, PmeTransMsg::sequence, PmeTransMsg::x_start, and LocalPmeInfo::y_start_after_transpose.

Referenced by recvSharedTrans(), and recvTrans().

02061                                               {
02062   // CkPrintf("procTrans on Pe(%d)\n",CkMyPe());
02063   if ( trans_count == numGridPes ) {
02064     lattice = msg->lattice;
02065     grid_sequence = msg->sequence;
02066   }
02067 
02068  if ( msg->nx ) {
02069   int zdim = myGrid.dim3;
02070   NodePmeInfo &nodeInfo(transNodeInfo[myTransNode]);
02071   int first_pe = nodeInfo.pe_start;
02072   int last_pe = first_pe+nodeInfo.npe-1;
02073   int y_skip = localInfo[myTransPe].y_start_after_transpose
02074              - localInfo[first_pe].y_start_after_transpose;
02075   int ny_msg = localInfo[last_pe].y_start_after_transpose
02076              + localInfo[last_pe].ny_after_transpose
02077              - localInfo[first_pe].y_start_after_transpose;
02078   int ny = localInfo[myTransPe].ny_after_transpose;
02079   int x_start = msg->x_start;
02080   int nx = msg->nx;
02081   for ( int g=0; g<numGrids; ++g ) {
02082     CmiMemcpy((void*)(kgrid + qgrid_size * g + x_start*ny*zdim),
02083         (void*)(msg->qgrid + nx*(ny_msg*g+y_skip)*zdim),
02084         nx*ny*zdim*sizeof(float));
02085   }
02086  }
02087 
02088   --trans_count;
02089 
02090   if ( trans_count == 0 ) {
02091     pmeProxyDir[CkMyPe()].gridCalc2();
02092   }
02093 }

void ComputePmeMgr::procUntrans ( PmeUntransMsg  ) 

Definition at line 2315 of file ComputePme.C.

References PmeGrid::dim3, gridCalc3(), PmeGrid::K2, NodePmeInfo::npe, LocalPmeInfo::nx, PmeUntransMsg::ny, NodePmeInfo::pe_start, PmeUntransMsg::qgrid, LocalPmeInfo::x_start, and PmeUntransMsg::y_start.

Referenced by recvSharedUntrans(), and recvUntrans().

02315                                                   {
02316   // CkPrintf("recvUntrans on Pe(%d)\n",CkMyPe());
02317 
02318 #if CMK_BLUEGENEL
02319   CmiNetworkProgressAfter (0);
02320 #endif
02321 
02322   NodePmeInfo &nodeInfo(gridNodeInfo[myGridNode]);
02323   int first_pe = nodeInfo.pe_start;
02324   int g;
02325 
02326  if ( msg->ny ) {
02327   int zdim = myGrid.dim3;
02328   int last_pe = first_pe+nodeInfo.npe-1;
02329   int x_skip = localInfo[myGridPe].x_start
02330              - localInfo[first_pe].x_start;
02331   int nx_msg = localInfo[last_pe].x_start
02332              + localInfo[last_pe].nx
02333              - localInfo[first_pe].x_start;
02334   int nx = localInfo[myGridPe].nx;
02335   int y_start = msg->y_start;
02336   int ny = msg->ny;
02337   int slicelen = myGrid.K2 * zdim;
02338   int cpylen = ny * zdim;
02339   for ( g=0; g<numGrids; ++g ) {
02340     float *q = qgrid + qgrid_size * g + y_start * zdim;
02341     float *qmsg = msg->qgrid + (nx_msg*g+x_skip) * cpylen;
02342     for ( int x = 0; x < nx; ++x ) {
02343       CmiMemcpy((void*)q, (void*)qmsg, cpylen*sizeof(float));
02344       q += slicelen;
02345       qmsg += cpylen;
02346     }
02347   }
02348  }
02349 
02350   --untrans_count;
02351 
02352   if ( untrans_count == 0 ) {
02353     pmeProxyDir[CkMyPe()].gridCalc3();
02354   }
02355 }

void ComputePmeMgr::recvAck ( PmeAckMsg  ) 

Definition at line 2458 of file ComputePme.C.

References cuda_lock, master_pe, and NAMD_bug().

Referenced by recvUngrid().

02458                                           {
02459   if ( msg ) delete msg;
02460 #ifdef NAMD_CUDA
02461   if ( offload ) {
02462     CmiLock(cuda_lock);
02463     if ( ungrid_count == 0 ) {
02464       NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
02465     }
02466     int uc = --ungrid_count;
02467     CmiUnlock(cuda_lock);
02468 
02469     if ( uc == 0 ) {
02470       pmeProxyDir[master_pe].ungridCalc();
02471     }
02472     return;
02473   }
02474 #endif
02475   --ungrid_count;
02476 
02477   if ( ungrid_count == 0 ) {
02478     pmeProxyDir[CkMyPe()].ungridCalc();
02479   }
02480 }

void ComputePmeMgr::recvArrays ( CProxy_PmeXPencil  ,
CProxy_PmeYPencil  ,
CProxy_PmeZPencil   
)

Definition at line 791 of file ComputePme.C.

00792                                                                        {
00793   xPencil = x;  yPencil = y;  zPencil = z;
00794   
00795     if(CmiMyRank()==0)
00796     {
00797       pmeNodeProxy.ckLocalBranch()->xPencil=x;
00798       pmeNodeProxy.ckLocalBranch()->yPencil=y;
00799       pmeNodeProxy.ckLocalBranch()->zPencil=z;
00800     }
00801 }

void ComputePmeMgr::recvChargeGridReady (  ) 

Definition at line 3555 of file ComputePme.C.

References chargeGridReady(), saved_lattice, and saved_sequence.

03555                                         {
03556   chargeGridReady(*saved_lattice,saved_sequence);
03557 }

void ComputePmeMgr::recvGrid ( PmeGridMsg  ) 

Definition at line 1840 of file ComputePme.C.

References PmeGrid::dim3, f, PmeGridMsg::fgrid, gridCalc1(), PmeGridMsg::lattice, NAMD_bug(), PmeGridMsg::qgrid, sendTransBarrier(), PmeGridMsg::sequence, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

01840                                             {
01841   // CkPrintf("recvGrid from %d on Pe(%d)\n",msg->sourceNode,CkMyPe());
01842   if ( grid_count == 0 ) {
01843     NAMD_bug("Message order failure in ComputePmeMgr::recvGrid\n");
01844   }
01845   if ( grid_count == numSources ) {
01846     lattice = msg->lattice;
01847     grid_sequence = msg->sequence;
01848   }
01849 
01850   int zdim = myGrid.dim3;
01851   int zlistlen = msg->zlistlen;
01852   int *zlist = msg->zlist;
01853   float *qmsg = msg->qgrid;
01854   for ( int g=0; g<numGrids; ++g ) {
01855     char *f = msg->fgrid + fgrid_len * g;
01856     float *q = qgrid + qgrid_size * g;
01857     for ( int i=0; i<fgrid_len; ++i ) {
01858       if ( f[i] ) {
01859         for ( int k=0; k<zlistlen; ++k ) {
01860           q[zlist[k]] += *(qmsg++);
01861         }
01862       }
01863       q += zdim;
01864     }
01865   }
01866 
01867   gridmsg_reuse[numSources-grid_count] = msg;
01868   --grid_count;
01869 
01870   if ( grid_count == 0 ) {
01871     pmeProxyDir[CkMyPe()].gridCalc1();
01872     if ( useBarrier ) pmeProxyDir[0].sendTransBarrier();
01873   }
01874 }

void ComputePmeMgr::recvRecipEvir ( PmeEvirMsg  ) 

Definition at line 3046 of file ComputePme.C.

References PmeEvirMsg::evir, NAMD_bug(), pmeComputes, ResizeArray< Elem >::size(), and submitReductions().

03046                                                  {
03047   if ( ! pmeComputes.size() ) NAMD_bug("ComputePmeMgr::recvRecipEvir() called on pe without patches");
03048   for ( int g=0; g<numGrids; ++g ) {
03049     evir[g] += msg->evir[g];
03050   }
03051   delete msg;
03052   // CkPrintf("recvRecipEvir pe %d %d %d\n", CkMyPe(), ungridForcesCount, recipEvirCount);
03053   if ( ! --recipEvirCount && ! ungridForcesCount ) submitReductions();
03054 }

void ComputePmeMgr::recvSharedTrans ( PmeSharedTransMsg  ) 

Definition at line 2043 of file ComputePme.C.

References PmeSharedTransMsg::count, PmeSharedTransMsg::lock, PmeSharedTransMsg::msg, and procTrans().

02043                                                           {
02044   procTrans(msg->msg);
02045   CmiLock(msg->lock);
02046   int count = --(*msg->count);
02047   CmiUnlock(msg->lock);
02048   if ( count == 0 ) {
02049     CmiDestroyLock(msg->lock);
02050     delete msg->count;
02051     delete msg->msg;
02052   }
02053   delete msg;
02054 }

void ComputePmeMgr::recvSharedUntrans ( PmeSharedUntransMsg  ) 

Definition at line 2297 of file ComputePme.C.

References PmeSharedUntransMsg::count, PmeSharedUntransMsg::lock, PmeSharedUntransMsg::msg, and procUntrans().

02297                                                               {
02298   procUntrans(msg->msg);
02299   CmiLock(msg->lock);
02300   int count = --(*msg->count);
02301   CmiUnlock(msg->lock);
02302   if ( count == 0 ) {
02303     CmiDestroyLock(msg->lock);
02304     delete msg->count;
02305     delete msg->msg;
02306   }
02307   delete msg;
02308 }

void ComputePmeMgr::recvTrans ( PmeTransMsg  ) 

Definition at line 2056 of file ComputePme.C.

References procTrans().

02056                                               {
02057   procTrans(msg);
02058   delete msg;
02059 }

void ComputePmeMgr::recvUngrid ( PmeGridMsg  ) 

Definition at line 2443 of file ComputePme.C.

References copyPencils(), copyResults(), NAMD_bug(), and recvAck().

Referenced by NodePmeMgr::recvUngrid().

02443                                               {
02444   // CkPrintf("recvUngrid on Pe(%d)\n",CkMyPe());
02445 #ifdef NAMD_CUDA
02446   if ( ! offload )  // would need lock
02447 #endif
02448   if ( ungrid_count == 0 ) {
02449     NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
02450   }
02451 
02452   if ( usePencils ) copyPencils(msg);
02453   else copyResults(msg);
02454   delete msg;
02455   recvAck(0);
02456 }

void ComputePmeMgr::recvUntrans ( PmeUntransMsg  ) 

Definition at line 2310 of file ComputePme.C.

References procUntrans().

02310                                                   {
02311   procUntrans(msg);
02312   delete msg;
02313 }

void ComputePmeMgr::sendChargeGridReady (  ) 

Definition at line 3532 of file ComputePme.C.

References chargeGridSubmittedCount, master_pe, NodePmeMgr::mgrObjects, pmeComputes, and ResizeArray< Elem >::size().

Referenced by cuda_check_pme_charges().

03532                                         {
03533   for ( int i=0; i<CkMyNodeSize(); ++i ) {
03534     ComputePmeMgr *mgr = nodePmeMgr->mgrObjects[i];
03535     int cs = mgr->pmeComputes.size();
03536     if ( cs ) {
03537       mgr->ungridForcesCount = cs;
03538       mgr->recipEvirCount = mgr->recipEvirClients;
03539       masterPmeMgr->chargeGridSubmittedCount++;
03540     }
03541   }
03542   pmeProxy[master_pe].recvChargeGridReady();
03543 }

void ComputePmeMgr::sendData ( Lattice ,
int  sequence 
)

Definition at line 3969 of file ComputePme.C.

References sendDataHelper_errors, sendDataHelper_lattice, sendDataHelper_sequence, sendDataHelper_sourcepe, and sendDataPart().

Referenced by chargeGridReady().

03969                                                            {
03970 
03971   sendDataHelper_lattice = &lattice;
03972   sendDataHelper_sequence = sequence;
03973   sendDataHelper_sourcepe = CkMyPe();
03974   sendDataHelper_errors = strayChargeErrors;
03975   strayChargeErrors = 0;
03976 
03977 #ifdef NAMD_CUDA
03978   if ( offload ) {
03979     for ( int i=0; i < numGridPes; ++i ) {
03980       int pe = gridPeOrder[i];  // different order
03981       if ( ! recipPeDest[pe] && ! sendDataHelper_errors ) continue;
03982 #if CMK_MULTICORE
03983       // nodegroup messages on multicore are delivered to sending pe, or pe 0 if expedited
03984       pmeProxy[gridPeMap[pe]].sendDataHelper(i);
03985 #else
03986       pmeNodeProxy[CkMyNode()].sendDataHelper(i);
03987 #endif
03988     }
03989   } else
03990 #endif
03991   {
03992     sendDataPart(0,numGridPes-1,lattice,sequence,CkMyPe(),sendDataHelper_errors);
03993   }
03994  
03995 }

void ComputePmeMgr::sendDataHelper ( int   ) 

Definition at line 3956 of file ComputePme.C.

References NodePmeMgr::sendDataHelper().

03956                                            {
03957   nodePmeMgr->sendDataHelper(iter);
03958 }

void ComputePmeMgr::sendDataPart ( int  first,
int  last,
Lattice ,
int  sequence,
int  sourcepe,
int  errors 
)

Definition at line 3847 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::dim2, PmeGrid::dim3, endi(), PmeGridMsg::fgrid, iERROR(), if(), iout, j, PmeGrid::K2, PmeGrid::K3, PmeGridMsg::lattice, PmeGridMsg::len, NAMD_bug(), PmeGrid::order, PME_GRID_PRIORITY, PRIORITY_SIZE, PmeGridMsg::qgrid, PmeGridMsg::sequence, SET_PRIORITY, PmeGridMsg::sourceNode, PmeGridMsg::start, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

Referenced by sendData(), and NodePmeMgr::sendDataHelper().

03847                                                                                                               {
03848 
03849   // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
03850 
03851   bsize = myGrid.block1 * myGrid.dim2 * myGrid.dim3;
03852 
03853   CProxy_ComputePmeMgr pmeProxy(CkpvAccess(BOCclass_group).computePmeMgr);
03854   for (int j=first; j<=last; j++) {
03855     int pe = gridPeOrder[j];  // different order
03856     if ( ! recipPeDest[pe] && ! errors ) continue;
03857     int start = pe * bsize;
03858     int len = bsize;
03859     if ( start >= qsize ) { start = 0; len = 0; }
03860     if ( start + len > qsize ) { len = qsize - start; }
03861     int zdim = myGrid.dim3;
03862     int fstart = start / zdim;
03863     int flen = len / zdim;
03864     int fcount = 0;
03865     int i;
03866 
03867     int g;
03868     for ( g=0; g<numGrids; ++g ) {
03869       char *f = f_arr + fstart + g*fsize;
03870 #ifdef NAMD_CUDA
03871      if ( offload ) {
03872       int errcount = 0;
03873       for ( i=0; i<flen; ++i ) {
03874         f[i] = ffz_host[fstart+i];
03875         fcount += f[i];
03876         if ( ffz_host[fstart+i] & ~1 ) ++errcount;
03877       }
03878       if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::sendDataPart");
03879      } else
03880 #endif
03881       for ( i=0; i<flen; ++i ) {
03882         fcount += f[i];
03883       }
03884       if ( ! recipPeDest[pe] ) {
03885         int errfound = 0;
03886         for ( i=0; i<flen; ++i ) {
03887           if ( f[i] == 3 ) {
03888             errfound = 1;
03889             break;
03890           }
03891         }
03892         if ( errfound ) {
03893           iout << iERROR << "Stray PME grid charges detected: "
03894                 << sourcepe << " sending to " << gridPeMap[pe] << " for planes";
03895           int iz = -1;
03896           for ( i=0; i<flen; ++i ) {
03897             if ( f[i] == 3 ) {
03898               f[i] = 2;
03899               int jz = (i+fstart)/myGrid.K2;
03900               if ( iz != jz ) { iout << " " << jz;  iz = jz; }
03901             }
03902           }
03903           iout << "\n" << endi;
03904         }
03905       }
03906     }
03907 
03908 #ifdef NETWORK_PROGRESS
03909     CmiNetworkProgress();
03910 #endif
03911 
03912     if ( ! recipPeDest[pe] ) continue;
03913 
03914     int zlistlen = 0;
03915     for ( i=0; i<myGrid.K3; ++i ) {
03916       if ( fz_arr[i] ) ++zlistlen;
03917     }
03918 
03919     PmeGridMsg *msg = new (zlistlen, flen*numGrids,
03920                                 fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
03921 
03922     msg->sourceNode = sourcepe;
03923     msg->lattice = lattice;
03924     msg->start = fstart;
03925     msg->len = flen;
03926     msg->zlistlen = zlistlen;
03927     int *zlist = msg->zlist;
03928     zlistlen = 0;
03929     for ( i=0; i<myGrid.K3; ++i ) {
03930       if ( fz_arr[i] ) zlist[zlistlen++] = i;
03931     }
03932     float *qmsg = msg->qgrid;
03933     for ( g=0; g<numGrids; ++g ) {
03934       char *f = f_arr + fstart + g*fsize;
03935       CmiMemcpy((void*)(msg->fgrid+g*flen),(void*)f,flen*sizeof(char));
03936       float **q = q_arr + fstart + g*fsize;
03937       for ( i=0; i<flen; ++i ) {
03938         if ( f[i] ) {
03939           for (int h=0; h<myGrid.order-1; ++h) {
03940             q[i][h] += q[i][myGrid.K3+h];
03941           }
03942           for ( int k=0; k<zlistlen; ++k ) {
03943             *(qmsg++) = q[i][zlist[k]];
03944           }
03945         }
03946       }
03947     }
03948 
03949     msg->sequence = compute_sequence;
03950     SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
03951     pmeProxy[gridPeMap[pe]].recvGrid(msg);
03952   }
03953 
03954 }

void ComputePmeMgr::sendPencils ( Lattice ,
int  sequence 
)

Definition at line 3742 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::block2, PmeGrid::dim2, endi(), iERROR(), iout, j, ijpair::j, PmeGrid::K1, PmeGrid::K2, sendDataHelper_lattice, sendDataHelper_sequence, sendDataHelper_sourcepe, sendPencilsPart(), and NodePmeMgr::zm.

Referenced by chargeGridReady().

03742                                                               {
03743 
03744   sendDataHelper_lattice = &lattice;
03745   sendDataHelper_sequence = sequence;
03746   sendDataHelper_sourcepe = CkMyPe();
03747 
03748 #ifdef NAMD_CUDA
03749   if ( offload ) {
03750     for ( int ap=0; ap < numPencilsActive; ++ap ) {
03751 #if CMK_MULTICORE
03752       // nodegroup messages on multicore are delivered to sending pe, or pe 0 if expedited
03753       int ib = activePencils[ap].i;
03754       int jb = activePencils[ap].j;
03755       int destproc = nodePmeMgr->zm.ckLocalBranch()->procNum(0, CkArrayIndex3D(ib,jb,0));
03756       pmeProxy[destproc].sendPencilsHelper(ap);
03757 #else
03758       pmeNodeProxy[CkMyNode()].sendPencilsHelper(ap);
03759 #endif
03760     }
03761   } else
03762 #endif
03763   {
03764     sendPencilsPart(0,numPencilsActive-1,lattice,sequence,CkMyPe());
03765   }
03766 
03767   if ( strayChargeErrors ) {
03768    strayChargeErrors = 0;
03769    iout << iERROR << "Stray PME grid charges detected: "
03770         << CkMyPe() << " sending to (x,y)";
03771    int K1 = myGrid.K1;
03772    int K2 = myGrid.K2;
03773    int dim2 = myGrid.dim2;
03774    int block1 = myGrid.block1;
03775    int block2 = myGrid.block2;
03776    for (int ib=0; ib<xBlocks; ++ib) {
03777     for (int jb=0; jb<yBlocks; ++jb) {
03778      int ibegin = ib*block1;
03779      int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
03780      int jbegin = jb*block2;
03781      int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
03782      int flen = numGrids * (iend - ibegin) * (jend - jbegin);
03783 
03784      for ( int g=0; g<numGrids; ++g ) {
03785        char *f = f_arr + g*fsize;
03786        if ( ! pencilActive[ib*yBlocks+jb] ) {
03787            for ( int i=ibegin; i<iend; ++i ) {
03788             for ( int j=jbegin; j<jend; ++j ) {
03789              if ( f[i*dim2+j] == 3 ) {
03790                f[i*dim2+j] = 2;
03791                iout << " (" << i << "," << j << ")";
03792              }
03793             }
03794            }
03795        }
03796      }
03797     }
03798    }
03799    iout << "\n" << endi;
03800   }
03801  
03802 }

void ComputePmeMgr::sendPencilsHelper ( int   ) 

Definition at line 3729 of file ComputePme.C.

References NodePmeMgr::sendPencilsHelper().

03729                                               {
03730   nodePmeMgr->sendPencilsHelper(iter);
03731 }

void ComputePmeMgr::sendPencilsPart ( int  first,
int  last,
Lattice ,
int  sequence,
int  sourcepe 
)

Definition at line 3587 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::block2, PmeGridMsg::destElem, PmeGrid::dim2, PmeGrid::dim3, PmeGridMsg::fgrid, PmeGridMsg::hasData, j, ijpair::j, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PmeGridMsg::lattice, PmeGridMsg::len, NAMD_bug(), PmeGrid::order, PME_GRID_PRIORITY, PRIORITY_SIZE, PmeGridMsg::qgrid, PmeGridMsg::sequence, SET_PRIORITY, PmeGridMsg::sourceNode, PmeGridMsg::start, PmeGridMsg::zlist, PmeGridMsg::zlistlen, and NodePmeMgr::zm.

Referenced by sendPencils(), and NodePmeMgr::sendPencilsHelper().

03587                                                                                                      {
03588 
03589   // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
03590 
03591 #if 0 && USE_PERSISTENT
03592     if (recvGrid_handle== NULL) setup_recvgrid_persistent();
03593 #endif
03594   int K1 = myGrid.K1;
03595   int K2 = myGrid.K2;
03596   int dim2 = myGrid.dim2;
03597   int dim3 = myGrid.dim3;
03598   int block1 = myGrid.block1;
03599   int block2 = myGrid.block2;
03600 
03601   // int savedMessages = 0;
03602   NodePmeMgr *npMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
03603 
03604   for (int ap=first; ap<=last; ++ap) {
03605     int ib = activePencils[ap].i;
03606     int jb = activePencils[ap].j;
03607     int ibegin = ib*block1;
03608     int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
03609     int jbegin = jb*block2;
03610     int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
03611     int flen = numGrids * (iend - ibegin) * (jend - jbegin);
03612 
03613     int fcount = 0;
03614     for ( int g=0; g<numGrids; ++g ) {
03615       char *f = f_arr + g*fsize;
03616 #ifdef NAMD_CUDA
03617      if ( offload ) {
03618       int errcount = 0;
03619       for ( int i=ibegin; i<iend; ++i ) {
03620        for ( int j=jbegin; j<jend; ++j ) {
03621         int k = i*dim2+j;
03622         f[k] = ffz_host[k];
03623         fcount += f[k];
03624         if ( ffz_host[k] & ~1 ) ++errcount;
03625        }
03626       }
03627       if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::sendPencilsPart");
03628      } else
03629 #endif
03630       for ( int i=ibegin; i<iend; ++i ) {
03631        for ( int j=jbegin; j<jend; ++j ) {
03632         fcount += f[i*dim2+j];
03633        }
03634       }
03635     }
03636 
03637 #ifdef NETWORK_PROGRESS
03638     CmiNetworkProgress();
03639 #endif
03640 
03641     if ( ! pencilActive[ib*yBlocks+jb] )
03642       NAMD_bug("PME activePencils list inconsistent");
03643 
03644     int zlistlen = 0;
03645     for ( int i=0; i<myGrid.K3; ++i ) {
03646       if ( fz_arr[i] ) ++zlistlen;
03647     }
03648 
03649     int hd = ( fcount? 1 : 0 );  // has data?
03650     // if ( ! hd ) ++savedMessages;
03651 
03652     
03653     PmeGridMsg *msg = new ( hd*zlistlen, hd*flen,
03654         hd*fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
03655     msg->sourceNode = sourcepe;
03656     msg->hasData = hd;
03657     msg->lattice = lattice;
03658    if ( hd ) {
03659 #if 0
03660     msg->start = fstart;
03661     msg->len = flen;
03662 #else
03663     msg->start = -1;   // obsolete?
03664     msg->len = -1;   // obsolete?
03665 #endif
03666     msg->zlistlen = zlistlen;
03667     int *zlist = msg->zlist;
03668     zlistlen = 0;
03669     for ( int i=0; i<myGrid.K3; ++i ) {
03670       if ( fz_arr[i] ) zlist[zlistlen++] = i;
03671     }
03672     char *fmsg = msg->fgrid;
03673     float *qmsg = msg->qgrid;
03674     for ( int g=0; g<numGrids; ++g ) {
03675       char *f = f_arr + g*fsize;
03676       float **q = q_arr + g*fsize;
03677       for ( int i=ibegin; i<iend; ++i ) {
03678        for ( int j=jbegin; j<jend; ++j ) {
03679         *(fmsg++) = f[i*dim2+j];
03680         if( f[i*dim2+j] ) {
03681           for (int h=0; h<myGrid.order-1; ++h) {
03682             q[i*dim2+j][h] += q[i*dim2+j][myGrid.K3+h];
03683           }
03684           for ( int k=0; k<zlistlen; ++k ) {
03685             *(qmsg++) = q[i*dim2+j][zlist[k]];
03686           }
03687         }
03688        }
03689       }
03690     }
03691    }
03692 
03693     msg->sequence = compute_sequence;
03694     SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
03695     CmiEnableUrgentSend(1);
03696 #if USE_NODE_PAR_RECEIVE
03697     msg->destElem=CkArrayIndex3D(ib,jb,0);
03698     CProxy_PmePencilMap lzm = npMgr->zm;
03699     int destproc = lzm.ckLocalBranch()->procNum(0, msg->destElem);
03700     int destnode = CmiNodeOf(destproc);
03701     
03702 #if  0 
03703     CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
03704 #endif
03705     pmeNodeProxy[destnode].recvZGrid(msg);
03706 #if 0 
03707     CmiUsePersistentHandle(NULL, 0);
03708 #endif
03709 #else
03710 #if 0 
03711     CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
03712 #endif
03713     zPencil(ib,jb,0).recvGrid(msg);
03714 #if 0 
03715     CmiUsePersistentHandle(NULL, 0);
03716 #endif
03717 #endif
03718     CmiEnableUrgentSend(0);
03719   }
03720 
03721 
03722   // if ( savedMessages ) {
03723   //   CkPrintf("Pe %d eliminated %d PME messages\n",CkMyPe(),savedMessages);
03724   // }
03725 
03726 }

void ComputePmeMgr::sendTrans ( void   ) 

Definition at line 1952 of file ComputePme.C.

References CKLOOP_CTRL_PME_SENDTRANS, Node::Object(), PmeSlabSendTrans(), sendTransSubset(), Node::simParameters, and SimParameters::useCkLoop.

Referenced by sendTransBarrier().

01952                                   {
01953 
01954   untrans_count = numTransPes;
01955 
01956 #if     CMK_SMP && USE_CKLOOP
01957   int useCkLoop = Node::Object()->simParameters->useCkLoop;
01958   if ( useCkLoop >= CKLOOP_CTRL_PME_SENDTRANS && CkNumPes() >= 2 * numGridPes) {
01959     CkLoop_Parallelize(PmeSlabSendTrans, 1, (void *)this, CkMyNodeSize(), 0, numTransNodes-1, 0); // no sync
01960   } else
01961 #endif
01962   {
01963     sendTransSubset(0, numTransNodes-1);
01964   }
01965 
01966 }

void ComputePmeMgr::sendTransBarrier ( void   ) 

Definition at line 1937 of file ComputePme.C.

References sendTrans().

Referenced by recvGrid().

01937                                          {
01938   sendTransBarrier_received += 1;
01939   // CkPrintf("sendTransBarrier on %d %d\n",myGridPe,numGridPes-sendTransBarrier_received);
01940   if ( sendTransBarrier_received < numGridPes ) return;
01941   sendTransBarrier_received = 0;
01942   for ( int i=0; i<numGridPes; ++i ) {
01943     pmeProxyDir[gridPeMap[i]].sendTrans();
01944   }
01945 }

void ComputePmeMgr::sendTransSubset ( int  first,
int  last 
)

Definition at line 1968 of file ComputePme.C.

References PmeGrid::dim3, fwdSharedTrans(), j, PmeGrid::K2, kgrid, PmeTransMsg::lattice, NodePmeInfo::npe, PmeTransMsg::nx, LocalPmeInfo::nx, LocalPmeInfo::ny_after_transpose, NodePmeInfo::pe_start, PME_TRANS_PRIORITY, PRIORITY_SIZE, PmeTransMsg::qgrid, NodePmeInfo::real_node, PmeTransMsg::sequence, SET_PRIORITY, PmeTransMsg::sourceNode, PmeTransMsg::x_start, and LocalPmeInfo::x_start.

Referenced by PmeSlabSendTrans(), and sendTrans().

01968                                                        {
01969   // CkPrintf("sendTrans on Pe(%d)\n",CkMyPe());
01970 
01971   // send data for transpose
01972   int zdim = myGrid.dim3;
01973   int nx = localInfo[myGridPe].nx;
01974   int x_start = localInfo[myGridPe].x_start;
01975   int slicelen = myGrid.K2 * zdim;
01976 
01977   ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
01978 
01979 #if CMK_BLUEGENEL
01980   CmiNetworkProgressAfter (0);
01981 #endif
01982 
01983   for (int j=first; j<=last; j++) {
01984     int node = transNodeOrder[j];  // different order on each node
01985     int pe = transNodeInfo[node].pe_start;
01986     int npe = transNodeInfo[node].npe;
01987     int totlen = 0;
01988     if ( node != myTransNode ) for (int i=0; i<npe; ++i, ++pe) {
01989       LocalPmeInfo &li = localInfo[pe];
01990       int cpylen = li.ny_after_transpose * zdim;
01991       totlen += cpylen;
01992     }
01993     PmeTransMsg *newmsg = new (nx * totlen * numGrids,
01994                                 PRIORITY_SIZE) PmeTransMsg;
01995     newmsg->sourceNode = myGridPe;
01996     newmsg->lattice = lattice;
01997     newmsg->x_start = x_start;
01998     newmsg->nx = nx;
01999     for ( int g=0; g<numGrids; ++g ) {
02000       float *qmsg = newmsg->qgrid + nx * totlen * g;
02001       pe = transNodeInfo[node].pe_start;
02002       for (int i=0; i<npe; ++i, ++pe) {
02003         LocalPmeInfo &li = localInfo[pe];
02004         int cpylen = li.ny_after_transpose * zdim;
02005         if ( node == myTransNode ) {
02006           ComputePmeMgr *m = mgrObjects[CkRankOf(transPeMap[pe])];
02007           qmsg = m->kgrid + m->qgrid_size * g + x_start*cpylen;
02008         }
02009         float *q = qgrid + qgrid_size * g + li.y_start_after_transpose * zdim;
02010         for ( int x = 0; x < nx; ++x ) {
02011           CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
02012           q += slicelen;
02013           qmsg += cpylen;
02014         }
02015       }
02016     }
02017     newmsg->sequence = grid_sequence;
02018     SET_PRIORITY(newmsg,grid_sequence,PME_TRANS_PRIORITY)
02019     if ( node == myTransNode ) newmsg->nx = 0;
02020     if ( npe > 1 ) {
02021       if ( node == myTransNode ) fwdSharedTrans(newmsg);
02022       else pmeNodeProxy[transNodeInfo[node].real_node].recvTrans(newmsg);
02023     } else pmeProxy[transPeMap[transNodeInfo[node].pe_start]].recvTrans(newmsg);
02024   }
02025 }

void ComputePmeMgr::sendUngrid ( void   ) 

Definition at line 2383 of file ComputePme.C.

References CKLOOP_CTRL_PME_SENDUNTRANS, Node::Object(), PmeSlabSendUngrid(), sendUngridSubset(), Node::simParameters, and SimParameters::useCkLoop.

02383                                    {
02384 
02385 #if     CMK_SMP && USE_CKLOOP
02386   int useCkLoop = Node::Object()->simParameters->useCkLoop;
02387   if ( useCkLoop >= CKLOOP_CTRL_PME_SENDUNTRANS && CkNumPes() >= 2 * numGridPes) {
02388     CkLoop_Parallelize(PmeSlabSendUngrid, 1, (void *)this, CkMyNodeSize(), 0, numSources-1, 1); // sync
02389   } else
02390 #endif
02391   {
02392     sendUngridSubset(0, numSources-1);
02393   }
02394 
02395   grid_count = numSources;
02396   memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
02397 }

void ComputePmeMgr::sendUngridSubset ( int  first,
int  last 
)

Definition at line 2399 of file ComputePme.C.

References PmeGrid::dim3, f, j, PME_OFFLOAD_UNGRID_PRIORITY, PME_UNGRID_PRIORITY, and SET_PRIORITY.

Referenced by PmeSlabSendUngrid(), and sendUngrid().

02399                                                         {
02400 
02401 #ifdef NAMD_CUDA
02402   const int UNGRID_PRIORITY = ( offload ? PME_OFFLOAD_UNGRID_PRIORITY : PME_UNGRID_PRIORITY );
02403 #else
02404   const int UNGRID_PRIORITY = PME_UNGRID_PRIORITY ;
02405 #endif
02406 
02407   for ( int j=first; j<=last; ++j ) {
02408     // int msglen = qgrid_len;
02409     PmeGridMsg *newmsg = gridmsg_reuse[j];
02410     int pe = newmsg->sourceNode;
02411     int zdim = myGrid.dim3;
02412     int flen = newmsg->len;
02413     int fstart = newmsg->start;
02414     int zlistlen = newmsg->zlistlen;
02415     int *zlist = newmsg->zlist;
02416     float *qmsg = newmsg->qgrid;
02417     for ( int g=0; g<numGrids; ++g ) {
02418       char *f = newmsg->fgrid + fgrid_len * g;
02419       float *q = qgrid + qgrid_size * g + (fstart-fgrid_start) * zdim;
02420       for ( int i=0; i<flen; ++i ) {
02421         if ( f[i] ) {
02422           for ( int k=0; k<zlistlen; ++k ) {
02423             *(qmsg++) = q[zlist[k]];
02424           }
02425         }
02426         q += zdim;
02427       }
02428     }
02429     newmsg->sourceNode = myGridPe;
02430 
02431     SET_PRIORITY(newmsg,grid_sequence,UNGRID_PRIORITY)
02432     CmiEnableUrgentSend(1);
02433 #ifdef NAMD_CUDA
02434     if ( offload ) {
02435       pmeNodeProxy[CkNodeOf(pe)].recvUngrid(newmsg);
02436     } else
02437 #endif
02438     pmeProxyDir[pe].recvUngrid(newmsg);
02439     CmiEnableUrgentSend(0);
02440   }
02441 }

void ComputePmeMgr::sendUntrans ( void   ) 

Definition at line 2196 of file ComputePme.C.

References CKLOOP_CTRL_PME_SENDUNTRANS, PmeEvirMsg::evir, Node::Object(), PME_UNGRID_PRIORITY, PmeSlabSendUntrans(), PRIORITY_SIZE, sendUntransSubset(), SET_PRIORITY, Node::simParameters, and SimParameters::useCkLoop.

02196                                     {
02197 
02198   trans_count = numGridPes;
02199 
02200   { // send energy and virial
02201     PmeEvirMsg *newmsg = new (numGrids, PRIORITY_SIZE) PmeEvirMsg;
02202     for ( int g=0; g<numGrids; ++g ) {
02203       newmsg->evir[g] = recip_evir2[g];
02204     }
02205     SET_PRIORITY(newmsg,grid_sequence,PME_UNGRID_PRIORITY)
02206     CmiEnableUrgentSend(1);
02207     pmeProxy[recipEvirPe].recvRecipEvir(newmsg);
02208     CmiEnableUrgentSend(0);
02209   }
02210 
02211 #if     CMK_SMP && USE_CKLOOP
02212   int useCkLoop = Node::Object()->simParameters->useCkLoop;
02213   if ( useCkLoop >= CKLOOP_CTRL_PME_SENDUNTRANS && CkNumPes() >= 2 * numTransPes) {
02214     CkLoop_Parallelize(PmeSlabSendUntrans, 1, (void *)this, CkMyNodeSize(), 0, numGridNodes-1, 0); // no sync
02215   } else
02216 #endif
02217   {
02218     sendUntransSubset(0, numGridNodes-1);
02219   }
02220 
02221 }

void ComputePmeMgr::sendUntransSubset ( int  first,
int  last 
)

Definition at line 2223 of file ComputePme.C.

References PmeGrid::dim3, fwdSharedUntrans(), j, PmeGrid::K2, NodePmeInfo::npe, LocalPmeInfo::nx, PmeUntransMsg::ny, LocalPmeInfo::ny_after_transpose, NodePmeInfo::pe_start, PME_UNTRANS_PRIORITY, PRIORITY_SIZE, qgrid, PmeUntransMsg::qgrid, NodePmeInfo::real_node, SET_PRIORITY, PmeUntransMsg::sourceNode, PmeUntransMsg::y_start, and LocalPmeInfo::y_start_after_transpose.

Referenced by PmeSlabSendUntrans(), and sendUntrans().

02223                                                          {
02224 
02225   int zdim = myGrid.dim3;
02226   int y_start = localInfo[myTransPe].y_start_after_transpose;
02227   int ny = localInfo[myTransPe].ny_after_transpose;
02228   int slicelen = myGrid.K2 * zdim;
02229 
02230   ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
02231 
02232 #if CMK_BLUEGENEL
02233   CmiNetworkProgressAfter (0);
02234 #endif
02235 
02236   // send data for reverse transpose
02237   for (int j=first; j<=last; j++) {
02238     int node = gridNodeOrder[j];  // different order on each node
02239     int pe = gridNodeInfo[node].pe_start;
02240     int npe = gridNodeInfo[node].npe;
02241     int totlen = 0;
02242     if ( node != myGridNode ) for (int i=0; i<npe; ++i, ++pe) {
02243       LocalPmeInfo &li = localInfo[pe];
02244       int cpylen = li.nx * zdim;
02245       totlen += cpylen;
02246     }
02247     PmeUntransMsg *newmsg = new (ny * totlen * numGrids, PRIORITY_SIZE) PmeUntransMsg;
02248     newmsg->sourceNode = myTransPe;
02249     newmsg->y_start = y_start;
02250     newmsg->ny = ny;
02251     for ( int g=0; g<numGrids; ++g ) {
02252       float *qmsg = newmsg->qgrid + ny * totlen * g;
02253       pe = gridNodeInfo[node].pe_start;
02254       for (int i=0; i<npe; ++i, ++pe) {
02255         LocalPmeInfo &li = localInfo[pe];
02256         if ( node == myGridNode ) {
02257           ComputePmeMgr *m = mgrObjects[CkRankOf(gridPeMap[pe])];
02258           qmsg = m->qgrid + m->qgrid_size * g + y_start * zdim;
02259           float *q = kgrid + qgrid_size*g + li.x_start*ny*zdim;
02260           int cpylen = ny * zdim;
02261           for ( int x = 0; x < li.nx; ++x ) {
02262             CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
02263             q += cpylen;
02264             qmsg += slicelen;
02265           }
02266         } else {
02267           CmiMemcpy((void*)qmsg,
02268                 (void*)(kgrid + qgrid_size*g + li.x_start*ny*zdim),
02269                 li.nx*ny*zdim*sizeof(float));
02270           qmsg += li.nx*ny*zdim;
02271         }
02272       }
02273     }
02274     SET_PRIORITY(newmsg,grid_sequence,PME_UNTRANS_PRIORITY)
02275     if ( node == myGridNode ) newmsg->ny = 0;
02276     if ( npe > 1 ) {
02277       if ( node == myGridNode ) fwdSharedUntrans(newmsg);
02278       else pmeNodeProxy[gridNodeInfo[node].real_node].recvUntrans(newmsg);
02279     } else pmeProxy[gridPeMap[gridNodeInfo[node].pe_start]].recvUntrans(newmsg);
02280   }
02281 }

void ComputePmeMgr::submitReductions (  ) 

Definition at line 4229 of file ComputePme.C.

References NAMD_bug(), Node::Object(), Node::simParameters, and simParams.

Referenced by ComputePme::doWork(), and recvRecipEvir().

04229                                      {
04230 
04231     SimParameters *simParams = Node::Object()->simParameters;
04232 
04233     for ( int g=0; g<numGrids; ++g ) {
04234       float scale = 1.;
04235       if (alchOn) {
04236         BigReal elecLambdaUp, elecLambdaDown;
04237         if( simParams->alchFepWhamOn ) {
04238           if( simParams->alchFepElecOn ) {
04239             elecLambdaUp = simParams->alchElecLambda;
04240             elecLambdaDown = 1.0 - simParams->alchElecLambda;
04241           }
04242           else {
04243             elecLambdaUp = 0.0;
04244             elecLambdaDown = 1.0;
04245           }
04246         }
04247         else {
04248           // alchLambda set on each step in ComputePme::ungridForces()
04249           if ( alchLambda < 0 || alchLambda > 1 ) {
04250             NAMD_bug("ComputePmeMgr::submitReductions alchLambda out of range");
04251           }
04252           elecLambdaUp = simParams->getElecLambda(alchLambda);
04253           elecLambdaDown = simParams->getElecLambda(1-alchLambda);
04254         }
04255         if ( g == 0 ) scale = elecLambdaUp;
04256         else if ( g == 1 ) scale = elecLambdaDown;
04257         else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
04258         if (alchDecouple) {
04259           if ( g == 2 ) scale = 1-elecLambdaUp;
04260           else if ( g == 3 ) scale = 1-elecLambdaDown;
04261           else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
04262         }
04263       } else if ( lesOn ) {
04264         scale = 1.0 / lesFactor;
04265       } else if ( pairOn ) {
04266         scale = ( g == 0 ? 1. : -1. );
04267       }
04268       reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += evir[g][0] * scale;
04269       reduction->item(REDUCTION_VIRIAL_SLOW_XX) += evir[g][1] * scale;
04270       reduction->item(REDUCTION_VIRIAL_SLOW_XY) += evir[g][2] * scale;
04271       reduction->item(REDUCTION_VIRIAL_SLOW_XZ) += evir[g][3] * scale;
04272       reduction->item(REDUCTION_VIRIAL_SLOW_YX) += evir[g][2] * scale;
04273       reduction->item(REDUCTION_VIRIAL_SLOW_YY) += evir[g][4] * scale;
04274       reduction->item(REDUCTION_VIRIAL_SLOW_YZ) += evir[g][5] * scale;
04275       reduction->item(REDUCTION_VIRIAL_SLOW_ZX) += evir[g][3] * scale;
04276       reduction->item(REDUCTION_VIRIAL_SLOW_ZY) += evir[g][5] * scale;
04277       reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += evir[g][6] * scale;
04278 
04279       float scale2 = 0.;
04280 
04281       // why is this declared/defined again here?
04282       SimParameters *simParams = Node::Object()->simParameters;
04283 
04284       if (alchFepOn) {
04285         BigReal elecLambda2Up=0.0, elecLambda2Down=0.0;
04286         if(simParams->alchFepWhamOn) {
04287           if(simParams->alchFepElecOn) {
04288             elecLambda2Up = simParams->alchElecLambda;
04289             elecLambda2Down =  1.0 - simParams->alchElecLambda;
04290           }
04291           else {
04292             elecLambda2Up = 0.0;
04293             elecLambda2Down =  1.0;
04294           }
04295         }
04296         else {
04297           elecLambda2Up = simParams->getElecLambda(simParams->alchLambda2);
04298           elecLambda2Down = simParams->getElecLambda(1.-simParams->alchLambda2);
04299         }
04300         
04301         if ( g == 0 ) scale2 = elecLambda2Up;
04302         else if ( g == 1 ) scale2 = elecLambda2Down;
04303         else if ( g == 2 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
04304         if (alchDecouple && g == 2 ) scale2 = 1 - elecLambda2Up;
04305         else if (alchDecouple && g == 3 ) scale2 = 1 - elecLambda2Down;
04306         else if (alchDecouple && g == 4 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
04307       }
04308       if(simParams->alchFepWhamOn && simParams->alchFepElecOn)  {       // FEP with wham post-process
04309         if( g==0 )      scale2 = scale + 1.0;
04310         else if( g==1 ) scale2 = scale - 1.0;
04311         else if( g==2 ) scale2 = scale - 1.0;
04312         else if( g==3 ) scale2 = scale + 1.0;
04313       }
04314       reduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += evir[g][0] * scale2;
04315       
04316       if (alchThermIntOn) {
04317         
04318         // no decoupling:
04319         // part. 1 <-> all of system except partition 2: g[0] - g[2] 
04320         // (interactions between all atoms [partition 0 OR partition 1], 
04321         // minus all [within partition 0])
04322         // U = elecLambdaUp * (U[0] - U[2])
04323         // dU/dl = U[0] - U[2];
04324         
04325         // part. 2 <-> all of system except partition 1: g[1] - g[2] 
04326         // (interactions between all atoms [partition 0 OR partition 2], 
04327         // minus all [within partition 0])
04328         // U = elecLambdaDown * (U[1] - U[2])
04329         // dU/dl = U[1] - U[2];
04330 
04331         // alchDecouple:
04332         // part. 1 <-> part. 0: g[0] - g[2] - g[4] 
04333         // (interactions between all atoms [partition 0 OR partition 1]
04334         // minus all [within partition 1] minus all [within partition 0]
04335         // U = elecLambdaUp * (U[0] - U[4]) + (1-elecLambdaUp)* U[2]
04336         // dU/dl = U[0] - U[2] - U[4];
04337 
04338         // part. 2 <-> part. 0: g[1] - g[3] - g[4] 
04339         // (interactions between all atoms [partition 0 OR partition 2]
04340         // minus all [within partition 2] minus all [within partition 0]
04341         // U = elecLambdaDown * (U[1] - U[4]) + (1-elecLambdaDown)* U[3]
04342         // dU/dl = U[1] - U[3] - U[4];
04343         
04344         
04345         if ( g == 0 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) += evir[g][0];
04346         if ( g == 1 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) += evir[g][0];
04347         if (!alchDecouple) {
04348           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
04349           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
04350         }
04351         else {  // alchDecouple
04352           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
04353           if ( g == 3 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
04354           if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
04355           if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
04356         }
04357       }
04358     }
04359 
04360     alchLambda = -1.;  // illegal value to catch if not updated
04361 
04362     reduction->item(REDUCTION_STRAY_CHARGE_ERRORS) += strayChargeErrors;
04363     reduction->submit();
04364 
04365   for ( int i=0; i<heldComputes.size(); ++i ) {
04366     WorkDistrib::messageEnqueueWork(heldComputes[i]);
04367   }
04368   heldComputes.resize(0);
04369 }

void ComputePmeMgr::ungridCalc ( void   ) 

Definition at line 2527 of file ComputePme.C.

References a_data_dev, cuda_errcheck(), CUDA_EVENT_ID_PME_COPY, CUDA_EVENT_ID_PME_KERNEL, CUDA_EVENT_ID_PME_TICK, deviceCUDA, end_forces, NodePmeMgr::end_potential_memcpy, EVENT_STRIDE, f_data_dev, f_data_host, forces_count, forces_done_count, forces_time, DeviceCUDA::getDeviceID(), j, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, WorkDistrib::messageEnqueueWork(), NodePmeMgr::mgrObjects, PmeGrid::order, pmeComputes, ResizeArray< Elem >::size(), this_pe, and ungridCalc().

Referenced by ungridCalc().

02527                                    {
02528   // CkPrintf("ungridCalc on Pe(%d)\n",CkMyPe());
02529 
02530   ungridForcesCount = pmeComputes.size();
02531 
02532 #ifdef NAMD_CUDA
02533  if ( offload ) {
02534   //CmiLock(cuda_lock);
02535   cudaSetDevice(deviceCUDA->getDeviceID());
02536 
02537   if ( this == masterPmeMgr ) {
02538     double before = CmiWallTimer();
02539     cudaMemcpyAsync(v_data_dev, q_data_host, q_data_size, cudaMemcpyHostToDevice, 0 /*streams[stream]*/);
02540     cudaEventRecord(nodePmeMgr->end_potential_memcpy, 0 /*streams[stream]*/);
02541     traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
02542 
02543     const int myrank = CkMyRank();
02544     for ( int i=0; i<CkMyNodeSize(); ++i ) {
02545       if ( myrank != i && nodePmeMgr->mgrObjects[i]->pmeComputes.size() ) {
02546         nodePmeMgr->mgrObjects[i]->ungridCalc();
02547       }
02548     }
02549     if ( ! pmeComputes.size() ) return;
02550   }
02551 
02552   if ( ! end_forces ) {
02553     int n=(pmeComputes.size()-1)/EVENT_STRIDE+1;
02554     end_forces = new cudaEvent_t[n];
02555     for ( int i=0; i<n; ++i ) {
02556       cudaEventCreateWithFlags(&end_forces[i],cudaEventDisableTiming);
02557     }
02558   }
02559 
02560   const int pcsz = pmeComputes.size();
02561   if ( ! afn_host ) {
02562     cudaMallocHost((void**) &afn_host, 3*pcsz*sizeof(float*));
02563     cudaMalloc((void**) &afn_dev, 3*pcsz*sizeof(float*));
02564     cuda_errcheck("malloc params for pme");
02565   }
02566   int totn = 0;
02567   for ( int i=0; i<pcsz; ++i ) {
02568     int n = pmeComputes[i]->numGridAtoms[0];
02569     totn += n;
02570   }
02571   if ( totn > f_data_mgr_alloc ) {
02572     if ( f_data_mgr_alloc ) {
02573       CkPrintf("Expanding CUDA forces allocation because %d > %d\n", totn, f_data_mgr_alloc);
02574       cudaFree(f_data_mgr_dev);
02575       cudaFreeHost(f_data_mgr_host);
02576     }
02577     f_data_mgr_alloc = 1.2 * (totn + 100);
02578     cudaMalloc((void**) &f_data_mgr_dev, 3*f_data_mgr_alloc*sizeof(float));
02579     cudaMallocHost((void**) &f_data_mgr_host, 3*f_data_mgr_alloc*sizeof(float));
02580     cuda_errcheck("malloc forces for pme");
02581   }
02582   // CkPrintf("pe %d pcsz %d totn %d alloc %d\n", CkMyPe(), pcsz, totn, f_data_mgr_alloc);
02583   float *f_dev = f_data_mgr_dev;
02584   float *f_host = f_data_mgr_host;
02585   for ( int i=0; i<pcsz; ++i ) {
02586     int n = pmeComputes[i]->numGridAtoms[0];
02587     pmeComputes[i]->f_data_dev = f_dev;
02588     pmeComputes[i]->f_data_host = f_host;
02589     afn_host[3*i  ] = a_data_dev + 7 * pmeComputes[i]->cuda_atoms_offset;
02590     afn_host[3*i+1] = f_dev;
02591     afn_host[3*i+2] = f_dev + n;  // avoid type conversion issues
02592     f_dev += 3*n;
02593     f_host += 3*n;
02594   }
02595   //CmiLock(cuda_lock);
02596   double before = CmiWallTimer();
02597   cudaMemcpyAsync(afn_dev, afn_host, 3*pcsz*sizeof(float*), cudaMemcpyHostToDevice, streams[stream]);
02598   traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
02599   cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_potential_memcpy, 0);
02600   traceUserEvent(CUDA_EVENT_ID_PME_TICK);
02601 
02602   for ( int i=0; i<pcsz; ++i ) {
02603     // cudaMemsetAsync(pmeComputes[i]->f_data_dev, 0, 3*n*sizeof(float), streams[stream]);
02604     if ( i%EVENT_STRIDE == 0 ) {
02605       int dimy = pcsz - i;
02606       if ( dimy > EVENT_STRIDE ) dimy = EVENT_STRIDE;
02607       int maxn = 0;
02608       int subtotn = 0;
02609       for ( int j=0; j<dimy; ++j ) {
02610         int n = pmeComputes[i+j]->numGridAtoms[0];
02611         subtotn += n;
02612         if ( n > maxn ) maxn = n;
02613       }
02614       // CkPrintf("pe %d dimy %d maxn %d subtotn %d\n", CkMyPe(), dimy, maxn, subtotn);
02615       before = CmiWallTimer();
02616       cuda_pme_forces(
02617         bspline_coeffs_dev,
02618         v_arr_dev, afn_dev+3*i, dimy, maxn, /*
02619         pmeComputes[i]->a_data_dev,
02620         pmeComputes[i]->f_data_dev,
02621         n, */ myGrid.K1, myGrid.K2, myGrid.K3, myGrid.order,
02622         streams[stream]);
02623       traceUserBracketEvent(CUDA_EVENT_ID_PME_KERNEL,before,CmiWallTimer());
02624       before = CmiWallTimer();
02625       cudaMemcpyAsync(pmeComputes[i]->f_data_host, pmeComputes[i]->f_data_dev, 3*subtotn*sizeof(float),
02626         cudaMemcpyDeviceToHost, streams[stream]);
02627       traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
02628       cudaEventRecord(end_forces[i/EVENT_STRIDE], streams[stream]);
02629       traceUserEvent(CUDA_EVENT_ID_PME_TICK);
02630     }
02631     // CkPrintf("pe %d c %d natoms %d fdev %lld fhost %lld\n", CkMyPe(), i, (int64)afn_host[3*i+2], pmeComputes[i]->f_data_dev, pmeComputes[i]->f_data_host);
02632   }
02633   //CmiUnlock(cuda_lock);
02634  } else
02635 #endif // NAMD_CUDA
02636  {
02637   for ( int i=0; i<pmeComputes.size(); ++i ) {
02638     WorkDistrib::messageEnqueueWork(pmeComputes[i]);
02639     // pmeComputes[i]->ungridForces();
02640   }
02641  }
02642   // submitReductions();  // must follow all ungridForces()
02643 
02644 #ifdef NAMD_CUDA
02645  if ( offload ) {
02646   forces_time = CmiWallTimer();
02647   forces_count = ungridForcesCount;
02648   forces_done_count = 0;
02649   pmeProxy[this_pe].pollForcesReady();
02650  }
02651 #endif
02652 
02653   ungrid_count = (usePencils ? numPencilsActive : numDestRecipPes );
02654 }


Friends And Related Function Documentation

friend class ComputePme [friend]

Definition at line 346 of file ComputePme.C.

friend class NodePmeMgr [friend]

Definition at line 347 of file ComputePme.C.


Member Data Documentation

float* ComputePmeMgr::a_data_dev

Definition at line 408 of file ComputePme.C.

Referenced by cuda_submit_charges(), and ungridCalc().

float* ComputePmeMgr::a_data_host

Definition at line 407 of file ComputePme.C.

int ComputePmeMgr::chargeGridSubmittedCount

Definition at line 433 of file ComputePme.C.

Referenced by chargeGridSubmitted(), ComputePmeMgr(), initialize_computes(), and sendChargeGridReady().

double ComputePmeMgr::charges_time

Definition at line 419 of file ComputePme.C.

Referenced by cuda_check_pme_charges(), and cuda_submit_charges().

int ComputePmeMgr::check_charges_count

Definition at line 421 of file ComputePme.C.

Referenced by ComputePmeMgr(), and cuda_check_pme_charges().

int ComputePmeMgr::check_forces_count

Definition at line 422 of file ComputePme.C.

Referenced by ComputePmeMgr(), and cuda_check_pme_forces().

int ComputePmeMgr::cuda_atoms_alloc

Definition at line 412 of file ComputePme.C.

Referenced by ComputePmeMgr().

int ComputePmeMgr::cuda_atoms_count

Definition at line 411 of file ComputePme.C.

Referenced by ComputePmeMgr(), cuda_submit_charges(), and ComputePme::initialize().

bool ComputePmeMgr::cuda_busy [static]

Definition at line 431 of file ComputePme.C.

CmiNodeLock ComputePmeMgr::cuda_lock [static]

Definition at line 413 of file ComputePme.C.

Referenced by ComputePmeMgr(), initialize_computes(), and recvAck().

std::deque< ComputePmeMgr::cuda_submit_charges_args > ComputePmeMgr::cuda_submit_charges_deque [static]

Definition at line 430 of file ComputePme.C.

cudaEvent_t ComputePmeMgr::end_charges

Definition at line 415 of file ComputePme.C.

Referenced by chargeGridSubmitted(), ComputePmeMgr(), and cuda_check_pme_charges().

cudaEvent_t* ComputePmeMgr::end_forces

Definition at line 416 of file ComputePme.C.

Referenced by ComputePmeMgr(), cuda_check_pme_forces(), and ungridCalc().

float* ComputePmeMgr::f_data_dev

Definition at line 410 of file ComputePme.C.

Referenced by ungridCalc().

float* ComputePmeMgr::f_data_host

Definition at line 409 of file ComputePme.C.

Referenced by ungridCalc().

CmiNodeLock ComputePmeMgr::fftw_plan_lock [static]

Definition at line 403 of file ComputePme.C.

Referenced by ComputePmeMgr(), initialize(), and ~ComputePmeMgr().

int ComputePmeMgr::forces_count

Definition at line 417 of file ComputePme.C.

Referenced by cuda_check_pme_forces(), and ungridCalc().

int ComputePmeMgr::forces_done_count

Definition at line 418 of file ComputePme.C.

Referenced by cuda_check_pme_forces(), and ungridCalc().

double ComputePmeMgr::forces_time

Definition at line 420 of file ComputePme.C.

Referenced by cuda_check_pme_forces(), and ungridCalc().

int ComputePmeMgr::master_pe

Definition at line 423 of file ComputePme.C.

Referenced by chargeGridSubmitted(), initialize_computes(), recvAck(), and sendChargeGridReady().

ResizeArray<ComputePme*> ComputePmeMgr::pmeComputes

Definition at line 443 of file ComputePme.C.

Referenced by chargeGridReady(), cuda_check_pme_forces(), getComputes(), ComputePme::noWork(), recvRecipEvir(), sendChargeGridReady(), and ungridCalc().

CmiNodeLock ComputePmeMgr::pmemgr_lock

Definition at line 404 of file ComputePme.C.

Referenced by ComputePmeMgr(), and ~ComputePmeMgr().

Lattice* ComputePmeMgr::saved_lattice

Definition at line 436 of file ComputePme.C.

Referenced by chargeGridSubmitted(), and recvChargeGridReady().

int ComputePmeMgr::saved_sequence

Definition at line 437 of file ComputePme.C.

Referenced by chargeGridSubmitted(), cuda_check_pme_charges(), cuda_check_pme_forces(), and recvChargeGridReady().

int ComputePmeMgr::sendDataHelper_errors

Definition at line 362 of file ComputePme.C.

Referenced by sendData(), and NodePmeMgr::sendDataHelper().

Lattice* ComputePmeMgr::sendDataHelper_lattice

Definition at line 359 of file ComputePme.C.

Referenced by sendData(), NodePmeMgr::sendDataHelper(), sendPencils(), and NodePmeMgr::sendPencilsHelper().

int ComputePmeMgr::sendDataHelper_sequence

Definition at line 360 of file ComputePme.C.

Referenced by sendData(), NodePmeMgr::sendDataHelper(), sendPencils(), and NodePmeMgr::sendPencilsHelper().

int ComputePmeMgr::sendDataHelper_sourcepe

Definition at line 361 of file ComputePme.C.

Referenced by sendData(), NodePmeMgr::sendDataHelper(), sendPencils(), and NodePmeMgr::sendPencilsHelper().

int ComputePmeMgr::this_pe

Definition at line 424 of file ComputePme.C.

Referenced by ComputePmeMgr(), and ungridCalc().


The documentation for this class was generated from the following file:
Generated on Thu Nov 23 01:17:17 2017 for NAMD by  doxygen 1.4.7