ComputePmeMgr Class Reference

Inheritance diagram for ComputePmeMgr:

ComputePmeUtil List of all members.

Public Member Functions

 ComputePmeMgr ()
 ~ComputePmeMgr ()
void initialize (CkQdMsg *)
void initialize_pencils (CkQdMsg *)
void activate_pencils (CkQdMsg *)
void recvArrays (CProxy_PmeXPencil, CProxy_PmeYPencil, CProxy_PmeZPencil)
void initialize_computes ()
void sendData (Lattice &, int sequence)
void sendDataPart (int first, int last, Lattice &, int sequence, int sourcepe, int errors)
void sendPencils (Lattice &, int sequence)
void sendPencilsPart (int first, int last, Lattice &, int sequence, int sourcepe)
void recvGrid (PmeGridMsg *)
void gridCalc1 (void)
void sendTransBarrier (void)
void sendTransSubset (int first, int last)
void sendTrans (void)
void fwdSharedTrans (PmeTransMsg *)
void recvSharedTrans (PmeSharedTransMsg *)
void sendDataHelper (int)
void sendPencilsHelper (int)
void recvTrans (PmeTransMsg *)
void procTrans (PmeTransMsg *)
void gridCalc2 (void)
void gridCalc2R (void)
void fwdSharedUntrans (PmeUntransMsg *)
void recvSharedUntrans (PmeSharedUntransMsg *)
void sendUntrans (void)
void sendUntransSubset (int first, int last)
void recvUntrans (PmeUntransMsg *)
void procUntrans (PmeUntransMsg *)
void gridCalc3 (void)
void sendUngrid (void)
void sendUngridSubset (int first, int last)
void recvUngrid (PmeGridMsg *)
void recvAck (PmeAckMsg *)
void copyResults (PmeGridMsg *)
void copyPencils (PmeGridMsg *)
void ungridCalc (void)
void recvRecipEvir (PmeEvirMsg *)
void addRecipEvirClient (void)
void submitReductions ()
void chargeGridSubmitted (Lattice &lattice, int sequence)
void cuda_submit_charges (Lattice &lattice, int sequence)
void sendChargeGridReady ()
void pollChargeGridReady ()
void pollForcesReady ()
void recvChargeGridReady ()
void chargeGridReady (Lattice &lattice, int sequence)

Public Attributes

LatticesendDataHelper_lattice
int sendDataHelper_sequence
int sendDataHelper_sourcepe
int sendDataHelper_errors
CmiNodeLock pmemgr_lock
float * a_data_host
float * a_data_dev
float * f_data_host
float * f_data_dev
int cuda_atoms_count
int cuda_atoms_alloc
cudaEvent_t end_charges
cudaEvent_t * end_forces
int forces_count
int forces_done_count
double charges_time
double forces_time
int check_charges_count
int check_forces_count
int master_pe
int this_pe
int chargeGridSubmittedCount
Latticesaved_lattice
int saved_sequence
ResizeArray< ComputePme * > pmeComputes

Static Public Attributes

static CmiNodeLock fftw_plan_lock
static CmiNodeLock cuda_lock
static std::deque< cuda_submit_charges_argscuda_submit_charges_deque
static bool cuda_busy

Friends

class ComputePme
class NodePmeMgr

Classes

struct  cuda_submit_charges_args

Detailed Description

Definition at line 355 of file ComputePme.C.


Constructor & Destructor Documentation

ComputePmeMgr::ComputePmeMgr (  ) 

Definition at line 709 of file ComputePme.C.

References chargeGridSubmittedCount, check_charges_count, check_forces_count, cuda_atoms_alloc, cuda_atoms_count, cuda_errcheck(), CUDA_EVENT_ID_PME_CHARGES, CUDA_EVENT_ID_PME_COPY, CUDA_EVENT_ID_PME_FORCES, CUDA_EVENT_ID_PME_KERNEL, CUDA_EVENT_ID_PME_TICK, cuda_lock, CUDA_STREAM_CREATE, end_charges, end_forces, fftw_plan_lock, NUM_STREAMS, pmemgr_lock, stream, and this_pe.

00709                              : pmeProxy(thisgroup), 
00710                                  pmeProxyDir(thisgroup) {
00711 
00712   CkpvAccess(BOCclass_group).computePmeMgr = thisgroup;
00713   pmeNodeProxy = CkpvAccess(BOCclass_group).nodePmeMgr;
00714   nodePmeMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
00715 
00716   pmeNodeProxy.ckLocalBranch()->initialize();
00717 
00718   if ( CmiMyRank() == 0 ) {
00719     fftw_plan_lock = CmiCreateLock();
00720   }
00721   pmemgr_lock = CmiCreateLock();
00722 
00723   myKSpace = 0;
00724   kgrid = 0;
00725   work = 0;
00726   grid_count = 0;
00727   trans_count = 0;
00728   untrans_count = 0;
00729   ungrid_count = 0;
00730   gridmsg_reuse= new PmeGridMsg*[CkNumPes()];
00731   useBarrier = 0;
00732   sendTransBarrier_received = 0;
00733   usePencils = 0;
00734 
00735 #ifdef NAMD_CUDA
00736  // offload has not been set so this happens on every run
00737   if ( CmiMyRank() == 0 ) {
00738     cuda_lock = CmiCreateLock();
00739   }
00740 
00741 #if CUDA_VERSION >= 5050
00742   int leastPriority, greatestPriority;
00743   cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
00744   cuda_errcheck("in cudaDeviceGetStreamPriorityRange");
00745   //if ( CkMyNode() == 0 ) {
00746   //  CkPrintf("Pe %d PME CUDA stream priority range %d %d\n", CkMyPe(), leastPriority, greatestPriority);
00747   //}
00748 #define CUDA_STREAM_CREATE(X) cudaStreamCreateWithPriority(X,cudaStreamDefault,greatestPriority)
00749 #else
00750 #define CUDA_STREAM_CREATE(X) cudaStreamCreate(X)
00751 #endif
00752 
00753   stream = 0;
00754   for ( int i=0; i<NUM_STREAMS; ++i ) {
00755 #if 1
00756     CUDA_STREAM_CREATE(&streams[i]);
00757     cuda_errcheck("cudaStreamCreate");
00758 #else
00759   streams[i] = 0;  // XXXX Testing!!!
00760 #endif
00761   }
00762 
00763   this_pe = CkMyPe();
00764  
00765   cudaEventCreateWithFlags(&end_charges,cudaEventDisableTiming);
00766   end_forces = 0;
00767   check_charges_count = 0;
00768   check_forces_count = 0;
00769   chargeGridSubmittedCount = 0;
00770 
00771   cuda_atoms_count = 0;
00772   cuda_atoms_alloc = 0;
00773 
00774   f_data_mgr_alloc = 0;
00775   f_data_mgr_host = 0;
00776   f_data_mgr_dev = 0;
00777   afn_host = 0;
00778   afn_dev = 0;
00779 
00780 #define CUDA_EVENT_ID_PME_CHARGES 80
00781 #define CUDA_EVENT_ID_PME_FORCES 81
00782 #define CUDA_EVENT_ID_PME_TICK 82
00783 #define CUDA_EVENT_ID_PME_COPY 83
00784 #define CUDA_EVENT_ID_PME_KERNEL 84
00785   if ( 0 == CkMyPe() ) {
00786     traceRegisterUserEvent("CUDA PME charges", CUDA_EVENT_ID_PME_CHARGES);
00787     traceRegisterUserEvent("CUDA PME forces", CUDA_EVENT_ID_PME_FORCES);
00788     traceRegisterUserEvent("CUDA PME tick", CUDA_EVENT_ID_PME_TICK);
00789     traceRegisterUserEvent("CUDA PME memcpy", CUDA_EVENT_ID_PME_COPY);
00790     traceRegisterUserEvent("CUDA PME kernel", CUDA_EVENT_ID_PME_KERNEL);
00791   }
00792 #endif
00793   recipEvirCount = 0;
00794   recipEvirClients = 0;
00795   recipEvirPe = -999;
00796 }

ComputePmeMgr::~ComputePmeMgr (  ) 

Definition at line 1792 of file ComputePme.C.

References fftw_plan_lock, and pmemgr_lock.

01792                               {
01793 
01794   if ( CmiMyRank() == 0 ) {
01795     CmiDestroyLock(fftw_plan_lock);
01796   }
01797   CmiDestroyLock(pmemgr_lock);
01798 
01799   delete myKSpace;
01800   delete [] localInfo;
01801   delete [] gridNodeInfo;
01802   delete [] transNodeInfo;
01803   delete [] gridPeMap;
01804   delete [] transPeMap;
01805   delete [] recipPeDest;
01806   delete [] gridPeOrder;
01807   delete [] gridNodeOrder;
01808   delete [] transNodeOrder;
01809   delete [] qgrid;
01810   if ( kgrid != qgrid ) delete [] kgrid;
01811   delete [] work;
01812   delete [] gridmsg_reuse;
01813 
01814  if ( ! offload ) {
01815   for (int i=0; i<q_count; ++i) {
01816     delete [] q_list[i];
01817   }
01818   delete [] q_list;
01819   delete [] fz_arr;
01820  }
01821   delete [] f_arr;
01822   delete [] q_arr;
01823 }


Member Function Documentation

void ComputePmeMgr::activate_pencils ( CkQdMsg *   ) 

Definition at line 1786 of file ComputePme.C.

01786                                                  {
01787   if ( ! usePencils ) return;
01788   if ( CkMyPe() == 0 ) zPencil.dummyRecvGrid(CkMyPe(),1);
01789 }

void ComputePmeMgr::addRecipEvirClient ( void   ) 

Definition at line 3003 of file ComputePme.C.

03003                                        {
03004   ++recipEvirClients;
03005 }

void ComputePmeMgr::chargeGridReady ( Lattice lattice,
int  sequence 
)

Definition at line 3520 of file ComputePme.C.

References j, PmeGrid::K3, NAMD_bug(), PmeGrid::order, pmeComputes, sendData(), sendPencils(), and ResizeArray< Elem >::size().

Referenced by recvChargeGridReady().

03520                                                                   {
03521 
03522 #ifdef NAMD_CUDA
03523  if ( offload ) {
03524   int errcount = 0;
03525   int q_stride = myGrid.K3+myGrid.order-1;
03526   for (int n=fsize+q_stride, j=fsize; j<n; ++j) {
03527     f_arr[j] = ffz_host[j];
03528     if ( ffz_host[j] & ~1 ) ++errcount;
03529   }
03530   if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::chargeGridReady");
03531  }
03532 #endif
03533   recipEvirCount = recipEvirClients;
03534   ungridForcesCount = pmeComputes.size();
03535 
03536   for (int j=0; j<myGrid.order-1; ++j) {
03537     fz_arr[j] |= fz_arr[myGrid.K3+j];
03538   }
03539 
03540   if ( usePencils ) {
03541     sendPencils(lattice,sequence);
03542   } else {
03543     sendData(lattice,sequence);
03544   }
03545 }

void ComputePmeMgr::chargeGridSubmitted ( Lattice lattice,
int  sequence 
)

Definition at line 3456 of file ComputePme.C.

References chargeGridSubmittedCount, computeMgr, CUDA_EVENT_ID_PME_COPY, deviceCUDA, NodePmeMgr::end_all_pme_kernels, NodePmeMgr::end_charge_memset, end_charges, DeviceCUDA::getMasterPe(), master_pe, Node::Object(), saved_lattice, saved_sequence, Node::simParameters, and simParams.

Referenced by cuda_submit_charges().

03456                                                                       {
03457   saved_lattice = &lattice;
03458   saved_sequence = sequence;
03459 
03460   // cudaDeviceSynchronize();  //  XXXX TESTING
03461   //int q_stride = myGrid.K3+myGrid.order-1;
03462   //for (int n=fsize+q_stride, j=0; j<n; ++j) {
03463   //  if ( ffz_host[j] != 0 && ffz_host[j] != 1 ) {
03464   //    CkPrintf("pre-memcpy flag %d/%d == %d on pe %d in ComputePmeMgr::chargeGridReady\n", j, n, ffz_host[j], CkMyPe());
03465   //  }
03466   //}
03467   //CmiLock(cuda_lock);
03468 
03469  if ( --(masterPmeMgr->chargeGridSubmittedCount) == 0 ) {
03470   double before = CmiWallTimer();
03471   cudaEventRecord(nodePmeMgr->end_all_pme_kernels, 0);  // when all streams complete
03472   cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_all_pme_kernels, 0);
03473   cudaMemcpyAsync(q_data_host, q_data_dev, q_data_size+ffz_size,
03474                         cudaMemcpyDeviceToHost, streams[stream]);
03475   traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
03476   cudaEventRecord(masterPmeMgr->end_charges, streams[stream]);
03477   cudaMemsetAsync(q_data_dev, 0, q_data_size + ffz_size, streams[stream]);  // for next time
03478   cudaEventRecord(nodePmeMgr->end_charge_memset, streams[stream]);
03479   //CmiUnlock(cuda_lock);
03480   // cudaDeviceSynchronize();  //  XXXX TESTING
03481   // cuda_errcheck("after memcpy grid to host");
03482 
03483   SimParameters *simParams = Node::Object()->simParameters;
03484   if ( ! simParams->useCUDA2 ) {
03485     CProxy_ComputeMgr cm(CkpvAccess(BOCclass_group).computeMgr);
03486     cm[deviceCUDA->getMasterPe()].recvYieldDevice(-1);
03487   }
03488 
03489   pmeProxy[master_pe].pollChargeGridReady();
03490  }
03491 }

void ComputePmeMgr::copyPencils ( PmeGridMsg  ) 

Definition at line 3766 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::block2, PmeGrid::dim2, PmeGrid::dim3, j, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PmeGrid::order, PmeGridMsg::qgrid, PmeGridMsg::sourceNode, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

Referenced by recvUngrid().

03766                                                {
03767 
03768   int K1 = myGrid.K1;
03769   int K2 = myGrid.K2;
03770   int dim2 = myGrid.dim2;
03771   int dim3 = myGrid.dim3;
03772   int block1 = myGrid.block1;
03773   int block2 = myGrid.block2;
03774 
03775   // msg->sourceNode = thisIndex.x * initdata.yBlocks + thisIndex.y;
03776   int ib = msg->sourceNode / yBlocks;
03777   int jb = msg->sourceNode % yBlocks;
03778 
03779   int ibegin = ib*block1;
03780   int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
03781   int jbegin = jb*block2;
03782   int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
03783 
03784   int zlistlen = msg->zlistlen;
03785   int *zlist = msg->zlist;
03786   float *qmsg = msg->qgrid;
03787   int g;
03788   for ( g=0; g<numGrids; ++g ) {
03789     char *f = f_arr + g*fsize;
03790     float **q = q_arr + g*fsize;
03791     for ( int i=ibegin; i<iend; ++i ) {
03792      for ( int j=jbegin; j<jend; ++j ) {
03793       if( f[i*dim2+j] ) {
03794         f[i*dim2+j] = 0;
03795         for ( int k=0; k<zlistlen; ++k ) {
03796           q[i*dim2+j][zlist[k]] = *(qmsg++);
03797         }
03798         for (int h=0; h<myGrid.order-1; ++h) {
03799           q[i*dim2+j][myGrid.K3+h] = q[i*dim2+j][h];
03800         }
03801       }
03802      }
03803     }
03804   }
03805 }

void ComputePmeMgr::copyResults ( PmeGridMsg  ) 

Definition at line 3958 of file ComputePme.C.

References PmeGrid::dim3, PmeGridMsg::fgrid, PmeGrid::K3, PmeGridMsg::len, PmeGrid::order, PmeGridMsg::qgrid, PmeGridMsg::start, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

Referenced by recvUngrid().

03958                                                {
03959 
03960   int zdim = myGrid.dim3;
03961   int flen = msg->len;
03962   int fstart = msg->start;
03963   int zlistlen = msg->zlistlen;
03964   int *zlist = msg->zlist;
03965   float *qmsg = msg->qgrid;
03966   int g;
03967   for ( g=0; g<numGrids; ++g ) {
03968     char *f = msg->fgrid + g*flen;
03969     float **q = q_arr + fstart + g*fsize;
03970     for ( int i=0; i<flen; ++i ) {
03971       if ( f[i] ) {
03972         f[i] = 0;
03973         for ( int k=0; k<zlistlen; ++k ) {
03974           q[i][zlist[k]] = *(qmsg++);
03975         }
03976         for (int h=0; h<myGrid.order-1; ++h) {
03977           q[i][myGrid.K3+h] = q[i][h];
03978         }
03979       }
03980     }
03981   }
03982 }

void ComputePmeMgr::cuda_submit_charges ( Lattice lattice,
int  sequence 
)

Definition at line 3403 of file ComputePme.C.

References a_data_dev, chargeGridSubmitted(), charges_time, cuda_atoms_count, CUDA_EVENT_ID_PME_COPY, CUDA_EVENT_ID_PME_KERNEL, NodePmeMgr::end_charge_memset, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, and PmeGrid::order.

03403                                                                       {
03404 
03405     int n = cuda_atoms_count;
03406     //CkPrintf("pe %d cuda_atoms_count %d\n", CkMyPe(), cuda_atoms_count);
03407     cuda_atoms_count = 0;
03408 
03409     const double before = CmiWallTimer();
03410     cudaMemcpyAsync(a_data_dev, a_data_host, 7*n*sizeof(float),
03411                           cudaMemcpyHostToDevice, streams[stream]);
03412     const double after = CmiWallTimer();
03413 
03414     cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_charge_memset, 0);
03415 
03416     cuda_pme_charges(
03417       bspline_coeffs_dev,
03418       q_arr_dev, ffz_dev, ffz_dev + fsize,
03419       a_data_dev, n,
03420       myGrid.K1, myGrid.K2, myGrid.K3, myGrid.order,
03421       streams[stream]);
03422     const double after2 = CmiWallTimer();
03423 
03424     chargeGridSubmitted(lattice,sequence);  // must be inside lock
03425 
03426     masterPmeMgr->charges_time = before;
03427     traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,after);
03428     traceUserBracketEvent(CUDA_EVENT_ID_PME_KERNEL,after,after2);
03429 }

void ComputePmeMgr::fwdSharedTrans ( PmeTransMsg  ) 

Definition at line 2012 of file ComputePme.C.

References NodePmeInfo::npe, NodePmeInfo::pe_start, PME_TRANS_PRIORITY, PRIORITY_SIZE, PmeTransMsg::sequence, and SET_PRIORITY.

Referenced by NodePmeMgr::recvTrans(), and sendTransSubset().

02012                                                    {
02013   // CkPrintf("fwdSharedTrans on Pe(%d)\n",CkMyPe());
02014   int pe = transNodeInfo[myTransNode].pe_start;
02015   int npe = transNodeInfo[myTransNode].npe;
02016   CmiNodeLock lock = CmiCreateLock();
02017   int *count = new int; *count = npe;
02018   for (int i=0; i<npe; ++i, ++pe) {
02019     PmeSharedTransMsg *shmsg = new (PRIORITY_SIZE) PmeSharedTransMsg;
02020     SET_PRIORITY(shmsg,msg->sequence,PME_TRANS_PRIORITY)
02021     shmsg->msg = msg;
02022     shmsg->count = count;
02023     shmsg->lock = lock;
02024     pmeProxy[transPeMap[pe]].recvSharedTrans(shmsg);
02025   }
02026 }

void ComputePmeMgr::fwdSharedUntrans ( PmeUntransMsg  ) 

Definition at line 2268 of file ComputePme.C.

References NodePmeInfo::npe, and NodePmeInfo::pe_start.

Referenced by NodePmeMgr::recvUntrans(), and sendUntransSubset().

02268                                                        {
02269   int pe = gridNodeInfo[myGridNode].pe_start;
02270   int npe = gridNodeInfo[myGridNode].npe;
02271   CmiNodeLock lock = CmiCreateLock();
02272   int *count = new int; *count = npe;
02273   for (int i=0; i<npe; ++i, ++pe) {
02274     PmeSharedUntransMsg *shmsg = new PmeSharedUntransMsg;
02275     shmsg->msg = msg;
02276     shmsg->count = count;
02277     shmsg->lock = lock;
02278     pmeProxy[gridPeMap[pe]].recvSharedUntrans(shmsg);
02279   }
02280 }

void ComputePmeMgr::gridCalc1 ( void   ) 

Definition at line 1904 of file ComputePme.C.

References PmeGrid::dim2, PmeGrid::dim3, and ComputePmeUtil::numGrids.

Referenced by recvGrid().

01904                                   {
01905   // CkPrintf("gridCalc1 on Pe(%d)\n",CkMyPe());
01906 
01907 #ifdef NAMD_FFTW
01908   for ( int g=0; g<numGrids; ++g ) {
01909 #ifdef NAMD_FFTW_3
01910     fftwf_execute(forward_plan_yz[g]);
01911 #else
01912     rfftwnd_real_to_complex(forward_plan_yz, localInfo[myGridPe].nx,
01913         qgrid + qgrid_size * g, 1, myGrid.dim2 * myGrid.dim3, 0, 0, 0);
01914 #endif
01915 
01916   }
01917 #endif
01918 
01919   if ( ! useBarrier ) pmeProxyDir[CkMyPe()].sendTrans();
01920 }

void ComputePmeMgr::gridCalc2 ( void   ) 

Definition at line 2080 of file ComputePme.C.

References PmeGrid::dim3, gridCalc2R(), ComputePmeUtil::numGrids, LocalPmeInfo::ny_after_transpose, and simParams.

Referenced by procTrans().

02080                                   {
02081   // CkPrintf("gridCalc2 on Pe(%d)\n",CkMyPe());
02082 
02083 #if CMK_BLUEGENEL
02084   CmiNetworkProgressAfter (0);
02085 #endif
02086 
02087   int zdim = myGrid.dim3;
02088   // int y_start = localInfo[myTransPe].y_start_after_transpose;
02089   int ny = localInfo[myTransPe].ny_after_transpose;
02090 
02091   for ( int g=0; g<numGrids; ++g ) {
02092     // finish forward FFT (x dimension)
02093 #ifdef NAMD_FFTW
02094 #ifdef NAMD_FFTW_3
02095     fftwf_execute(forward_plan_x[g]);
02096 #else
02097     fftw(forward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
02098         ny * zdim / 2, 1, work, 1, 0);
02099 #endif
02100 #endif
02101   }
02102 
02103 #ifdef OPENATOM_VERSION
02104     if ( ! simParams -> openatomOn ) { 
02105 #endif // OPENATOM_VERSION
02106       gridCalc2R();
02107 #ifdef OPENATOM_VERSION
02108     } else {
02109       gridCalc2Moa();
02110     }
02111 #endif // OPENATOM_VERSION
02112 }

void ComputePmeMgr::gridCalc2R ( void   ) 

Definition at line 2140 of file ComputePme.C.

References CKLOOP_CTRL_PME_KSPACE, PmeKSpace::compute_energy(), PmeGrid::dim3, ComputeNonbondedUtil::ewaldcof, LocalPmeInfo::ny_after_transpose, and Node::Object().

Referenced by gridCalc2().

02140                                    {
02141 
02142   int useCkLoop = 0;
02143 #if CMK_SMP && USE_CKLOOP
02144   if ( Node::Object()->simParameters->useCkLoop >= CKLOOP_CTRL_PME_KSPACE
02145        && CkNumPes() >= 2 * numTransPes ) {
02146     useCkLoop = 1;
02147   }
02148 #endif
02149 
02150   int zdim = myGrid.dim3;
02151   // int y_start = localInfo[myTransPe].y_start_after_transpose;
02152   int ny = localInfo[myTransPe].ny_after_transpose;
02153 
02154   for ( int g=0; g<numGrids; ++g ) {
02155     // reciprocal space portion of PME
02156     BigReal ewaldcof = ComputeNonbondedUtil::ewaldcof;
02157     recip_evir2[g][0] = myKSpace->compute_energy(kgrid+qgrid_size*g,
02158                         lattice, ewaldcof, &(recip_evir2[g][1]), useCkLoop);
02159     // CkPrintf("Ewald reciprocal energy = %f\n", recip_evir2[g][0]);
02160 
02161     // start backward FFT (x dimension)
02162 
02163 #ifdef NAMD_FFTW
02164 #ifdef NAMD_FFTW_3
02165     fftwf_execute(backward_plan_x[g]);
02166 #else
02167     fftw(backward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
02168         ny * zdim / 2, 1, work, 1, 0);
02169 #endif
02170 #endif
02171   }
02172   
02173   pmeProxyDir[CkMyPe()].sendUntrans();
02174 }

void ComputePmeMgr::gridCalc3 ( void   ) 

Definition at line 2342 of file ComputePme.C.

References PmeGrid::dim2, and PmeGrid::dim3.

Referenced by procUntrans().

02342                                   {
02343   // CkPrintf("gridCalc3 on Pe(%d)\n",CkMyPe());
02344 
02345   // finish backward FFT
02346 #ifdef NAMD_FFTW
02347   for ( int g=0; g<numGrids; ++g ) {
02348 #ifdef NAMD_FFTW_3
02349     fftwf_execute(backward_plan_yz[g]);
02350 #else
02351     rfftwnd_complex_to_real(backward_plan_yz, localInfo[myGridPe].nx,
02352         (fftw_complex *) (qgrid + qgrid_size * g),
02353         1, myGrid.dim2 * myGrid.dim3 / 2, 0, 0, 0);
02354 #endif
02355   }
02356 
02357 #endif
02358 
02359   pmeProxyDir[CkMyPe()].sendUngrid();
02360 }

void ComputePmeMgr::initialize ( CkQdMsg *   ) 

Definition at line 861 of file ComputePme.C.

References Lattice::a(), Lattice::a_r(), ResizeArray< Elem >::add(), ResizeArray< Elem >::begin(), PmeGrid::block1, PmeGrid::block2, PmeGrid::block3, cuda_errcheck(), deviceCUDA, PmeGrid::dim2, PmeGrid::dim3, ResizeArray< Elem >::end(), endi(), fftw_plan_lock, findRecipEvirPe(), generatePmePeList2(), DeviceCUDA::getDeviceID(), PmePencilInitMsgData::grid, if(), iINFO(), iout, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PatchMap::max_a(), PatchMap::min_a(), NAMD_bug(), NAMD_die(), PatchMap::node(), NodePmeInfo::npe, ComputePmeUtil::numGrids, PatchMap::numNodesWithPatches(), PatchMap::numPatches(), numPatches, PatchMap::numPatchesOnNode(), LocalPmeInfo::nx, LocalPmeInfo::ny_after_transpose, PatchMap::Object(), Node::Object(), DeviceCUDA::one_device_per_node(), PmeGrid::order, NodePmeInfo::pe_start, WorkDistrib::peDiffuseOrdering, pencilPMEProcessors, PmePencilInitMsgData::pmeNodeProxy, PmePencilInitMsgData::pmeProxy, NodePmeInfo::real_node, Random::reorder(), ResizeArray< Elem >::resize(), Node::simParameters, simParams, ResizeArray< Elem >::size(), SortableResizeArray< Elem >::sort(), WorkDistrib::sortPmePes(), Vector::unit(), LocalPmeInfo::x_start, PmePencilInitMsgData::xBlocks, PmePencilInitMsgData::xm, PmePencilInitMsgData::xPencil, LocalPmeInfo::y_start_after_transpose, PmePencilInitMsgData::yBlocks, PmePencilInitMsgData::ym, PmePencilInitMsgData::yPencil, PmePencilInitMsgData::zBlocks, PmePencilInitMsgData::zm, and PmePencilInitMsgData::zPencil.

00861                                            {
00862   delete msg;
00863 
00864   localInfo = new LocalPmeInfo[CkNumPes()];
00865   gridNodeInfo = new NodePmeInfo[CkNumNodes()];
00866   transNodeInfo = new NodePmeInfo[CkNumNodes()];
00867   gridPeMap = new int[CkNumPes()];
00868   transPeMap = new int[CkNumPes()];
00869   recipPeDest = new int[CkNumPes()];
00870   gridPeOrder = new int[CkNumPes()];
00871   gridNodeOrder = new int[CkNumNodes()];
00872   transNodeOrder = new int[CkNumNodes()];
00873 
00874   if (CkMyRank() == 0) {
00875     pencilPMEProcessors = new char [CkNumPes()];
00876     memset (pencilPMEProcessors, 0, sizeof(char) * CkNumPes());
00877   }
00878 
00879   SimParameters *simParams = Node::Object()->simParameters;
00880   PatchMap *patchMap = PatchMap::Object();
00881 
00882   offload = simParams->PMEOffload;
00883 #ifdef NAMD_CUDA
00884   if ( offload && ! deviceCUDA->one_device_per_node() ) {
00885     NAMD_die("PME offload requires exactly one CUDA device per process.  Use \"PMEOffload no\".");
00886   }
00887   if ( offload ) {
00888     int dev;
00889     cudaGetDevice(&dev);
00890     cuda_errcheck("in cudaGetDevice");
00891     if ( dev != deviceCUDA->getDeviceID() ) NAMD_bug("ComputePmeMgr::initialize dev != deviceCUDA->getDeviceID()");
00892     cudaDeviceProp deviceProp;
00893     cudaGetDeviceProperties(&deviceProp, dev);
00894     cuda_errcheck("in cudaGetDeviceProperties");
00895     if ( deviceProp.major < 2 )
00896       NAMD_die("PME offload requires CUDA device of compute capability 2.0 or higher.  Use \"PMEOffload no\".");
00897   }
00898 #endif
00899 
00900   alchLambda = -1.;  // illegal value to catch if not updated
00901   useBarrier = simParams->PMEBarrier;
00902 
00903   if ( numGrids != 1 || simParams->PMEPencils == 0 ) usePencils = 0;
00904   else if ( simParams->PMEPencils > 0 ) usePencils = 1;
00905   else {
00906     int nrps = simParams->PMEProcessors;
00907     if ( nrps <= 0 ) nrps = CkNumPes();
00908     if ( nrps > CkNumPes() ) nrps = CkNumPes();
00909     int dimx = simParams->PMEGridSizeX;
00910     int dimy = simParams->PMEGridSizeY;
00911     int maxslabs = 1 + (dimx - 1) / simParams->PMEMinSlices;
00912     if ( maxslabs > nrps ) maxslabs = nrps;
00913     int maxpencils = ( simParams->PMEGridSizeX * (int64) simParams->PMEGridSizeY
00914                 * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
00915     if ( maxpencils > nrps ) maxpencils = nrps;
00916     if ( maxpencils > 3 * maxslabs ) usePencils = 1;
00917     else usePencils = 0;
00918   }
00919 
00920   if ( usePencils ) {
00921     int nrps = simParams->PMEProcessors;
00922     if ( nrps <= 0 ) nrps = CkNumPes();
00923     if ( nrps > CkNumPes() ) nrps = CkNumPes();
00924     if ( simParams->PMEPencils > 1 &&
00925          simParams->PMEPencils * simParams->PMEPencils <= nrps ) {
00926       xBlocks = yBlocks = zBlocks = simParams->PMEPencils;
00927     } else {
00928       int nb2 = ( simParams->PMEGridSizeX * (int64) simParams->PMEGridSizeY
00929                 * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
00930       if ( nb2 > nrps ) nb2 = nrps;
00931       if ( nb2 < 1 ) nb2 = 1;
00932       int nb = (int) sqrt((float)nb2);
00933       if ( nb < 1 ) nb = 1;
00934       xBlocks = zBlocks = nb;
00935       yBlocks = nb2 / nb;
00936     }
00937 
00938     if ( simParams->PMEPencilsX > 0 ) xBlocks = simParams->PMEPencilsX;
00939     if ( simParams->PMEPencilsY > 0 ) yBlocks = simParams->PMEPencilsY;
00940     if ( simParams->PMEPencilsZ > 0 ) zBlocks = simParams->PMEPencilsZ;
00941 
00942     int dimx = simParams->PMEGridSizeX;
00943     int bx = 1 + ( dimx - 1 ) / xBlocks;
00944     xBlocks = 1 + ( dimx - 1 ) / bx;
00945 
00946     int dimy = simParams->PMEGridSizeY;
00947     int by = 1 + ( dimy - 1 ) / yBlocks;
00948     yBlocks = 1 + ( dimy - 1 ) / by;
00949 
00950     int dimz = simParams->PMEGridSizeZ / 2 + 1;  // complex
00951     int bz = 1 + ( dimz - 1 ) / zBlocks;
00952     zBlocks = 1 + ( dimz - 1 ) / bz;
00953 
00954     if ( xBlocks * yBlocks > CkNumPes() ) {
00955       NAMD_die("PME pencils xBlocks * yBlocks > numPes");
00956     }
00957     if ( xBlocks * zBlocks > CkNumPes() ) {
00958       NAMD_die("PME pencils xBlocks * zBlocks > numPes");
00959     }
00960     if ( yBlocks * zBlocks > CkNumPes() ) {
00961       NAMD_die("PME pencils yBlocks * zBlocks > numPes");
00962     }
00963 
00964     if ( ! CkMyPe() ) {
00965       iout << iINFO << "PME using " << xBlocks << " x " <<
00966         yBlocks << " x " << zBlocks <<
00967         " pencil grid for FFT and reciprocal sum.\n" << endi;
00968     }
00969   } else { // usePencils
00970 
00971   {  // decide how many pes to use for reciprocal sum
00972 
00973     // rules based on work available
00974     int minslices = simParams->PMEMinSlices;
00975     int dimx = simParams->PMEGridSizeX;
00976     int nrpx = ( dimx + minslices - 1 ) / minslices;
00977     int dimy = simParams->PMEGridSizeY;
00978     int nrpy = ( dimy + minslices - 1 ) / minslices;
00979 
00980     // rules based on processors available
00981     int nrpp = CkNumPes();
00982     // if ( nrpp > 32 ) nrpp = 32;  // cap to limit messages
00983     if ( nrpp < nrpx ) nrpx = nrpp;
00984     if ( nrpp < nrpy ) nrpy = nrpp;
00985 
00986     // user override
00987     int nrps = simParams->PMEProcessors;
00988     if ( nrps > CkNumPes() ) nrps = CkNumPes();
00989     if ( nrps > 0 ) nrpx = nrps;
00990     if ( nrps > 0 ) nrpy = nrps;
00991 
00992     // make sure there aren't any totally empty processors
00993     int bx = ( dimx + nrpx - 1 ) / nrpx;
00994     nrpx = ( dimx + bx - 1 ) / bx;
00995     int by = ( dimy + nrpy - 1 ) / nrpy;
00996     nrpy = ( dimy + by - 1 ) / by;
00997     if ( bx != ( dimx + nrpx - 1 ) / nrpx )
00998       NAMD_bug("Error in selecting number of PME processors.");
00999     if ( by != ( dimy + nrpy - 1 ) / nrpy )
01000       NAMD_bug("Error in selecting number of PME processors.");
01001 
01002     numGridPes = nrpx;
01003     numTransPes = nrpy;
01004   }
01005   if ( ! CkMyPe() ) {
01006     iout << iINFO << "PME using " << numGridPes << " and " << numTransPes <<
01007       " processors for FFT and reciprocal sum.\n" << endi;
01008   }
01009 
01010   int sum_npes = numTransPes + numGridPes;
01011   int max_npes = (numTransPes > numGridPes)?numTransPes:numGridPes;
01012 
01013 #if 0 // USE_TOPOMAP
01014   /* This code is being disabled permanently for slab PME on Blue Gene machines */
01015   PatchMap * pmap = PatchMap::Object();
01016   
01017   int patch_pes = pmap->numNodesWithPatches();
01018   TopoManager tmgr;
01019   if(tmgr.hasMultipleProcsPerNode())
01020     patch_pes *= 2;
01021 
01022   bool done = false;
01023   if(CkNumPes() > 2*sum_npes + patch_pes) {    
01024     done = generateBGLORBPmePeList(transPeMap, numTransPes);
01025     done &= generateBGLORBPmePeList(gridPeMap, numGridPes, transPeMap, numTransPes);    
01026   }
01027   else 
01028     if(CkNumPes() > 2 *max_npes + patch_pes) {
01029       done = generateBGLORBPmePeList(transPeMap, max_npes);
01030       gridPeMap = transPeMap;
01031     }
01032 
01033   if (!done)
01034 #endif
01035     {
01036       //generatePmePeList(transPeMap, max_npes);
01037       //gridPeMap = transPeMap;
01038       generatePmePeList2(gridPeMap, numGridPes, transPeMap, numTransPes);
01039     }
01040   
01041   if ( ! CkMyPe() ) {
01042     iout << iINFO << "PME GRID LOCATIONS:";
01043     int i;
01044     for ( i=0; i<numGridPes && i<10; ++i ) {
01045       iout << " " << gridPeMap[i];
01046     }
01047     if ( i < numGridPes ) iout << " ...";
01048     iout << "\n" << endi;
01049     iout << iINFO << "PME TRANS LOCATIONS:";
01050     for ( i=0; i<numTransPes && i<10; ++i ) {
01051       iout << " " << transPeMap[i];
01052     }
01053     if ( i < numTransPes ) iout << " ...";
01054     iout << "\n" << endi;
01055   }
01056 
01057   // sort based on nodes and physical nodes
01058   std::sort(gridPeMap,gridPeMap+numGridPes,WorkDistrib::pe_sortop_compact());
01059 
01060   myGridPe = -1;
01061   myGridNode = -1;
01062   int i = 0;
01063   int node = -1;
01064   int real_node = -1;
01065   for ( i=0; i<numGridPes; ++i ) {
01066     if ( gridPeMap[i] == CkMyPe() ) myGridPe = i;
01067     if (CkMyRank() == 0) pencilPMEProcessors[gridPeMap[i]] |= 1;
01068     int real_node_i = CkNodeOf(gridPeMap[i]);
01069     if ( real_node_i == real_node ) {
01070       gridNodeInfo[node].npe += 1;
01071     } else {
01072       real_node = real_node_i;
01073       ++node;
01074       gridNodeInfo[node].real_node = real_node;
01075       gridNodeInfo[node].pe_start = i;
01076       gridNodeInfo[node].npe = 1;
01077     }
01078     if ( CkMyNode() == real_node_i ) myGridNode = node;
01079   }
01080   numGridNodes = node + 1;
01081   myTransPe = -1;
01082   myTransNode = -1;
01083   node = -1;
01084   real_node = -1;
01085   for ( i=0; i<numTransPes; ++i ) {
01086     if ( transPeMap[i] == CkMyPe() ) myTransPe = i;
01087     if (CkMyRank() == 0) pencilPMEProcessors[transPeMap[i]] |= 2;
01088     int real_node_i = CkNodeOf(transPeMap[i]);
01089     if ( real_node_i == real_node ) {
01090       transNodeInfo[node].npe += 1;
01091     } else {
01092       real_node = real_node_i;
01093       ++node;
01094       transNodeInfo[node].real_node = real_node;
01095       transNodeInfo[node].pe_start = i;
01096       transNodeInfo[node].npe = 1;
01097     }
01098     if ( CkMyNode() == real_node_i ) myTransNode = node;
01099   }
01100   numTransNodes = node + 1;
01101 
01102   if ( ! CkMyPe() ) {
01103     iout << iINFO << "PME USING " << numGridNodes << " GRID NODES AND "
01104          << numTransNodes << " TRANS NODES\n" << endi;
01105   }
01106 
01107   { // generate random orderings for grid and trans messages
01108     int i;
01109     for ( i = 0; i < numGridPes; ++i ) {
01110       gridPeOrder[i] = i;
01111     }
01112     Random rand(CkMyPe());
01113     if ( myGridPe < 0 ) {
01114       rand.reorder(gridPeOrder,numGridPes);
01115     } else {  // self last
01116       gridPeOrder[myGridPe] = numGridPes-1;
01117       gridPeOrder[numGridPes-1] = myGridPe;
01118       rand.reorder(gridPeOrder,numGridPes-1);
01119     } 
01120     for ( i = 0; i < numGridNodes; ++i ) {
01121       gridNodeOrder[i] = i;
01122     }
01123     if ( myGridNode < 0 ) {
01124       rand.reorder(gridNodeOrder,numGridNodes);
01125     } else {  // self last
01126       gridNodeOrder[myGridNode] = numGridNodes-1;
01127       gridNodeOrder[numGridNodes-1] = myGridNode;
01128       rand.reorder(gridNodeOrder,numGridNodes-1);
01129     }
01130     for ( i = 0; i < numTransNodes; ++i ) {
01131       transNodeOrder[i] = i;
01132     }
01133     if ( myTransNode < 0 ) {
01134       rand.reorder(transNodeOrder,numTransNodes);
01135     } else {  // self last
01136       transNodeOrder[myTransNode] = numTransNodes-1;
01137       transNodeOrder[numTransNodes-1] = myTransNode;
01138       rand.reorder(transNodeOrder,numTransNodes-1);
01139     }
01140   }
01141   
01142   } // ! usePencils
01143 
01144   myGrid.K1 = simParams->PMEGridSizeX;
01145   myGrid.K2 = simParams->PMEGridSizeY;
01146   myGrid.K3 = simParams->PMEGridSizeZ;
01147   myGrid.order = simParams->PMEInterpOrder;
01148   myGrid.dim2 = myGrid.K2;
01149   myGrid.dim3 = 2 * (myGrid.K3/2 + 1);
01150 
01151   if ( ! usePencils ) {
01152     myGrid.block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
01153     myGrid.block2 = ( myGrid.K2 + numTransPes - 1 ) / numTransPes;
01154     myGrid.block3 = myGrid.dim3 / 2;  // complex
01155   }
01156 
01157   if ( usePencils ) {
01158     myGrid.block1 = ( myGrid.K1 + xBlocks - 1 ) / xBlocks;
01159     myGrid.block2 = ( myGrid.K2 + yBlocks - 1 ) / yBlocks;
01160     myGrid.block3 = ( myGrid.K3/2 + 1 + zBlocks - 1 ) / zBlocks;  // complex
01161 
01162 
01163       int pe = 0;
01164       int x,y,z;
01165 
01166                 SortableResizeArray<int> zprocs(xBlocks*yBlocks);
01167                 SortableResizeArray<int> yprocs(xBlocks*zBlocks);
01168                 SortableResizeArray<int> xprocs(yBlocks*zBlocks);
01169       
01170                 // decide which pes to use by bit reversal and patch use
01171                 int i;
01172                 int ncpus = CkNumPes();
01173                 SortableResizeArray<int> patches, nopatches, pmeprocs;
01174                 PatchMap *pmap = PatchMap::Object();
01175                 for ( int icpu=0; icpu<ncpus; ++icpu ) {
01176                         int ri = WorkDistrib::peDiffuseOrdering[icpu];
01177                         if ( ri ) { // keep 0 for special case
01178                                 // pretend pe 1 has patches to avoid placing extra PME load on node
01179                                 if ( ri == 1 || pmap->numPatchesOnNode(ri) ) patches.add(ri);
01180                                 else nopatches.add(ri);
01181                         }
01182                 }
01183 
01184 #if USE_RANDOM_TOPO
01185             Random rand(CkMyPe());
01186             int *tmp = new int[patches.size()];
01187             int nn = patches.size();
01188             for (i=0;i<nn;i++)  tmp[i] = patches[i];
01189             rand.reorder(tmp, nn);
01190             patches.resize(0);
01191             for (i=0;i<nn;i++)  patches.add(tmp[i]);
01192             delete [] tmp;
01193             tmp = new int[nopatches.size()];
01194             nn = nopatches.size();
01195             for (i=0;i<nn;i++)  tmp[i] = nopatches[i];
01196             rand.reorder(tmp, nn);
01197             nopatches.resize(0);
01198             for (i=0;i<nn;i++)  nopatches.add(tmp[i]);
01199             delete [] tmp;
01200 #endif
01201 
01202                 // only use zero if it eliminates overloading or has patches
01203                 int useZero = 0;
01204                 int npens = xBlocks*yBlocks;
01205                 if ( npens % ncpus == 0 ) useZero = 1;
01206                 if ( npens == nopatches.size() + 1 ) useZero = 1;
01207                 npens += xBlocks*zBlocks;
01208                 if ( npens % ncpus == 0 ) useZero = 1;
01209                 if ( npens == nopatches.size() + 1 ) useZero = 1;
01210                 npens += yBlocks*zBlocks;
01211                 if ( npens % ncpus == 0 ) useZero = 1;
01212                 if ( npens == nopatches.size() + 1 ) useZero = 1;
01213 
01214                 // add nopatches then patches in reversed order
01215                 for ( i=nopatches.size()-1; i>=0; --i ) pmeprocs.add(nopatches[i]);
01216                 if ( useZero && ! pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
01217                 for ( i=patches.size()-1; i>=0; --i ) pmeprocs.add(patches[i]);
01218                 if ( pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
01219   
01220                 int npes = pmeprocs.size();
01221                 for ( i=0; i<xBlocks*yBlocks; ++i, ++pe ) zprocs[i] = pmeprocs[pe%npes];
01222                 if ( i>1 && zprocs[0] == zprocs[i-1] ) zprocs[0] = 0;
01223 #if !USE_RANDOM_TOPO
01224                 zprocs.sort();
01225 #endif
01226                 for ( i=0; i<xBlocks*zBlocks; ++i, ++pe ) yprocs[i] = pmeprocs[pe%npes];
01227                 if ( i>1 && yprocs[0] == yprocs[i-1] ) yprocs[0] = 0;
01228 #if !USE_RANDOM_TOPO
01229                 yprocs.sort();
01230 #endif
01231       for ( i=0; i<yBlocks*zBlocks; ++i, ++pe ) xprocs[i] = pmeprocs[pe%npes];
01232       if ( i>1 && xprocs[0] == xprocs[i-1] ) xprocs[0] = 0;
01233 #if !USE_RANDOM_TOPO
01234       xprocs.sort();
01235 #endif
01236 
01237 #if USE_TOPO_SFC
01238   CmiLock(tmgr_lock);
01239   //{
01240   TopoManager tmgr;
01241   int xdim = tmgr.getDimNX();
01242   int ydim = tmgr.getDimNY();
01243   int zdim = tmgr.getDimNZ();
01244   int xdim1 = find_level_grid(xdim);
01245   int ydim1 = find_level_grid(ydim);
01246   int zdim1 = find_level_grid(zdim);
01247   if(CkMyPe() == 0)
01248       printf("xdim: %d %d %d, %d %d %d\n", xdim, ydim, zdim, xdim1, ydim1, zdim1);
01249 
01250   vector<Coord> result;
01251   SFC_grid(xdim, ydim, zdim, xdim1, ydim1, zdim1, result);
01252   sort_sfc(xprocs, tmgr, result);
01253   sort_sfc(yprocs, tmgr, result);
01254   sort_sfc(zprocs, tmgr, result);
01255   //}
01256   CmiUnlock(tmgr_lock);
01257 #endif
01258 
01259 
01260                 if(CkMyPe() == 0){  
01261               iout << iINFO << "PME Z PENCIL LOCATIONS:";
01262           for ( i=0; i<zprocs.size() && i<10; ++i ) {
01263 #if USE_TOPO_SFC
01264               int x,y,z,t;
01265               tmgr.rankToCoordinates(zprocs[i], x,y, z, t);
01266               iout << " " << zprocs[i] << "(" << x << " " << y << " " << z << ")";
01267 #else
01268               iout << " " << zprocs[i];
01269 #endif
01270           }
01271           if ( i < zprocs.size() ) iout << " ...";
01272               iout << "\n" << endi;
01273                 }
01274 
01275     if (CkMyRank() == 0) {
01276       for (pe=0, x = 0; x < xBlocks; ++x)
01277         for (y = 0; y < yBlocks; ++y, ++pe ) {
01278           pencilPMEProcessors[zprocs[pe]] = 1;
01279         }
01280     }
01281      
01282                 if(CkMyPe() == 0){  
01283               iout << iINFO << "PME Y PENCIL LOCATIONS:";
01284           for ( i=0; i<yprocs.size() && i<10; ++i ) {
01285 #if USE_TOPO_SFC
01286               int x,y,z,t;
01287               tmgr.rankToCoordinates(yprocs[i], x,y, z, t);
01288               iout << " " << yprocs[i] << "(" << x << " " << y << " " << z << ")";
01289 #else
01290               iout << " " << yprocs[i];
01291 #endif
01292           }
01293           if ( i < yprocs.size() ) iout << " ...";
01294               iout << "\n" << endi;
01295                 }
01296 
01297     if (CkMyRank() == 0) {
01298       for (pe=0, z = 0; z < zBlocks; ++z )
01299         for (x = 0; x < xBlocks; ++x, ++pe ) {
01300           pencilPMEProcessors[yprocs[pe]] = 1;
01301         }
01302     }
01303     
01304                 if(CkMyPe() == 0){  
01305                 iout << iINFO << "PME X PENCIL LOCATIONS:";
01306                     for ( i=0; i<xprocs.size() && i<10; ++i ) {
01307 #if USE_TOPO_SFC
01308                 int x,y,z,t;
01309                 tmgr.rankToCoordinates(xprocs[i], x,y, z, t);
01310                 iout << " " << xprocs[i] << "(" << x << "  " << y << " " << z << ")";
01311 #else
01312                 iout << " " << xprocs[i];
01313 #endif
01314             }
01315                 if ( i < xprocs.size() ) iout << " ...";
01316                 iout << "\n" << endi;
01317                 }
01318 
01319     if (CkMyRank() == 0) {
01320       for (pe=0, y = 0; y < yBlocks; ++y )      
01321         for (z = 0; z < zBlocks; ++z, ++pe ) {
01322           pencilPMEProcessors[xprocs[pe]] = 1;
01323         }
01324     }
01325         
01326 
01327         // creating the pencil arrays
01328         if ( CkMyPe() == 0 ){
01329 #if !USE_RANDOM_TOPO
01330         // std::sort(zprocs.begin(),zprocs.end(),WorkDistrib::pe_sortop_compact());
01331         WorkDistrib::sortPmePes(zprocs.begin(),xBlocks,yBlocks);
01332         std::sort(yprocs.begin(),yprocs.end(),WorkDistrib::pe_sortop_compact());
01333         std::sort(xprocs.begin(),xprocs.end(),WorkDistrib::pe_sortop_compact());
01334 #endif
01335 #if 1
01336         CProxy_PmePencilMap zm = CProxy_PmePencilMap::ckNew(0,1,yBlocks,xBlocks*yBlocks,zprocs.begin());
01337         CProxy_PmePencilMap ym;
01338         if ( simParams->PMEPencilsYLayout )
01339           ym = CProxy_PmePencilMap::ckNew(0,2,zBlocks,zBlocks*xBlocks,yprocs.begin()); // new
01340         else
01341           ym = CProxy_PmePencilMap::ckNew(2,0,xBlocks,zBlocks*xBlocks,yprocs.begin()); // old
01342         CProxy_PmePencilMap xm;
01343         if ( simParams->PMEPencilsXLayout )
01344           xm = CProxy_PmePencilMap::ckNew(2,1,yBlocks,yBlocks*zBlocks,xprocs.begin()); // new
01345         else
01346           xm = CProxy_PmePencilMap::ckNew(1,2,zBlocks,yBlocks*zBlocks,xprocs.begin()); // old
01347         pmeNodeProxy.recvPencilMapProxies(xm,ym,zm);
01348         CkArrayOptions zo(xBlocks,yBlocks,1);  zo.setMap(zm);
01349         CkArrayOptions yo(xBlocks,1,zBlocks);  yo.setMap(ym);
01350         CkArrayOptions xo(1,yBlocks,zBlocks);  xo.setMap(xm);
01351         zo.setAnytimeMigration(false);  zo.setStaticInsertion(true);
01352         yo.setAnytimeMigration(false);  yo.setStaticInsertion(true);
01353         xo.setAnytimeMigration(false);  xo.setStaticInsertion(true);
01354         zPencil = CProxy_PmeZPencil::ckNew(zo);  // (xBlocks,yBlocks,1);
01355         yPencil = CProxy_PmeYPencil::ckNew(yo);  // (xBlocks,1,zBlocks);
01356         xPencil = CProxy_PmeXPencil::ckNew(xo);  // (1,yBlocks,zBlocks);
01357 #else
01358         zPencil = CProxy_PmeZPencil::ckNew();  // (xBlocks,yBlocks,1);
01359         yPencil = CProxy_PmeYPencil::ckNew();  // (xBlocks,1,zBlocks);
01360         xPencil = CProxy_PmeXPencil::ckNew();  // (1,yBlocks,zBlocks);
01361 
01362                 for (pe=0, x = 0; x < xBlocks; ++x)
01363                         for (y = 0; y < yBlocks; ++y, ++pe ) {
01364                                 zPencil(x,y,0).insert(zprocs[pe]);
01365                         }
01366         zPencil.doneInserting();
01367 
01368                 for (pe=0, x = 0; x < xBlocks; ++x)
01369                         for (z = 0; z < zBlocks; ++z, ++pe ) {
01370                                 yPencil(x,0,z).insert(yprocs[pe]);
01371                         }
01372         yPencil.doneInserting();
01373 
01374 
01375                 for (pe=0, y = 0; y < yBlocks; ++y )    
01376                         for (z = 0; z < zBlocks; ++z, ++pe ) {
01377                                 xPencil(0,y,z).insert(xprocs[pe]);
01378                         }
01379                 xPencil.doneInserting();     
01380 #endif
01381 
01382                 pmeProxy.recvArrays(xPencil,yPencil,zPencil);
01383                 PmePencilInitMsgData msgdata;
01384                 msgdata.grid = myGrid;
01385                 msgdata.xBlocks = xBlocks;
01386                 msgdata.yBlocks = yBlocks;
01387                 msgdata.zBlocks = zBlocks;
01388                 msgdata.xPencil = xPencil;
01389                 msgdata.yPencil = yPencil;
01390                 msgdata.zPencil = zPencil;
01391                 msgdata.pmeProxy = pmeProxyDir;
01392         msgdata.pmeNodeProxy = pmeNodeProxy;
01393         msgdata.xm = xm;
01394         msgdata.ym = ym;
01395         msgdata.zm = zm;
01396                 xPencil.init(new PmePencilInitMsg(msgdata));
01397                 yPencil.init(new PmePencilInitMsg(msgdata));
01398                 zPencil.init(new PmePencilInitMsg(msgdata));
01399         }
01400 
01401     return;  // continue in initialize_pencils() at next startup stage
01402   }
01403 
01404 
01405   int pe;
01406   int nx = 0;
01407   for ( pe = 0; pe < numGridPes; ++pe ) {
01408     localInfo[pe].x_start = nx;
01409     nx += myGrid.block1;
01410     if ( nx > myGrid.K1 ) nx = myGrid.K1;
01411     localInfo[pe].nx = nx - localInfo[pe].x_start;
01412   }
01413   int ny = 0;
01414   for ( pe = 0; pe < numTransPes; ++pe ) {
01415     localInfo[pe].y_start_after_transpose = ny;
01416     ny += myGrid.block2;
01417     if ( ny > myGrid.K2 ) ny = myGrid.K2;
01418     localInfo[pe].ny_after_transpose =
01419                         ny - localInfo[pe].y_start_after_transpose;
01420   }
01421 
01422   {  // decide how many pes this node exchanges charges with
01423 
01424   PatchMap *patchMap = PatchMap::Object();
01425   Lattice lattice = simParams->lattice;
01426   BigReal sysdima = lattice.a_r().unit() * lattice.a();
01427   BigReal cutoff = simParams->cutoff;
01428   BigReal patchdim = simParams->patchDimension;
01429   int numPatches = patchMap->numPatches();
01430   int numNodes = CkNumPes();
01431   int *source_flags = new int[numNodes];
01432   int node;
01433   for ( node=0; node<numNodes; ++node ) {
01434     source_flags[node] = 0;
01435     recipPeDest[node] = 0;
01436   }
01437 
01438   // // make sure that we don't get ahead of ourselves on this node
01439   // if ( CkMyPe() < numPatches && myRecipPe >= 0 ) {
01440   //   source_flags[CkMyPe()] = 1;
01441   //   recipPeDest[myRecipPe] = 1;
01442   // }
01443 
01444   for ( int pid=0; pid < numPatches; ++pid ) {
01445     int pnode = patchMap->node(pid);
01446 #ifdef NAMD_CUDA
01447     if ( offload ) pnode = CkNodeFirst(CkNodeOf(pnode));
01448 #endif
01449     int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
01450     BigReal minx = patchMap->min_a(pid);
01451     BigReal maxx = patchMap->max_a(pid);
01452     BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
01453     // min1 (max1) is smallest (largest) grid line for this patch
01454     int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
01455     int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
01456     for ( int i=min1; i<=max1; ++i ) {
01457       int ix = i;
01458       while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
01459       while ( ix < 0 ) ix += myGrid.K1;
01460       // set source_flags[pnode] if this patch sends to our node
01461       if ( myGridPe >= 0 && ix >= localInfo[myGridPe].x_start &&
01462            ix < localInfo[myGridPe].x_start + localInfo[myGridPe].nx ) {
01463         source_flags[pnode] = 1;
01464       }
01465       // set dest_flags[] for node that our patch sends to
01466 #ifdef NAMD_CUDA
01467       if ( offload ) {
01468         if ( pnode == CkNodeFirst(CkMyNode()) ) {
01469           recipPeDest[ix / myGrid.block1] = 1;
01470         }
01471       } else
01472 #endif
01473       if ( pnode == CkMyPe() ) {
01474         recipPeDest[ix / myGrid.block1] = 1;
01475       }
01476     }
01477   }
01478 
01479   int numSourcesSamePhysicalNode = 0;
01480   numSources = 0;
01481   numDestRecipPes = 0;
01482   for ( node=0; node<numNodes; ++node ) {
01483     if ( source_flags[node] ) ++numSources;
01484     if ( recipPeDest[node] ) ++numDestRecipPes;
01485     if ( source_flags[node] && CmiPeOnSamePhysicalNode(node,CkMyPe()) ) ++numSourcesSamePhysicalNode;
01486   }
01487 
01488 #if 0
01489   if ( numSources ) {
01490     CkPrintf("pe %5d pme %5d of %5d on same physical node\n",
01491             CkMyPe(), numSourcesSamePhysicalNode, numSources);
01492     iout << iINFO << "PME " << CkMyPe() << " sources:";
01493     for ( node=0; node<numNodes; ++node ) {
01494       if ( source_flags[node] ) iout << " " << node;
01495     }
01496     iout << "\n" << endi;
01497   }
01498 #endif
01499 
01500   delete [] source_flags;
01501 
01502   // CkPrintf("PME on node %d has %d sources and %d destinations\n",
01503   //           CkMyPe(), numSources, numDestRecipPes);
01504 
01505   }  // decide how many pes this node exchanges charges with (end)
01506 
01507   ungrid_count = numDestRecipPes;
01508 
01509   sendTransBarrier_received = 0;
01510 
01511   if ( myGridPe < 0 && myTransPe < 0 ) return;
01512   // the following only for nodes doing reciprocal sum
01513 
01514   if ( myTransPe >= 0 ) {
01515     recipEvirPe = findRecipEvirPe();
01516     pmeProxy[recipEvirPe].addRecipEvirClient();
01517   }
01518 
01519   if ( myTransPe >= 0 ) {
01520       int k2_start = localInfo[myTransPe].y_start_after_transpose;
01521       int k2_end = k2_start + localInfo[myTransPe].ny_after_transpose;
01522       #ifdef OPENATOM_VERSION
01523       if ( simParams->openatomOn ) { 
01524         CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
01525         myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2, moaProxy);
01526       } else {
01527         myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
01528       }
01529       #else  // OPENATOM_VERSION
01530       myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
01531       #endif // OPENATOM_VERSION
01532   }
01533 
01534   int local_size = myGrid.block1 * myGrid.K2 * myGrid.dim3;
01535   int local_size_2 = myGrid.block2 * myGrid.K1 * myGrid.dim3;
01536   if ( local_size < local_size_2 ) local_size = local_size_2;
01537   qgrid = new float[local_size*numGrids];
01538   if ( numGridPes > 1 || numTransPes > 1 ) {
01539     kgrid = new float[local_size*numGrids];
01540   } else {
01541     kgrid = qgrid;
01542   }
01543   qgrid_size = local_size;
01544 
01545   if ( myGridPe >= 0 ) {
01546   qgrid_start = localInfo[myGridPe].x_start * myGrid.K2 * myGrid.dim3;
01547   qgrid_len = localInfo[myGridPe].nx * myGrid.K2 * myGrid.dim3;
01548   fgrid_start = localInfo[myGridPe].x_start * myGrid.K2;
01549   fgrid_len = localInfo[myGridPe].nx * myGrid.K2;
01550   }
01551 
01552   int n[3]; n[0] = myGrid.K1; n[1] = myGrid.K2; n[2] = myGrid.K3;
01553 #ifdef NAMD_FFTW
01554   CmiLock(fftw_plan_lock);
01555 #ifdef NAMD_FFTW_3
01556   work = new fftwf_complex[n[0]];
01557   int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT  : simParams->FFTWEstimate ? FFTW_ESTIMATE  : FFTW_MEASURE ;
01558   if ( myGridPe >= 0 ) {
01559     forward_plan_yz=new fftwf_plan[numGrids];
01560     backward_plan_yz=new fftwf_plan[numGrids];
01561   }
01562   if ( myTransPe >= 0 ) {
01563     forward_plan_x=new fftwf_plan[numGrids];
01564     backward_plan_x=new fftwf_plan[numGrids];
01565   }
01566   /* need one plan per grid */
01567   if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps.  1..." << endi;
01568   if ( myGridPe >= 0 ) {
01569     for( int g=0; g<numGrids; g++)
01570       {
01571         forward_plan_yz[g] = fftwf_plan_many_dft_r2c(2, n+1, 
01572                                                      localInfo[myGridPe].nx,
01573                                                      qgrid + qgrid_size * g,
01574                                                      NULL,
01575                                                      1,
01576                                                      myGrid.dim2 * myGrid.dim3,
01577                                                      (fftwf_complex *) 
01578                                                      (qgrid + qgrid_size * g),
01579                                                      NULL,
01580                                                      1,
01581                                                      myGrid.dim2 * (myGrid.dim3/2),
01582                                                      fftwFlags);
01583       }
01584   }
01585   int zdim = myGrid.dim3;
01586   int xStride=localInfo[myTransPe].ny_after_transpose *( myGrid.dim3 / 2);
01587   if ( ! CkMyPe() ) iout << " 2..." << endi;
01588   if ( myTransPe >= 0 ) {
01589     for( int g=0; g<numGrids; g++)
01590       {
01591 
01592         forward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
01593                                                 (fftwf_complex *)
01594                                                 (kgrid+qgrid_size*g),
01595                                                 NULL,
01596                                                 xStride,
01597                                                 1,
01598                                                 (fftwf_complex *)
01599                                                 (kgrid+qgrid_size*g),
01600                                                 NULL,
01601                                                 xStride,
01602                                                 1,
01603                                                 FFTW_FORWARD,fftwFlags);
01604         
01605       }
01606   }
01607   if ( ! CkMyPe() ) iout << " 3..." << endi;
01608   if ( myTransPe >= 0 ) {
01609     for( int g=0; g<numGrids; g++)
01610       {
01611         backward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
01612                                                  (fftwf_complex *)
01613                                                  (kgrid+qgrid_size*g),
01614                                                  NULL,
01615                                                  xStride,
01616                                                  1,
01617                                                  (fftwf_complex *)
01618                                                  (kgrid+qgrid_size*g),
01619                                                  NULL,
01620                                                  xStride,
01621                                                  1,
01622                                                  FFTW_BACKWARD, fftwFlags);
01623 
01624       }
01625   }
01626   if ( ! CkMyPe() ) iout << " 4..." << endi;
01627   if ( myGridPe >= 0 ) {
01628     for( int g=0; g<numGrids; g++)
01629       {
01630         backward_plan_yz[g] = fftwf_plan_many_dft_c2r(2, n+1, 
01631                                                       localInfo[myGridPe].nx,
01632                                                       (fftwf_complex *)
01633                                                       (qgrid + qgrid_size * g),
01634                                                       NULL,
01635                                                       1,
01636                                                       myGrid.dim2*(myGrid.dim3/2),
01637                                                       qgrid + qgrid_size * g,
01638                                                       NULL,
01639                                                       1,
01640                                                       myGrid.dim2 * myGrid.dim3,
01641                                                       fftwFlags);
01642       }
01643   }
01644   if ( ! CkMyPe() ) iout << "   Done.\n" << endi;
01645 
01646 #else
01647   work = new fftw_complex[n[0]];
01648 
01649   if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps.  1..." << endi;
01650   if ( myGridPe >= 0 ) {
01651   forward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_REAL_TO_COMPLEX,
01652         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01653         | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
01654   }
01655   if ( ! CkMyPe() ) iout << " 2..." << endi;
01656   if ( myTransPe >= 0 ) {
01657       forward_plan_x = fftw_create_plan_specific(n[0], FFTW_FORWARD,
01658         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01659         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
01660         localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
01661   }
01662   if ( ! CkMyPe() ) iout << " 3..." << endi;
01663   if ( myTransPe >= 0 ) {
01664   backward_plan_x = fftw_create_plan_specific(n[0], FFTW_BACKWARD,
01665         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01666         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
01667         localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
01668   }
01669   if ( ! CkMyPe() ) iout << " 4..." << endi;
01670   if ( myGridPe >= 0 ) {
01671   backward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_COMPLEX_TO_REAL,
01672         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01673         | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
01674   }
01675   if ( ! CkMyPe() ) iout << "   Done.\n" << endi;
01676 #endif
01677   CmiUnlock(fftw_plan_lock);
01678 #else
01679   NAMD_die("Sorry, FFTW must be compiled in to use PME.");
01680 #endif
01681 
01682   if ( myGridPe >= 0 && numSources == 0 )
01683                 NAMD_bug("PME grid elements exist without sources.");
01684   grid_count = numSources;
01685   memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
01686   trans_count = numGridPes;
01687 }

void ComputePmeMgr::initialize_computes (  ) 

Definition at line 2704 of file ComputePme.C.

References chargeGridSubmittedCount, cuda_errcheck(), cuda_init_bspline_coeffs(), cuda_lock, deviceCUDA, PmeGrid::dim2, PmeGrid::dim3, NodePmeMgr::end_all_pme_kernels, NodePmeMgr::end_charge_memset, NodePmeMgr::end_potential_memcpy, f, DeviceCUDA::getDeviceID(), DeviceCUDA::getMasterPe(), j, ijpair::j, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, NodePmeMgr::master_pe, master_pe, NodePmeMgr::masterPmeMgr, NodePmeMgr::mgrObjects, NAMD_bug(), PatchMap::numPatchesOnNode(), PatchMap::Object(), Node::Object(), ReductionMgr::Object(), PmeGrid::order, REDUCTIONS_BASIC, Node::simParameters, simParams, ReductionMgr::willSubmit(), and XCOPY.

02704                                         {
02705 
02706   noWorkCount = 0;
02707   doWorkCount = 0;
02708   ungridForcesCount = 0;
02709 
02710   reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
02711 
02712   SimParameters *simParams = Node::Object()->simParameters;
02713 
02714   strayChargeErrors = 0;
02715 
02716 #ifdef NAMD_CUDA
02717  PatchMap *patchMap = PatchMap::Object();
02718  int pe = master_pe = CkNodeFirst(CkMyNode());
02719  for ( int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
02720     if ( ! patchMap->numPatchesOnNode(master_pe) ) master_pe = pe;
02721     if ( ! patchMap->numPatchesOnNode(pe) ) continue;
02722     if ( master_pe < 1 && pe != deviceCUDA->getMasterPe() ) master_pe = pe;
02723     if ( master_pe == deviceCUDA->getMasterPe() ) master_pe = pe;
02724     if ( WorkDistrib::pe_sortop_diffuse()(pe,master_pe)
02725         && pe != deviceCUDA->getMasterPe() ) {
02726       master_pe = pe;
02727     }
02728  }
02729  if ( ! patchMap->numPatchesOnNode(master_pe) ) {
02730    NAMD_bug("ComputePmeMgr::initialize_computes() master_pe has no patches.");
02731  }
02732 
02733  masterPmeMgr = nodePmeMgr->mgrObjects[master_pe - CkNodeFirst(CkMyNode())];
02734  bool cudaFirst = 1;
02735  if ( offload ) {
02736   CmiLock(cuda_lock);
02737   cudaFirst = ! masterPmeMgr->chargeGridSubmittedCount++;
02738  }
02739 
02740  if ( cudaFirst ) {
02741   nodePmeMgr->master_pe = master_pe;
02742   nodePmeMgr->masterPmeMgr = masterPmeMgr;
02743  }
02744 #endif
02745 
02746   qsize = myGrid.K1 * myGrid.dim2 * myGrid.dim3;
02747   fsize = myGrid.K1 * myGrid.dim2;
02748   if ( myGrid.K2 != myGrid.dim2 ) NAMD_bug("PME myGrid.K2 != myGrid.dim2");
02749 #ifdef NAMD_CUDA
02750  if ( ! offload )
02751 #endif
02752  {
02753   q_arr = new float*[fsize*numGrids];
02754   memset( (void*) q_arr, 0, fsize*numGrids * sizeof(float*) );
02755   q_list = new float*[fsize*numGrids];
02756   memset( (void*) q_list, 0, fsize*numGrids * sizeof(float*) );
02757   q_count = 0;
02758  }
02759 
02760 #ifdef NAMD_CUDA
02761  if ( cudaFirst || ! offload ) {
02762 #endif
02763   f_arr = new char[fsize*numGrids];
02764   // memset to non-zero value has race condition on BlueGene/Q
02765   // memset( (void*) f_arr, 2, fsize*numGrids * sizeof(char) );
02766   for ( int n=fsize*numGrids, i=0; i<n; ++i ) f_arr[i] = 2;
02767 
02768   for ( int g=0; g<numGrids; ++g ) {
02769     char *f = f_arr + g*fsize;
02770     if ( usePencils ) {
02771       int K1 = myGrid.K1;
02772       int K2 = myGrid.K2;
02773       int block1 = ( K1 + xBlocks - 1 ) / xBlocks;
02774       int block2 = ( K2 + yBlocks - 1 ) / yBlocks;
02775       int dim2 = myGrid.dim2;
02776       for (int ap=0; ap<numPencilsActive; ++ap) {
02777         int ib = activePencils[ap].i;
02778         int jb = activePencils[ap].j;
02779         int ibegin = ib*block1;
02780         int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
02781         int jbegin = jb*block2;
02782         int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
02783         int flen = numGrids * (iend - ibegin) * (jend - jbegin);
02784         for ( int i=ibegin; i<iend; ++i ) {
02785           for ( int j=jbegin; j<jend; ++j ) {
02786             f[i*dim2+j] = 0;
02787           }
02788         }
02789       }
02790     } else {
02791       int block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
02792       bsize = block1 * myGrid.dim2 * myGrid.dim3;
02793       for (int pe=0; pe<numGridPes; pe++) {
02794         if ( ! recipPeDest[pe] ) continue;
02795         int start = pe * bsize;
02796         int len = bsize;
02797         if ( start >= qsize ) { start = 0; len = 0; }
02798         if ( start + len > qsize ) { len = qsize - start; }
02799         int zdim = myGrid.dim3;
02800         int fstart = start / zdim;
02801         int flen = len / zdim;
02802         memset(f + fstart, 0, flen*sizeof(char));
02803         // CkPrintf("pe %d enabled slabs %d to %d\n", CkMyPe(), fstart/myGrid.dim2, (fstart+flen)/myGrid.dim2-1);
02804       }
02805     }
02806   }
02807 #ifdef NAMD_CUDA
02808  }
02809  if ( offload ) {
02810  cudaSetDevice(deviceCUDA->getDeviceID());
02811  if ( cudaFirst ) {
02812 
02813   int f_alloc_count = 0;
02814   for ( int n=fsize, i=0; i<n; ++i ) {
02815     if ( f_arr[i] == 0 ) {
02816       ++f_alloc_count;
02817     }
02818   }
02819   // CkPrintf("pe %d f_alloc_count == %d (%d slabs)\n", CkMyPe(), f_alloc_count, f_alloc_count/myGrid.dim2);
02820 
02821   q_arr = new float*[fsize*numGrids];
02822   memset( (void*) q_arr, 0, fsize*numGrids * sizeof(float*) );
02823 
02824   float **q_arr_dev_host = new float*[fsize];
02825   cudaMalloc((void**) &q_arr_dev, fsize * sizeof(float*));
02826 
02827   float **v_arr_dev_host = new float*[fsize];
02828   cudaMalloc((void**) &v_arr_dev, fsize * sizeof(float*));
02829 
02830   int q_stride = myGrid.K3+myGrid.order-1;
02831   q_data_size = f_alloc_count * q_stride * sizeof(float);
02832   ffz_size = (fsize + q_stride) * sizeof(int);
02833 
02834   // tack ffz onto end of q_data to allow merged transfer
02835   cudaMallocHost((void**) &q_data_host, q_data_size+ffz_size);
02836   ffz_host = (int*)(((char*)q_data_host) + q_data_size);
02837   cudaMalloc((void**) &q_data_dev, q_data_size+ffz_size);
02838   ffz_dev = (int*)(((char*)q_data_dev) + q_data_size);
02839   cudaMalloc((void**) &v_data_dev, q_data_size);
02840   cuda_errcheck("malloc grid data for pme");
02841   cudaMemset(q_data_dev, 0, q_data_size + ffz_size);  // for first time
02842   cudaEventCreateWithFlags(&(nodePmeMgr->end_charge_memset),cudaEventDisableTiming);
02843   cudaEventRecord(nodePmeMgr->end_charge_memset, 0);
02844   cudaEventCreateWithFlags(&(nodePmeMgr->end_all_pme_kernels),cudaEventDisableTiming);
02845   cudaEventCreateWithFlags(&(nodePmeMgr->end_potential_memcpy),cudaEventDisableTiming);
02846 
02847   f_alloc_count = 0;
02848   for ( int n=fsize, i=0; i<n; ++i ) {
02849     if ( f_arr[i] == 0 ) {
02850       q_arr[i] = q_data_host + f_alloc_count * q_stride;
02851       q_arr_dev_host[i] = q_data_dev + f_alloc_count * q_stride;
02852       v_arr_dev_host[i] = v_data_dev + f_alloc_count * q_stride;
02853       ++f_alloc_count;
02854     } else {
02855       q_arr[i] = 0;
02856       q_arr_dev_host[i] = 0;
02857       v_arr_dev_host[i] = 0;
02858     }
02859   }
02860 
02861   cudaMemcpy(q_arr_dev, q_arr_dev_host, fsize * sizeof(float*), cudaMemcpyHostToDevice);
02862   cudaMemcpy(v_arr_dev, v_arr_dev_host, fsize * sizeof(float*), cudaMemcpyHostToDevice);
02863   delete [] q_arr_dev_host;
02864   delete [] v_arr_dev_host;
02865   delete [] f_arr;
02866   f_arr = new char[fsize + q_stride];
02867   fz_arr = f_arr + fsize;
02868   memset(f_arr, 0, fsize + q_stride);
02869   memset(ffz_host, 0, (fsize + q_stride)*sizeof(int));
02870 
02871   cuda_errcheck("initialize grid data for pme");
02872 
02873   cuda_init_bspline_coeffs(&bspline_coeffs_dev, &bspline_dcoeffs_dev, myGrid.order);
02874   cuda_errcheck("initialize bspline coefficients for pme");
02875 
02876 #define XCOPY(X) masterPmeMgr->X = X;
02877   XCOPY(bspline_coeffs_dev)
02878   XCOPY(bspline_dcoeffs_dev)
02879   XCOPY(q_arr)
02880   XCOPY(q_arr_dev)
02881   XCOPY(v_arr_dev)
02882   XCOPY(q_data_size)
02883   XCOPY(q_data_host)
02884   XCOPY(q_data_dev)
02885   XCOPY(v_data_dev)
02886   XCOPY(ffz_size)
02887   XCOPY(ffz_host)
02888   XCOPY(ffz_dev)
02889   XCOPY(f_arr)
02890   XCOPY(fz_arr)
02891 #undef XCOPY
02892   //CkPrintf("pe %d init first\n", CkMyPe());
02893  } else { // cudaFirst
02894   //CkPrintf("pe %d init later\n", CkMyPe());
02895 #define XCOPY(X) X = masterPmeMgr->X;
02896   XCOPY(bspline_coeffs_dev)
02897   XCOPY(bspline_dcoeffs_dev)
02898   XCOPY(q_arr)
02899   XCOPY(q_arr_dev)
02900   XCOPY(v_arr_dev)
02901   XCOPY(q_data_size)
02902   XCOPY(q_data_host)
02903   XCOPY(q_data_dev)
02904   XCOPY(v_data_dev)
02905   XCOPY(ffz_size)
02906   XCOPY(ffz_host)
02907   XCOPY(ffz_dev)
02908   XCOPY(f_arr)
02909   XCOPY(fz_arr)
02910 #undef XCOPY
02911  } // cudaFirst
02912   CmiUnlock(cuda_lock);
02913  } else // offload
02914 #endif // NAMD_CUDA
02915  {
02916   fz_arr = new char[myGrid.K3+myGrid.order-1];
02917  }
02918 
02919 #if 0 && USE_PERSISTENT
02920   recvGrid_handle = NULL;
02921 #endif
02922 }

void ComputePmeMgr::initialize_pencils ( CkQdMsg *   ) 

Definition at line 1691 of file ComputePme.C.

References Lattice::a(), Lattice::a_r(), Lattice::b(), Lattice::b_r(), PmeGrid::block1, PmeGrid::block2, deviceCUDA, DeviceCUDA::getMasterPe(), j, PmeGrid::K1, PmeGrid::K2, PatchMap::max_a(), PatchMap::max_b(), PatchMap::min_a(), PatchMap::min_b(), PatchMap::node(), PatchMap::numPatches(), numPatches, PatchMap::Object(), Node::Object(), PmeGrid::order, Node::simParameters, simParams, and Vector::unit().

01691                                                    {
01692   delete msg;
01693   if ( ! usePencils ) return;
01694 
01695   SimParameters *simParams = Node::Object()->simParameters;
01696 
01697   PatchMap *patchMap = PatchMap::Object();
01698   Lattice lattice = simParams->lattice;
01699   BigReal sysdima = lattice.a_r().unit() * lattice.a();
01700   BigReal sysdimb = lattice.b_r().unit() * lattice.b();
01701   BigReal cutoff = simParams->cutoff;
01702   BigReal patchdim = simParams->patchDimension;
01703   int numPatches = patchMap->numPatches();
01704 
01705   pencilActive = new char[xBlocks*yBlocks];
01706   for ( int i=0; i<xBlocks; ++i ) {
01707     for ( int j=0; j<yBlocks; ++j ) {
01708       pencilActive[i*yBlocks+j] = 0;
01709     }
01710   }
01711 
01712   for ( int pid=0; pid < numPatches; ++pid ) {
01713     int pnode = patchMap->node(pid);
01714 #ifdef NAMD_CUDA
01715     if ( offload ) {
01716       if ( CkNodeOf(pnode) != CkMyNode() ) continue;
01717     } else
01718 #endif
01719     if ( pnode != CkMyPe() ) continue;
01720 
01721     int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
01722     int shift2 = (myGrid.K2 + myGrid.order - 1)/2;
01723 
01724     BigReal minx = patchMap->min_a(pid);
01725     BigReal maxx = patchMap->max_a(pid);
01726     BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
01727     // min1 (max1) is smallest (largest) grid line for this patch
01728     int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
01729     int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
01730 
01731     BigReal miny = patchMap->min_b(pid);
01732     BigReal maxy = patchMap->max_b(pid);
01733     BigReal marginb = 0.5 * ( patchdim - cutoff ) / sysdimb;
01734     // min2 (max2) is smallest (largest) grid line for this patch
01735     int min2 = ((int) floor(myGrid.K2 * (miny - marginb))) + shift2 - myGrid.order + 1;
01736     int max2 = ((int) floor(myGrid.K2 * (maxy + marginb))) + shift2;
01737 
01738     for ( int i=min1; i<=max1; ++i ) {
01739       int ix = i;
01740       while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
01741       while ( ix < 0 ) ix += myGrid.K1;
01742       for ( int j=min2; j<=max2; ++j ) {
01743         int jy = j;
01744         while ( jy >= myGrid.K2 ) jy -= myGrid.K2;
01745         while ( jy < 0 ) jy += myGrid.K2;
01746         pencilActive[(ix / myGrid.block1)*yBlocks + (jy / myGrid.block2)] = 1;
01747       }
01748     }
01749   }
01750 
01751   numPencilsActive = 0;
01752   for ( int i=0; i<xBlocks; ++i ) {
01753     for ( int j=0; j<yBlocks; ++j ) {
01754       if ( pencilActive[i*yBlocks+j] ) {
01755         ++numPencilsActive;
01756 #ifdef NAMD_CUDA
01757         if ( CkMyPe() == deviceCUDA->getMasterPe() || ! offload )
01758 #endif
01759         zPencil(i,j,0).dummyRecvGrid(CkMyPe(),0);
01760       }
01761     }
01762   }
01763   activePencils = new ijpair[numPencilsActive];
01764   numPencilsActive = 0;
01765   for ( int i=0; i<xBlocks; ++i ) {
01766     for ( int j=0; j<yBlocks; ++j ) {
01767       if ( pencilActive[i*yBlocks+j] ) {
01768         activePencils[numPencilsActive++] = ijpair(i,j);
01769       }
01770     }
01771   }
01772   if ( simParams->PMESendOrder ) {
01773     std::sort(activePencils,activePencils+numPencilsActive,ijpair_sortop_bit_reversed());
01774   } else {
01775     Random rand(CkMyPe());
01776     rand.reorder(activePencils,numPencilsActive);
01777   }
01778   //if ( numPencilsActive ) {
01779   //  CkPrintf("node %d sending to %d pencils\n", CkMyPe(), numPencilsActive);
01780   //}
01781 
01782   ungrid_count = numPencilsActive;
01783 }

void ComputePmeMgr::pollChargeGridReady (  ) 

Definition at line 3507 of file ComputePme.C.

References CcdCallBacksReset(), cuda_check_pme_charges(), CUDA_POLL, and NAMD_bug().

03507                                         {
03508 #ifdef NAMD_CUDA
03509   CcdCallBacksReset(0,CmiWallTimer());  // fix Charm++
03510   CUDA_POLL(cuda_check_pme_charges,this);
03511 #else
03512   NAMD_bug("ComputePmeMgr::pollChargeGridReady() called in non-CUDA build.");
03513 #endif
03514 }

void ComputePmeMgr::pollForcesReady (  ) 

Definition at line 2640 of file ComputePme.C.

References CcdCallBacksReset(), cuda_check_pme_forces(), CUDA_POLL, and NAMD_bug().

02640                                     {
02641 #ifdef NAMD_CUDA
02642   CcdCallBacksReset(0,CmiWallTimer());  // fix Charm++
02643   CUDA_POLL(cuda_check_pme_forces,this);
02644 #else
02645   NAMD_bug("ComputePmeMgr::pollForcesReady() called in non-CUDA build.");
02646 #endif
02647 }

void ComputePmeMgr::procTrans ( PmeTransMsg  ) 

Definition at line 2046 of file ComputePme.C.

References PmeGrid::dim3, gridCalc2(), PmeTransMsg::lattice, NodePmeInfo::npe, ComputePmeUtil::numGrids, PmeTransMsg::nx, LocalPmeInfo::ny_after_transpose, NodePmeInfo::pe_start, PmeTransMsg::qgrid, PmeTransMsg::sequence, PmeTransMsg::x_start, and LocalPmeInfo::y_start_after_transpose.

Referenced by recvSharedTrans(), and recvTrans().

02046                                               {
02047   // CkPrintf("procTrans on Pe(%d)\n",CkMyPe());
02048   if ( trans_count == numGridPes ) {
02049     lattice = msg->lattice;
02050     grid_sequence = msg->sequence;
02051   }
02052 
02053  if ( msg->nx ) {
02054   int zdim = myGrid.dim3;
02055   NodePmeInfo &nodeInfo(transNodeInfo[myTransNode]);
02056   int first_pe = nodeInfo.pe_start;
02057   int last_pe = first_pe+nodeInfo.npe-1;
02058   int y_skip = localInfo[myTransPe].y_start_after_transpose
02059              - localInfo[first_pe].y_start_after_transpose;
02060   int ny_msg = localInfo[last_pe].y_start_after_transpose
02061              + localInfo[last_pe].ny_after_transpose
02062              - localInfo[first_pe].y_start_after_transpose;
02063   int ny = localInfo[myTransPe].ny_after_transpose;
02064   int x_start = msg->x_start;
02065   int nx = msg->nx;
02066   for ( int g=0; g<numGrids; ++g ) {
02067     CmiMemcpy((void*)(kgrid + qgrid_size * g + x_start*ny*zdim),
02068         (void*)(msg->qgrid + nx*(ny_msg*g+y_skip)*zdim),
02069         nx*ny*zdim*sizeof(float));
02070   }
02071  }
02072 
02073   --trans_count;
02074 
02075   if ( trans_count == 0 ) {
02076     pmeProxyDir[CkMyPe()].gridCalc2();
02077   }
02078 }

void ComputePmeMgr::procUntrans ( PmeUntransMsg  ) 

Definition at line 2300 of file ComputePme.C.

References PmeGrid::dim3, gridCalc3(), PmeGrid::K2, NodePmeInfo::npe, LocalPmeInfo::nx, PmeUntransMsg::ny, NodePmeInfo::pe_start, PmeUntransMsg::qgrid, LocalPmeInfo::x_start, and PmeUntransMsg::y_start.

Referenced by recvSharedUntrans(), and recvUntrans().

02300                                                   {
02301   // CkPrintf("recvUntrans on Pe(%d)\n",CkMyPe());
02302 
02303 #if CMK_BLUEGENEL
02304   CmiNetworkProgressAfter (0);
02305 #endif
02306 
02307   NodePmeInfo &nodeInfo(gridNodeInfo[myGridNode]);
02308   int first_pe = nodeInfo.pe_start;
02309   int g;
02310 
02311  if ( msg->ny ) {
02312   int zdim = myGrid.dim3;
02313   int last_pe = first_pe+nodeInfo.npe-1;
02314   int x_skip = localInfo[myGridPe].x_start
02315              - localInfo[first_pe].x_start;
02316   int nx_msg = localInfo[last_pe].x_start
02317              + localInfo[last_pe].nx
02318              - localInfo[first_pe].x_start;
02319   int nx = localInfo[myGridPe].nx;
02320   int y_start = msg->y_start;
02321   int ny = msg->ny;
02322   int slicelen = myGrid.K2 * zdim;
02323   int cpylen = ny * zdim;
02324   for ( g=0; g<numGrids; ++g ) {
02325     float *q = qgrid + qgrid_size * g + y_start * zdim;
02326     float *qmsg = msg->qgrid + (nx_msg*g+x_skip) * cpylen;
02327     for ( int x = 0; x < nx; ++x ) {
02328       CmiMemcpy((void*)q, (void*)qmsg, cpylen*sizeof(float));
02329       q += slicelen;
02330       qmsg += cpylen;
02331     }
02332   }
02333  }
02334 
02335   --untrans_count;
02336 
02337   if ( untrans_count == 0 ) {
02338     pmeProxyDir[CkMyPe()].gridCalc3();
02339   }
02340 }

void ComputePmeMgr::recvAck ( PmeAckMsg  ) 

Definition at line 2442 of file ComputePme.C.

References cuda_lock, master_pe, and NAMD_bug().

Referenced by recvUngrid().

02442                                           {
02443   if ( msg ) delete msg;
02444 #ifdef NAMD_CUDA
02445   if ( offload ) {
02446     CmiLock(cuda_lock);
02447     if ( ungrid_count == 0 ) {
02448       NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
02449     }
02450     int uc = --ungrid_count;
02451     CmiUnlock(cuda_lock);
02452 
02453     if ( uc == 0 ) {
02454       pmeProxyDir[master_pe].ungridCalc();
02455     }
02456     return;
02457   }
02458 #endif
02459   --ungrid_count;
02460 
02461   if ( ungrid_count == 0 ) {
02462     pmeProxyDir[CkMyPe()].ungridCalc();
02463   }
02464 }

void ComputePmeMgr::recvArrays ( CProxy_PmeXPencil  ,
CProxy_PmeYPencil  ,
CProxy_PmeZPencil   
)

Definition at line 799 of file ComputePme.C.

00800                                                                        {
00801   xPencil = x;  yPencil = y;  zPencil = z;
00802   
00803     if(CmiMyRank()==0)
00804     {
00805       pmeNodeProxy.ckLocalBranch()->xPencil=x;
00806       pmeNodeProxy.ckLocalBranch()->yPencil=y;
00807       pmeNodeProxy.ckLocalBranch()->zPencil=z;
00808     }
00809 }

void ComputePmeMgr::recvChargeGridReady (  ) 

Definition at line 3516 of file ComputePme.C.

References chargeGridReady(), saved_lattice, and saved_sequence.

03516                                         {
03517   chargeGridReady(*saved_lattice,saved_sequence);
03518 }

void ComputePmeMgr::recvGrid ( PmeGridMsg  ) 

Definition at line 1825 of file ComputePme.C.

References PmeGrid::dim3, f, PmeGridMsg::fgrid, gridCalc1(), PmeGridMsg::lattice, NAMD_bug(), ComputePmeUtil::numGrids, PmeGridMsg::qgrid, sendTransBarrier(), PmeGridMsg::sequence, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

01825                                             {
01826   // CkPrintf("recvGrid from %d on Pe(%d)\n",msg->sourceNode,CkMyPe());
01827   if ( grid_count == 0 ) {
01828     NAMD_bug("Message order failure in ComputePmeMgr::recvGrid\n");
01829   }
01830   if ( grid_count == numSources ) {
01831     lattice = msg->lattice;
01832     grid_sequence = msg->sequence;
01833   }
01834 
01835   int zdim = myGrid.dim3;
01836   int zlistlen = msg->zlistlen;
01837   int *zlist = msg->zlist;
01838   float *qmsg = msg->qgrid;
01839   for ( int g=0; g<numGrids; ++g ) {
01840     char *f = msg->fgrid + fgrid_len * g;
01841     float *q = qgrid + qgrid_size * g;
01842     for ( int i=0; i<fgrid_len; ++i ) {
01843       if ( f[i] ) {
01844         for ( int k=0; k<zlistlen; ++k ) {
01845           q[zlist[k]] += *(qmsg++);
01846         }
01847       }
01848       q += zdim;
01849     }
01850   }
01851 
01852   gridmsg_reuse[numSources-grid_count] = msg;
01853   --grid_count;
01854 
01855   if ( grid_count == 0 ) {
01856     pmeProxyDir[CkMyPe()].gridCalc1();
01857     if ( useBarrier ) pmeProxyDir[0].sendTransBarrier();
01858   }
01859 }

void ComputePmeMgr::recvRecipEvir ( PmeEvirMsg  ) 

Definition at line 3007 of file ComputePme.C.

References PmeEvirMsg::evir, NAMD_bug(), pmeComputes, ResizeArray< Elem >::size(), and submitReductions().

03007                                                  {
03008   if ( ! pmeComputes.size() ) NAMD_bug("ComputePmeMgr::recvRecipEvir() called on pe without patches");
03009   for ( int g=0; g<numGrids; ++g ) {
03010     evir[g] += msg->evir[g];
03011   }
03012   delete msg;
03013   // CkPrintf("recvRecipEvir pe %d %d %d\n", CkMyPe(), ungridForcesCount, recipEvirCount);
03014   if ( ! --recipEvirCount && ! ungridForcesCount ) submitReductions();
03015 }

void ComputePmeMgr::recvSharedTrans ( PmeSharedTransMsg  ) 

Definition at line 2028 of file ComputePme.C.

References PmeSharedTransMsg::count, PmeSharedTransMsg::lock, PmeSharedTransMsg::msg, and procTrans().

02028                                                           {
02029   procTrans(msg->msg);
02030   CmiLock(msg->lock);
02031   int count = --(*msg->count);
02032   CmiUnlock(msg->lock);
02033   if ( count == 0 ) {
02034     CmiDestroyLock(msg->lock);
02035     delete msg->count;
02036     delete msg->msg;
02037   }
02038   delete msg;
02039 }

void ComputePmeMgr::recvSharedUntrans ( PmeSharedUntransMsg  ) 

Definition at line 2282 of file ComputePme.C.

References PmeSharedUntransMsg::count, PmeSharedUntransMsg::lock, PmeSharedUntransMsg::msg, and procUntrans().

02282                                                               {
02283   procUntrans(msg->msg);
02284   CmiLock(msg->lock);
02285   int count = --(*msg->count);
02286   CmiUnlock(msg->lock);
02287   if ( count == 0 ) {
02288     CmiDestroyLock(msg->lock);
02289     delete msg->count;
02290     delete msg->msg;
02291   }
02292   delete msg;
02293 }

void ComputePmeMgr::recvTrans ( PmeTransMsg  ) 

Definition at line 2041 of file ComputePme.C.

References procTrans().

02041                                               {
02042   procTrans(msg);
02043   delete msg;
02044 }

void ComputePmeMgr::recvUngrid ( PmeGridMsg  ) 

Definition at line 2427 of file ComputePme.C.

References copyPencils(), copyResults(), NAMD_bug(), and recvAck().

Referenced by NodePmeMgr::recvUngrid().

02427                                               {
02428   // CkPrintf("recvUngrid on Pe(%d)\n",CkMyPe());
02429 #ifdef NAMD_CUDA
02430   if ( ! offload )  // would need lock
02431 #endif
02432   if ( ungrid_count == 0 ) {
02433     NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
02434   }
02435 
02436   if ( usePencils ) copyPencils(msg);
02437   else copyResults(msg);
02438   delete msg;
02439   recvAck(0);
02440 }

void ComputePmeMgr::recvUntrans ( PmeUntransMsg  ) 

Definition at line 2295 of file ComputePme.C.

References procUntrans().

02295                                                   {
02296   procUntrans(msg);
02297   delete msg;
02298 }

void ComputePmeMgr::sendChargeGridReady (  ) 

Definition at line 3493 of file ComputePme.C.

References chargeGridSubmittedCount, master_pe, NodePmeMgr::mgrObjects, pmeComputes, and ResizeArray< Elem >::size().

Referenced by cuda_check_pme_charges().

03493                                         {
03494   for ( int i=0; i<CkMyNodeSize(); ++i ) {
03495     ComputePmeMgr *mgr = nodePmeMgr->mgrObjects[i];
03496     int cs = mgr->pmeComputes.size();
03497     if ( cs ) {
03498       mgr->ungridForcesCount = cs;
03499       mgr->recipEvirCount = mgr->recipEvirClients;
03500       masterPmeMgr->chargeGridSubmittedCount++;
03501     }
03502   }
03503   pmeProxy[master_pe].recvChargeGridReady();
03504 }

void ComputePmeMgr::sendData ( Lattice ,
int  sequence 
)

Definition at line 3930 of file ComputePme.C.

References sendDataHelper_errors, sendDataHelper_lattice, sendDataHelper_sequence, sendDataHelper_sourcepe, and sendDataPart().

Referenced by chargeGridReady().

03930                                                            {
03931 
03932   sendDataHelper_lattice = &lattice;
03933   sendDataHelper_sequence = sequence;
03934   sendDataHelper_sourcepe = CkMyPe();
03935   sendDataHelper_errors = strayChargeErrors;
03936   strayChargeErrors = 0;
03937 
03938 #ifdef NAMD_CUDA
03939   if ( offload ) {
03940     for ( int i=0; i < numGridPes; ++i ) {
03941       int pe = gridPeOrder[i];  // different order
03942       if ( ! recipPeDest[pe] && ! sendDataHelper_errors ) continue;
03943 #if CMK_MULTICORE
03944       // nodegroup messages on multicore are delivered to sending pe, or pe 0 if expedited
03945       pmeProxy[gridPeMap[pe]].sendDataHelper(i);
03946 #else
03947       pmeNodeProxy[CkMyNode()].sendDataHelper(i);
03948 #endif
03949     }
03950   } else
03951 #endif
03952   {
03953     sendDataPart(0,numGridPes-1,lattice,sequence,CkMyPe(),sendDataHelper_errors);
03954   }
03955  
03956 }

void ComputePmeMgr::sendDataHelper ( int   ) 

Definition at line 3917 of file ComputePme.C.

References NodePmeMgr::sendDataHelper().

03917                                            {
03918   nodePmeMgr->sendDataHelper(iter);
03919 }

void ComputePmeMgr::sendDataPart ( int  first,
int  last,
Lattice ,
int  sequence,
int  sourcepe,
int  errors 
)

Definition at line 3808 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::dim2, PmeGrid::dim3, endi(), PmeGridMsg::fgrid, iERROR(), if(), iout, j, PmeGrid::K2, PmeGrid::K3, PmeGridMsg::lattice, PmeGridMsg::len, NAMD_bug(), PmeGrid::order, PME_GRID_PRIORITY, PRIORITY_SIZE, PmeGridMsg::qgrid, PmeGridMsg::sequence, SET_PRIORITY, PmeGridMsg::sourceNode, PmeGridMsg::start, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

Referenced by sendData(), and NodePmeMgr::sendDataHelper().

03808                                                                                                               {
03809 
03810   // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
03811 
03812   bsize = myGrid.block1 * myGrid.dim2 * myGrid.dim3;
03813 
03814   CProxy_ComputePmeMgr pmeProxy(CkpvAccess(BOCclass_group).computePmeMgr);
03815   for (int j=first; j<=last; j++) {
03816     int pe = gridPeOrder[j];  // different order
03817     if ( ! recipPeDest[pe] && ! errors ) continue;
03818     int start = pe * bsize;
03819     int len = bsize;
03820     if ( start >= qsize ) { start = 0; len = 0; }
03821     if ( start + len > qsize ) { len = qsize - start; }
03822     int zdim = myGrid.dim3;
03823     int fstart = start / zdim;
03824     int flen = len / zdim;
03825     int fcount = 0;
03826     int i;
03827 
03828     int g;
03829     for ( g=0; g<numGrids; ++g ) {
03830       char *f = f_arr + fstart + g*fsize;
03831 #ifdef NAMD_CUDA
03832      if ( offload ) {
03833       int errcount = 0;
03834       for ( i=0; i<flen; ++i ) {
03835         f[i] = ffz_host[fstart+i];
03836         fcount += f[i];
03837         if ( ffz_host[fstart+i] & ~1 ) ++errcount;
03838       }
03839       if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::sendDataPart");
03840      } else
03841 #endif
03842       for ( i=0; i<flen; ++i ) {
03843         fcount += f[i];
03844       }
03845       if ( ! recipPeDest[pe] ) {
03846         int errfound = 0;
03847         for ( i=0; i<flen; ++i ) {
03848           if ( f[i] == 3 ) {
03849             errfound = 1;
03850             break;
03851           }
03852         }
03853         if ( errfound ) {
03854           iout << iERROR << "Stray PME grid charges detected: "
03855                 << sourcepe << " sending to " << gridPeMap[pe] << " for planes";
03856           int iz = -1;
03857           for ( i=0; i<flen; ++i ) {
03858             if ( f[i] == 3 ) {
03859               f[i] = 2;
03860               int jz = (i+fstart)/myGrid.K2;
03861               if ( iz != jz ) { iout << " " << jz;  iz = jz; }
03862             }
03863           }
03864           iout << "\n" << endi;
03865         }
03866       }
03867     }
03868 
03869 #ifdef NETWORK_PROGRESS
03870     CmiNetworkProgress();
03871 #endif
03872 
03873     if ( ! recipPeDest[pe] ) continue;
03874 
03875     int zlistlen = 0;
03876     for ( i=0; i<myGrid.K3; ++i ) {
03877       if ( fz_arr[i] ) ++zlistlen;
03878     }
03879 
03880     PmeGridMsg *msg = new (zlistlen, flen*numGrids,
03881                                 fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
03882 
03883     msg->sourceNode = sourcepe;
03884     msg->lattice = lattice;
03885     msg->start = fstart;
03886     msg->len = flen;
03887     msg->zlistlen = zlistlen;
03888     int *zlist = msg->zlist;
03889     zlistlen = 0;
03890     for ( i=0; i<myGrid.K3; ++i ) {
03891       if ( fz_arr[i] ) zlist[zlistlen++] = i;
03892     }
03893     float *qmsg = msg->qgrid;
03894     for ( g=0; g<numGrids; ++g ) {
03895       char *f = f_arr + fstart + g*fsize;
03896       CmiMemcpy((void*)(msg->fgrid+g*flen),(void*)f,flen*sizeof(char));
03897       float **q = q_arr + fstart + g*fsize;
03898       for ( i=0; i<flen; ++i ) {
03899         if ( f[i] ) {
03900           for (int h=0; h<myGrid.order-1; ++h) {
03901             q[i][h] += q[i][myGrid.K3+h];
03902           }
03903           for ( int k=0; k<zlistlen; ++k ) {
03904             *(qmsg++) = q[i][zlist[k]];
03905           }
03906         }
03907       }
03908     }
03909 
03910     msg->sequence = compute_sequence;
03911     SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
03912     pmeProxy[gridPeMap[pe]].recvGrid(msg);
03913   }
03914 
03915 }

void ComputePmeMgr::sendPencils ( Lattice ,
int  sequence 
)

Definition at line 3703 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::block2, PmeGrid::dim2, endi(), iERROR(), iout, j, ijpair::j, PmeGrid::K1, PmeGrid::K2, sendDataHelper_lattice, sendDataHelper_sequence, sendDataHelper_sourcepe, sendPencilsPart(), and NodePmeMgr::zm.

Referenced by chargeGridReady().

03703                                                               {
03704 
03705   sendDataHelper_lattice = &lattice;
03706   sendDataHelper_sequence = sequence;
03707   sendDataHelper_sourcepe = CkMyPe();
03708 
03709 #ifdef NAMD_CUDA
03710   if ( offload ) {
03711     for ( int ap=0; ap < numPencilsActive; ++ap ) {
03712 #if CMK_MULTICORE
03713       // nodegroup messages on multicore are delivered to sending pe, or pe 0 if expedited
03714       int ib = activePencils[ap].i;
03715       int jb = activePencils[ap].j;
03716       int destproc = nodePmeMgr->zm.ckLocalBranch()->procNum(0, CkArrayIndex3D(ib,jb,0));
03717       pmeProxy[destproc].sendPencilsHelper(ap);
03718 #else
03719       pmeNodeProxy[CkMyNode()].sendPencilsHelper(ap);
03720 #endif
03721     }
03722   } else
03723 #endif
03724   {
03725     sendPencilsPart(0,numPencilsActive-1,lattice,sequence,CkMyPe());
03726   }
03727 
03728   if ( strayChargeErrors ) {
03729    strayChargeErrors = 0;
03730    iout << iERROR << "Stray PME grid charges detected: "
03731         << CkMyPe() << " sending to (x,y)";
03732    int K1 = myGrid.K1;
03733    int K2 = myGrid.K2;
03734    int dim2 = myGrid.dim2;
03735    int block1 = myGrid.block1;
03736    int block2 = myGrid.block2;
03737    for (int ib=0; ib<xBlocks; ++ib) {
03738     for (int jb=0; jb<yBlocks; ++jb) {
03739      int ibegin = ib*block1;
03740      int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
03741      int jbegin = jb*block2;
03742      int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
03743      int flen = numGrids * (iend - ibegin) * (jend - jbegin);
03744 
03745      for ( int g=0; g<numGrids; ++g ) {
03746        char *f = f_arr + g*fsize;
03747        if ( ! pencilActive[ib*yBlocks+jb] ) {
03748            for ( int i=ibegin; i<iend; ++i ) {
03749             for ( int j=jbegin; j<jend; ++j ) {
03750              if ( f[i*dim2+j] == 3 ) {
03751                f[i*dim2+j] = 2;
03752                iout << " (" << i << "," << j << ")";
03753              }
03754             }
03755            }
03756        }
03757      }
03758     }
03759    }
03760    iout << "\n" << endi;
03761   }
03762  
03763 }

void ComputePmeMgr::sendPencilsHelper ( int   ) 

Definition at line 3690 of file ComputePme.C.

References NodePmeMgr::sendPencilsHelper().

03690                                               {
03691   nodePmeMgr->sendPencilsHelper(iter);
03692 }

void ComputePmeMgr::sendPencilsPart ( int  first,
int  last,
Lattice ,
int  sequence,
int  sourcepe 
)

Definition at line 3548 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::block2, PmeGridMsg::destElem, PmeGrid::dim2, PmeGrid::dim3, PmeGridMsg::fgrid, PmeGridMsg::hasData, j, ijpair::j, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PmeGridMsg::lattice, PmeGridMsg::len, NAMD_bug(), PmeGrid::order, PME_GRID_PRIORITY, PRIORITY_SIZE, PmeGridMsg::qgrid, PmeGridMsg::sequence, SET_PRIORITY, PmeGridMsg::sourceNode, PmeGridMsg::start, PmeGridMsg::zlist, PmeGridMsg::zlistlen, and NodePmeMgr::zm.

Referenced by sendPencils(), and NodePmeMgr::sendPencilsHelper().

03548                                                                                                      {
03549 
03550   // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
03551 
03552 #if 0 && USE_PERSISTENT
03553     if (recvGrid_handle== NULL) setup_recvgrid_persistent();
03554 #endif
03555   int K1 = myGrid.K1;
03556   int K2 = myGrid.K2;
03557   int dim2 = myGrid.dim2;
03558   int dim3 = myGrid.dim3;
03559   int block1 = myGrid.block1;
03560   int block2 = myGrid.block2;
03561 
03562   // int savedMessages = 0;
03563   NodePmeMgr *npMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
03564 
03565   for (int ap=first; ap<=last; ++ap) {
03566     int ib = activePencils[ap].i;
03567     int jb = activePencils[ap].j;
03568     int ibegin = ib*block1;
03569     int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
03570     int jbegin = jb*block2;
03571     int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
03572     int flen = numGrids * (iend - ibegin) * (jend - jbegin);
03573 
03574     int fcount = 0;
03575     for ( int g=0; g<numGrids; ++g ) {
03576       char *f = f_arr + g*fsize;
03577 #ifdef NAMD_CUDA
03578      if ( offload ) {
03579       int errcount = 0;
03580       for ( int i=ibegin; i<iend; ++i ) {
03581        for ( int j=jbegin; j<jend; ++j ) {
03582         int k = i*dim2+j;
03583         f[k] = ffz_host[k];
03584         fcount += f[k];
03585         if ( ffz_host[k] & ~1 ) ++errcount;
03586        }
03587       }
03588       if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::sendPencilsPart");
03589      } else
03590 #endif
03591       for ( int i=ibegin; i<iend; ++i ) {
03592        for ( int j=jbegin; j<jend; ++j ) {
03593         fcount += f[i*dim2+j];
03594        }
03595       }
03596     }
03597 
03598 #ifdef NETWORK_PROGRESS
03599     CmiNetworkProgress();
03600 #endif
03601 
03602     if ( ! pencilActive[ib*yBlocks+jb] )
03603       NAMD_bug("PME activePencils list inconsistent");
03604 
03605     int zlistlen = 0;
03606     for ( int i=0; i<myGrid.K3; ++i ) {
03607       if ( fz_arr[i] ) ++zlistlen;
03608     }
03609 
03610     int hd = ( fcount? 1 : 0 );  // has data?
03611     // if ( ! hd ) ++savedMessages;
03612 
03613     
03614     PmeGridMsg *msg = new ( hd*zlistlen, hd*flen,
03615         hd*fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
03616     msg->sourceNode = sourcepe;
03617     msg->hasData = hd;
03618     msg->lattice = lattice;
03619    if ( hd ) {
03620 #if 0
03621     msg->start = fstart;
03622     msg->len = flen;
03623 #else
03624     msg->start = -1;   // obsolete?
03625     msg->len = -1;   // obsolete?
03626 #endif
03627     msg->zlistlen = zlistlen;
03628     int *zlist = msg->zlist;
03629     zlistlen = 0;
03630     for ( int i=0; i<myGrid.K3; ++i ) {
03631       if ( fz_arr[i] ) zlist[zlistlen++] = i;
03632     }
03633     char *fmsg = msg->fgrid;
03634     float *qmsg = msg->qgrid;
03635     for ( int g=0; g<numGrids; ++g ) {
03636       char *f = f_arr + g*fsize;
03637       float **q = q_arr + g*fsize;
03638       for ( int i=ibegin; i<iend; ++i ) {
03639        for ( int j=jbegin; j<jend; ++j ) {
03640         *(fmsg++) = f[i*dim2+j];
03641         if( f[i*dim2+j] ) {
03642           for (int h=0; h<myGrid.order-1; ++h) {
03643             q[i*dim2+j][h] += q[i*dim2+j][myGrid.K3+h];
03644           }
03645           for ( int k=0; k<zlistlen; ++k ) {
03646             *(qmsg++) = q[i*dim2+j][zlist[k]];
03647           }
03648         }
03649        }
03650       }
03651     }
03652    }
03653 
03654     msg->sequence = compute_sequence;
03655     SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
03656     CmiEnableUrgentSend(1);
03657 #if USE_NODE_PAR_RECEIVE
03658     msg->destElem=CkArrayIndex3D(ib,jb,0);
03659     CProxy_PmePencilMap lzm = npMgr->zm;
03660     int destproc = lzm.ckLocalBranch()->procNum(0, msg->destElem);
03661     int destnode = CmiNodeOf(destproc);
03662     
03663 #if  0 
03664     CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
03665 #endif
03666     pmeNodeProxy[destnode].recvZGrid(msg);
03667 #if 0 
03668     CmiUsePersistentHandle(NULL, 0);
03669 #endif
03670 #else
03671 #if 0 
03672     CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
03673 #endif
03674     zPencil(ib,jb,0).recvGrid(msg);
03675 #if 0 
03676     CmiUsePersistentHandle(NULL, 0);
03677 #endif
03678 #endif
03679     CmiEnableUrgentSend(0);
03680   }
03681 
03682 
03683   // if ( savedMessages ) {
03684   //   CkPrintf("Pe %d eliminated %d PME messages\n",CkMyPe(),savedMessages);
03685   // }
03686 
03687 }

void ComputePmeMgr::sendTrans ( void   ) 

Definition at line 1937 of file ComputePme.C.

References CKLOOP_CTRL_PME_SENDTRANS, Node::Object(), PmeSlabSendTrans(), sendTransSubset(), Node::simParameters, and SimParameters::useCkLoop.

Referenced by sendTransBarrier().

01937                                   {
01938 
01939   untrans_count = numTransPes;
01940 
01941 #if     CMK_SMP && USE_CKLOOP
01942   int useCkLoop = Node::Object()->simParameters->useCkLoop;
01943   if ( useCkLoop >= CKLOOP_CTRL_PME_SENDTRANS && CkNumPes() >= 2 * numGridPes) {
01944     CkLoop_Parallelize(PmeSlabSendTrans, 1, (void *)this, CkMyNodeSize(), 0, numTransNodes-1, 0); // no sync
01945   } else
01946 #endif
01947   {
01948     sendTransSubset(0, numTransNodes-1);
01949   }
01950 
01951 }

void ComputePmeMgr::sendTransBarrier ( void   ) 

Definition at line 1922 of file ComputePme.C.

References sendTrans().

Referenced by recvGrid().

01922                                          {
01923   sendTransBarrier_received += 1;
01924   // CkPrintf("sendTransBarrier on %d %d\n",myGridPe,numGridPes-sendTransBarrier_received);
01925   if ( sendTransBarrier_received < numGridPes ) return;
01926   sendTransBarrier_received = 0;
01927   for ( int i=0; i<numGridPes; ++i ) {
01928     pmeProxyDir[gridPeMap[i]].sendTrans();
01929   }
01930 }

void ComputePmeMgr::sendTransSubset ( int  first,
int  last 
)

Definition at line 1953 of file ComputePme.C.

References PmeGrid::dim3, fwdSharedTrans(), j, PmeGrid::K2, kgrid, PmeTransMsg::lattice, NodePmeInfo::npe, ComputePmeUtil::numGrids, PmeTransMsg::nx, LocalPmeInfo::nx, LocalPmeInfo::ny_after_transpose, NodePmeInfo::pe_start, PME_TRANS_PRIORITY, PRIORITY_SIZE, PmeTransMsg::qgrid, NodePmeInfo::real_node, PmeTransMsg::sequence, SET_PRIORITY, PmeTransMsg::sourceNode, PmeTransMsg::x_start, and LocalPmeInfo::x_start.

Referenced by PmeSlabSendTrans(), and sendTrans().

01953                                                        {
01954   // CkPrintf("sendTrans on Pe(%d)\n",CkMyPe());
01955 
01956   // send data for transpose
01957   int zdim = myGrid.dim3;
01958   int nx = localInfo[myGridPe].nx;
01959   int x_start = localInfo[myGridPe].x_start;
01960   int slicelen = myGrid.K2 * zdim;
01961 
01962   ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
01963 
01964 #if CMK_BLUEGENEL
01965   CmiNetworkProgressAfter (0);
01966 #endif
01967 
01968   for (int j=first; j<=last; j++) {
01969     int node = transNodeOrder[j];  // different order on each node
01970     int pe = transNodeInfo[node].pe_start;
01971     int npe = transNodeInfo[node].npe;
01972     int totlen = 0;
01973     if ( node != myTransNode ) for (int i=0; i<npe; ++i, ++pe) {
01974       LocalPmeInfo &li = localInfo[pe];
01975       int cpylen = li.ny_after_transpose * zdim;
01976       totlen += cpylen;
01977     }
01978     PmeTransMsg *newmsg = new (nx * totlen * numGrids,
01979                                 PRIORITY_SIZE) PmeTransMsg;
01980     newmsg->sourceNode = myGridPe;
01981     newmsg->lattice = lattice;
01982     newmsg->x_start = x_start;
01983     newmsg->nx = nx;
01984     for ( int g=0; g<numGrids; ++g ) {
01985       float *qmsg = newmsg->qgrid + nx * totlen * g;
01986       pe = transNodeInfo[node].pe_start;
01987       for (int i=0; i<npe; ++i, ++pe) {
01988         LocalPmeInfo &li = localInfo[pe];
01989         int cpylen = li.ny_after_transpose * zdim;
01990         if ( node == myTransNode ) {
01991           ComputePmeMgr *m = mgrObjects[CkRankOf(transPeMap[pe])];
01992           qmsg = m->kgrid + m->qgrid_size * g + x_start*cpylen;
01993         }
01994         float *q = qgrid + qgrid_size * g + li.y_start_after_transpose * zdim;
01995         for ( int x = 0; x < nx; ++x ) {
01996           CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
01997           q += slicelen;
01998           qmsg += cpylen;
01999         }
02000       }
02001     }
02002     newmsg->sequence = grid_sequence;
02003     SET_PRIORITY(newmsg,grid_sequence,PME_TRANS_PRIORITY)
02004     if ( node == myTransNode ) newmsg->nx = 0;
02005     if ( npe > 1 ) {
02006       if ( node == myTransNode ) fwdSharedTrans(newmsg);
02007       else pmeNodeProxy[transNodeInfo[node].real_node].recvTrans(newmsg);
02008     } else pmeProxy[transPeMap[transNodeInfo[node].pe_start]].recvTrans(newmsg);
02009   }
02010 }

void ComputePmeMgr::sendUngrid ( void   ) 

Definition at line 2367 of file ComputePme.C.

References CKLOOP_CTRL_PME_SENDUNTRANS, Node::Object(), PmeSlabSendUngrid(), sendUngridSubset(), Node::simParameters, and SimParameters::useCkLoop.

02367                                    {
02368 
02369 #if     CMK_SMP && USE_CKLOOP
02370   int useCkLoop = Node::Object()->simParameters->useCkLoop;
02371   if ( useCkLoop >= CKLOOP_CTRL_PME_SENDUNTRANS && CkNumPes() >= 2 * numGridPes) {
02372     CkLoop_Parallelize(PmeSlabSendUngrid, 1, (void *)this, CkMyNodeSize(), 0, numSources-1, 1); // sync
02373   } else
02374 #endif
02375   {
02376     sendUngridSubset(0, numSources-1);
02377   }
02378 
02379   grid_count = numSources;
02380   memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
02381 }

void ComputePmeMgr::sendUngridSubset ( int  first,
int  last 
)

Definition at line 2383 of file ComputePme.C.

References PmeGrid::dim3, f, j, PME_OFFLOAD_UNGRID_PRIORITY, PME_UNGRID_PRIORITY, and SET_PRIORITY.

Referenced by PmeSlabSendUngrid(), and sendUngrid().

02383                                                         {
02384 
02385 #ifdef NAMD_CUDA
02386   const int UNGRID_PRIORITY = ( offload ? PME_OFFLOAD_UNGRID_PRIORITY : PME_UNGRID_PRIORITY );
02387 #else
02388   const int UNGRID_PRIORITY = PME_UNGRID_PRIORITY ;
02389 #endif
02390 
02391   for ( int j=first; j<=last; ++j ) {
02392     // int msglen = qgrid_len;
02393     PmeGridMsg *newmsg = gridmsg_reuse[j];
02394     int pe = newmsg->sourceNode;
02395     int zdim = myGrid.dim3;
02396     int flen = newmsg->len;
02397     int fstart = newmsg->start;
02398     int zlistlen = newmsg->zlistlen;
02399     int *zlist = newmsg->zlist;
02400     float *qmsg = newmsg->qgrid;
02401     for ( int g=0; g<numGrids; ++g ) {
02402       char *f = newmsg->fgrid + fgrid_len * g;
02403       float *q = qgrid + qgrid_size * g + (fstart-fgrid_start) * zdim;
02404       for ( int i=0; i<flen; ++i ) {
02405         if ( f[i] ) {
02406           for ( int k=0; k<zlistlen; ++k ) {
02407             *(qmsg++) = q[zlist[k]];
02408           }
02409         }
02410         q += zdim;
02411       }
02412     }
02413     newmsg->sourceNode = myGridPe;
02414 
02415     SET_PRIORITY(newmsg,grid_sequence,UNGRID_PRIORITY)
02416     CmiEnableUrgentSend(1);
02417 #ifdef NAMD_CUDA
02418     if ( offload ) {
02419       pmeNodeProxy[CkNodeOf(pe)].recvUngrid(newmsg);
02420     } else
02421 #endif
02422     pmeProxyDir[pe].recvUngrid(newmsg);
02423     CmiEnableUrgentSend(0);
02424   }
02425 }

void ComputePmeMgr::sendUntrans ( void   ) 

Definition at line 2181 of file ComputePme.C.

References CKLOOP_CTRL_PME_SENDUNTRANS, PmeEvirMsg::evir, Node::Object(), PME_UNGRID_PRIORITY, PmeSlabSendUntrans(), PRIORITY_SIZE, sendUntransSubset(), SET_PRIORITY, Node::simParameters, and SimParameters::useCkLoop.

02181                                     {
02182 
02183   trans_count = numGridPes;
02184 
02185   { // send energy and virial
02186     PmeEvirMsg *newmsg = new (numGrids, PRIORITY_SIZE) PmeEvirMsg;
02187     for ( int g=0; g<numGrids; ++g ) {
02188       newmsg->evir[g] = recip_evir2[g];
02189     }
02190     SET_PRIORITY(newmsg,grid_sequence,PME_UNGRID_PRIORITY)
02191     CmiEnableUrgentSend(1);
02192     pmeProxy[recipEvirPe].recvRecipEvir(newmsg);
02193     CmiEnableUrgentSend(0);
02194   }
02195 
02196 #if     CMK_SMP && USE_CKLOOP
02197   int useCkLoop = Node::Object()->simParameters->useCkLoop;
02198   if ( useCkLoop >= CKLOOP_CTRL_PME_SENDUNTRANS && CkNumPes() >= 2 * numTransPes) {
02199     CkLoop_Parallelize(PmeSlabSendUntrans, 1, (void *)this, CkMyNodeSize(), 0, numGridNodes-1, 0); // no sync
02200   } else
02201 #endif
02202   {
02203     sendUntransSubset(0, numGridNodes-1);
02204   }
02205 
02206 }

void ComputePmeMgr::sendUntransSubset ( int  first,
int  last 
)

Definition at line 2208 of file ComputePme.C.

References PmeGrid::dim3, fwdSharedUntrans(), j, PmeGrid::K2, NodePmeInfo::npe, LocalPmeInfo::nx, PmeUntransMsg::ny, LocalPmeInfo::ny_after_transpose, NodePmeInfo::pe_start, PME_UNTRANS_PRIORITY, PRIORITY_SIZE, qgrid, PmeUntransMsg::qgrid, NodePmeInfo::real_node, SET_PRIORITY, PmeUntransMsg::sourceNode, PmeUntransMsg::y_start, and LocalPmeInfo::y_start_after_transpose.

Referenced by PmeSlabSendUntrans(), and sendUntrans().

02208                                                          {
02209 
02210   int zdim = myGrid.dim3;
02211   int y_start = localInfo[myTransPe].y_start_after_transpose;
02212   int ny = localInfo[myTransPe].ny_after_transpose;
02213   int slicelen = myGrid.K2 * zdim;
02214 
02215   ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
02216 
02217 #if CMK_BLUEGENEL
02218   CmiNetworkProgressAfter (0);
02219 #endif
02220 
02221   // send data for reverse transpose
02222   for (int j=first; j<=last; j++) {
02223     int node = gridNodeOrder[j];  // different order on each node
02224     int pe = gridNodeInfo[node].pe_start;
02225     int npe = gridNodeInfo[node].npe;
02226     int totlen = 0;
02227     if ( node != myGridNode ) for (int i=0; i<npe; ++i, ++pe) {
02228       LocalPmeInfo &li = localInfo[pe];
02229       int cpylen = li.nx * zdim;
02230       totlen += cpylen;
02231     }
02232     PmeUntransMsg *newmsg = new (ny * totlen * numGrids, PRIORITY_SIZE) PmeUntransMsg;
02233     newmsg->sourceNode = myTransPe;
02234     newmsg->y_start = y_start;
02235     newmsg->ny = ny;
02236     for ( int g=0; g<numGrids; ++g ) {
02237       float *qmsg = newmsg->qgrid + ny * totlen * g;
02238       pe = gridNodeInfo[node].pe_start;
02239       for (int i=0; i<npe; ++i, ++pe) {
02240         LocalPmeInfo &li = localInfo[pe];
02241         if ( node == myGridNode ) {
02242           ComputePmeMgr *m = mgrObjects[CkRankOf(gridPeMap[pe])];
02243           qmsg = m->qgrid + m->qgrid_size * g + y_start * zdim;
02244           float *q = kgrid + qgrid_size*g + li.x_start*ny*zdim;
02245           int cpylen = ny * zdim;
02246           for ( int x = 0; x < li.nx; ++x ) {
02247             CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
02248             q += cpylen;
02249             qmsg += slicelen;
02250           }
02251         } else {
02252           CmiMemcpy((void*)qmsg,
02253                 (void*)(kgrid + qgrid_size*g + li.x_start*ny*zdim),
02254                 li.nx*ny*zdim*sizeof(float));
02255           qmsg += li.nx*ny*zdim;
02256         }
02257       }
02258     }
02259     SET_PRIORITY(newmsg,grid_sequence,PME_UNTRANS_PRIORITY)
02260     if ( node == myGridNode ) newmsg->ny = 0;
02261     if ( npe > 1 ) {
02262       if ( node == myGridNode ) fwdSharedUntrans(newmsg);
02263       else pmeNodeProxy[gridNodeInfo[node].real_node].recvUntrans(newmsg);
02264     } else pmeProxy[gridPeMap[gridNodeInfo[node].pe_start]].recvUntrans(newmsg);
02265   }
02266 }

void ComputePmeMgr::submitReductions (  ) 

Definition at line 4191 of file ComputePme.C.

References ComputePmeUtil::alchDecouple, ComputePmeUtil::alchOn, NAMD_bug(), Node::Object(), Node::simParameters, and simParams.

Referenced by ComputePme::doWork(), and recvRecipEvir().

04191                                      {
04192 
04193     SimParameters *simParams = Node::Object()->simParameters;
04194 
04195     for ( int g=0; g<numGrids; ++g ) {
04196       float scale = 1.;
04197       if (alchOn) {
04198         BigReal elecLambdaUp, elecLambdaDown;
04199         if( simParams->alchFepWhamOn ) {
04200           if( simParams->alchFepElecOn ) {
04201             elecLambdaUp = simParams->alchElecLambda;
04202             elecLambdaDown = 1.0 - simParams->alchElecLambda;
04203           }
04204           else {
04205             elecLambdaUp = 0.0;
04206             elecLambdaDown = 1.0;
04207           }
04208         }
04209         else {
04210           // alchLambda set on each step in ComputePme::ungridForces()
04211           if ( alchLambda < 0 || alchLambda > 1 ) {
04212             NAMD_bug("ComputePmeMgr::submitReductions alchLambda out of range");
04213           }
04214           elecLambdaUp = simParams->getElecLambda(alchLambda);
04215           elecLambdaDown = simParams->getElecLambda(1-alchLambda);
04216         }
04217         if ( g == 0 ) scale = elecLambdaUp;
04218         else if ( g == 1 ) scale = elecLambdaDown;
04219         else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
04220         if (alchDecouple) {
04221           if ( g == 2 ) scale = 1-elecLambdaUp;
04222           else if ( g == 3 ) scale = 1-elecLambdaDown;
04223           else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
04224         }
04225       } else if ( lesOn ) {
04226         scale = 1.0 / lesFactor;
04227       } else if ( pairOn ) {
04228         scale = ( g == 0 ? 1. : -1. );
04229       }
04230       reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += evir[g][0] * scale;
04231       reduction->item(REDUCTION_VIRIAL_SLOW_XX) += evir[g][1] * scale;
04232       reduction->item(REDUCTION_VIRIAL_SLOW_XY) += evir[g][2] * scale;
04233       reduction->item(REDUCTION_VIRIAL_SLOW_XZ) += evir[g][3] * scale;
04234       reduction->item(REDUCTION_VIRIAL_SLOW_YX) += evir[g][2] * scale;
04235       reduction->item(REDUCTION_VIRIAL_SLOW_YY) += evir[g][4] * scale;
04236       reduction->item(REDUCTION_VIRIAL_SLOW_YZ) += evir[g][5] * scale;
04237       reduction->item(REDUCTION_VIRIAL_SLOW_ZX) += evir[g][3] * scale;
04238       reduction->item(REDUCTION_VIRIAL_SLOW_ZY) += evir[g][5] * scale;
04239       reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += evir[g][6] * scale;
04240 
04241       float scale2 = 0.;
04242 
04243       // why is this declared/defined again here?
04244       SimParameters *simParams = Node::Object()->simParameters;
04245 
04246       if (alchFepOn) {
04247         BigReal elecLambda2Up=0.0, elecLambda2Down=0.0;
04248         if(simParams->alchFepWhamOn) {
04249           if(simParams->alchFepElecOn) {
04250             elecLambda2Up = simParams->alchElecLambda;
04251             elecLambda2Down =  1.0 - simParams->alchElecLambda;
04252           }
04253           else {
04254             elecLambda2Up = 0.0;
04255             elecLambda2Down =  1.0;
04256           }
04257         }
04258         else {
04259           elecLambda2Up = simParams->getElecLambda(simParams->alchLambda2);
04260           elecLambda2Down = simParams->getElecLambda(1.-simParams->alchLambda2);
04261         }
04262         
04263         if ( g == 0 ) scale2 = elecLambda2Up;
04264         else if ( g == 1 ) scale2 = elecLambda2Down;
04265         else if ( g == 2 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
04266         if (alchDecouple && g == 2 ) scale2 = 1 - elecLambda2Up;
04267         else if (alchDecouple && g == 3 ) scale2 = 1 - elecLambda2Down;
04268         else if (alchDecouple && g == 4 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
04269       }
04270       if(simParams->alchFepWhamOn && simParams->alchFepElecOn)  {       // FEP with wham post-process
04271         if( g==0 )      scale2 = scale + 1.0;
04272         else if( g==1 ) scale2 = scale - 1.0;
04273         else if( g==2 ) scale2 = scale - 1.0;
04274         else if( g==3 ) scale2 = scale + 1.0;
04275       }
04276       reduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += evir[g][0] * scale2;
04277       
04278       if (alchThermIntOn) {
04279         
04280         // no decoupling:
04281         // part. 1 <-> all of system except partition 2: g[0] - g[2] 
04282         // (interactions between all atoms [partition 0 OR partition 1], 
04283         // minus all [within partition 0])
04284         // U = elecLambdaUp * (U[0] - U[2])
04285         // dU/dl = U[0] - U[2];
04286         
04287         // part. 2 <-> all of system except partition 1: g[1] - g[2] 
04288         // (interactions between all atoms [partition 0 OR partition 2], 
04289         // minus all [within partition 0])
04290         // U = elecLambdaDown * (U[1] - U[2])
04291         // dU/dl = U[1] - U[2];
04292 
04293         // alchDecouple:
04294         // part. 1 <-> part. 0: g[0] - g[2] - g[4] 
04295         // (interactions between all atoms [partition 0 OR partition 1]
04296         // minus all [within partition 1] minus all [within partition 0]
04297         // U = elecLambdaUp * (U[0] - U[4]) + (1-elecLambdaUp)* U[2]
04298         // dU/dl = U[0] - U[2] - U[4];
04299 
04300         // part. 2 <-> part. 0: g[1] - g[3] - g[4] 
04301         // (interactions between all atoms [partition 0 OR partition 2]
04302         // minus all [within partition 2] minus all [within partition 0]
04303         // U = elecLambdaDown * (U[1] - U[4]) + (1-elecLambdaDown)* U[3]
04304         // dU/dl = U[1] - U[3] - U[4];
04305         
04306         
04307         if ( g == 0 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) += evir[g][0];
04308         if ( g == 1 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) += evir[g][0];
04309         if (!alchDecouple) {
04310           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
04311           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
04312         }
04313         else {  // alchDecouple
04314           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
04315           if ( g == 3 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
04316           if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
04317           if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
04318         }
04319       }
04320     }
04321 
04322     alchLambda = -1.;  // illegal value to catch if not updated
04323 
04324     reduction->item(REDUCTION_STRAY_CHARGE_ERRORS) += strayChargeErrors;
04325     reduction->submit();
04326 
04327   for ( int i=0; i<heldComputes.size(); ++i ) {
04328     WorkDistrib::messageEnqueueWork(heldComputes[i]);
04329   }
04330   heldComputes.resize(0);
04331 }

void ComputePmeMgr::ungridCalc ( void   ) 

Definition at line 2511 of file ComputePme.C.

References a_data_dev, cuda_errcheck(), CUDA_EVENT_ID_PME_COPY, CUDA_EVENT_ID_PME_KERNEL, CUDA_EVENT_ID_PME_TICK, deviceCUDA, end_forces, NodePmeMgr::end_potential_memcpy, EVENT_STRIDE, f_data_dev, f_data_host, forces_count, forces_done_count, forces_time, DeviceCUDA::getDeviceID(), j, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, WorkDistrib::messageEnqueueWork(), NodePmeMgr::mgrObjects, PmeGrid::order, pmeComputes, ResizeArray< Elem >::size(), this_pe, and ungridCalc().

Referenced by ungridCalc().

02511                                    {
02512   // CkPrintf("ungridCalc on Pe(%d)\n",CkMyPe());
02513 
02514   ungridForcesCount = pmeComputes.size();
02515 
02516 #ifdef NAMD_CUDA
02517  if ( offload ) {
02518   //CmiLock(cuda_lock);
02519   cudaSetDevice(deviceCUDA->getDeviceID());
02520 
02521   if ( this == masterPmeMgr ) {
02522     double before = CmiWallTimer();
02523     cudaMemcpyAsync(v_data_dev, q_data_host, q_data_size, cudaMemcpyHostToDevice, 0 /*streams[stream]*/);
02524     cudaEventRecord(nodePmeMgr->end_potential_memcpy, 0 /*streams[stream]*/);
02525     traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
02526 
02527     const int myrank = CkMyRank();
02528     for ( int i=0; i<CkMyNodeSize(); ++i ) {
02529       if ( myrank != i && nodePmeMgr->mgrObjects[i]->pmeComputes.size() ) {
02530         nodePmeMgr->mgrObjects[i]->ungridCalc();
02531       }
02532     }
02533     if ( ! pmeComputes.size() ) return;
02534   }
02535 
02536   if ( ! end_forces ) {
02537     int n=(pmeComputes.size()-1)/EVENT_STRIDE+1;
02538     end_forces = new cudaEvent_t[n];
02539     for ( int i=0; i<n; ++i ) {
02540       cudaEventCreateWithFlags(&end_forces[i],cudaEventDisableTiming);
02541     }
02542   }
02543 
02544   const int pcsz = pmeComputes.size();
02545   if ( ! afn_host ) {
02546     cudaMallocHost((void**) &afn_host, 3*pcsz*sizeof(float*));
02547     cudaMalloc((void**) &afn_dev, 3*pcsz*sizeof(float*));
02548     cuda_errcheck("malloc params for pme");
02549   }
02550   int totn = 0;
02551   for ( int i=0; i<pcsz; ++i ) {
02552     int n = pmeComputes[i]->numGridAtoms[0];
02553     totn += n;
02554   }
02555   if ( totn > f_data_mgr_alloc ) {
02556     if ( f_data_mgr_alloc ) {
02557       CkPrintf("Expanding CUDA forces allocation because %d > %d\n", totn, f_data_mgr_alloc);
02558       cudaFree(f_data_mgr_dev);
02559       cudaFreeHost(f_data_mgr_host);
02560     }
02561     f_data_mgr_alloc = 1.2 * (totn + 100);
02562     cudaMalloc((void**) &f_data_mgr_dev, 3*f_data_mgr_alloc*sizeof(float));
02563     cudaMallocHost((void**) &f_data_mgr_host, 3*f_data_mgr_alloc*sizeof(float));
02564     cuda_errcheck("malloc forces for pme");
02565   }
02566   // CkPrintf("pe %d pcsz %d totn %d alloc %d\n", CkMyPe(), pcsz, totn, f_data_mgr_alloc);
02567   float *f_dev = f_data_mgr_dev;
02568   float *f_host = f_data_mgr_host;
02569   for ( int i=0; i<pcsz; ++i ) {
02570     int n = pmeComputes[i]->numGridAtoms[0];
02571     pmeComputes[i]->f_data_dev = f_dev;
02572     pmeComputes[i]->f_data_host = f_host;
02573     afn_host[3*i  ] = a_data_dev + 7 * pmeComputes[i]->cuda_atoms_offset;
02574     afn_host[3*i+1] = f_dev;
02575     afn_host[3*i+2] = f_dev + n;  // avoid type conversion issues
02576     f_dev += 3*n;
02577     f_host += 3*n;
02578   }
02579   //CmiLock(cuda_lock);
02580   double before = CmiWallTimer();
02581   cudaMemcpyAsync(afn_dev, afn_host, 3*pcsz*sizeof(float*), cudaMemcpyHostToDevice, streams[stream]);
02582   traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
02583   cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_potential_memcpy, 0);
02584   traceUserEvent(CUDA_EVENT_ID_PME_TICK);
02585 
02586   for ( int i=0; i<pcsz; ++i ) {
02587     // cudaMemsetAsync(pmeComputes[i]->f_data_dev, 0, 3*n*sizeof(float), streams[stream]);
02588     if ( i%EVENT_STRIDE == 0 ) {
02589       int dimy = pcsz - i;
02590       if ( dimy > EVENT_STRIDE ) dimy = EVENT_STRIDE;
02591       int maxn = 0;
02592       int subtotn = 0;
02593       for ( int j=0; j<dimy; ++j ) {
02594         int n = pmeComputes[i+j]->numGridAtoms[0];
02595         subtotn += n;
02596         if ( n > maxn ) maxn = n;
02597       }
02598       // CkPrintf("pe %d dimy %d maxn %d subtotn %d\n", CkMyPe(), dimy, maxn, subtotn);
02599       before = CmiWallTimer();
02600       cuda_pme_forces(
02601         bspline_coeffs_dev,
02602         v_arr_dev, afn_dev+3*i, dimy, maxn, /*
02603         pmeComputes[i]->a_data_dev,
02604         pmeComputes[i]->f_data_dev,
02605         n, */ myGrid.K1, myGrid.K2, myGrid.K3, myGrid.order,
02606         streams[stream]);
02607       traceUserBracketEvent(CUDA_EVENT_ID_PME_KERNEL,before,CmiWallTimer());
02608       before = CmiWallTimer();
02609       cudaMemcpyAsync(pmeComputes[i]->f_data_host, pmeComputes[i]->f_data_dev, 3*subtotn*sizeof(float),
02610         cudaMemcpyDeviceToHost, streams[stream]);
02611       traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
02612       cudaEventRecord(end_forces[i/EVENT_STRIDE], streams[stream]);
02613       traceUserEvent(CUDA_EVENT_ID_PME_TICK);
02614     }
02615     // CkPrintf("pe %d c %d natoms %d fdev %lld fhost %lld\n", CkMyPe(), i, (int64)afn_host[3*i+2], pmeComputes[i]->f_data_dev, pmeComputes[i]->f_data_host);
02616   }
02617   //CmiUnlock(cuda_lock);
02618  } else
02619 #endif // NAMD_CUDA
02620  {
02621   for ( int i=0; i<pmeComputes.size(); ++i ) {
02622     WorkDistrib::messageEnqueueWork(pmeComputes[i]);
02623     // pmeComputes[i]->ungridForces();
02624   }
02625  }
02626   // submitReductions();  // must follow all ungridForces()
02627 
02628 #ifdef NAMD_CUDA
02629  if ( offload ) {
02630   forces_time = CmiWallTimer();
02631   forces_count = ungridForcesCount;
02632   forces_done_count = 0;
02633   pmeProxy[this_pe].pollForcesReady();
02634  }
02635 #endif
02636 
02637   ungrid_count = (usePencils ? numPencilsActive : numDestRecipPes );
02638 }


Friends And Related Function Documentation

friend class ComputePme [friend]

Definition at line 357 of file ComputePme.C.

friend class NodePmeMgr [friend]

Definition at line 358 of file ComputePme.C.


Member Data Documentation

float* ComputePmeMgr::a_data_dev

Definition at line 419 of file ComputePme.C.

Referenced by cuda_submit_charges(), and ungridCalc().

float* ComputePmeMgr::a_data_host

Definition at line 418 of file ComputePme.C.

int ComputePmeMgr::chargeGridSubmittedCount

Definition at line 444 of file ComputePme.C.

Referenced by chargeGridSubmitted(), ComputePmeMgr(), initialize_computes(), and sendChargeGridReady().

double ComputePmeMgr::charges_time

Definition at line 430 of file ComputePme.C.

Referenced by cuda_check_pme_charges(), and cuda_submit_charges().

int ComputePmeMgr::check_charges_count

Definition at line 432 of file ComputePme.C.

Referenced by ComputePmeMgr(), and cuda_check_pme_charges().

int ComputePmeMgr::check_forces_count

Definition at line 433 of file ComputePme.C.

Referenced by ComputePmeMgr(), and cuda_check_pme_forces().

int ComputePmeMgr::cuda_atoms_alloc

Definition at line 423 of file ComputePme.C.

Referenced by ComputePmeMgr().

int ComputePmeMgr::cuda_atoms_count

Definition at line 422 of file ComputePme.C.

Referenced by ComputePmeMgr(), cuda_submit_charges(), and ComputePme::initialize().

bool ComputePmeMgr::cuda_busy [static]

Definition at line 442 of file ComputePme.C.

CmiNodeLock ComputePmeMgr::cuda_lock [static]

Definition at line 424 of file ComputePme.C.

Referenced by ComputePmeMgr(), initialize_computes(), and recvAck().

std::deque< ComputePmeMgr::cuda_submit_charges_args > ComputePmeMgr::cuda_submit_charges_deque [static]

Definition at line 441 of file ComputePme.C.

cudaEvent_t ComputePmeMgr::end_charges

Definition at line 426 of file ComputePme.C.

Referenced by chargeGridSubmitted(), ComputePmeMgr(), and cuda_check_pme_charges().

cudaEvent_t* ComputePmeMgr::end_forces

Definition at line 427 of file ComputePme.C.

Referenced by ComputePmeMgr(), cuda_check_pme_forces(), and ungridCalc().

float* ComputePmeMgr::f_data_dev

Definition at line 421 of file ComputePme.C.

Referenced by ungridCalc().

float* ComputePmeMgr::f_data_host

Definition at line 420 of file ComputePme.C.

Referenced by ungridCalc().

CmiNodeLock ComputePmeMgr::fftw_plan_lock [static]

Definition at line 414 of file ComputePme.C.

Referenced by ComputePmeMgr(), initialize(), and ~ComputePmeMgr().

int ComputePmeMgr::forces_count

Definition at line 428 of file ComputePme.C.

Referenced by cuda_check_pme_forces(), and ungridCalc().

int ComputePmeMgr::forces_done_count

Definition at line 429 of file ComputePme.C.

Referenced by cuda_check_pme_forces(), and ungridCalc().

double ComputePmeMgr::forces_time

Definition at line 431 of file ComputePme.C.

Referenced by cuda_check_pme_forces(), and ungridCalc().

int ComputePmeMgr::master_pe

Definition at line 434 of file ComputePme.C.

Referenced by chargeGridSubmitted(), initialize_computes(), recvAck(), and sendChargeGridReady().

ResizeArray<ComputePme*> ComputePmeMgr::pmeComputes

Definition at line 454 of file ComputePme.C.

Referenced by chargeGridReady(), cuda_check_pme_forces(), getComputes(), ComputePme::noWork(), recvRecipEvir(), sendChargeGridReady(), and ungridCalc().

CmiNodeLock ComputePmeMgr::pmemgr_lock

Definition at line 415 of file ComputePme.C.

Referenced by ComputePmeMgr(), and ~ComputePmeMgr().

Lattice* ComputePmeMgr::saved_lattice

Definition at line 447 of file ComputePme.C.

Referenced by chargeGridSubmitted(), and recvChargeGridReady().

int ComputePmeMgr::saved_sequence

Definition at line 448 of file ComputePme.C.

Referenced by chargeGridSubmitted(), cuda_check_pme_charges(), cuda_check_pme_forces(), and recvChargeGridReady().

int ComputePmeMgr::sendDataHelper_errors

Definition at line 373 of file ComputePme.C.

Referenced by sendData(), and NodePmeMgr::sendDataHelper().

Lattice* ComputePmeMgr::sendDataHelper_lattice

Definition at line 370 of file ComputePme.C.

Referenced by sendData(), NodePmeMgr::sendDataHelper(), sendPencils(), and NodePmeMgr::sendPencilsHelper().

int ComputePmeMgr::sendDataHelper_sequence

Definition at line 371 of file ComputePme.C.

Referenced by sendData(), NodePmeMgr::sendDataHelper(), sendPencils(), and NodePmeMgr::sendPencilsHelper().

int ComputePmeMgr::sendDataHelper_sourcepe

Definition at line 372 of file ComputePme.C.

Referenced by sendData(), NodePmeMgr::sendDataHelper(), sendPencils(), and NodePmeMgr::sendPencilsHelper().

int ComputePmeMgr::this_pe

Definition at line 435 of file ComputePme.C.

Referenced by ComputePmeMgr(), and ungridCalc().


The documentation for this class was generated from the following file:
Generated on Thu Jun 21 01:17:19 2018 for NAMD by  doxygen 1.4.7