ComputePmeMgr Class Reference

List of all members.

Public Member Functions

 ComputePmeMgr ()
 ~ComputePmeMgr ()
void initialize (CkQdMsg *)
void initialize_pencils (CkQdMsg *)
void activate_pencils (CkQdMsg *)
void recvArrays (CProxy_PmeXPencil, CProxy_PmeYPencil, CProxy_PmeZPencil)
void initialize_computes ()
void sendData (Lattice &, int sequence)
void sendDataPart (int first, int last, Lattice &, int sequence, int sourcepe, int errors)
void sendPencils (Lattice &, int sequence)
void sendPencilsPart (int first, int last, Lattice &, int sequence, int sourcepe)
void recvGrid (PmeGridMsg *)
void gridCalc1 (void)
void sendTransBarrier (void)
void sendTransSubset (int first, int last)
void sendTrans (void)
void fwdSharedTrans (PmeTransMsg *)
void recvSharedTrans (PmeSharedTransMsg *)
void sendDataHelper (int)
void sendPencilsHelper (int)
void recvTrans (PmeTransMsg *)
void procTrans (PmeTransMsg *)
void gridCalc2 (void)
void gridCalc2R (void)
void fwdSharedUntrans (PmeUntransMsg *)
void recvSharedUntrans (PmeSharedUntransMsg *)
void sendUntrans (void)
void sendUntransSubset (int first, int last)
void recvUntrans (PmeUntransMsg *)
void procUntrans (PmeUntransMsg *)
void gridCalc3 (void)
void sendUngrid (void)
void sendUngridSubset (int first, int last)
void recvUngrid (PmeGridMsg *)
void recvAck (PmeAckMsg *)
void copyResults (PmeGridMsg *)
void copyPencils (PmeGridMsg *)
void ungridCalc (void)
void recvRecipEvir (PmeEvirMsg *)
void addRecipEvirClient (void)
void submitReductions ()
void chargeGridSubmitted (Lattice &lattice, int sequence)
void cuda_submit_charges (Lattice &lattice, int sequence)
void sendChargeGridReady ()
void pollChargeGridReady ()
void pollForcesReady ()
void recvChargeGridReady ()
void chargeGridReady (Lattice &lattice, int sequence)

Public Attributes

LatticesendDataHelper_lattice
int sendDataHelper_sequence
int sendDataHelper_sourcepe
int sendDataHelper_errors
CmiNodeLock pmemgr_lock
float * a_data_host
float * a_data_dev
float * f_data_host
float * f_data_dev
int cuda_atoms_count
int cuda_atoms_alloc
cudaEvent_t end_charges
cudaEvent_t * end_forces
int forces_count
int forces_done_count
double charges_time
double forces_time
int check_charges_count
int check_forces_count
int master_pe
int this_pe
int chargeGridSubmittedCount
Latticesaved_lattice
int saved_sequence
ResizeArray< ComputePme * > pmeComputes

Static Public Attributes

static CmiNodeLock fftw_plan_lock
static CmiNodeLock cuda_lock
static std::deque< cuda_submit_charges_argscuda_submit_charges_deque
static bool cuda_busy

Friends

class ComputePme
class NodePmeMgr

Classes

struct  cuda_submit_charges_args

Detailed Description

Definition at line 344 of file ComputePme.C.


Constructor & Destructor Documentation

ComputePmeMgr::ComputePmeMgr (  ) 

Definition at line 701 of file ComputePme.C.

References chargeGridSubmittedCount, check_charges_count, check_forces_count, cuda_atoms_alloc, cuda_atoms_count, cuda_errcheck(), CUDA_EVENT_ID_PME_CHARGES, CUDA_EVENT_ID_PME_COPY, CUDA_EVENT_ID_PME_FORCES, CUDA_EVENT_ID_PME_KERNEL, CUDA_EVENT_ID_PME_TICK, cuda_lock, CUDA_STREAM_CREATE, end_charges, end_forces, fftw_plan_lock, NUM_STREAMS, pmemgr_lock, stream, and this_pe.

00701                              : pmeProxy(thisgroup), 
00702                                  pmeProxyDir(thisgroup) {
00703 
00704   CkpvAccess(BOCclass_group).computePmeMgr = thisgroup;
00705   pmeNodeProxy = CkpvAccess(BOCclass_group).nodePmeMgr;
00706   nodePmeMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
00707 
00708   pmeNodeProxy.ckLocalBranch()->initialize();
00709 
00710   if ( CmiMyRank() == 0 ) {
00711     fftw_plan_lock = CmiCreateLock();
00712   }
00713   pmemgr_lock = CmiCreateLock();
00714 
00715   myKSpace = 0;
00716   kgrid = 0;
00717   work = 0;
00718   grid_count = 0;
00719   trans_count = 0;
00720   untrans_count = 0;
00721   ungrid_count = 0;
00722   gridmsg_reuse= new PmeGridMsg*[CkNumPes()];
00723   useBarrier = 0;
00724   sendTransBarrier_received = 0;
00725   usePencils = 0;
00726 
00727 #ifdef NAMD_CUDA
00728  // offload has not been set so this happens on every run
00729   if ( CmiMyRank() == 0 ) {
00730     cuda_lock = CmiCreateLock();
00731   }
00732 
00733 #if CUDA_VERSION >= 5050
00734   int leastPriority, greatestPriority;
00735   cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
00736   cuda_errcheck("in cudaDeviceGetStreamPriorityRange");
00737   //if ( CkMyNode() == 0 ) {
00738   //  CkPrintf("Pe %d PME CUDA stream priority range %d %d\n", CkMyPe(), leastPriority, greatestPriority);
00739   //}
00740 #define CUDA_STREAM_CREATE(X) cudaStreamCreateWithPriority(X,cudaStreamDefault,greatestPriority)
00741 #else
00742 #define CUDA_STREAM_CREATE(X) cudaStreamCreate(X)
00743 #endif
00744 
00745   stream = 0;
00746   for ( int i=0; i<NUM_STREAMS; ++i ) {
00747 #if 1
00748     CUDA_STREAM_CREATE(&streams[i]);
00749     cuda_errcheck("cudaStreamCreate");
00750 #else
00751   streams[i] = 0;  // XXXX Testing!!!
00752 #endif
00753   }
00754 
00755   this_pe = CkMyPe();
00756  
00757   cudaEventCreateWithFlags(&end_charges,cudaEventDisableTiming);
00758   end_forces = 0;
00759   check_charges_count = 0;
00760   check_forces_count = 0;
00761   chargeGridSubmittedCount = 0;
00762 
00763   cuda_atoms_count = 0;
00764   cuda_atoms_alloc = 0;
00765 
00766   f_data_mgr_alloc = 0;
00767   f_data_mgr_host = 0;
00768   f_data_mgr_dev = 0;
00769   afn_host = 0;
00770   afn_dev = 0;
00771 
00772 #define CUDA_EVENT_ID_PME_CHARGES 80
00773 #define CUDA_EVENT_ID_PME_FORCES 81
00774 #define CUDA_EVENT_ID_PME_TICK 82
00775 #define CUDA_EVENT_ID_PME_COPY 83
00776 #define CUDA_EVENT_ID_PME_KERNEL 84
00777   if ( 0 == CkMyPe() ) {
00778     traceRegisterUserEvent("CUDA PME charges", CUDA_EVENT_ID_PME_CHARGES);
00779     traceRegisterUserEvent("CUDA PME forces", CUDA_EVENT_ID_PME_FORCES);
00780     traceRegisterUserEvent("CUDA PME tick", CUDA_EVENT_ID_PME_TICK);
00781     traceRegisterUserEvent("CUDA PME memcpy", CUDA_EVENT_ID_PME_COPY);
00782     traceRegisterUserEvent("CUDA PME kernel", CUDA_EVENT_ID_PME_KERNEL);
00783   }
00784 #endif
00785   recipEvirCount = 0;
00786   recipEvirClients = 0;
00787   recipEvirPe = -999;
00788 }

ComputePmeMgr::~ComputePmeMgr (  ) 

Definition at line 1806 of file ComputePme.C.

References fftw_plan_lock, and pmemgr_lock.

01806                               {
01807 
01808   if ( CmiMyRank() == 0 ) {
01809     CmiDestroyLock(fftw_plan_lock);
01810   }
01811   CmiDestroyLock(pmemgr_lock);
01812 
01813   delete myKSpace;
01814   delete [] localInfo;
01815   delete [] gridNodeInfo;
01816   delete [] transNodeInfo;
01817   delete [] gridPeMap;
01818   delete [] transPeMap;
01819   delete [] recipPeDest;
01820   delete [] gridPeOrder;
01821   delete [] gridNodeOrder;
01822   delete [] transNodeOrder;
01823   delete [] qgrid;
01824   if ( kgrid != qgrid ) delete [] kgrid;
01825   delete [] work;
01826   delete [] gridmsg_reuse;
01827 
01828  if ( ! offload ) {
01829   for (int i=0; i<q_count; ++i) {
01830     delete [] q_list[i];
01831   }
01832   delete [] q_list;
01833   delete [] fz_arr;
01834  }
01835   delete [] f_arr;
01836   delete [] q_arr;
01837 }


Member Function Documentation

void ComputePmeMgr::activate_pencils ( CkQdMsg *   ) 

Definition at line 1800 of file ComputePme.C.

01800                                                  {
01801   if ( ! usePencils ) return;
01802   if ( CkMyPe() == 0 ) zPencil.dummyRecvGrid(CkMyPe(),1);
01803 }

void ComputePmeMgr::addRecipEvirClient ( void   ) 

Definition at line 3039 of file ComputePme.C.

03039                                        {
03040   ++recipEvirClients;
03041 }

void ComputePmeMgr::chargeGridReady ( Lattice lattice,
int  sequence 
)

Definition at line 3552 of file ComputePme.C.

References j, PmeGrid::K3, NAMD_bug(), PmeGrid::order, pmeComputes, sendData(), sendPencils(), and ResizeArray< Elem >::size().

Referenced by recvChargeGridReady().

03552                                                                   {
03553 
03554 #ifdef NAMD_CUDA
03555  if ( offload ) {
03556   int errcount = 0;
03557   int q_stride = myGrid.K3+myGrid.order-1;
03558   for (int n=fsize+q_stride, j=fsize; j<n; ++j) {
03559     f_arr[j] = ffz_host[j];
03560     if ( ffz_host[j] & ~1 ) ++errcount;
03561   }
03562   if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::chargeGridReady");
03563  }
03564 #endif
03565   recipEvirCount = recipEvirClients;
03566   ungridForcesCount = pmeComputes.size();
03567 
03568   for (int j=0; j<myGrid.order-1; ++j) {
03569     fz_arr[j] |= fz_arr[myGrid.K3+j];
03570   }
03571 
03572   if ( usePencils ) {
03573     sendPencils(lattice,sequence);
03574   } else {
03575     sendData(lattice,sequence);
03576   }
03577 }

void ComputePmeMgr::chargeGridSubmitted ( Lattice lattice,
int  sequence 
)

Definition at line 3488 of file ComputePme.C.

References chargeGridSubmittedCount, computeMgr, CUDA_EVENT_ID_PME_COPY, deviceCUDA, NodePmeMgr::end_all_pme_kernels, NodePmeMgr::end_charge_memset, end_charges, DeviceCUDA::getMasterPe(), master_pe, Node::Object(), saved_lattice, saved_sequence, Node::simParameters, and simParams.

Referenced by cuda_submit_charges().

03488                                                                       {
03489   saved_lattice = &lattice;
03490   saved_sequence = sequence;
03491 
03492   // cudaDeviceSynchronize();  //  XXXX TESTING
03493   //int q_stride = myGrid.K3+myGrid.order-1;
03494   //for (int n=fsize+q_stride, j=0; j<n; ++j) {
03495   //  if ( ffz_host[j] != 0 && ffz_host[j] != 1 ) {
03496   //    CkPrintf("pre-memcpy flag %d/%d == %d on pe %d in ComputePmeMgr::chargeGridReady\n", j, n, ffz_host[j], CkMyPe());
03497   //  }
03498   //}
03499   //CmiLock(cuda_lock);
03500 
03501  if ( --(masterPmeMgr->chargeGridSubmittedCount) == 0 ) {
03502   double before = CmiWallTimer();
03503   cudaEventRecord(nodePmeMgr->end_all_pme_kernels, 0);  // when all streams complete
03504   cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_all_pme_kernels, 0);
03505   cudaMemcpyAsync(q_data_host, q_data_dev, q_data_size+ffz_size,
03506                         cudaMemcpyDeviceToHost, streams[stream]);
03507   traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
03508   cudaEventRecord(masterPmeMgr->end_charges, streams[stream]);
03509   cudaMemsetAsync(q_data_dev, 0, q_data_size + ffz_size, streams[stream]);  // for next time
03510   cudaEventRecord(nodePmeMgr->end_charge_memset, streams[stream]);
03511   //CmiUnlock(cuda_lock);
03512   // cudaDeviceSynchronize();  //  XXXX TESTING
03513   // cuda_errcheck("after memcpy grid to host");
03514 
03515   SimParameters *simParams = Node::Object()->simParameters;
03516   if ( ! simParams->useCUDA2 ) {
03517     CProxy_ComputeMgr cm(CkpvAccess(BOCclass_group).computeMgr);
03518     cm[deviceCUDA->getMasterPe()].recvYieldDevice(-1);
03519   }
03520 
03521   pmeProxy[master_pe].pollChargeGridReady();
03522  }
03523 }

void ComputePmeMgr::copyPencils ( PmeGridMsg  ) 

Definition at line 3798 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::block2, PmeGrid::dim2, PmeGrid::dim3, j, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PmeGrid::order, PmeGridMsg::qgrid, PmeGridMsg::sourceNode, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

Referenced by recvUngrid().

03798                                                {
03799 
03800   int K1 = myGrid.K1;
03801   int K2 = myGrid.K2;
03802   int dim2 = myGrid.dim2;
03803   int dim3 = myGrid.dim3;
03804   int block1 = myGrid.block1;
03805   int block2 = myGrid.block2;
03806 
03807   // msg->sourceNode = thisIndex.x * initdata.yBlocks + thisIndex.y;
03808   int ib = msg->sourceNode / yBlocks;
03809   int jb = msg->sourceNode % yBlocks;
03810 
03811   int ibegin = ib*block1;
03812   int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
03813   int jbegin = jb*block2;
03814   int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
03815 
03816   int zlistlen = msg->zlistlen;
03817   int *zlist = msg->zlist;
03818   float *qmsg = msg->qgrid;
03819   int g;
03820   for ( g=0; g<numGrids; ++g ) {
03821     char *f = f_arr + g*fsize;
03822     float **q = q_arr + g*fsize;
03823     for ( int i=ibegin; i<iend; ++i ) {
03824      for ( int j=jbegin; j<jend; ++j ) {
03825       if( f[i*dim2+j] ) {
03826         f[i*dim2+j] = 0;
03827         for ( int k=0; k<zlistlen; ++k ) {
03828           q[i*dim2+j][zlist[k]] = *(qmsg++);
03829         }
03830         for (int h=0; h<myGrid.order-1; ++h) {
03831           q[i*dim2+j][myGrid.K3+h] = q[i*dim2+j][h];
03832         }
03833       }
03834      }
03835     }
03836   }
03837 }

void ComputePmeMgr::copyResults ( PmeGridMsg  ) 

Definition at line 3990 of file ComputePme.C.

References PmeGrid::dim3, PmeGridMsg::fgrid, PmeGrid::K3, PmeGridMsg::len, PmeGrid::order, PmeGridMsg::qgrid, PmeGridMsg::start, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

Referenced by recvUngrid().

03990                                                {
03991 
03992   int zdim = myGrid.dim3;
03993   int flen = msg->len;
03994   int fstart = msg->start;
03995   int zlistlen = msg->zlistlen;
03996   int *zlist = msg->zlist;
03997   float *qmsg = msg->qgrid;
03998   int g;
03999   for ( g=0; g<numGrids; ++g ) {
04000     char *f = msg->fgrid + g*flen;
04001     float **q = q_arr + fstart + g*fsize;
04002     for ( int i=0; i<flen; ++i ) {
04003       if ( f[i] ) {
04004         f[i] = 0;
04005         for ( int k=0; k<zlistlen; ++k ) {
04006           q[i][zlist[k]] = *(qmsg++);
04007         }
04008         for (int h=0; h<myGrid.order-1; ++h) {
04009           q[i][myGrid.K3+h] = q[i][h];
04010         }
04011       }
04012     }
04013   }
04014 }

void ComputePmeMgr::cuda_submit_charges ( Lattice lattice,
int  sequence 
)

Definition at line 3435 of file ComputePme.C.

References a_data_dev, chargeGridSubmitted(), charges_time, cuda_atoms_count, CUDA_EVENT_ID_PME_COPY, CUDA_EVENT_ID_PME_KERNEL, NodePmeMgr::end_charge_memset, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, and PmeGrid::order.

03435                                                                       {
03436 
03437     int n = cuda_atoms_count;
03438     //CkPrintf("pe %d cuda_atoms_count %d\n", CkMyPe(), cuda_atoms_count);
03439     cuda_atoms_count = 0;
03440 
03441     const double before = CmiWallTimer();
03442     cudaMemcpyAsync(a_data_dev, a_data_host, 7*n*sizeof(float),
03443                           cudaMemcpyHostToDevice, streams[stream]);
03444     const double after = CmiWallTimer();
03445 
03446     cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_charge_memset, 0);
03447 
03448     cuda_pme_charges(
03449       bspline_coeffs_dev,
03450       q_arr_dev, ffz_dev, ffz_dev + fsize,
03451       a_data_dev, n,
03452       myGrid.K1, myGrid.K2, myGrid.K3, myGrid.order,
03453       streams[stream]);
03454     const double after2 = CmiWallTimer();
03455 
03456     chargeGridSubmitted(lattice,sequence);  // must be inside lock
03457 
03458     masterPmeMgr->charges_time = before;
03459     traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,after);
03460     traceUserBracketEvent(CUDA_EVENT_ID_PME_KERNEL,after,after2);
03461 }

void ComputePmeMgr::fwdSharedTrans ( PmeTransMsg  ) 

Definition at line 2026 of file ComputePme.C.

References NodePmeInfo::npe, NodePmeInfo::pe_start, PME_TRANS_PRIORITY, PRIORITY_SIZE, PmeTransMsg::sequence, and SET_PRIORITY.

Referenced by NodePmeMgr::recvTrans(), and sendTransSubset().

02026                                                    {
02027   // CkPrintf("fwdSharedTrans on Pe(%d)\n",CkMyPe());
02028   int pe = transNodeInfo[myTransNode].pe_start;
02029   int npe = transNodeInfo[myTransNode].npe;
02030   CmiNodeLock lock = CmiCreateLock();
02031   int *count = new int; *count = npe;
02032   for (int i=0; i<npe; ++i, ++pe) {
02033     PmeSharedTransMsg *shmsg = new (PRIORITY_SIZE) PmeSharedTransMsg;
02034     SET_PRIORITY(shmsg,msg->sequence,PME_TRANS_PRIORITY)
02035     shmsg->msg = msg;
02036     shmsg->count = count;
02037     shmsg->lock = lock;
02038     pmeProxy[transPeMap[pe]].recvSharedTrans(shmsg);
02039   }
02040 }

void ComputePmeMgr::fwdSharedUntrans ( PmeUntransMsg  ) 

Definition at line 2282 of file ComputePme.C.

References NodePmeInfo::npe, and NodePmeInfo::pe_start.

Referenced by NodePmeMgr::recvUntrans(), and sendUntransSubset().

02282                                                        {
02283   int pe = gridNodeInfo[myGridNode].pe_start;
02284   int npe = gridNodeInfo[myGridNode].npe;
02285   CmiNodeLock lock = CmiCreateLock();
02286   int *count = new int; *count = npe;
02287   for (int i=0; i<npe; ++i, ++pe) {
02288     PmeSharedUntransMsg *shmsg = new PmeSharedUntransMsg;
02289     shmsg->msg = msg;
02290     shmsg->count = count;
02291     shmsg->lock = lock;
02292     pmeProxy[gridPeMap[pe]].recvSharedUntrans(shmsg);
02293   }
02294 }

void ComputePmeMgr::gridCalc1 ( void   ) 

Definition at line 1918 of file ComputePme.C.

References PmeGrid::dim2, and PmeGrid::dim3.

Referenced by recvGrid().

01918                                   {
01919   // CkPrintf("gridCalc1 on Pe(%d)\n",CkMyPe());
01920 
01921 #ifdef NAMD_FFTW
01922   for ( int g=0; g<numGrids; ++g ) {
01923 #ifdef NAMD_FFTW_3
01924     fftwf_execute(forward_plan_yz[g]);
01925 #else
01926     rfftwnd_real_to_complex(forward_plan_yz, localInfo[myGridPe].nx,
01927         qgrid + qgrid_size * g, 1, myGrid.dim2 * myGrid.dim3, 0, 0, 0);
01928 #endif
01929 
01930   }
01931 #endif
01932 
01933   if ( ! useBarrier ) pmeProxyDir[CkMyPe()].sendTrans();
01934 }

void ComputePmeMgr::gridCalc2 ( void   ) 

Definition at line 2094 of file ComputePme.C.

References PmeGrid::dim3, gridCalc2R(), LocalPmeInfo::ny_after_transpose, and simParams.

Referenced by procTrans().

02094                                   {
02095   // CkPrintf("gridCalc2 on Pe(%d)\n",CkMyPe());
02096 
02097 #if CMK_BLUEGENEL
02098   CmiNetworkProgressAfter (0);
02099 #endif
02100 
02101   int zdim = myGrid.dim3;
02102   // int y_start = localInfo[myTransPe].y_start_after_transpose;
02103   int ny = localInfo[myTransPe].ny_after_transpose;
02104 
02105   for ( int g=0; g<numGrids; ++g ) {
02106     // finish forward FFT (x dimension)
02107 #ifdef NAMD_FFTW
02108 #ifdef NAMD_FFTW_3
02109     fftwf_execute(forward_plan_x[g]);
02110 #else
02111     fftw(forward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
02112         ny * zdim / 2, 1, work, 1, 0);
02113 #endif
02114 #endif
02115   }
02116 
02117 #ifdef OPENATOM_VERSION
02118     if ( ! simParams -> openatomOn ) { 
02119 #endif // OPENATOM_VERSION
02120       gridCalc2R();
02121 #ifdef OPENATOM_VERSION
02122     } else {
02123       gridCalc2Moa();
02124     }
02125 #endif // OPENATOM_VERSION
02126 }

void ComputePmeMgr::gridCalc2R ( void   ) 

Definition at line 2154 of file ComputePme.C.

References CKLOOP_CTRL_PME_KSPACE, PmeKSpace::compute_energy(), PmeGrid::dim3, ComputeNonbondedUtil::ewaldcof, LocalPmeInfo::ny_after_transpose, and Node::Object().

Referenced by gridCalc2().

02154                                    {
02155 
02156   int useCkLoop = 0;
02157 #if CMK_SMP && USE_CKLOOP
02158   if ( Node::Object()->simParameters->useCkLoop >= CKLOOP_CTRL_PME_KSPACE
02159        && CkNumPes() >= 2 * numTransPes ) {
02160     useCkLoop = 1;
02161   }
02162 #endif
02163 
02164   int zdim = myGrid.dim3;
02165   // int y_start = localInfo[myTransPe].y_start_after_transpose;
02166   int ny = localInfo[myTransPe].ny_after_transpose;
02167 
02168   for ( int g=0; g<numGrids; ++g ) {
02169     // reciprocal space portion of PME
02170     BigReal ewaldcof = ComputeNonbondedUtil::ewaldcof;
02171     recip_evir2[g][0] = myKSpace->compute_energy(kgrid+qgrid_size*g,
02172                         lattice, ewaldcof, &(recip_evir2[g][1]), useCkLoop);
02173     // CkPrintf("Ewald reciprocal energy = %f\n", recip_evir2[g][0]);
02174 
02175     // start backward FFT (x dimension)
02176 
02177 #ifdef NAMD_FFTW
02178 #ifdef NAMD_FFTW_3
02179     fftwf_execute(backward_plan_x[g]);
02180 #else
02181     fftw(backward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
02182         ny * zdim / 2, 1, work, 1, 0);
02183 #endif
02184 #endif
02185   }
02186   
02187   pmeProxyDir[CkMyPe()].sendUntrans();
02188 }

void ComputePmeMgr::gridCalc3 ( void   ) 

Definition at line 2356 of file ComputePme.C.

References PmeGrid::dim2, and PmeGrid::dim3.

Referenced by procUntrans().

02356                                   {
02357   // CkPrintf("gridCalc3 on Pe(%d)\n",CkMyPe());
02358 
02359   // finish backward FFT
02360 #ifdef NAMD_FFTW
02361 
02362   for ( int g=0; g<numGrids; ++g ) {
02363 #ifdef NAMD_FFTW_3
02364     fftwf_execute(backward_plan_yz[g]);
02365 #else
02366     rfftwnd_complex_to_real(backward_plan_yz, localInfo[myGridPe].nx,
02367         (fftw_complex *) (qgrid + qgrid_size * g),
02368         1, myGrid.dim2 * myGrid.dim3 / 2, 0, 0, 0);
02369 #endif
02370   }
02371 
02372 #endif
02373 
02374   pmeProxyDir[CkMyPe()].sendUngrid();
02375 }

void ComputePmeMgr::initialize ( CkQdMsg *   ) 

Definition at line 853 of file ComputePme.C.

References Lattice::a(), Lattice::a_r(), ResizeArray< Elem >::add(), ResizeArray< Elem >::begin(), PmeGrid::block1, PmeGrid::block2, PmeGrid::block3, cuda_errcheck(), deviceCUDA, PmeGrid::dim2, PmeGrid::dim3, ResizeArray< Elem >::end(), endi(), fftw_plan_lock, findRecipEvirPe(), generatePmePeList2(), PmePencilInitMsgData::grid, if(), iINFO(), iout, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PatchMap::max_a(), PatchMap::min_a(), NAMD_bug(), NAMD_die(), PatchMap::node(), NodePmeInfo::npe, PatchMap::numNodesWithPatches(), PatchMap::numPatches(), numPatches, PatchMap::numPatchesOnNode(), LocalPmeInfo::nx, LocalPmeInfo::ny_after_transpose, PatchMap::Object(), Node::Object(), DeviceCUDA::one_device_per_node(), PmeGrid::order, NodePmeInfo::pe_start, WorkDistrib::peDiffuseOrdering, pencilPMEProcessors, PmePencilInitMsgData::pmeNodeProxy, PmePencilInitMsgData::pmeProxy, NodePmeInfo::real_node, Random::reorder(), ResizeArray< Elem >::resize(), Node::simParameters, simParams, ResizeArray< Elem >::size(), SortableResizeArray< Elem >::sort(), WorkDistrib::sortPmePes(), Vector::unit(), LocalPmeInfo::x_start, PmePencilInitMsgData::xBlocks, PmePencilInitMsgData::xm, PmePencilInitMsgData::xPencil, LocalPmeInfo::y_start_after_transpose, PmePencilInitMsgData::yBlocks, PmePencilInitMsgData::ym, PmePencilInitMsgData::yPencil, PmePencilInitMsgData::zBlocks, PmePencilInitMsgData::zm, and PmePencilInitMsgData::zPencil.

00853                                            {
00854   delete msg;
00855 
00856   localInfo = new LocalPmeInfo[CkNumPes()];
00857   gridNodeInfo = new NodePmeInfo[CkNumNodes()];
00858   transNodeInfo = new NodePmeInfo[CkNumNodes()];
00859   gridPeMap = new int[CkNumPes()];
00860   transPeMap = new int[CkNumPes()];
00861   recipPeDest = new int[CkNumPes()];
00862   gridPeOrder = new int[CkNumPes()];
00863   gridNodeOrder = new int[CkNumNodes()];
00864   transNodeOrder = new int[CkNumNodes()];
00865 
00866   if (CkMyRank() == 0) {
00867     pencilPMEProcessors = new char [CkNumPes()];
00868     memset (pencilPMEProcessors, 0, sizeof(char) * CkNumPes());
00869   }
00870 
00871   SimParameters *simParams = Node::Object()->simParameters;
00872   PatchMap *patchMap = PatchMap::Object();
00873 
00874   offload = simParams->PMEOffload;
00875 #ifdef NAMD_CUDA
00876   if ( offload && ! deviceCUDA->one_device_per_node() ) {
00877     NAMD_die("PME offload requires exactly one CUDA device per process.  Use \"PMEOffload no\".");
00878   }
00879   if ( offload ) {
00880     int dev;
00881     cudaGetDevice(&dev);
00882     cuda_errcheck("in cudaGetDevice");
00883     cudaDeviceProp deviceProp;
00884     cudaGetDeviceProperties(&deviceProp, dev);
00885     cuda_errcheck("in cudaGetDeviceProperties");
00886     if ( deviceProp.major < 2 )
00887       NAMD_die("PME offload requires CUDA device of compute capability 2.0 or higher.  Use \"PMEOffload no\".");
00888   }
00889 #endif
00890 
00891   alchLambda = -1.;  // illegal value to catch if not updated
00892 
00893   alchOn = simParams->alchOn;
00894   alchFepOn = simParams->alchFepOn;
00895   alchThermIntOn = simParams->alchThermIntOn;
00896   alchDecouple = alchOn && simParams->alchDecouple;
00897   alchElecLambdaStart = alchOn ? simParams->alchElecLambdaStart : 0;
00898   if (alchOn) {
00899     numGrids = 2;
00900     if (alchDecouple) numGrids += 2;
00901     if (alchElecLambdaStart || alchThermIntOn) numGrids ++;
00902   }
00903   else numGrids = 1;
00904   lesOn = simParams->lesOn;
00905   useBarrier = simParams->PMEBarrier;
00906   if ( lesOn ) {
00907     lesFactor = simParams->lesFactor;
00908     numGrids = lesFactor;
00909   }
00910   selfOn = 0;
00911   pairOn = simParams->pairInteractionOn;
00912   if ( pairOn ) {
00913     selfOn = simParams->pairInteractionSelf;
00914     if ( selfOn ) pairOn = 0;  // make pairOn and selfOn exclusive
00915     numGrids = selfOn ? 1 : 3;
00916   }
00917 
00918   if ( numGrids != 1 || simParams->PMEPencils == 0 ) usePencils = 0;
00919   else if ( simParams->PMEPencils > 0 ) usePencils = 1;
00920   else {
00921     int nrps = simParams->PMEProcessors;
00922     if ( nrps <= 0 ) nrps = CkNumPes();
00923     if ( nrps > CkNumPes() ) nrps = CkNumPes();
00924     int dimx = simParams->PMEGridSizeX;
00925     int dimy = simParams->PMEGridSizeY;
00926     int maxslabs = 1 + (dimx - 1) / simParams->PMEMinSlices;
00927     if ( maxslabs > nrps ) maxslabs = nrps;
00928     int maxpencils = ( simParams->PMEGridSizeX * simParams->PMEGridSizeY
00929                 * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
00930     if ( maxpencils > nrps ) maxpencils = nrps;
00931     if ( maxpencils > 3 * maxslabs ) usePencils = 1;
00932     else usePencils = 0;
00933   }
00934 
00935   if ( usePencils ) {
00936     int nrps = simParams->PMEProcessors;
00937     if ( nrps <= 0 ) nrps = CkNumPes();
00938     if ( nrps > CkNumPes() ) nrps = CkNumPes();
00939     if ( simParams->PMEPencils > 1 &&
00940          simParams->PMEPencils * simParams->PMEPencils <= nrps ) {
00941       xBlocks = yBlocks = zBlocks = simParams->PMEPencils;
00942     } else {
00943       int nb2 = ( simParams->PMEGridSizeX * simParams->PMEGridSizeY
00944                 * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
00945       if ( nb2 > nrps ) nb2 = nrps;
00946       if ( nb2 < 1 ) nb2 = 1;
00947       int nb = (int) sqrt((float)nb2);
00948       if ( nb < 1 ) nb = 1;
00949       xBlocks = zBlocks = nb;
00950       yBlocks = nb2 / nb;
00951     }
00952 
00953     if ( simParams->PMEPencilsX > 0 ) xBlocks = simParams->PMEPencilsX;
00954     if ( simParams->PMEPencilsY > 0 ) yBlocks = simParams->PMEPencilsY;
00955     if ( simParams->PMEPencilsZ > 0 ) zBlocks = simParams->PMEPencilsZ;
00956 
00957     int dimx = simParams->PMEGridSizeX;
00958     int bx = 1 + ( dimx - 1 ) / xBlocks;
00959     xBlocks = 1 + ( dimx - 1 ) / bx;
00960 
00961     int dimy = simParams->PMEGridSizeY;
00962     int by = 1 + ( dimy - 1 ) / yBlocks;
00963     yBlocks = 1 + ( dimy - 1 ) / by;
00964 
00965     int dimz = simParams->PMEGridSizeZ / 2 + 1;  // complex
00966     int bz = 1 + ( dimz - 1 ) / zBlocks;
00967     zBlocks = 1 + ( dimz - 1 ) / bz;
00968 
00969     if ( xBlocks * yBlocks > CkNumPes() ) {
00970       NAMD_die("PME pencils xBlocks * yBlocks > numPes");
00971     }
00972     if ( xBlocks * zBlocks > CkNumPes() ) {
00973       NAMD_die("PME pencils xBlocks * zBlocks > numPes");
00974     }
00975     if ( yBlocks * zBlocks > CkNumPes() ) {
00976       NAMD_die("PME pencils yBlocks * zBlocks > numPes");
00977     }
00978 
00979     if ( ! CkMyPe() ) {
00980       iout << iINFO << "PME using " << xBlocks << " x " <<
00981         yBlocks << " x " << zBlocks <<
00982         " pencil grid for FFT and reciprocal sum.\n" << endi;
00983     }
00984   } else { // usePencils
00985 
00986   {  // decide how many pes to use for reciprocal sum
00987 
00988     // rules based on work available
00989     int minslices = simParams->PMEMinSlices;
00990     int dimx = simParams->PMEGridSizeX;
00991     int nrpx = ( dimx + minslices - 1 ) / minslices;
00992     int dimy = simParams->PMEGridSizeY;
00993     int nrpy = ( dimy + minslices - 1 ) / minslices;
00994 
00995     // rules based on processors available
00996     int nrpp = CkNumPes();
00997     // if ( nrpp > 32 ) nrpp = 32;  // cap to limit messages
00998     if ( nrpp < nrpx ) nrpx = nrpp;
00999     if ( nrpp < nrpy ) nrpy = nrpp;
01000 
01001     // user override
01002     int nrps = simParams->PMEProcessors;
01003     if ( nrps > CkNumPes() ) nrps = CkNumPes();
01004     if ( nrps > 0 ) nrpx = nrps;
01005     if ( nrps > 0 ) nrpy = nrps;
01006 
01007     // make sure there aren't any totally empty processors
01008     int bx = ( dimx + nrpx - 1 ) / nrpx;
01009     nrpx = ( dimx + bx - 1 ) / bx;
01010     int by = ( dimy + nrpy - 1 ) / nrpy;
01011     nrpy = ( dimy + by - 1 ) / by;
01012     if ( bx != ( dimx + nrpx - 1 ) / nrpx )
01013       NAMD_bug("Error in selecting number of PME processors.");
01014     if ( by != ( dimy + nrpy - 1 ) / nrpy )
01015       NAMD_bug("Error in selecting number of PME processors.");
01016 
01017     numGridPes = nrpx;
01018     numTransPes = nrpy;
01019   }
01020   if ( ! CkMyPe() ) {
01021     iout << iINFO << "PME using " << numGridPes << " and " << numTransPes <<
01022       " processors for FFT and reciprocal sum.\n" << endi;
01023   }
01024 
01025   int sum_npes = numTransPes + numGridPes;
01026   int max_npes = (numTransPes > numGridPes)?numTransPes:numGridPes;
01027 
01028 #if 0 // USE_TOPOMAP
01029   /* This code is being disabled permanently for slab PME on Blue Gene machines */
01030   PatchMap * pmap = PatchMap::Object();
01031   
01032   int patch_pes = pmap->numNodesWithPatches();
01033   TopoManager tmgr;
01034   if(tmgr.hasMultipleProcsPerNode())
01035     patch_pes *= 2;
01036 
01037   bool done = false;
01038   if(CkNumPes() > 2*sum_npes + patch_pes) {    
01039     done = generateBGLORBPmePeList(transPeMap, numTransPes);
01040     done &= generateBGLORBPmePeList(gridPeMap, numGridPes, transPeMap, numTransPes);    
01041   }
01042   else 
01043     if(CkNumPes() > 2 *max_npes + patch_pes) {
01044       done = generateBGLORBPmePeList(transPeMap, max_npes);
01045       gridPeMap = transPeMap;
01046     }
01047 
01048   if (!done)
01049 #endif
01050     {
01051       //generatePmePeList(transPeMap, max_npes);
01052       //gridPeMap = transPeMap;
01053       generatePmePeList2(gridPeMap, numGridPes, transPeMap, numTransPes);
01054     }
01055   
01056   if ( ! CkMyPe() ) {
01057     iout << iINFO << "PME GRID LOCATIONS:";
01058     int i;
01059     for ( i=0; i<numGridPes && i<10; ++i ) {
01060       iout << " " << gridPeMap[i];
01061     }
01062     if ( i < numGridPes ) iout << " ...";
01063     iout << "\n" << endi;
01064     iout << iINFO << "PME TRANS LOCATIONS:";
01065     for ( i=0; i<numTransPes && i<10; ++i ) {
01066       iout << " " << transPeMap[i];
01067     }
01068     if ( i < numTransPes ) iout << " ...";
01069     iout << "\n" << endi;
01070   }
01071 
01072   // sort based on nodes and physical nodes
01073   std::sort(gridPeMap,gridPeMap+numGridPes,WorkDistrib::pe_sortop_compact());
01074 
01075   myGridPe = -1;
01076   myGridNode = -1;
01077   int i = 0;
01078   int node = -1;
01079   int real_node = -1;
01080   for ( i=0; i<numGridPes; ++i ) {
01081     if ( gridPeMap[i] == CkMyPe() ) myGridPe = i;
01082     if (CkMyRank() == 0) pencilPMEProcessors[gridPeMap[i]] |= 1;
01083     int real_node_i = CkNodeOf(gridPeMap[i]);
01084     if ( real_node_i == real_node ) {
01085       gridNodeInfo[node].npe += 1;
01086     } else {
01087       real_node = real_node_i;
01088       ++node;
01089       gridNodeInfo[node].real_node = real_node;
01090       gridNodeInfo[node].pe_start = i;
01091       gridNodeInfo[node].npe = 1;
01092     }
01093     if ( CkMyNode() == real_node_i ) myGridNode = node;
01094   }
01095   numGridNodes = node + 1;
01096   myTransPe = -1;
01097   myTransNode = -1;
01098   node = -1;
01099   real_node = -1;
01100   for ( i=0; i<numTransPes; ++i ) {
01101     if ( transPeMap[i] == CkMyPe() ) myTransPe = i;
01102     if (CkMyRank() == 0) pencilPMEProcessors[transPeMap[i]] |= 2;
01103     int real_node_i = CkNodeOf(transPeMap[i]);
01104     if ( real_node_i == real_node ) {
01105       transNodeInfo[node].npe += 1;
01106     } else {
01107       real_node = real_node_i;
01108       ++node;
01109       transNodeInfo[node].real_node = real_node;
01110       transNodeInfo[node].pe_start = i;
01111       transNodeInfo[node].npe = 1;
01112     }
01113     if ( CkMyNode() == real_node_i ) myTransNode = node;
01114   }
01115   numTransNodes = node + 1;
01116 
01117   if ( ! CkMyPe() ) {
01118     iout << iINFO << "PME USING " << numGridNodes << " GRID NODES AND "
01119          << numTransNodes << " TRANS NODES\n" << endi;
01120   }
01121 
01122   { // generate random orderings for grid and trans messages
01123     int i;
01124     for ( i = 0; i < numGridPes; ++i ) {
01125       gridPeOrder[i] = i;
01126     }
01127     Random rand(CkMyPe());
01128     if ( myGridPe < 0 ) {
01129       rand.reorder(gridPeOrder,numGridPes);
01130     } else {  // self last
01131       gridPeOrder[myGridPe] = numGridPes-1;
01132       gridPeOrder[numGridPes-1] = myGridPe;
01133       rand.reorder(gridPeOrder,numGridPes-1);
01134     } 
01135     for ( i = 0; i < numGridNodes; ++i ) {
01136       gridNodeOrder[i] = i;
01137     }
01138     if ( myGridNode < 0 ) {
01139       rand.reorder(gridNodeOrder,numGridNodes);
01140     } else {  // self last
01141       gridNodeOrder[myGridNode] = numGridNodes-1;
01142       gridNodeOrder[numGridNodes-1] = myGridNode;
01143       rand.reorder(gridNodeOrder,numGridNodes-1);
01144     }
01145     for ( i = 0; i < numTransNodes; ++i ) {
01146       transNodeOrder[i] = i;
01147     }
01148     if ( myTransNode < 0 ) {
01149       rand.reorder(transNodeOrder,numTransNodes);
01150     } else {  // self last
01151       transNodeOrder[myTransNode] = numTransNodes-1;
01152       transNodeOrder[numTransNodes-1] = myTransNode;
01153       rand.reorder(transNodeOrder,numTransNodes-1);
01154     }
01155   }
01156   
01157   } // ! usePencils
01158 
01159   myGrid.K1 = simParams->PMEGridSizeX;
01160   myGrid.K2 = simParams->PMEGridSizeY;
01161   myGrid.K3 = simParams->PMEGridSizeZ;
01162   myGrid.order = simParams->PMEInterpOrder;
01163   myGrid.dim2 = myGrid.K2;
01164   myGrid.dim3 = 2 * (myGrid.K3/2 + 1);
01165 
01166   if ( ! usePencils ) {
01167     myGrid.block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
01168     myGrid.block2 = ( myGrid.K2 + numTransPes - 1 ) / numTransPes;
01169     myGrid.block3 = myGrid.dim3 / 2;  // complex
01170   }
01171 
01172   if ( usePencils ) {
01173     myGrid.block1 = ( myGrid.K1 + xBlocks - 1 ) / xBlocks;
01174     myGrid.block2 = ( myGrid.K2 + yBlocks - 1 ) / yBlocks;
01175     myGrid.block3 = ( myGrid.K3/2 + 1 + zBlocks - 1 ) / zBlocks;  // complex
01176 
01177 
01178       int pe = 0;
01179       int x,y,z;
01180 
01181                 SortableResizeArray<int> zprocs(xBlocks*yBlocks);
01182                 SortableResizeArray<int> yprocs(xBlocks*zBlocks);
01183                 SortableResizeArray<int> xprocs(yBlocks*zBlocks);
01184       
01185                 // decide which pes to use by bit reversal and patch use
01186                 int i;
01187                 int ncpus = CkNumPes();
01188                 SortableResizeArray<int> patches, nopatches, pmeprocs;
01189                 PatchMap *pmap = PatchMap::Object();
01190                 for ( int icpu=0; icpu<ncpus; ++icpu ) {
01191                         int ri = WorkDistrib::peDiffuseOrdering[icpu];
01192                         if ( ri ) { // keep 0 for special case
01193                                 if ( pmap->numPatchesOnNode(ri) ) patches.add(ri);
01194                                 else nopatches.add(ri);
01195                         }
01196                 }
01197 
01198 #if USE_RANDOM_TOPO
01199             Random rand(CkMyPe());
01200             int *tmp = new int[patches.size()];
01201             int nn = patches.size();
01202             for (i=0;i<nn;i++)  tmp[i] = patches[i];
01203             rand.reorder(tmp, nn);
01204             patches.resize(0);
01205             for (i=0;i<nn;i++)  patches.add(tmp[i]);
01206             delete [] tmp;
01207             tmp = new int[nopatches.size()];
01208             nn = nopatches.size();
01209             for (i=0;i<nn;i++)  tmp[i] = nopatches[i];
01210             rand.reorder(tmp, nn);
01211             nopatches.resize(0);
01212             for (i=0;i<nn;i++)  nopatches.add(tmp[i]);
01213             delete [] tmp;
01214 #endif
01215 
01216                 // only use zero if it eliminates overloading or has patches
01217                 int useZero = 0;
01218                 int npens = xBlocks*yBlocks;
01219                 if ( npens % ncpus == 0 ) useZero = 1;
01220                 if ( npens == nopatches.size() + 1 ) useZero = 1;
01221                 npens += xBlocks*zBlocks;
01222                 if ( npens % ncpus == 0 ) useZero = 1;
01223                 if ( npens == nopatches.size() + 1 ) useZero = 1;
01224                 npens += yBlocks*zBlocks;
01225                 if ( npens % ncpus == 0 ) useZero = 1;
01226                 if ( npens == nopatches.size() + 1 ) useZero = 1;
01227 
01228                 // add nopatches then patches in reversed order
01229                 for ( i=nopatches.size()-1; i>=0; --i ) pmeprocs.add(nopatches[i]);
01230                 if ( useZero && ! pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
01231                 for ( i=patches.size()-1; i>=0; --i ) pmeprocs.add(patches[i]);
01232                 if ( pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
01233   
01234                 int npes = pmeprocs.size();
01235                 for ( i=0; i<xBlocks*yBlocks; ++i, ++pe ) zprocs[i] = pmeprocs[pe%npes];
01236                 if ( i>1 && zprocs[0] == zprocs[i-1] ) zprocs[0] = 0;
01237 #if !USE_RANDOM_TOPO
01238                 zprocs.sort();
01239 #endif
01240                 for ( i=0; i<xBlocks*zBlocks; ++i, ++pe ) yprocs[i] = pmeprocs[pe%npes];
01241                 if ( i>1 && yprocs[0] == yprocs[i-1] ) yprocs[0] = 0;
01242 #if !USE_RANDOM_TOPO
01243                 yprocs.sort();
01244 #endif
01245       for ( i=0; i<yBlocks*zBlocks; ++i, ++pe ) xprocs[i] = pmeprocs[pe%npes];
01246       if ( i>1 && xprocs[0] == xprocs[i-1] ) xprocs[0] = 0;
01247 #if !USE_RANDOM_TOPO
01248       xprocs.sort();
01249 #endif
01250 
01251 #if USE_TOPO_SFC
01252   CmiLock(tmgr_lock);
01253   //{
01254   TopoManager tmgr;
01255   int xdim = tmgr.getDimNX();
01256   int ydim = tmgr.getDimNY();
01257   int zdim = tmgr.getDimNZ();
01258   int xdim1 = find_level_grid(xdim);
01259   int ydim1 = find_level_grid(ydim);
01260   int zdim1 = find_level_grid(zdim);
01261   if(CkMyPe() == 0)
01262       printf("xdim: %d %d %d, %d %d %d\n", xdim, ydim, zdim, xdim1, ydim1, zdim1);
01263 
01264   vector<Coord> result;
01265   SFC_grid(xdim, ydim, zdim, xdim1, ydim1, zdim1, result);
01266   sort_sfc(xprocs, tmgr, result);
01267   sort_sfc(yprocs, tmgr, result);
01268   sort_sfc(zprocs, tmgr, result);
01269   //}
01270   CmiUnlock(tmgr_lock);
01271 #endif
01272 
01273 
01274                 if(CkMyPe() == 0){  
01275               iout << iINFO << "PME Z PENCIL LOCATIONS:";
01276           for ( i=0; i<zprocs.size() && i<10; ++i ) {
01277 #if USE_TOPO_SFC
01278               int x,y,z,t;
01279               tmgr.rankToCoordinates(zprocs[i], x,y, z, t);
01280               iout << " " << zprocs[i] << "(" << x << " " << y << " " << z << ")";
01281 #else
01282               iout << " " << zprocs[i];
01283 #endif
01284           }
01285           if ( i < zprocs.size() ) iout << " ...";
01286               iout << "\n" << endi;
01287                 }
01288 
01289     if (CkMyRank() == 0) {
01290       for (pe=0, x = 0; x < xBlocks; ++x)
01291         for (y = 0; y < yBlocks; ++y, ++pe ) {
01292           pencilPMEProcessors[zprocs[pe]] = 1;
01293         }
01294     }
01295      
01296                 if(CkMyPe() == 0){  
01297               iout << iINFO << "PME Y PENCIL LOCATIONS:";
01298           for ( i=0; i<yprocs.size() && i<10; ++i ) {
01299 #if USE_TOPO_SFC
01300               int x,y,z,t;
01301               tmgr.rankToCoordinates(yprocs[i], x,y, z, t);
01302               iout << " " << yprocs[i] << "(" << x << " " << y << " " << z << ")";
01303 #else
01304               iout << " " << yprocs[i];
01305 #endif
01306           }
01307           if ( i < yprocs.size() ) iout << " ...";
01308               iout << "\n" << endi;
01309                 }
01310 
01311     if (CkMyRank() == 0) {
01312       for (pe=0, z = 0; z < zBlocks; ++z )
01313         for (x = 0; x < xBlocks; ++x, ++pe ) {
01314           pencilPMEProcessors[yprocs[pe]] = 1;
01315         }
01316     }
01317     
01318                 if(CkMyPe() == 0){  
01319                 iout << iINFO << "PME X PENCIL LOCATIONS:";
01320                     for ( i=0; i<xprocs.size() && i<10; ++i ) {
01321 #if USE_TOPO_SFC
01322                 int x,y,z,t;
01323                 tmgr.rankToCoordinates(xprocs[i], x,y, z, t);
01324                 iout << " " << xprocs[i] << "(" << x << "  " << y << " " << z << ")";
01325 #else
01326                 iout << " " << xprocs[i];
01327 #endif
01328             }
01329                 if ( i < xprocs.size() ) iout << " ...";
01330                 iout << "\n" << endi;
01331                 }
01332 
01333     if (CkMyRank() == 0) {
01334       for (pe=0, y = 0; y < yBlocks; ++y )      
01335         for (z = 0; z < zBlocks; ++z, ++pe ) {
01336           pencilPMEProcessors[xprocs[pe]] = 1;
01337         }
01338     }
01339         
01340 
01341         // creating the pencil arrays
01342         if ( CkMyPe() == 0 ){
01343 #if !USE_RANDOM_TOPO
01344         // std::sort(zprocs.begin(),zprocs.end(),WorkDistrib::pe_sortop_compact());
01345         WorkDistrib::sortPmePes(zprocs.begin(),xBlocks,yBlocks);
01346         std::sort(yprocs.begin(),yprocs.end(),WorkDistrib::pe_sortop_compact());
01347         std::sort(xprocs.begin(),xprocs.end(),WorkDistrib::pe_sortop_compact());
01348 #endif
01349 #if 1
01350         CProxy_PmePencilMap zm = CProxy_PmePencilMap::ckNew(0,1,yBlocks,xBlocks*yBlocks,zprocs.begin());
01351         CProxy_PmePencilMap ym;
01352         if ( simParams->PMEPencilsYLayout )
01353           ym = CProxy_PmePencilMap::ckNew(0,2,zBlocks,zBlocks*xBlocks,yprocs.begin()); // new
01354         else
01355           ym = CProxy_PmePencilMap::ckNew(2,0,xBlocks,zBlocks*xBlocks,yprocs.begin()); // old
01356         CProxy_PmePencilMap xm;
01357         if ( simParams->PMEPencilsXLayout )
01358           xm = CProxy_PmePencilMap::ckNew(2,1,yBlocks,yBlocks*zBlocks,xprocs.begin()); // new
01359         else
01360           xm = CProxy_PmePencilMap::ckNew(1,2,zBlocks,yBlocks*zBlocks,xprocs.begin()); // old
01361         pmeNodeProxy.recvPencilMapProxies(xm,ym,zm);
01362         CkArrayOptions zo(xBlocks,yBlocks,1);  zo.setMap(zm);
01363         CkArrayOptions yo(xBlocks,1,zBlocks);  yo.setMap(ym);
01364         CkArrayOptions xo(1,yBlocks,zBlocks);  xo.setMap(xm);
01365         zo.setAnytimeMigration(false);  zo.setStaticInsertion(true);
01366         yo.setAnytimeMigration(false);  yo.setStaticInsertion(true);
01367         xo.setAnytimeMigration(false);  xo.setStaticInsertion(true);
01368         zPencil = CProxy_PmeZPencil::ckNew(zo);  // (xBlocks,yBlocks,1);
01369         yPencil = CProxy_PmeYPencil::ckNew(yo);  // (xBlocks,1,zBlocks);
01370         xPencil = CProxy_PmeXPencil::ckNew(xo);  // (1,yBlocks,zBlocks);
01371 #else
01372         zPencil = CProxy_PmeZPencil::ckNew();  // (xBlocks,yBlocks,1);
01373         yPencil = CProxy_PmeYPencil::ckNew();  // (xBlocks,1,zBlocks);
01374         xPencil = CProxy_PmeXPencil::ckNew();  // (1,yBlocks,zBlocks);
01375 
01376                 for (pe=0, x = 0; x < xBlocks; ++x)
01377                         for (y = 0; y < yBlocks; ++y, ++pe ) {
01378                                 zPencil(x,y,0).insert(zprocs[pe]);
01379                         }
01380         zPencil.doneInserting();
01381 
01382                 for (pe=0, x = 0; x < xBlocks; ++x)
01383                         for (z = 0; z < zBlocks; ++z, ++pe ) {
01384                                 yPencil(x,0,z).insert(yprocs[pe]);
01385                         }
01386         yPencil.doneInserting();
01387 
01388 
01389                 for (pe=0, y = 0; y < yBlocks; ++y )    
01390                         for (z = 0; z < zBlocks; ++z, ++pe ) {
01391                                 xPencil(0,y,z).insert(xprocs[pe]);
01392                         }
01393                 xPencil.doneInserting();     
01394 #endif
01395 
01396                 pmeProxy.recvArrays(xPencil,yPencil,zPencil);
01397                 PmePencilInitMsgData msgdata;
01398                 msgdata.grid = myGrid;
01399                 msgdata.xBlocks = xBlocks;
01400                 msgdata.yBlocks = yBlocks;
01401                 msgdata.zBlocks = zBlocks;
01402                 msgdata.xPencil = xPencil;
01403                 msgdata.yPencil = yPencil;
01404                 msgdata.zPencil = zPencil;
01405                 msgdata.pmeProxy = pmeProxyDir;
01406         msgdata.pmeNodeProxy = pmeNodeProxy;
01407         msgdata.xm = xm;
01408         msgdata.ym = ym;
01409         msgdata.zm = zm;
01410                 xPencil.init(new PmePencilInitMsg(msgdata));
01411                 yPencil.init(new PmePencilInitMsg(msgdata));
01412                 zPencil.init(new PmePencilInitMsg(msgdata));
01413         }
01414 
01415     return;  // continue in initialize_pencils() at next startup stage
01416   }
01417 
01418 
01419   int pe;
01420   int nx = 0;
01421   for ( pe = 0; pe < numGridPes; ++pe ) {
01422     localInfo[pe].x_start = nx;
01423     nx += myGrid.block1;
01424     if ( nx > myGrid.K1 ) nx = myGrid.K1;
01425     localInfo[pe].nx = nx - localInfo[pe].x_start;
01426   }
01427   int ny = 0;
01428   for ( pe = 0; pe < numTransPes; ++pe ) {
01429     localInfo[pe].y_start_after_transpose = ny;
01430     ny += myGrid.block2;
01431     if ( ny > myGrid.K2 ) ny = myGrid.K2;
01432     localInfo[pe].ny_after_transpose =
01433                         ny - localInfo[pe].y_start_after_transpose;
01434   }
01435 
01436   {  // decide how many pes this node exchanges charges with
01437 
01438   PatchMap *patchMap = PatchMap::Object();
01439   Lattice lattice = simParams->lattice;
01440   BigReal sysdima = lattice.a_r().unit() * lattice.a();
01441   BigReal cutoff = simParams->cutoff;
01442   BigReal patchdim = simParams->patchDimension;
01443   int numPatches = patchMap->numPatches();
01444   int numNodes = CkNumPes();
01445   int *source_flags = new int[numNodes];
01446   int node;
01447   for ( node=0; node<numNodes; ++node ) {
01448     source_flags[node] = 0;
01449     recipPeDest[node] = 0;
01450   }
01451 
01452   // // make sure that we don't get ahead of ourselves on this node
01453   // if ( CkMyPe() < numPatches && myRecipPe >= 0 ) {
01454   //   source_flags[CkMyPe()] = 1;
01455   //   recipPeDest[myRecipPe] = 1;
01456   // }
01457 
01458   for ( int pid=0; pid < numPatches; ++pid ) {
01459     int pnode = patchMap->node(pid);
01460 #ifdef NAMD_CUDA
01461     if ( offload ) pnode = CkNodeFirst(CkNodeOf(pnode));
01462 #endif
01463     int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
01464     BigReal minx = patchMap->min_a(pid);
01465     BigReal maxx = patchMap->max_a(pid);
01466     BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
01467     // min1 (max1) is smallest (largest) grid line for this patch
01468     int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
01469     int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
01470     for ( int i=min1; i<=max1; ++i ) {
01471       int ix = i;
01472       while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
01473       while ( ix < 0 ) ix += myGrid.K1;
01474       // set source_flags[pnode] if this patch sends to our node
01475       if ( myGridPe >= 0 && ix >= localInfo[myGridPe].x_start &&
01476            ix < localInfo[myGridPe].x_start + localInfo[myGridPe].nx ) {
01477         source_flags[pnode] = 1;
01478       }
01479       // set dest_flags[] for node that our patch sends to
01480 #ifdef NAMD_CUDA
01481       if ( offload ) {
01482         if ( pnode == CkNodeFirst(CkMyNode()) ) {
01483           recipPeDest[ix / myGrid.block1] = 1;
01484         }
01485       } else
01486 #endif
01487       if ( pnode == CkMyPe() ) {
01488         recipPeDest[ix / myGrid.block1] = 1;
01489       }
01490     }
01491   }
01492 
01493   int numSourcesSamePhysicalNode = 0;
01494   numSources = 0;
01495   numDestRecipPes = 0;
01496   for ( node=0; node<numNodes; ++node ) {
01497     if ( source_flags[node] ) ++numSources;
01498     if ( recipPeDest[node] ) ++numDestRecipPes;
01499     if ( source_flags[node] && CmiPeOnSamePhysicalNode(node,CkMyPe()) ) ++numSourcesSamePhysicalNode;
01500   }
01501 
01502 #if 0
01503   if ( numSources ) {
01504     CkPrintf("pe %5d pme %5d of %5d on same physical node\n",
01505             CkMyPe(), numSourcesSamePhysicalNode, numSources);
01506     iout << iINFO << "PME " << CkMyPe() << " sources:";
01507     for ( node=0; node<numNodes; ++node ) {
01508       if ( source_flags[node] ) iout << " " << node;
01509     }
01510     iout << "\n" << endi;
01511   }
01512 #endif
01513 
01514   delete [] source_flags;
01515 
01516   // CkPrintf("PME on node %d has %d sources and %d destinations\n",
01517   //           CkMyPe(), numSources, numDestRecipPes);
01518 
01519   }  // decide how many pes this node exchanges charges with (end)
01520 
01521   ungrid_count = numDestRecipPes;
01522 
01523   sendTransBarrier_received = 0;
01524 
01525   if ( myGridPe < 0 && myTransPe < 0 ) return;
01526   // the following only for nodes doing reciprocal sum
01527 
01528   if ( myTransPe >= 0 ) {
01529     recipEvirPe = findRecipEvirPe();
01530     pmeProxy[recipEvirPe].addRecipEvirClient();
01531   }
01532 
01533   if ( myTransPe >= 0 ) {
01534       int k2_start = localInfo[myTransPe].y_start_after_transpose;
01535       int k2_end = k2_start + localInfo[myTransPe].ny_after_transpose;
01536       #ifdef OPENATOM_VERSION
01537       if ( simParams->openatomOn ) { 
01538         CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
01539         myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2, moaProxy);
01540       } else {
01541         myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
01542       }
01543       #else  // OPENATOM_VERSION
01544       myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
01545       #endif // OPENATOM_VERSION
01546   }
01547 
01548   int local_size = myGrid.block1 * myGrid.K2 * myGrid.dim3;
01549   int local_size_2 = myGrid.block2 * myGrid.K1 * myGrid.dim3;
01550   if ( local_size < local_size_2 ) local_size = local_size_2;
01551   qgrid = new float[local_size*numGrids];
01552   if ( numGridPes > 1 || numTransPes > 1 ) {
01553     kgrid = new float[local_size*numGrids];
01554   } else {
01555     kgrid = qgrid;
01556   }
01557   qgrid_size = local_size;
01558 
01559   if ( myGridPe >= 0 ) {
01560   qgrid_start = localInfo[myGridPe].x_start * myGrid.K2 * myGrid.dim3;
01561   qgrid_len = localInfo[myGridPe].nx * myGrid.K2 * myGrid.dim3;
01562   fgrid_start = localInfo[myGridPe].x_start * myGrid.K2;
01563   fgrid_len = localInfo[myGridPe].nx * myGrid.K2;
01564   }
01565 
01566   int n[3]; n[0] = myGrid.K1; n[1] = myGrid.K2; n[2] = myGrid.K3;
01567 #ifdef NAMD_FFTW
01568   CmiLock(fftw_plan_lock);
01569 #ifdef NAMD_FFTW_3
01570   work = new fftwf_complex[n[0]];
01571   int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT  : simParams->FFTWEstimate ? FFTW_ESTIMATE  : FFTW_MEASURE ;
01572   if ( myGridPe >= 0 ) {
01573     forward_plan_yz=new fftwf_plan[numGrids];
01574     backward_plan_yz=new fftwf_plan[numGrids];
01575   }
01576   if ( myTransPe >= 0 ) {
01577     forward_plan_x=new fftwf_plan[numGrids];
01578     backward_plan_x=new fftwf_plan[numGrids];
01579   }
01580   /* need one plan per grid */
01581   if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps.  1..." << endi;
01582   if ( myGridPe >= 0 ) {
01583     for( int g=0; g<numGrids; g++)
01584       {
01585         forward_plan_yz[g] = fftwf_plan_many_dft_r2c(2, n+1, 
01586                                                      localInfo[myGridPe].nx,
01587                                                      qgrid + qgrid_size * g,
01588                                                      NULL,
01589                                                      1,
01590                                                      myGrid.dim2 * myGrid.dim3,
01591                                                      (fftwf_complex *) 
01592                                                      (qgrid + qgrid_size * g),
01593                                                      NULL,
01594                                                      1,
01595                                                      myGrid.dim2 * (myGrid.dim3/2),
01596                                                      fftwFlags);
01597       }
01598   }
01599   int zdim = myGrid.dim3;
01600   int xStride=localInfo[myTransPe].ny_after_transpose *( myGrid.dim3 / 2);
01601   if ( ! CkMyPe() ) iout << " 2..." << endi;
01602   if ( myTransPe >= 0 ) {
01603     for( int g=0; g<numGrids; g++)
01604       {
01605 
01606         forward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
01607                                                 (fftwf_complex *)
01608                                                 (kgrid+qgrid_size*g),
01609                                                 NULL,
01610                                                 xStride,
01611                                                 1,
01612                                                 (fftwf_complex *)
01613                                                 (kgrid+qgrid_size*g),
01614                                                 NULL,
01615                                                 xStride,
01616                                                 1,
01617                                                 FFTW_FORWARD,fftwFlags);
01618         
01619       }
01620   }
01621   if ( ! CkMyPe() ) iout << " 3..." << endi;
01622   if ( myTransPe >= 0 ) {
01623     for( int g=0; g<numGrids; g++)
01624       {
01625         backward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
01626                                                  (fftwf_complex *)
01627                                                  (kgrid+qgrid_size*g),
01628                                                  NULL,
01629                                                  xStride,
01630                                                  1,
01631                                                  (fftwf_complex *)
01632                                                  (kgrid+qgrid_size*g),
01633                                                  NULL,
01634                                                  xStride,
01635                                                  1,
01636                                                  FFTW_BACKWARD, fftwFlags);
01637 
01638       }
01639   }
01640   if ( ! CkMyPe() ) iout << " 4..." << endi;
01641   if ( myGridPe >= 0 ) {
01642     for( int g=0; g<numGrids; g++)
01643       {
01644         backward_plan_yz[g] = fftwf_plan_many_dft_c2r(2, n+1, 
01645                                                       localInfo[myGridPe].nx,
01646                                                       (fftwf_complex *)
01647                                                       (qgrid + qgrid_size * g),
01648                                                       NULL,
01649                                                       1,
01650                                                       myGrid.dim2*(myGrid.dim3/2),
01651                                                       qgrid + qgrid_size * g,
01652                                                       NULL,
01653                                                       1,
01654                                                       myGrid.dim2 * myGrid.dim3,
01655                                                       fftwFlags);
01656       }
01657   }
01658   if ( ! CkMyPe() ) iout << "   Done.\n" << endi;
01659 
01660 #else
01661   work = new fftw_complex[n[0]];
01662 
01663   if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps.  1..." << endi;
01664   if ( myGridPe >= 0 ) {
01665   forward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_REAL_TO_COMPLEX,
01666         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01667         | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
01668   }
01669   if ( ! CkMyPe() ) iout << " 2..." << endi;
01670   if ( myTransPe >= 0 ) {
01671       forward_plan_x = fftw_create_plan_specific(n[0], FFTW_FORWARD,
01672         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01673         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
01674         localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
01675   }
01676   if ( ! CkMyPe() ) iout << " 3..." << endi;
01677   if ( myTransPe >= 0 ) {
01678   backward_plan_x = fftw_create_plan_specific(n[0], FFTW_BACKWARD,
01679         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01680         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
01681         localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
01682   }
01683   if ( ! CkMyPe() ) iout << " 4..." << endi;
01684   if ( myGridPe >= 0 ) {
01685   backward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_COMPLEX_TO_REAL,
01686         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01687         | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
01688   }
01689   if ( ! CkMyPe() ) iout << "   Done.\n" << endi;
01690 #endif
01691   CmiUnlock(fftw_plan_lock);
01692 #else
01693   NAMD_die("Sorry, FFTW must be compiled in to use PME.");
01694 #endif
01695 
01696   if ( myGridPe >= 0 && numSources == 0 )
01697                 NAMD_bug("PME grid elements exist without sources.");
01698   grid_count = numSources;
01699   memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
01700   trans_count = numGridPes;
01701 }

void ComputePmeMgr::initialize_computes (  ) 

Definition at line 2741 of file ComputePme.C.

References chargeGridSubmittedCount, cuda_errcheck(), cuda_init_bspline_coeffs(), cuda_lock, deviceCUDA, PmeGrid::dim2, PmeGrid::dim3, NodePmeMgr::end_all_pme_kernels, NodePmeMgr::end_charge_memset, NodePmeMgr::end_potential_memcpy, f, DeviceCUDA::getMasterPe(), j, ijpair::j, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, NodePmeMgr::master_pe, master_pe, NodePmeMgr::masterPmeMgr, NodePmeMgr::mgrObjects, NAMD_bug(), PatchMap::numPatchesOnNode(), PatchMap::Object(), Node::Object(), ReductionMgr::Object(), PmeGrid::order, REDUCTIONS_BASIC, Node::simParameters, simParams, ReductionMgr::willSubmit(), and XCOPY.

02741                                         {
02742 
02743   noWorkCount = 0;
02744   doWorkCount = 0;
02745   ungridForcesCount = 0;
02746 
02747   reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
02748 
02749   SimParameters *simParams = Node::Object()->simParameters;
02750 
02751   strayChargeErrors = 0;
02752 
02753 #ifdef NAMD_CUDA
02754  PatchMap *patchMap = PatchMap::Object();
02755  int pe = master_pe = CkNodeFirst(CkMyNode());
02756  for ( int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
02757     if ( ! patchMap->numPatchesOnNode(master_pe) ) master_pe = pe;
02758     if ( ! patchMap->numPatchesOnNode(pe) ) continue;
02759     if ( master_pe < 1 && pe != deviceCUDA->getMasterPe() ) master_pe = pe;
02760     if ( master_pe == deviceCUDA->getMasterPe() ) master_pe = pe;
02761     if ( WorkDistrib::pe_sortop_diffuse()(pe,master_pe)
02762         && pe != deviceCUDA->getMasterPe() ) {
02763       master_pe = pe;
02764     }
02765  }
02766  if ( ! patchMap->numPatchesOnNode(master_pe) ) {
02767    NAMD_bug("ComputePmeMgr::initialize_computes() master_pe has no patches.");
02768  }
02769 
02770  masterPmeMgr = nodePmeMgr->mgrObjects[master_pe - CkNodeFirst(CkMyNode())];
02771  bool cudaFirst = 1;
02772  if ( offload ) {
02773   CmiLock(cuda_lock);
02774   cudaFirst = ! masterPmeMgr->chargeGridSubmittedCount++;
02775  }
02776 
02777  if ( cudaFirst ) {
02778   nodePmeMgr->master_pe = master_pe;
02779   nodePmeMgr->masterPmeMgr = masterPmeMgr;
02780  }
02781 #endif
02782 
02783   qsize = myGrid.K1 * myGrid.dim2 * myGrid.dim3;
02784   fsize = myGrid.K1 * myGrid.dim2;
02785   if ( myGrid.K2 != myGrid.dim2 ) NAMD_bug("PME myGrid.K2 != myGrid.dim2");
02786 #ifdef NAMD_CUDA
02787  if ( ! offload )
02788 #endif
02789  {
02790   q_arr = new float*[fsize*numGrids];
02791   memset( (void*) q_arr, 0, fsize*numGrids * sizeof(float*) );
02792   q_list = new float*[fsize*numGrids];
02793   memset( (void*) q_list, 0, fsize*numGrids * sizeof(float*) );
02794   q_count = 0;
02795  }
02796 
02797 #ifdef NAMD_CUDA
02798  if ( cudaFirst || ! offload ) {
02799 #endif
02800   f_arr = new char[fsize*numGrids];
02801   // memset to non-zero value has race condition on BlueGene/Q
02802   // memset( (void*) f_arr, 2, fsize*numGrids * sizeof(char) );
02803   for ( int n=fsize*numGrids, i=0; i<n; ++i ) f_arr[i] = 2;
02804 
02805   for ( int g=0; g<numGrids; ++g ) {
02806     char *f = f_arr + g*fsize;
02807     if ( usePencils ) {
02808       int K1 = myGrid.K1;
02809       int K2 = myGrid.K2;
02810       int block1 = ( K1 + xBlocks - 1 ) / xBlocks;
02811       int block2 = ( K2 + yBlocks - 1 ) / yBlocks;
02812       int dim2 = myGrid.dim2;
02813       for (int ap=0; ap<numPencilsActive; ++ap) {
02814         int ib = activePencils[ap].i;
02815         int jb = activePencils[ap].j;
02816         int ibegin = ib*block1;
02817         int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
02818         int jbegin = jb*block2;
02819         int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
02820         int flen = numGrids * (iend - ibegin) * (jend - jbegin);
02821         for ( int i=ibegin; i<iend; ++i ) {
02822           for ( int j=jbegin; j<jend; ++j ) {
02823             f[i*dim2+j] = 0;
02824           }
02825         }
02826       }
02827     } else {
02828       int block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
02829       bsize = block1 * myGrid.dim2 * myGrid.dim3;
02830       for (int pe=0; pe<numGridPes; pe++) {
02831         if ( ! recipPeDest[pe] ) continue;
02832         int start = pe * bsize;
02833         int len = bsize;
02834         if ( start >= qsize ) { start = 0; len = 0; }
02835         if ( start + len > qsize ) { len = qsize - start; }
02836         int zdim = myGrid.dim3;
02837         int fstart = start / zdim;
02838         int flen = len / zdim;
02839         memset(f + fstart, 0, flen*sizeof(char));
02840         // CkPrintf("pe %d enabled slabs %d to %d\n", CkMyPe(), fstart/myGrid.dim2, (fstart+flen)/myGrid.dim2-1);
02841       }
02842     }
02843   }
02844 #ifdef NAMD_CUDA
02845  }
02846  if ( offload ) {
02847  if ( cudaFirst ) {
02848 
02849   int f_alloc_count = 0;
02850   for ( int n=fsize, i=0; i<n; ++i ) {
02851     if ( f_arr[i] == 0 ) {
02852       ++f_alloc_count;
02853     }
02854   }
02855   // CkPrintf("pe %d f_alloc_count == %d (%d slabs)\n", CkMyPe(), f_alloc_count, f_alloc_count/myGrid.dim2);
02856 
02857   q_arr = new float*[fsize*numGrids];
02858   memset( (void*) q_arr, 0, fsize*numGrids * sizeof(float*) );
02859 
02860   float **q_arr_dev_host = new float*[fsize];
02861   cudaMalloc((void**) &q_arr_dev, fsize * sizeof(float*));
02862 
02863   float **v_arr_dev_host = new float*[fsize];
02864   cudaMalloc((void**) &v_arr_dev, fsize * sizeof(float*));
02865 
02866   int q_stride = myGrid.K3+myGrid.order-1;
02867   q_data_size = f_alloc_count * q_stride * sizeof(float);
02868   ffz_size = (fsize + q_stride) * sizeof(int);
02869 
02870   // tack ffz onto end of q_data to allow merged transfer
02871   cudaMallocHost((void**) &q_data_host, q_data_size+ffz_size);
02872   ffz_host = (int*)(((char*)q_data_host) + q_data_size);
02873   cudaMalloc((void**) &q_data_dev, q_data_size+ffz_size);
02874   ffz_dev = (int*)(((char*)q_data_dev) + q_data_size);
02875   cudaMalloc((void**) &v_data_dev, q_data_size);
02876   cuda_errcheck("malloc grid data for pme");
02877   cudaMemset(q_data_dev, 0, q_data_size + ffz_size);  // for first time
02878   cudaEventCreateWithFlags(&(nodePmeMgr->end_charge_memset),cudaEventDisableTiming);
02879   cudaEventRecord(nodePmeMgr->end_charge_memset, 0);
02880   cudaEventCreateWithFlags(&(nodePmeMgr->end_all_pme_kernels),cudaEventDisableTiming);
02881   cudaEventCreateWithFlags(&(nodePmeMgr->end_potential_memcpy),cudaEventDisableTiming);
02882 
02883   f_alloc_count = 0;
02884   for ( int n=fsize, i=0; i<n; ++i ) {
02885     if ( f_arr[i] == 0 ) {
02886       q_arr[i] = q_data_host + f_alloc_count * q_stride;
02887       q_arr_dev_host[i] = q_data_dev + f_alloc_count * q_stride;
02888       v_arr_dev_host[i] = v_data_dev + f_alloc_count * q_stride;
02889       ++f_alloc_count;
02890     } else {
02891       q_arr[i] = 0;
02892       q_arr_dev_host[i] = 0;
02893       v_arr_dev_host[i] = 0;
02894     }
02895   }
02896 
02897   cudaMemcpy(q_arr_dev, q_arr_dev_host, fsize * sizeof(float*), cudaMemcpyHostToDevice);
02898   cudaMemcpy(v_arr_dev, v_arr_dev_host, fsize * sizeof(float*), cudaMemcpyHostToDevice);
02899   delete [] q_arr_dev_host;
02900   delete [] v_arr_dev_host;
02901   delete [] f_arr;
02902   f_arr = new char[fsize + q_stride];
02903   fz_arr = f_arr + fsize;
02904   memset(f_arr, 0, fsize + q_stride);
02905   memset(ffz_host, 0, (fsize + q_stride)*sizeof(int));
02906 
02907   cuda_errcheck("initialize grid data for pme");
02908 
02909   cuda_init_bspline_coeffs(&bspline_coeffs_dev, &bspline_dcoeffs_dev, myGrid.order);
02910   cuda_errcheck("initialize bspline coefficients for pme");
02911 
02912 #define XCOPY(X) masterPmeMgr->X = X;
02913   XCOPY(bspline_coeffs_dev)
02914   XCOPY(bspline_dcoeffs_dev)
02915   XCOPY(q_arr)
02916   XCOPY(q_arr_dev)
02917   XCOPY(v_arr_dev)
02918   XCOPY(q_data_size)
02919   XCOPY(q_data_host)
02920   XCOPY(q_data_dev)
02921   XCOPY(v_data_dev)
02922   XCOPY(ffz_size)
02923   XCOPY(ffz_host)
02924   XCOPY(ffz_dev)
02925   XCOPY(f_arr)
02926   XCOPY(fz_arr)
02927 #undef XCOPY
02928   //CkPrintf("pe %d init first\n", CkMyPe());
02929  } else { // cudaFirst
02930   //CkPrintf("pe %d init later\n", CkMyPe());
02931 #define XCOPY(X) X = masterPmeMgr->X;
02932   XCOPY(bspline_coeffs_dev)
02933   XCOPY(bspline_dcoeffs_dev)
02934   XCOPY(q_arr)
02935   XCOPY(q_arr_dev)
02936   XCOPY(v_arr_dev)
02937   XCOPY(q_data_size)
02938   XCOPY(q_data_host)
02939   XCOPY(q_data_dev)
02940   XCOPY(v_data_dev)
02941   XCOPY(ffz_size)
02942   XCOPY(ffz_host)
02943   XCOPY(ffz_dev)
02944   XCOPY(f_arr)
02945   XCOPY(fz_arr)
02946 #undef XCOPY
02947  } // cudaFirst
02948   CmiUnlock(cuda_lock);
02949  } else // offload
02950 #endif // NAMD_CUDA
02951  {
02952   fz_arr = new char[myGrid.K3+myGrid.order-1];
02953  }
02954 
02955 #if 0 && USE_PERSISTENT
02956   recvGrid_handle = NULL;
02957 #endif
02958 }

void ComputePmeMgr::initialize_pencils ( CkQdMsg *   ) 

Definition at line 1705 of file ComputePme.C.

References Lattice::a(), Lattice::a_r(), Lattice::b(), Lattice::b_r(), PmeGrid::block1, PmeGrid::block2, deviceCUDA, DeviceCUDA::getMasterPe(), j, PmeGrid::K1, PmeGrid::K2, PatchMap::max_a(), PatchMap::max_b(), PatchMap::min_a(), PatchMap::min_b(), PatchMap::node(), PatchMap::numPatches(), numPatches, PatchMap::Object(), Node::Object(), PmeGrid::order, Node::simParameters, simParams, and Vector::unit().

01705                                                    {
01706   delete msg;
01707   if ( ! usePencils ) return;
01708 
01709   SimParameters *simParams = Node::Object()->simParameters;
01710 
01711   PatchMap *patchMap = PatchMap::Object();
01712   Lattice lattice = simParams->lattice;
01713   BigReal sysdima = lattice.a_r().unit() * lattice.a();
01714   BigReal sysdimb = lattice.b_r().unit() * lattice.b();
01715   BigReal cutoff = simParams->cutoff;
01716   BigReal patchdim = simParams->patchDimension;
01717   int numPatches = patchMap->numPatches();
01718 
01719   pencilActive = new char[xBlocks*yBlocks];
01720   for ( int i=0; i<xBlocks; ++i ) {
01721     for ( int j=0; j<yBlocks; ++j ) {
01722       pencilActive[i*yBlocks+j] = 0;
01723     }
01724   }
01725 
01726   for ( int pid=0; pid < numPatches; ++pid ) {
01727     int pnode = patchMap->node(pid);
01728 #ifdef NAMD_CUDA
01729     if ( offload ) {
01730       if ( CkNodeOf(pnode) != CkMyNode() ) continue;
01731     } else
01732 #endif
01733     if ( pnode != CkMyPe() ) continue;
01734 
01735     int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
01736     int shift2 = (myGrid.K2 + myGrid.order - 1)/2;
01737 
01738     BigReal minx = patchMap->min_a(pid);
01739     BigReal maxx = patchMap->max_a(pid);
01740     BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
01741     // min1 (max1) is smallest (largest) grid line for this patch
01742     int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
01743     int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
01744 
01745     BigReal miny = patchMap->min_b(pid);
01746     BigReal maxy = patchMap->max_b(pid);
01747     BigReal marginb = 0.5 * ( patchdim - cutoff ) / sysdimb;
01748     // min2 (max2) is smallest (largest) grid line for this patch
01749     int min2 = ((int) floor(myGrid.K2 * (miny - marginb))) + shift2 - myGrid.order + 1;
01750     int max2 = ((int) floor(myGrid.K2 * (maxy + marginb))) + shift2;
01751 
01752     for ( int i=min1; i<=max1; ++i ) {
01753       int ix = i;
01754       while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
01755       while ( ix < 0 ) ix += myGrid.K1;
01756       for ( int j=min2; j<=max2; ++j ) {
01757         int jy = j;
01758         while ( jy >= myGrid.K2 ) jy -= myGrid.K2;
01759         while ( jy < 0 ) jy += myGrid.K2;
01760         pencilActive[(ix / myGrid.block1)*yBlocks + (jy / myGrid.block2)] = 1;
01761       }
01762     }
01763   }
01764 
01765   numPencilsActive = 0;
01766   for ( int i=0; i<xBlocks; ++i ) {
01767     for ( int j=0; j<yBlocks; ++j ) {
01768       if ( pencilActive[i*yBlocks+j] ) {
01769         ++numPencilsActive;
01770 #ifdef NAMD_CUDA
01771         if ( CkMyPe() == deviceCUDA->getMasterPe() || ! offload )
01772 #endif
01773         zPencil(i,j,0).dummyRecvGrid(CkMyPe(),0);
01774       }
01775     }
01776   }
01777   activePencils = new ijpair[numPencilsActive];
01778   numPencilsActive = 0;
01779   for ( int i=0; i<xBlocks; ++i ) {
01780     for ( int j=0; j<yBlocks; ++j ) {
01781       if ( pencilActive[i*yBlocks+j] ) {
01782         activePencils[numPencilsActive++] = ijpair(i,j);
01783       }
01784     }
01785   }
01786   if ( simParams->PMESendOrder ) {
01787     std::sort(activePencils,activePencils+numPencilsActive,ijpair_sortop_bit_reversed());
01788   } else {
01789     Random rand(CkMyPe());
01790     rand.reorder(activePencils,numPencilsActive);
01791   }
01792   //if ( numPencilsActive ) {
01793   //  CkPrintf("node %d sending to %d pencils\n", CkMyPe(), numPencilsActive);
01794   //}
01795 
01796   ungrid_count = numPencilsActive;
01797 }

void ComputePmeMgr::pollChargeGridReady (  ) 

Definition at line 3539 of file ComputePme.C.

References CcdCallBacksReset(), cuda_check_pme_charges(), CUDA_POLL, and NAMD_bug().

03539                                         {
03540 #ifdef NAMD_CUDA
03541   CcdCallBacksReset(0,CmiWallTimer());  // fix Charm++
03542   CUDA_POLL(cuda_check_pme_charges,this);
03543 #else
03544   NAMD_bug("ComputePmeMgr::pollChargeGridReady() called in non-CUDA build.");
03545 #endif
03546 }

void ComputePmeMgr::pollForcesReady (  ) 

Definition at line 2654 of file ComputePme.C.

References CcdCallBacksReset(), cuda_check_pme_forces(), CUDA_POLL, and NAMD_bug().

02654                                     {
02655 #ifdef NAMD_CUDA
02656   CcdCallBacksReset(0,CmiWallTimer());  // fix Charm++
02657   CUDA_POLL(cuda_check_pme_forces,this);
02658 #else
02659   NAMD_bug("ComputePmeMgr::pollForcesReady() called in non-CUDA build.");
02660 #endif
02661 }

void ComputePmeMgr::procTrans ( PmeTransMsg  ) 

Definition at line 2060 of file ComputePme.C.

References PmeGrid::dim3, gridCalc2(), PmeTransMsg::lattice, NodePmeInfo::npe, PmeTransMsg::nx, LocalPmeInfo::ny_after_transpose, NodePmeInfo::pe_start, PmeTransMsg::qgrid, PmeTransMsg::sequence, PmeTransMsg::x_start, and LocalPmeInfo::y_start_after_transpose.

Referenced by recvSharedTrans(), and recvTrans().

02060                                               {
02061   // CkPrintf("procTrans on Pe(%d)\n",CkMyPe());
02062   if ( trans_count == numGridPes ) {
02063     lattice = msg->lattice;
02064     grid_sequence = msg->sequence;
02065   }
02066 
02067  if ( msg->nx ) {
02068   int zdim = myGrid.dim3;
02069   NodePmeInfo &nodeInfo(transNodeInfo[myTransNode]);
02070   int first_pe = nodeInfo.pe_start;
02071   int last_pe = first_pe+nodeInfo.npe-1;
02072   int y_skip = localInfo[myTransPe].y_start_after_transpose
02073              - localInfo[first_pe].y_start_after_transpose;
02074   int ny_msg = localInfo[last_pe].y_start_after_transpose
02075              + localInfo[last_pe].ny_after_transpose
02076              - localInfo[first_pe].y_start_after_transpose;
02077   int ny = localInfo[myTransPe].ny_after_transpose;
02078   int x_start = msg->x_start;
02079   int nx = msg->nx;
02080   for ( int g=0; g<numGrids; ++g ) {
02081     CmiMemcpy((void*)(kgrid + qgrid_size * g + x_start*ny*zdim),
02082         (void*)(msg->qgrid + nx*(ny_msg*g+y_skip)*zdim),
02083         nx*ny*zdim*sizeof(float));
02084   }
02085  }
02086 
02087   --trans_count;
02088 
02089   if ( trans_count == 0 ) {
02090     pmeProxyDir[CkMyPe()].gridCalc2();
02091   }
02092 }

void ComputePmeMgr::procUntrans ( PmeUntransMsg  ) 

Definition at line 2314 of file ComputePme.C.

References PmeGrid::dim3, gridCalc3(), PmeGrid::K2, NodePmeInfo::npe, LocalPmeInfo::nx, PmeUntransMsg::ny, NodePmeInfo::pe_start, PmeUntransMsg::qgrid, LocalPmeInfo::x_start, and PmeUntransMsg::y_start.

Referenced by recvSharedUntrans(), and recvUntrans().

02314                                                   {
02315   // CkPrintf("recvUntrans on Pe(%d)\n",CkMyPe());
02316 
02317 #if CMK_BLUEGENEL
02318   CmiNetworkProgressAfter (0);
02319 #endif
02320 
02321   NodePmeInfo &nodeInfo(gridNodeInfo[myGridNode]);
02322   int first_pe = nodeInfo.pe_start;
02323   int g;
02324 
02325  if ( msg->ny ) {
02326   int zdim = myGrid.dim3;
02327   int last_pe = first_pe+nodeInfo.npe-1;
02328   int x_skip = localInfo[myGridPe].x_start
02329              - localInfo[first_pe].x_start;
02330   int nx_msg = localInfo[last_pe].x_start
02331              + localInfo[last_pe].nx
02332              - localInfo[first_pe].x_start;
02333   int nx = localInfo[myGridPe].nx;
02334   int y_start = msg->y_start;
02335   int ny = msg->ny;
02336   int slicelen = myGrid.K2 * zdim;
02337   int cpylen = ny * zdim;
02338   for ( g=0; g<numGrids; ++g ) {
02339     float *q = qgrid + qgrid_size * g + y_start * zdim;
02340     float *qmsg = msg->qgrid + (nx_msg*g+x_skip) * cpylen;
02341     for ( int x = 0; x < nx; ++x ) {
02342       CmiMemcpy((void*)q, (void*)qmsg, cpylen*sizeof(float));
02343       q += slicelen;
02344       qmsg += cpylen;
02345     }
02346   }
02347  }
02348 
02349   --untrans_count;
02350 
02351   if ( untrans_count == 0 ) {
02352     pmeProxyDir[CkMyPe()].gridCalc3();
02353   }
02354 }

void ComputePmeMgr::recvAck ( PmeAckMsg  ) 

Definition at line 2457 of file ComputePme.C.

References cuda_lock, master_pe, and NAMD_bug().

Referenced by recvUngrid().

02457                                           {
02458   if ( msg ) delete msg;
02459 #ifdef NAMD_CUDA
02460   if ( offload ) {
02461     CmiLock(cuda_lock);
02462     if ( ungrid_count == 0 ) {
02463       NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
02464     }
02465     int uc = --ungrid_count;
02466     CmiUnlock(cuda_lock);
02467 
02468     if ( uc == 0 ) {
02469       pmeProxyDir[master_pe].ungridCalc();
02470     }
02471     return;
02472   }
02473 #endif
02474   --ungrid_count;
02475 
02476   if ( ungrid_count == 0 ) {
02477     pmeProxyDir[CkMyPe()].ungridCalc();
02478   }
02479 }

void ComputePmeMgr::recvArrays ( CProxy_PmeXPencil  ,
CProxy_PmeYPencil  ,
CProxy_PmeZPencil   
)

Definition at line 791 of file ComputePme.C.

00792                                                                        {
00793   xPencil = x;  yPencil = y;  zPencil = z;
00794   
00795     if(CmiMyRank()==0)
00796     {
00797       pmeNodeProxy.ckLocalBranch()->xPencil=x;
00798       pmeNodeProxy.ckLocalBranch()->yPencil=y;
00799       pmeNodeProxy.ckLocalBranch()->zPencil=z;
00800     }
00801 }

void ComputePmeMgr::recvChargeGridReady (  ) 

Definition at line 3548 of file ComputePme.C.

References chargeGridReady(), saved_lattice, and saved_sequence.

03548                                         {
03549   chargeGridReady(*saved_lattice,saved_sequence);
03550 }

void ComputePmeMgr::recvGrid ( PmeGridMsg  ) 

Definition at line 1839 of file ComputePme.C.

References PmeGrid::dim3, f, PmeGridMsg::fgrid, gridCalc1(), PmeGridMsg::lattice, NAMD_bug(), PmeGridMsg::qgrid, sendTransBarrier(), PmeGridMsg::sequence, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

01839                                             {
01840   // CkPrintf("recvGrid from %d on Pe(%d)\n",msg->sourceNode,CkMyPe());
01841   if ( grid_count == 0 ) {
01842     NAMD_bug("Message order failure in ComputePmeMgr::recvGrid\n");
01843   }
01844   if ( grid_count == numSources ) {
01845     lattice = msg->lattice;
01846     grid_sequence = msg->sequence;
01847   }
01848 
01849   int zdim = myGrid.dim3;
01850   int zlistlen = msg->zlistlen;
01851   int *zlist = msg->zlist;
01852   float *qmsg = msg->qgrid;
01853   for ( int g=0; g<numGrids; ++g ) {
01854     char *f = msg->fgrid + fgrid_len * g;
01855     float *q = qgrid + qgrid_size * g;
01856     for ( int i=0; i<fgrid_len; ++i ) {
01857       if ( f[i] ) {
01858         for ( int k=0; k<zlistlen; ++k ) {
01859           q[zlist[k]] += *(qmsg++);
01860         }
01861       }
01862       q += zdim;
01863     }
01864   }
01865 
01866   gridmsg_reuse[numSources-grid_count] = msg;
01867   --grid_count;
01868 
01869   if ( grid_count == 0 ) {
01870     pmeProxyDir[CkMyPe()].gridCalc1();
01871     if ( useBarrier ) pmeProxyDir[0].sendTransBarrier();
01872   }
01873 }

void ComputePmeMgr::recvRecipEvir ( PmeEvirMsg  ) 

Definition at line 3043 of file ComputePme.C.

References PmeEvirMsg::evir, NAMD_bug(), pmeComputes, ResizeArray< Elem >::size(), and submitReductions().

03043                                                  {
03044   if ( ! pmeComputes.size() ) NAMD_bug("ComputePmeMgr::recvRecipEvir() called on pe without patches");
03045   for ( int g=0; g<numGrids; ++g ) {
03046     evir[g] += msg->evir[g];
03047   }
03048   delete msg;
03049   // CkPrintf("recvRecipEvir pe %d %d %d\n", CkMyPe(), ungridForcesCount, recipEvirCount);
03050   if ( ! --recipEvirCount && ! ungridForcesCount ) submitReductions();
03051 }

void ComputePmeMgr::recvSharedTrans ( PmeSharedTransMsg  ) 

Definition at line 2042 of file ComputePme.C.

References PmeSharedTransMsg::count, PmeSharedTransMsg::lock, PmeSharedTransMsg::msg, and procTrans().

02042                                                           {
02043   procTrans(msg->msg);
02044   CmiLock(msg->lock);
02045   int count = --(*msg->count);
02046   CmiUnlock(msg->lock);
02047   if ( count == 0 ) {
02048     CmiDestroyLock(msg->lock);
02049     delete msg->count;
02050     delete msg->msg;
02051   }
02052   delete msg;
02053 }

void ComputePmeMgr::recvSharedUntrans ( PmeSharedUntransMsg  ) 

Definition at line 2296 of file ComputePme.C.

References PmeSharedUntransMsg::count, PmeSharedUntransMsg::lock, PmeSharedUntransMsg::msg, and procUntrans().

02296                                                               {
02297   procUntrans(msg->msg);
02298   CmiLock(msg->lock);
02299   int count = --(*msg->count);
02300   CmiUnlock(msg->lock);
02301   if ( count == 0 ) {
02302     CmiDestroyLock(msg->lock);
02303     delete msg->count;
02304     delete msg->msg;
02305   }
02306   delete msg;
02307 }

void ComputePmeMgr::recvTrans ( PmeTransMsg  ) 

Definition at line 2055 of file ComputePme.C.

References procTrans().

02055                                               {
02056   procTrans(msg);
02057   delete msg;
02058 }

void ComputePmeMgr::recvUngrid ( PmeGridMsg  ) 

Definition at line 2442 of file ComputePme.C.

References copyPencils(), copyResults(), NAMD_bug(), and recvAck().

Referenced by NodePmeMgr::recvUngrid().

02442                                               {
02443   // CkPrintf("recvUngrid on Pe(%d)\n",CkMyPe());
02444 #ifdef NAMD_CUDA
02445   if ( ! offload )  // would need lock
02446 #endif
02447   if ( ungrid_count == 0 ) {
02448     NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
02449   }
02450 
02451   if ( usePencils ) copyPencils(msg);
02452   else copyResults(msg);
02453   delete msg;
02454   recvAck(0);
02455 }

void ComputePmeMgr::recvUntrans ( PmeUntransMsg  ) 

Definition at line 2309 of file ComputePme.C.

References procUntrans().

02309                                                   {
02310   procUntrans(msg);
02311   delete msg;
02312 }

void ComputePmeMgr::sendChargeGridReady (  ) 

Definition at line 3525 of file ComputePme.C.

References chargeGridSubmittedCount, master_pe, NodePmeMgr::mgrObjects, pmeComputes, and ResizeArray< Elem >::size().

Referenced by cuda_check_pme_charges().

03525                                         {
03526   for ( int i=0; i<CkMyNodeSize(); ++i ) {
03527     ComputePmeMgr *mgr = nodePmeMgr->mgrObjects[i];
03528     int cs = mgr->pmeComputes.size();
03529     if ( cs ) {
03530       mgr->ungridForcesCount = cs;
03531       mgr->recipEvirCount = mgr->recipEvirClients;
03532       masterPmeMgr->chargeGridSubmittedCount++;
03533     }
03534   }
03535   pmeProxy[master_pe].recvChargeGridReady();
03536 }

void ComputePmeMgr::sendData ( Lattice ,
int  sequence 
)

Definition at line 3962 of file ComputePme.C.

References sendDataHelper_errors, sendDataHelper_lattice, sendDataHelper_sequence, sendDataHelper_sourcepe, and sendDataPart().

Referenced by chargeGridReady().

03962                                                            {
03963 
03964   sendDataHelper_lattice = &lattice;
03965   sendDataHelper_sequence = sequence;
03966   sendDataHelper_sourcepe = CkMyPe();
03967   sendDataHelper_errors = strayChargeErrors;
03968   strayChargeErrors = 0;
03969 
03970 #ifdef NAMD_CUDA
03971   if ( offload ) {
03972     for ( int i=0; i < numGridPes; ++i ) {
03973       int pe = gridPeOrder[i];  // different order
03974       if ( ! recipPeDest[pe] && ! sendDataHelper_errors ) continue;
03975 #if CMK_MULTICORE
03976       // nodegroup messages on multicore are delivered to sending pe, or pe 0 if expedited
03977       pmeProxy[gridPeMap[pe]].sendDataHelper(i);
03978 #else
03979       pmeNodeProxy[CkMyNode()].sendDataHelper(i);
03980 #endif
03981     }
03982   } else
03983 #endif
03984   {
03985     sendDataPart(0,numGridPes-1,lattice,sequence,CkMyPe(),sendDataHelper_errors);
03986   }
03987  
03988 }

void ComputePmeMgr::sendDataHelper ( int   ) 

Definition at line 3949 of file ComputePme.C.

References NodePmeMgr::sendDataHelper().

03949                                            {
03950   nodePmeMgr->sendDataHelper(iter);
03951 }

void ComputePmeMgr::sendDataPart ( int  first,
int  last,
Lattice ,
int  sequence,
int  sourcepe,
int  errors 
)

Definition at line 3840 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::dim2, PmeGrid::dim3, endi(), PmeGridMsg::fgrid, iERROR(), if(), iout, j, PmeGrid::K2, PmeGrid::K3, PmeGridMsg::lattice, PmeGridMsg::len, NAMD_bug(), PmeGrid::order, PME_GRID_PRIORITY, PRIORITY_SIZE, PmeGridMsg::qgrid, PmeGridMsg::sequence, SET_PRIORITY, PmeGridMsg::sourceNode, PmeGridMsg::start, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

Referenced by sendData(), and NodePmeMgr::sendDataHelper().

03840                                                                                                               {
03841 
03842   // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
03843 
03844   bsize = myGrid.block1 * myGrid.dim2 * myGrid.dim3;
03845 
03846   CProxy_ComputePmeMgr pmeProxy(CkpvAccess(BOCclass_group).computePmeMgr);
03847   for (int j=first; j<=last; j++) {
03848     int pe = gridPeOrder[j];  // different order
03849     if ( ! recipPeDest[pe] && ! errors ) continue;
03850     int start = pe * bsize;
03851     int len = bsize;
03852     if ( start >= qsize ) { start = 0; len = 0; }
03853     if ( start + len > qsize ) { len = qsize - start; }
03854     int zdim = myGrid.dim3;
03855     int fstart = start / zdim;
03856     int flen = len / zdim;
03857     int fcount = 0;
03858     int i;
03859 
03860     int g;
03861     for ( g=0; g<numGrids; ++g ) {
03862       char *f = f_arr + fstart + g*fsize;
03863 #ifdef NAMD_CUDA
03864      if ( offload ) {
03865       int errcount = 0;
03866       for ( i=0; i<flen; ++i ) {
03867         f[i] = ffz_host[fstart+i];
03868         fcount += f[i];
03869         if ( ffz_host[fstart+i] & ~1 ) ++errcount;
03870       }
03871       if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::sendDataPart");
03872      } else
03873 #endif
03874       for ( i=0; i<flen; ++i ) {
03875         fcount += f[i];
03876       }
03877       if ( ! recipPeDest[pe] ) {
03878         int errfound = 0;
03879         for ( i=0; i<flen; ++i ) {
03880           if ( f[i] == 3 ) {
03881             errfound = 1;
03882             break;
03883           }
03884         }
03885         if ( errfound ) {
03886           iout << iERROR << "Stray PME grid charges detected: "
03887                 << sourcepe << " sending to " << gridPeMap[pe] << " for planes";
03888           int iz = -1;
03889           for ( i=0; i<flen; ++i ) {
03890             if ( f[i] == 3 ) {
03891               f[i] = 2;
03892               int jz = (i+fstart)/myGrid.K2;
03893               if ( iz != jz ) { iout << " " << jz;  iz = jz; }
03894             }
03895           }
03896           iout << "\n" << endi;
03897         }
03898       }
03899     }
03900 
03901 #ifdef NETWORK_PROGRESS
03902     CmiNetworkProgress();
03903 #endif
03904 
03905     if ( ! recipPeDest[pe] ) continue;
03906 
03907     int zlistlen = 0;
03908     for ( i=0; i<myGrid.K3; ++i ) {
03909       if ( fz_arr[i] ) ++zlistlen;
03910     }
03911 
03912     PmeGridMsg *msg = new (zlistlen, flen*numGrids,
03913                                 fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
03914 
03915     msg->sourceNode = sourcepe;
03916     msg->lattice = lattice;
03917     msg->start = fstart;
03918     msg->len = flen;
03919     msg->zlistlen = zlistlen;
03920     int *zlist = msg->zlist;
03921     zlistlen = 0;
03922     for ( i=0; i<myGrid.K3; ++i ) {
03923       if ( fz_arr[i] ) zlist[zlistlen++] = i;
03924     }
03925     float *qmsg = msg->qgrid;
03926     for ( g=0; g<numGrids; ++g ) {
03927       char *f = f_arr + fstart + g*fsize;
03928       CmiMemcpy((void*)(msg->fgrid+g*flen),(void*)f,flen*sizeof(char));
03929       float **q = q_arr + fstart + g*fsize;
03930       for ( i=0; i<flen; ++i ) {
03931         if ( f[i] ) {
03932           for (int h=0; h<myGrid.order-1; ++h) {
03933             q[i][h] += q[i][myGrid.K3+h];
03934           }
03935           for ( int k=0; k<zlistlen; ++k ) {
03936             *(qmsg++) = q[i][zlist[k]];
03937           }
03938         }
03939       }
03940     }
03941 
03942     msg->sequence = compute_sequence;
03943     SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
03944     pmeProxy[gridPeMap[pe]].recvGrid(msg);
03945   }
03946 
03947 }

void ComputePmeMgr::sendPencils ( Lattice ,
int  sequence 
)

Definition at line 3735 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::block2, PmeGrid::dim2, endi(), iERROR(), iout, j, ijpair::j, PmeGrid::K1, PmeGrid::K2, sendDataHelper_lattice, sendDataHelper_sequence, sendDataHelper_sourcepe, sendPencilsPart(), and NodePmeMgr::zm.

Referenced by chargeGridReady().

03735                                                               {
03736 
03737   sendDataHelper_lattice = &lattice;
03738   sendDataHelper_sequence = sequence;
03739   sendDataHelper_sourcepe = CkMyPe();
03740 
03741 #ifdef NAMD_CUDA
03742   if ( offload ) {
03743     for ( int ap=0; ap < numPencilsActive; ++ap ) {
03744 #if CMK_MULTICORE
03745       // nodegroup messages on multicore are delivered to sending pe, or pe 0 if expedited
03746       int ib = activePencils[ap].i;
03747       int jb = activePencils[ap].j;
03748       int destproc = nodePmeMgr->zm.ckLocalBranch()->procNum(0, CkArrayIndex3D(ib,jb,0));
03749       pmeProxy[destproc].sendPencilsHelper(ap);
03750 #else
03751       pmeNodeProxy[CkMyNode()].sendPencilsHelper(ap);
03752 #endif
03753     }
03754   } else
03755 #endif
03756   {
03757     sendPencilsPart(0,numPencilsActive-1,lattice,sequence,CkMyPe());
03758   }
03759 
03760   if ( strayChargeErrors ) {
03761    strayChargeErrors = 0;
03762    iout << iERROR << "Stray PME grid charges detected: "
03763         << CkMyPe() << " sending to (x,y)";
03764    int K1 = myGrid.K1;
03765    int K2 = myGrid.K2;
03766    int dim2 = myGrid.dim2;
03767    int block1 = myGrid.block1;
03768    int block2 = myGrid.block2;
03769    for (int ib=0; ib<xBlocks; ++ib) {
03770     for (int jb=0; jb<yBlocks; ++jb) {
03771      int ibegin = ib*block1;
03772      int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
03773      int jbegin = jb*block2;
03774      int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
03775      int flen = numGrids * (iend - ibegin) * (jend - jbegin);
03776 
03777      for ( int g=0; g<numGrids; ++g ) {
03778        char *f = f_arr + g*fsize;
03779        if ( ! pencilActive[ib*yBlocks+jb] ) {
03780            for ( int i=ibegin; i<iend; ++i ) {
03781             for ( int j=jbegin; j<jend; ++j ) {
03782              if ( f[i*dim2+j] == 3 ) {
03783                f[i*dim2+j] = 2;
03784                iout << " (" << i << "," << j << ")";
03785              }
03786             }
03787            }
03788        }
03789      }
03790     }
03791    }
03792    iout << "\n" << endi;
03793   }
03794  
03795 }

void ComputePmeMgr::sendPencilsHelper ( int   ) 

Definition at line 3722 of file ComputePme.C.

References NodePmeMgr::sendPencilsHelper().

03722                                               {
03723   nodePmeMgr->sendPencilsHelper(iter);
03724 }

void ComputePmeMgr::sendPencilsPart ( int  first,
int  last,
Lattice ,
int  sequence,
int  sourcepe 
)

Definition at line 3580 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::block2, PmeGridMsg::destElem, PmeGrid::dim2, PmeGrid::dim3, PmeGridMsg::fgrid, PmeGridMsg::hasData, j, ijpair::j, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PmeGridMsg::lattice, PmeGridMsg::len, NAMD_bug(), PmeGrid::order, PME_GRID_PRIORITY, PRIORITY_SIZE, PmeGridMsg::qgrid, PmeGridMsg::sequence, SET_PRIORITY, PmeGridMsg::sourceNode, PmeGridMsg::start, PmeGridMsg::zlist, PmeGridMsg::zlistlen, and NodePmeMgr::zm.

Referenced by sendPencils(), and NodePmeMgr::sendPencilsHelper().

03580                                                                                                      {
03581 
03582   // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
03583 
03584 #if 0 && USE_PERSISTENT
03585     if (recvGrid_handle== NULL) setup_recvgrid_persistent();
03586 #endif
03587   int K1 = myGrid.K1;
03588   int K2 = myGrid.K2;
03589   int dim2 = myGrid.dim2;
03590   int dim3 = myGrid.dim3;
03591   int block1 = myGrid.block1;
03592   int block2 = myGrid.block2;
03593 
03594   // int savedMessages = 0;
03595   NodePmeMgr *npMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
03596 
03597   for (int ap=first; ap<=last; ++ap) {
03598     int ib = activePencils[ap].i;
03599     int jb = activePencils[ap].j;
03600     int ibegin = ib*block1;
03601     int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
03602     int jbegin = jb*block2;
03603     int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
03604     int flen = numGrids * (iend - ibegin) * (jend - jbegin);
03605 
03606     int fcount = 0;
03607     for ( int g=0; g<numGrids; ++g ) {
03608       char *f = f_arr + g*fsize;
03609 #ifdef NAMD_CUDA
03610      if ( offload ) {
03611       int errcount = 0;
03612       for ( int i=ibegin; i<iend; ++i ) {
03613        for ( int j=jbegin; j<jend; ++j ) {
03614         int k = i*dim2+j;
03615         f[k] = ffz_host[k];
03616         fcount += f[k];
03617         if ( ffz_host[k] & ~1 ) ++errcount;
03618        }
03619       }
03620       if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::sendPencilsPart");
03621      } else
03622 #endif
03623       for ( int i=ibegin; i<iend; ++i ) {
03624        for ( int j=jbegin; j<jend; ++j ) {
03625         fcount += f[i*dim2+j];
03626        }
03627       }
03628     }
03629 
03630 #ifdef NETWORK_PROGRESS
03631     CmiNetworkProgress();
03632 #endif
03633 
03634     if ( ! pencilActive[ib*yBlocks+jb] )
03635       NAMD_bug("PME activePencils list inconsistent");
03636 
03637     int zlistlen = 0;
03638     for ( int i=0; i<myGrid.K3; ++i ) {
03639       if ( fz_arr[i] ) ++zlistlen;
03640     }
03641 
03642     int hd = ( fcount? 1 : 0 );  // has data?
03643     // if ( ! hd ) ++savedMessages;
03644 
03645     
03646     PmeGridMsg *msg = new ( hd*zlistlen, hd*flen,
03647         hd*fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
03648     msg->sourceNode = sourcepe;
03649     msg->hasData = hd;
03650     msg->lattice = lattice;
03651    if ( hd ) {
03652 #if 0
03653     msg->start = fstart;
03654     msg->len = flen;
03655 #else
03656     msg->start = -1;   // obsolete?
03657     msg->len = -1;   // obsolete?
03658 #endif
03659     msg->zlistlen = zlistlen;
03660     int *zlist = msg->zlist;
03661     zlistlen = 0;
03662     for ( int i=0; i<myGrid.K3; ++i ) {
03663       if ( fz_arr[i] ) zlist[zlistlen++] = i;
03664     }
03665     char *fmsg = msg->fgrid;
03666     float *qmsg = msg->qgrid;
03667     for ( int g=0; g<numGrids; ++g ) {
03668       char *f = f_arr + g*fsize;
03669       float **q = q_arr + g*fsize;
03670       for ( int i=ibegin; i<iend; ++i ) {
03671        for ( int j=jbegin; j<jend; ++j ) {
03672         *(fmsg++) = f[i*dim2+j];
03673         if( f[i*dim2+j] ) {
03674           for (int h=0; h<myGrid.order-1; ++h) {
03675             q[i*dim2+j][h] += q[i*dim2+j][myGrid.K3+h];
03676           }
03677           for ( int k=0; k<zlistlen; ++k ) {
03678             *(qmsg++) = q[i*dim2+j][zlist[k]];
03679           }
03680         }
03681        }
03682       }
03683     }
03684    }
03685 
03686     msg->sequence = compute_sequence;
03687     SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
03688     CmiEnableUrgentSend(1);
03689 #if USE_NODE_PAR_RECEIVE
03690     msg->destElem=CkArrayIndex3D(ib,jb,0);
03691     CProxy_PmePencilMap lzm = npMgr->zm;
03692     int destproc = lzm.ckLocalBranch()->procNum(0, msg->destElem);
03693     int destnode = CmiNodeOf(destproc);
03694     
03695 #if  0 
03696     CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
03697 #endif
03698     pmeNodeProxy[destnode].recvZGrid(msg);
03699 #if 0 
03700     CmiUsePersistentHandle(NULL, 0);
03701 #endif
03702 #else
03703 #if 0 
03704     CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
03705 #endif
03706     zPencil(ib,jb,0).recvGrid(msg);
03707 #if 0 
03708     CmiUsePersistentHandle(NULL, 0);
03709 #endif
03710 #endif
03711     CmiEnableUrgentSend(0);
03712   }
03713 
03714 
03715   // if ( savedMessages ) {
03716   //   CkPrintf("Pe %d eliminated %d PME messages\n",CkMyPe(),savedMessages);
03717   // }
03718 
03719 }

void ComputePmeMgr::sendTrans ( void   ) 

Definition at line 1951 of file ComputePme.C.

References CKLOOP_CTRL_PME_SENDTRANS, Node::Object(), PmeSlabSendTrans(), sendTransSubset(), Node::simParameters, and SimParameters::useCkLoop.

Referenced by sendTransBarrier().

01951                                   {
01952 
01953   untrans_count = numTransPes;
01954 
01955 #if     CMK_SMP && USE_CKLOOP
01956   int useCkLoop = Node::Object()->simParameters->useCkLoop;
01957   if ( useCkLoop >= CKLOOP_CTRL_PME_SENDTRANS && CkNumPes() >= 2 * numGridPes) {
01958     CkLoop_Parallelize(PmeSlabSendTrans, 1, (void *)this, CkMyNodeSize(), 0, numTransNodes-1, 0); // no sync
01959   } else
01960 #endif
01961   {
01962     sendTransSubset(0, numTransNodes-1);
01963   }
01964 
01965 }

void ComputePmeMgr::sendTransBarrier ( void   ) 

Definition at line 1936 of file ComputePme.C.

References sendTrans().

Referenced by recvGrid().

01936                                          {
01937   sendTransBarrier_received += 1;
01938   // CkPrintf("sendTransBarrier on %d %d\n",myGridPe,numGridPes-sendTransBarrier_received);
01939   if ( sendTransBarrier_received < numGridPes ) return;
01940   sendTransBarrier_received = 0;
01941   for ( int i=0; i<numGridPes; ++i ) {
01942     pmeProxyDir[gridPeMap[i]].sendTrans();
01943   }
01944 }

void ComputePmeMgr::sendTransSubset ( int  first,
int  last 
)

Definition at line 1967 of file ComputePme.C.

References PmeGrid::dim3, fwdSharedTrans(), j, PmeGrid::K2, kgrid, PmeTransMsg::lattice, NodePmeInfo::npe, PmeTransMsg::nx, LocalPmeInfo::nx, LocalPmeInfo::ny_after_transpose, NodePmeInfo::pe_start, PME_TRANS_PRIORITY, PRIORITY_SIZE, PmeTransMsg::qgrid, NodePmeInfo::real_node, PmeTransMsg::sequence, SET_PRIORITY, PmeTransMsg::sourceNode, PmeTransMsg::x_start, and LocalPmeInfo::x_start.

Referenced by PmeSlabSendTrans(), and sendTrans().

01967                                                        {
01968   // CkPrintf("sendTrans on Pe(%d)\n",CkMyPe());
01969 
01970   // send data for transpose
01971   int zdim = myGrid.dim3;
01972   int nx = localInfo[myGridPe].nx;
01973   int x_start = localInfo[myGridPe].x_start;
01974   int slicelen = myGrid.K2 * zdim;
01975 
01976   ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
01977 
01978 #if CMK_BLUEGENEL
01979   CmiNetworkProgressAfter (0);
01980 #endif
01981 
01982   for (int j=first; j<=last; j++) {
01983     int node = transNodeOrder[j];  // different order on each node
01984     int pe = transNodeInfo[node].pe_start;
01985     int npe = transNodeInfo[node].npe;
01986     int totlen = 0;
01987     if ( node != myTransNode ) for (int i=0; i<npe; ++i, ++pe) {
01988       LocalPmeInfo &li = localInfo[pe];
01989       int cpylen = li.ny_after_transpose * zdim;
01990       totlen += cpylen;
01991     }
01992     PmeTransMsg *newmsg = new (nx * totlen * numGrids,
01993                                 PRIORITY_SIZE) PmeTransMsg;
01994     newmsg->sourceNode = myGridPe;
01995     newmsg->lattice = lattice;
01996     newmsg->x_start = x_start;
01997     newmsg->nx = nx;
01998     for ( int g=0; g<numGrids; ++g ) {
01999       float *qmsg = newmsg->qgrid + nx * totlen * g;
02000       pe = transNodeInfo[node].pe_start;
02001       for (int i=0; i<npe; ++i, ++pe) {
02002         LocalPmeInfo &li = localInfo[pe];
02003         int cpylen = li.ny_after_transpose * zdim;
02004         if ( node == myTransNode ) {
02005           ComputePmeMgr *m = mgrObjects[CkRankOf(transPeMap[pe])];
02006           qmsg = m->kgrid + m->qgrid_size * g + x_start*cpylen;
02007         }
02008         float *q = qgrid + qgrid_size * g + li.y_start_after_transpose * zdim;
02009         for ( int x = 0; x < nx; ++x ) {
02010           CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
02011           q += slicelen;
02012           qmsg += cpylen;
02013         }
02014       }
02015     }
02016     newmsg->sequence = grid_sequence;
02017     SET_PRIORITY(newmsg,grid_sequence,PME_TRANS_PRIORITY)
02018     if ( node == myTransNode ) newmsg->nx = 0;
02019     if ( npe > 1 ) {
02020       if ( node == myTransNode ) fwdSharedTrans(newmsg);
02021       else pmeNodeProxy[transNodeInfo[node].real_node].recvTrans(newmsg);
02022     } else pmeProxy[transPeMap[transNodeInfo[node].pe_start]].recvTrans(newmsg);
02023   }
02024 }

void ComputePmeMgr::sendUngrid ( void   ) 

Definition at line 2382 of file ComputePme.C.

References CKLOOP_CTRL_PME_SENDUNTRANS, Node::Object(), PmeSlabSendUngrid(), sendUngridSubset(), Node::simParameters, and SimParameters::useCkLoop.

02382                                    {
02383 
02384 #if     CMK_SMP && USE_CKLOOP
02385   int useCkLoop = Node::Object()->simParameters->useCkLoop;
02386   if ( useCkLoop >= CKLOOP_CTRL_PME_SENDUNTRANS && CkNumPes() >= 2 * numGridPes) {
02387     CkLoop_Parallelize(PmeSlabSendUngrid, 1, (void *)this, CkMyNodeSize(), 0, numSources-1, 1); // sync
02388   } else
02389 #endif
02390   {
02391     sendUngridSubset(0, numSources-1);
02392   }
02393 
02394   grid_count = numSources;
02395   memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
02396 }

void ComputePmeMgr::sendUngridSubset ( int  first,
int  last 
)

Definition at line 2398 of file ComputePme.C.

References PmeGrid::dim3, f, j, PME_OFFLOAD_UNGRID_PRIORITY, PME_UNGRID_PRIORITY, and SET_PRIORITY.

Referenced by PmeSlabSendUngrid(), and sendUngrid().

02398                                                         {
02399 
02400 #ifdef NAMD_CUDA
02401   const int UNGRID_PRIORITY = ( offload ? PME_OFFLOAD_UNGRID_PRIORITY : PME_UNGRID_PRIORITY );
02402 #else
02403   const int UNGRID_PRIORITY = PME_UNGRID_PRIORITY ;
02404 #endif
02405 
02406   for ( int j=first; j<=last; ++j ) {
02407     // int msglen = qgrid_len;
02408     PmeGridMsg *newmsg = gridmsg_reuse[j];
02409     int pe = newmsg->sourceNode;
02410     int zdim = myGrid.dim3;
02411     int flen = newmsg->len;
02412     int fstart = newmsg->start;
02413     int zlistlen = newmsg->zlistlen;
02414     int *zlist = newmsg->zlist;
02415     float *qmsg = newmsg->qgrid;
02416     for ( int g=0; g<numGrids; ++g ) {
02417       char *f = newmsg->fgrid + fgrid_len * g;
02418       float *q = qgrid + qgrid_size * g + (fstart-fgrid_start) * zdim;
02419       for ( int i=0; i<flen; ++i ) {
02420         if ( f[i] ) {
02421           for ( int k=0; k<zlistlen; ++k ) {
02422             *(qmsg++) = q[zlist[k]];
02423           }
02424         }
02425         q += zdim;
02426       }
02427     }
02428     newmsg->sourceNode = myGridPe;
02429 
02430     SET_PRIORITY(newmsg,grid_sequence,UNGRID_PRIORITY)
02431     CmiEnableUrgentSend(1);
02432 #ifdef NAMD_CUDA
02433     if ( offload ) {
02434       pmeNodeProxy[CkNodeOf(pe)].recvUngrid(newmsg);
02435     } else
02436 #endif
02437     pmeProxyDir[pe].recvUngrid(newmsg);
02438     CmiEnableUrgentSend(0);
02439   }
02440 }

void ComputePmeMgr::sendUntrans ( void   ) 

Definition at line 2195 of file ComputePme.C.

References CKLOOP_CTRL_PME_SENDUNTRANS, PmeEvirMsg::evir, Node::Object(), PME_UNGRID_PRIORITY, PmeSlabSendUntrans(), PRIORITY_SIZE, sendUntransSubset(), SET_PRIORITY, Node::simParameters, and SimParameters::useCkLoop.

02195                                     {
02196 
02197   trans_count = numGridPes;
02198 
02199   { // send energy and virial
02200     PmeEvirMsg *newmsg = new (numGrids, PRIORITY_SIZE) PmeEvirMsg;
02201     for ( int g=0; g<numGrids; ++g ) {
02202       newmsg->evir[g] = recip_evir2[g];
02203     }
02204     SET_PRIORITY(newmsg,grid_sequence,PME_UNGRID_PRIORITY)
02205     CmiEnableUrgentSend(1);
02206     pmeProxy[recipEvirPe].recvRecipEvir(newmsg);
02207     CmiEnableUrgentSend(0);
02208   }
02209 
02210 #if     CMK_SMP && USE_CKLOOP
02211   int useCkLoop = Node::Object()->simParameters->useCkLoop;
02212   if ( useCkLoop >= CKLOOP_CTRL_PME_SENDUNTRANS && CkNumPes() >= 2 * numTransPes) {
02213     CkLoop_Parallelize(PmeSlabSendUntrans, 1, (void *)this, CkMyNodeSize(), 0, numGridNodes-1, 0); // no sync
02214   } else
02215 #endif
02216   {
02217     sendUntransSubset(0, numGridNodes-1);
02218   }
02219 
02220 }

void ComputePmeMgr::sendUntransSubset ( int  first,
int  last 
)

Definition at line 2222 of file ComputePme.C.

References PmeGrid::dim3, fwdSharedUntrans(), j, PmeGrid::K2, NodePmeInfo::npe, LocalPmeInfo::nx, PmeUntransMsg::ny, LocalPmeInfo::ny_after_transpose, NodePmeInfo::pe_start, PME_UNTRANS_PRIORITY, PRIORITY_SIZE, qgrid, PmeUntransMsg::qgrid, NodePmeInfo::real_node, SET_PRIORITY, PmeUntransMsg::sourceNode, PmeUntransMsg::y_start, and LocalPmeInfo::y_start_after_transpose.

Referenced by PmeSlabSendUntrans(), and sendUntrans().

02222                                                          {
02223 
02224   int zdim = myGrid.dim3;
02225   int y_start = localInfo[myTransPe].y_start_after_transpose;
02226   int ny = localInfo[myTransPe].ny_after_transpose;
02227   int slicelen = myGrid.K2 * zdim;
02228 
02229   ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
02230 
02231 #if CMK_BLUEGENEL
02232   CmiNetworkProgressAfter (0);
02233 #endif
02234 
02235   // send data for reverse transpose
02236   for (int j=first; j<=last; j++) {
02237     int node = gridNodeOrder[j];  // different order on each node
02238     int pe = gridNodeInfo[node].pe_start;
02239     int npe = gridNodeInfo[node].npe;
02240     int totlen = 0;
02241     if ( node != myGridNode ) for (int i=0; i<npe; ++i, ++pe) {
02242       LocalPmeInfo &li = localInfo[pe];
02243       int cpylen = li.nx * zdim;
02244       totlen += cpylen;
02245     }
02246     PmeUntransMsg *newmsg = new (ny * totlen * numGrids, PRIORITY_SIZE) PmeUntransMsg;
02247     newmsg->sourceNode = myTransPe;
02248     newmsg->y_start = y_start;
02249     newmsg->ny = ny;
02250     for ( int g=0; g<numGrids; ++g ) {
02251       float *qmsg = newmsg->qgrid + ny * totlen * g;
02252       pe = gridNodeInfo[node].pe_start;
02253       for (int i=0; i<npe; ++i, ++pe) {
02254         LocalPmeInfo &li = localInfo[pe];
02255         if ( node == myGridNode ) {
02256           ComputePmeMgr *m = mgrObjects[CkRankOf(gridPeMap[pe])];
02257           qmsg = m->qgrid + m->qgrid_size * g + y_start * zdim;
02258           float *q = kgrid + qgrid_size*g + li.x_start*ny*zdim;
02259           int cpylen = ny * zdim;
02260           for ( int x = 0; x < li.nx; ++x ) {
02261             CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
02262             q += cpylen;
02263             qmsg += slicelen;
02264           }
02265         } else {
02266           CmiMemcpy((void*)qmsg,
02267                 (void*)(kgrid + qgrid_size*g + li.x_start*ny*zdim),
02268                 li.nx*ny*zdim*sizeof(float));
02269           qmsg += li.nx*ny*zdim;
02270         }
02271       }
02272     }
02273     SET_PRIORITY(newmsg,grid_sequence,PME_UNTRANS_PRIORITY)
02274     if ( node == myGridNode ) newmsg->ny = 0;
02275     if ( npe > 1 ) {
02276       if ( node == myGridNode ) fwdSharedUntrans(newmsg);
02277       else pmeNodeProxy[gridNodeInfo[node].real_node].recvUntrans(newmsg);
02278     } else pmeProxy[gridPeMap[gridNodeInfo[node].pe_start]].recvUntrans(newmsg);
02279   }
02280 }

void ComputePmeMgr::submitReductions (  ) 

Definition at line 4222 of file ComputePme.C.

References NAMD_bug(), Node::Object(), Node::simParameters, and simParams.

Referenced by ComputePme::doWork(), and recvRecipEvir().

04222                                      {
04223 
04224     SimParameters *simParams = Node::Object()->simParameters;
04225 
04226     for ( int g=0; g<numGrids; ++g ) {
04227       float scale = 1.;
04228       if (alchOn) {
04229         BigReal elecLambdaUp, elecLambdaDown;
04230         if( simParams->alchFepWhamOn ) {
04231           if( simParams->alchFepElecOn ) {
04232             elecLambdaUp = simParams->alchElecLambda;
04233             elecLambdaDown = 1.0 - simParams->alchElecLambda;
04234           }
04235           else {
04236             elecLambdaUp = 0.0;
04237             elecLambdaDown = 1.0;
04238           }
04239         }
04240         else {
04241           // alchLambda set on each step in ComputePme::ungridForces()
04242           if ( alchLambda < 0 || alchLambda > 1 ) {
04243             NAMD_bug("ComputePmeMgr::submitReductions alchLambda out of range");
04244           }
04245           elecLambdaUp = simParams->getElecLambda(alchLambda);
04246           elecLambdaDown = simParams->getElecLambda(1-alchLambda);
04247         }
04248         if ( g == 0 ) scale = elecLambdaUp;
04249         else if ( g == 1 ) scale = elecLambdaDown;
04250         else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
04251         if (alchDecouple) {
04252           if ( g == 2 ) scale = 1-elecLambdaUp;
04253           else if ( g == 3 ) scale = 1-elecLambdaDown;
04254           else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
04255         }
04256       } else if ( lesOn ) {
04257         scale = 1.0 / lesFactor;
04258       } else if ( pairOn ) {
04259         scale = ( g == 0 ? 1. : -1. );
04260       }
04261       reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += evir[g][0] * scale;
04262       reduction->item(REDUCTION_VIRIAL_SLOW_XX) += evir[g][1] * scale;
04263       reduction->item(REDUCTION_VIRIAL_SLOW_XY) += evir[g][2] * scale;
04264       reduction->item(REDUCTION_VIRIAL_SLOW_XZ) += evir[g][3] * scale;
04265       reduction->item(REDUCTION_VIRIAL_SLOW_YX) += evir[g][2] * scale;
04266       reduction->item(REDUCTION_VIRIAL_SLOW_YY) += evir[g][4] * scale;
04267       reduction->item(REDUCTION_VIRIAL_SLOW_YZ) += evir[g][5] * scale;
04268       reduction->item(REDUCTION_VIRIAL_SLOW_ZX) += evir[g][3] * scale;
04269       reduction->item(REDUCTION_VIRIAL_SLOW_ZY) += evir[g][5] * scale;
04270       reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += evir[g][6] * scale;
04271 
04272       float scale2 = 0.;
04273 
04274       // why is this declared/defined again here?
04275       SimParameters *simParams = Node::Object()->simParameters;
04276 
04277       if (alchFepOn) {
04278         BigReal elecLambda2Up=0.0, elecLambda2Down=0.0;
04279         if(simParams->alchFepWhamOn) {
04280           if(simParams->alchFepElecOn) {
04281             elecLambda2Up = simParams->alchElecLambda;
04282             elecLambda2Down =  1.0 - simParams->alchElecLambda;
04283           }
04284           else {
04285             elecLambda2Up = 0.0;
04286             elecLambda2Down =  1.0;
04287           }
04288         }
04289         else {
04290           elecLambda2Up = simParams->getElecLambda(simParams->alchLambda2);
04291           elecLambda2Down = simParams->getElecLambda(1.-simParams->alchLambda2);
04292         }
04293         
04294         if ( g == 0 ) scale2 = elecLambda2Up;
04295         else if ( g == 1 ) scale2 = elecLambda2Down;
04296         else if ( g == 2 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
04297         if (alchDecouple && g == 2 ) scale2 = 1 - elecLambda2Up;
04298         else if (alchDecouple && g == 3 ) scale2 = 1 - elecLambda2Down;
04299         else if (alchDecouple && g == 4 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
04300       }
04301       if(simParams->alchFepWhamOn && simParams->alchFepElecOn)  {       // FEP with wham post-process
04302         if( g==0 )      scale2 = scale + 1.0;
04303         else if( g==1 ) scale2 = scale - 1.0;
04304         else if( g==2 ) scale2 = scale - 1.0;
04305         else if( g==3 ) scale2 = scale + 1.0;
04306       }
04307       reduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += evir[g][0] * scale2;
04308       
04309       if (alchThermIntOn) {
04310         
04311         // no decoupling:
04312         // part. 1 <-> all of system except partition 2: g[0] - g[2] 
04313         // (interactions between all atoms [partition 0 OR partition 1], 
04314         // minus all [within partition 0])
04315         // U = elecLambdaUp * (U[0] - U[2])
04316         // dU/dl = U[0] - U[2];
04317         
04318         // part. 2 <-> all of system except partition 1: g[1] - g[2] 
04319         // (interactions between all atoms [partition 0 OR partition 2], 
04320         // minus all [within partition 0])
04321         // U = elecLambdaDown * (U[1] - U[2])
04322         // dU/dl = U[1] - U[2];
04323 
04324         // alchDecouple:
04325         // part. 1 <-> part. 0: g[0] - g[2] - g[4] 
04326         // (interactions between all atoms [partition 0 OR partition 1]
04327         // minus all [within partition 1] minus all [within partition 0]
04328         // U = elecLambdaUp * (U[0] - U[4]) + (1-elecLambdaUp)* U[2]
04329         // dU/dl = U[0] - U[2] - U[4];
04330 
04331         // part. 2 <-> part. 0: g[1] - g[3] - g[4] 
04332         // (interactions between all atoms [partition 0 OR partition 2]
04333         // minus all [within partition 2] minus all [within partition 0]
04334         // U = elecLambdaDown * (U[1] - U[4]) + (1-elecLambdaDown)* U[3]
04335         // dU/dl = U[1] - U[3] - U[4];
04336         
04337         
04338         if ( g == 0 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) += evir[g][0];
04339         if ( g == 1 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) += evir[g][0];
04340         if (!alchDecouple) {
04341           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
04342           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
04343         }
04344         else {  // alchDecouple
04345           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
04346           if ( g == 3 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
04347           if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
04348           if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
04349         }
04350       }
04351     }
04352 
04353     alchLambda = -1.;  // illegal value to catch if not updated
04354 
04355     reduction->item(REDUCTION_STRAY_CHARGE_ERRORS) += strayChargeErrors;
04356     reduction->submit();
04357 
04358   for ( int i=0; i<heldComputes.size(); ++i ) {
04359     WorkDistrib::messageEnqueueWork(heldComputes[i]);
04360   }
04361   heldComputes.resize(0);
04362 }

void ComputePmeMgr::ungridCalc ( void   ) 

Definition at line 2526 of file ComputePme.C.

References a_data_dev, cuda_errcheck(), CUDA_EVENT_ID_PME_COPY, CUDA_EVENT_ID_PME_KERNEL, CUDA_EVENT_ID_PME_TICK, end_forces, NodePmeMgr::end_potential_memcpy, EVENT_STRIDE, f_data_dev, f_data_host, forces_count, forces_done_count, forces_time, j, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, WorkDistrib::messageEnqueueWork(), NodePmeMgr::mgrObjects, PmeGrid::order, pmeComputes, ResizeArray< Elem >::size(), this_pe, and ungridCalc().

Referenced by ungridCalc().

02526                                    {
02527   // CkPrintf("ungridCalc on Pe(%d)\n",CkMyPe());
02528 
02529   ungridForcesCount = pmeComputes.size();
02530 
02531 #ifdef NAMD_CUDA
02532  if ( offload ) {
02533   //CmiLock(cuda_lock);
02534 
02535   if ( this == masterPmeMgr ) {
02536     double before = CmiWallTimer();
02537     cudaMemcpyAsync(v_data_dev, q_data_host, q_data_size, cudaMemcpyHostToDevice, 0 /*streams[stream]*/);
02538     cudaEventRecord(nodePmeMgr->end_potential_memcpy, 0 /*streams[stream]*/);
02539     traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
02540 
02541     const int myrank = CkMyRank();
02542     for ( int i=0; i<CkMyNodeSize(); ++i ) {
02543       if ( myrank != i && nodePmeMgr->mgrObjects[i]->pmeComputes.size() ) {
02544         nodePmeMgr->mgrObjects[i]->ungridCalc();
02545       }
02546     }
02547     if ( ! pmeComputes.size() ) return;
02548   }
02549 
02550   if ( ! end_forces ) {
02551     int n=(pmeComputes.size()-1)/EVENT_STRIDE+1;
02552     end_forces = new cudaEvent_t[n];
02553     for ( int i=0; i<n; ++i ) {
02554       cudaEventCreateWithFlags(&end_forces[i],cudaEventDisableTiming);
02555     }
02556   }
02557 
02558   const int pcsz = pmeComputes.size();
02559   if ( ! afn_host ) {
02560     cudaMallocHost((void**) &afn_host, 3*pcsz*sizeof(float*));
02561     cudaMalloc((void**) &afn_dev, 3*pcsz*sizeof(float*));
02562     cuda_errcheck("malloc params for pme");
02563   }
02564   int totn = 0;
02565   for ( int i=0; i<pcsz; ++i ) {
02566     int n = pmeComputes[i]->numGridAtoms[0];
02567     totn += n;
02568   }
02569   if ( totn > f_data_mgr_alloc ) {
02570     if ( f_data_mgr_alloc ) {
02571       CkPrintf("Expanding CUDA forces allocation because %d > %d\n", totn, f_data_mgr_alloc);
02572       cudaFree(f_data_mgr_dev);
02573       cudaFreeHost(f_data_mgr_host);
02574     }
02575     f_data_mgr_alloc = 1.2 * (totn + 100);
02576     cudaMalloc((void**) &f_data_mgr_dev, 3*f_data_mgr_alloc*sizeof(float));
02577     cudaMallocHost((void**) &f_data_mgr_host, 3*f_data_mgr_alloc*sizeof(float));
02578     cuda_errcheck("malloc forces for pme");
02579   }
02580   // CkPrintf("pe %d pcsz %d totn %d alloc %d\n", CkMyPe(), pcsz, totn, f_data_mgr_alloc);
02581   float *f_dev = f_data_mgr_dev;
02582   float *f_host = f_data_mgr_host;
02583   for ( int i=0; i<pcsz; ++i ) {
02584     int n = pmeComputes[i]->numGridAtoms[0];
02585     pmeComputes[i]->f_data_dev = f_dev;
02586     pmeComputes[i]->f_data_host = f_host;
02587     afn_host[3*i  ] = a_data_dev + 7 * pmeComputes[i]->cuda_atoms_offset;
02588     afn_host[3*i+1] = f_dev;
02589     afn_host[3*i+2] = f_dev + n;  // avoid type conversion issues
02590     f_dev += 3*n;
02591     f_host += 3*n;
02592   }
02593   //CmiLock(cuda_lock);
02594   double before = CmiWallTimer();
02595   cudaMemcpyAsync(afn_dev, afn_host, 3*pcsz*sizeof(float*), cudaMemcpyHostToDevice, streams[stream]);
02596   traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
02597   cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_potential_memcpy, 0);
02598   traceUserEvent(CUDA_EVENT_ID_PME_TICK);
02599 
02600   for ( int i=0; i<pcsz; ++i ) {
02601     // cudaMemsetAsync(pmeComputes[i]->f_data_dev, 0, 3*n*sizeof(float), streams[stream]);
02602     if ( i%EVENT_STRIDE == 0 ) {
02603       int dimy = pcsz - i;
02604       if ( dimy > EVENT_STRIDE ) dimy = EVENT_STRIDE;
02605       int maxn = 0;
02606       int subtotn = 0;
02607       for ( int j=0; j<dimy; ++j ) {
02608         int n = pmeComputes[i+j]->numGridAtoms[0];
02609         subtotn += n;
02610         if ( n > maxn ) maxn = n;
02611       }
02612       // CkPrintf("pe %d dimy %d maxn %d subtotn %d\n", CkMyPe(), dimy, maxn, subtotn);
02613       before = CmiWallTimer();
02614       cuda_pme_forces(
02615         bspline_coeffs_dev,
02616         v_arr_dev, afn_dev+3*i, dimy, maxn, /*
02617         pmeComputes[i]->a_data_dev,
02618         pmeComputes[i]->f_data_dev,
02619         n, */ myGrid.K1, myGrid.K2, myGrid.K3, myGrid.order,
02620         streams[stream]);
02621       traceUserBracketEvent(CUDA_EVENT_ID_PME_KERNEL,before,CmiWallTimer());
02622       before = CmiWallTimer();
02623       cudaMemcpyAsync(pmeComputes[i]->f_data_host, pmeComputes[i]->f_data_dev, 3*subtotn*sizeof(float),
02624         cudaMemcpyDeviceToHost, streams[stream]);
02625       traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
02626       cudaEventRecord(end_forces[i/EVENT_STRIDE], streams[stream]);
02627       traceUserEvent(CUDA_EVENT_ID_PME_TICK);
02628     }
02629     // CkPrintf("pe %d c %d natoms %d fdev %lld fhost %lld\n", CkMyPe(), i, (int64)afn_host[3*i+2], pmeComputes[i]->f_data_dev, pmeComputes[i]->f_data_host);
02630   }
02631   //CmiUnlock(cuda_lock);
02632  } else
02633 #endif // NAMD_CUDA
02634  {
02635   for ( int i=0; i<pmeComputes.size(); ++i ) {
02636     WorkDistrib::messageEnqueueWork(pmeComputes[i]);
02637     // pmeComputes[i]->ungridForces();
02638   }
02639  }
02640   // submitReductions();  // must follow all ungridForces()
02641 
02642 #ifdef NAMD_CUDA
02643  if ( offload ) {
02644   forces_time = CmiWallTimer();
02645   forces_count = ungridForcesCount;
02646   forces_done_count = 0;
02647   pmeProxy[this_pe].pollForcesReady();
02648  }
02649 #endif
02650 
02651   ungrid_count = (usePencils ? numPencilsActive : numDestRecipPes );
02652 }


Friends And Related Function Documentation

friend class ComputePme [friend]

Definition at line 346 of file ComputePme.C.

friend class NodePmeMgr [friend]

Definition at line 347 of file ComputePme.C.


Member Data Documentation

float* ComputePmeMgr::a_data_dev

Definition at line 408 of file ComputePme.C.

Referenced by cuda_submit_charges(), and ungridCalc().

float* ComputePmeMgr::a_data_host

Definition at line 407 of file ComputePme.C.

int ComputePmeMgr::chargeGridSubmittedCount

Definition at line 433 of file ComputePme.C.

Referenced by chargeGridSubmitted(), ComputePmeMgr(), initialize_computes(), and sendChargeGridReady().

double ComputePmeMgr::charges_time

Definition at line 419 of file ComputePme.C.

Referenced by cuda_check_pme_charges(), and cuda_submit_charges().

int ComputePmeMgr::check_charges_count

Definition at line 421 of file ComputePme.C.

Referenced by ComputePmeMgr(), and cuda_check_pme_charges().

int ComputePmeMgr::check_forces_count

Definition at line 422 of file ComputePme.C.

Referenced by ComputePmeMgr(), and cuda_check_pme_forces().

int ComputePmeMgr::cuda_atoms_alloc

Definition at line 412 of file ComputePme.C.

Referenced by ComputePmeMgr().

int ComputePmeMgr::cuda_atoms_count

Definition at line 411 of file ComputePme.C.

Referenced by ComputePmeMgr(), cuda_submit_charges(), and ComputePme::initialize().

bool ComputePmeMgr::cuda_busy [static]

Definition at line 431 of file ComputePme.C.

CmiNodeLock ComputePmeMgr::cuda_lock [static]

Definition at line 413 of file ComputePme.C.

Referenced by ComputePmeMgr(), initialize_computes(), and recvAck().

std::deque< ComputePmeMgr::cuda_submit_charges_args > ComputePmeMgr::cuda_submit_charges_deque [static]

Definition at line 430 of file ComputePme.C.

cudaEvent_t ComputePmeMgr::end_charges

Definition at line 415 of file ComputePme.C.

Referenced by chargeGridSubmitted(), ComputePmeMgr(), and cuda_check_pme_charges().

cudaEvent_t* ComputePmeMgr::end_forces

Definition at line 416 of file ComputePme.C.

Referenced by ComputePmeMgr(), cuda_check_pme_forces(), and ungridCalc().

float* ComputePmeMgr::f_data_dev

Definition at line 410 of file ComputePme.C.

Referenced by ungridCalc().

float* ComputePmeMgr::f_data_host

Definition at line 409 of file ComputePme.C.

Referenced by ungridCalc().

CmiNodeLock ComputePmeMgr::fftw_plan_lock [static]

Definition at line 403 of file ComputePme.C.

Referenced by ComputePmeMgr(), initialize(), and ~ComputePmeMgr().

int ComputePmeMgr::forces_count

Definition at line 417 of file ComputePme.C.

Referenced by cuda_check_pme_forces(), and ungridCalc().

int ComputePmeMgr::forces_done_count

Definition at line 418 of file ComputePme.C.

Referenced by cuda_check_pme_forces(), and ungridCalc().

double ComputePmeMgr::forces_time

Definition at line 420 of file ComputePme.C.

Referenced by cuda_check_pme_forces(), and ungridCalc().

int ComputePmeMgr::master_pe

Definition at line 423 of file ComputePme.C.

Referenced by chargeGridSubmitted(), initialize_computes(), recvAck(), and sendChargeGridReady().

ResizeArray<ComputePme*> ComputePmeMgr::pmeComputes

Definition at line 443 of file ComputePme.C.

Referenced by chargeGridReady(), cuda_check_pme_forces(), getComputes(), ComputePme::noWork(), recvRecipEvir(), sendChargeGridReady(), and ungridCalc().

CmiNodeLock ComputePmeMgr::pmemgr_lock

Definition at line 404 of file ComputePme.C.

Referenced by ComputePmeMgr(), and ~ComputePmeMgr().

Lattice* ComputePmeMgr::saved_lattice

Definition at line 436 of file ComputePme.C.

Referenced by chargeGridSubmitted(), and recvChargeGridReady().

int ComputePmeMgr::saved_sequence

Definition at line 437 of file ComputePme.C.

Referenced by chargeGridSubmitted(), cuda_check_pme_charges(), cuda_check_pme_forces(), and recvChargeGridReady().

int ComputePmeMgr::sendDataHelper_errors

Definition at line 362 of file ComputePme.C.

Referenced by sendData(), and NodePmeMgr::sendDataHelper().

Lattice* ComputePmeMgr::sendDataHelper_lattice

Definition at line 359 of file ComputePme.C.

Referenced by sendData(), NodePmeMgr::sendDataHelper(), sendPencils(), and NodePmeMgr::sendPencilsHelper().

int ComputePmeMgr::sendDataHelper_sequence

Definition at line 360 of file ComputePme.C.

Referenced by sendData(), NodePmeMgr::sendDataHelper(), sendPencils(), and NodePmeMgr::sendPencilsHelper().

int ComputePmeMgr::sendDataHelper_sourcepe

Definition at line 361 of file ComputePme.C.

Referenced by sendData(), NodePmeMgr::sendDataHelper(), sendPencils(), and NodePmeMgr::sendPencilsHelper().

int ComputePmeMgr::this_pe

Definition at line 424 of file ComputePme.C.

Referenced by ComputePmeMgr(), and ungridCalc().


The documentation for this class was generated from the following file:
Generated on Sat Sep 23 01:17:18 2017 for NAMD by  doxygen 1.4.7