ComputePmeMgr Class Reference

Inheritance diagram for ComputePmeMgr:

ComputePmeUtil List of all members.

Public Member Functions

 ComputePmeMgr ()
 ~ComputePmeMgr ()
void initialize (CkQdMsg *)
void initialize_pencils (CkQdMsg *)
void activate_pencils (CkQdMsg *)
void recvArrays (CProxy_PmeXPencil, CProxy_PmeYPencil, CProxy_PmeZPencil)
void initialize_computes ()
void sendData (Lattice &, int sequence)
void sendDataPart (int first, int last, Lattice &, int sequence, int sourcepe, int errors)
void sendPencils (Lattice &, int sequence)
void sendPencilsPart (int first, int last, Lattice &, int sequence, int sourcepe)
void recvGrid (PmeGridMsg *)
void gridCalc1 (void)
void sendTransBarrier (void)
void sendTransSubset (int first, int last)
void sendTrans (void)
void fwdSharedTrans (PmeTransMsg *)
void recvSharedTrans (PmeSharedTransMsg *)
void sendDataHelper (int)
void sendPencilsHelper (int)
void recvTrans (PmeTransMsg *)
void procTrans (PmeTransMsg *)
void gridCalc2 (void)
void gridCalc2R (void)
void fwdSharedUntrans (PmeUntransMsg *)
void recvSharedUntrans (PmeSharedUntransMsg *)
void sendUntrans (void)
void sendUntransSubset (int first, int last)
void recvUntrans (PmeUntransMsg *)
void procUntrans (PmeUntransMsg *)
void gridCalc3 (void)
void sendUngrid (void)
void sendUngridSubset (int first, int last)
void recvUngrid (PmeGridMsg *)
void recvAck (PmeAckMsg *)
void copyResults (PmeGridMsg *)
void copyPencils (PmeGridMsg *)
void ungridCalc (void)
void recvRecipEvir (PmeEvirMsg *)
void addRecipEvirClient (void)
void submitReductions ()
void chargeGridSubmitted (Lattice &lattice, int sequence)
void cuda_submit_charges (Lattice &lattice, int sequence)
void sendChargeGridReady ()
void pollChargeGridReady ()
void pollForcesReady ()
void recvChargeGridReady ()
void chargeGridReady (Lattice &lattice, int sequence)

Public Attributes

LatticesendDataHelper_lattice
int sendDataHelper_sequence
int sendDataHelper_sourcepe
int sendDataHelper_errors
CmiNodeLock pmemgr_lock
float * a_data_host
float * a_data_dev
float * f_data_host
float * f_data_dev
int cuda_atoms_count
int cuda_atoms_alloc
cudaEvent_t end_charges
cudaEvent_t * end_forces
int forces_count
int forces_done_count
double charges_time
double forces_time
int check_charges_count
int check_forces_count
int master_pe
int this_pe
int chargeGridSubmittedCount
Latticesaved_lattice
int saved_sequence
ResizeArray< ComputePme * > pmeComputes

Static Public Attributes

static CmiNodeLock fftw_plan_lock
static CmiNodeLock cuda_lock
static std::deque< cuda_submit_charges_argscuda_submit_charges_deque
static bool cuda_busy

Friends

class ComputePme
class NodePmeMgr

Classes

struct  cuda_submit_charges_args

Detailed Description

Definition at line 355 of file ComputePme.C.


Constructor & Destructor Documentation

ComputePmeMgr::ComputePmeMgr (  ) 

Definition at line 709 of file ComputePme.C.

References chargeGridSubmittedCount, check_charges_count, check_forces_count, cuda_atoms_alloc, cuda_atoms_count, cuda_errcheck(), CUDA_EVENT_ID_PME_CHARGES, CUDA_EVENT_ID_PME_COPY, CUDA_EVENT_ID_PME_FORCES, CUDA_EVENT_ID_PME_KERNEL, CUDA_EVENT_ID_PME_TICK, cuda_lock, CUDA_STREAM_CREATE, end_charges, end_forces, fftw_plan_lock, NUM_STREAMS, pmemgr_lock, stream, and this_pe.

00709                              : pmeProxy(thisgroup), 
00710                                  pmeProxyDir(thisgroup) {
00711 
00712   CkpvAccess(BOCclass_group).computePmeMgr = thisgroup;
00713   pmeNodeProxy = CkpvAccess(BOCclass_group).nodePmeMgr;
00714   nodePmeMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
00715 
00716   pmeNodeProxy.ckLocalBranch()->initialize();
00717 
00718   if ( CmiMyRank() == 0 ) {
00719     fftw_plan_lock = CmiCreateLock();
00720   }
00721   pmemgr_lock = CmiCreateLock();
00722 
00723   myKSpace = 0;
00724   kgrid = 0;
00725   work = 0;
00726   grid_count = 0;
00727   trans_count = 0;
00728   untrans_count = 0;
00729   ungrid_count = 0;
00730   gridmsg_reuse= new PmeGridMsg*[CkNumPes()];
00731   useBarrier = 0;
00732   sendTransBarrier_received = 0;
00733   usePencils = 0;
00734 
00735 #ifdef NAMD_CUDA
00736  // offload has not been set so this happens on every run
00737   if ( CmiMyRank() == 0 ) {
00738     cuda_lock = CmiCreateLock();
00739   }
00740 
00741 #if CUDA_VERSION >= 5050
00742   int leastPriority, greatestPriority;
00743   cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
00744   cuda_errcheck("in cudaDeviceGetStreamPriorityRange");
00745   //if ( CkMyNode() == 0 ) {
00746   //  CkPrintf("Pe %d PME CUDA stream priority range %d %d\n", CkMyPe(), leastPriority, greatestPriority);
00747   //}
00748 #define CUDA_STREAM_CREATE(X) cudaStreamCreateWithPriority(X,cudaStreamDefault,greatestPriority)
00749 #else
00750 #define CUDA_STREAM_CREATE(X) cudaStreamCreate(X)
00751 #endif
00752 
00753   stream = 0;
00754   for ( int i=0; i<NUM_STREAMS; ++i ) {
00755 #if 1
00756     CUDA_STREAM_CREATE(&streams[i]);
00757     cuda_errcheck("cudaStreamCreate");
00758 #else
00759   streams[i] = 0;  // XXXX Testing!!!
00760 #endif
00761   }
00762 
00763   this_pe = CkMyPe();
00764  
00765   cudaEventCreateWithFlags(&end_charges,cudaEventDisableTiming);
00766   end_forces = 0;
00767   check_charges_count = 0;
00768   check_forces_count = 0;
00769   chargeGridSubmittedCount = 0;
00770 
00771   cuda_atoms_count = 0;
00772   cuda_atoms_alloc = 0;
00773 
00774   f_data_mgr_alloc = 0;
00775   f_data_mgr_host = 0;
00776   f_data_mgr_dev = 0;
00777   afn_host = 0;
00778   afn_dev = 0;
00779 
00780 #define CUDA_EVENT_ID_PME_CHARGES 80
00781 #define CUDA_EVENT_ID_PME_FORCES 81
00782 #define CUDA_EVENT_ID_PME_TICK 82
00783 #define CUDA_EVENT_ID_PME_COPY 83
00784 #define CUDA_EVENT_ID_PME_KERNEL 84
00785   if ( 0 == CkMyPe() ) {
00786     traceRegisterUserEvent("CUDA PME charges", CUDA_EVENT_ID_PME_CHARGES);
00787     traceRegisterUserEvent("CUDA PME forces", CUDA_EVENT_ID_PME_FORCES);
00788     traceRegisterUserEvent("CUDA PME tick", CUDA_EVENT_ID_PME_TICK);
00789     traceRegisterUserEvent("CUDA PME memcpy", CUDA_EVENT_ID_PME_COPY);
00790     traceRegisterUserEvent("CUDA PME kernel", CUDA_EVENT_ID_PME_KERNEL);
00791   }
00792 #endif
00793   recipEvirCount = 0;
00794   recipEvirClients = 0;
00795   recipEvirPe = -999;
00796 }

ComputePmeMgr::~ComputePmeMgr (  ) 

Definition at line 1791 of file ComputePme.C.

References fftw_plan_lock, and pmemgr_lock.

01791                               {
01792 
01793   if ( CmiMyRank() == 0 ) {
01794     CmiDestroyLock(fftw_plan_lock);
01795   }
01796   CmiDestroyLock(pmemgr_lock);
01797 
01798   delete myKSpace;
01799   delete [] localInfo;
01800   delete [] gridNodeInfo;
01801   delete [] transNodeInfo;
01802   delete [] gridPeMap;
01803   delete [] transPeMap;
01804   delete [] recipPeDest;
01805   delete [] gridPeOrder;
01806   delete [] gridNodeOrder;
01807   delete [] transNodeOrder;
01808   delete [] qgrid;
01809   if ( kgrid != qgrid ) delete [] kgrid;
01810   delete [] work;
01811   delete [] gridmsg_reuse;
01812 
01813  if ( ! offload ) {
01814   for (int i=0; i<q_count; ++i) {
01815     delete [] q_list[i];
01816   }
01817   delete [] q_list;
01818   delete [] fz_arr;
01819  }
01820   delete [] f_arr;
01821   delete [] q_arr;
01822 }


Member Function Documentation

void ComputePmeMgr::activate_pencils ( CkQdMsg *   ) 

Definition at line 1785 of file ComputePme.C.

01785                                                  {
01786   if ( ! usePencils ) return;
01787   if ( CkMyPe() == 0 ) zPencil.dummyRecvGrid(CkMyPe(),1);
01788 }

void ComputePmeMgr::addRecipEvirClient ( void   ) 

Definition at line 3002 of file ComputePme.C.

03002                                        {
03003   ++recipEvirClients;
03004 }

void ComputePmeMgr::chargeGridReady ( Lattice lattice,
int  sequence 
)

Definition at line 3519 of file ComputePme.C.

References j, PmeGrid::K3, NAMD_bug(), PmeGrid::order, pmeComputes, sendData(), sendPencils(), and ResizeArray< Elem >::size().

Referenced by recvChargeGridReady().

03519                                                                   {
03520 
03521 #ifdef NAMD_CUDA
03522  if ( offload ) {
03523   int errcount = 0;
03524   int q_stride = myGrid.K3+myGrid.order-1;
03525   for (int n=fsize+q_stride, j=fsize; j<n; ++j) {
03526     f_arr[j] = ffz_host[j];
03527     if ( ffz_host[j] & ~1 ) ++errcount;
03528   }
03529   if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::chargeGridReady");
03530  }
03531 #endif
03532   recipEvirCount = recipEvirClients;
03533   ungridForcesCount = pmeComputes.size();
03534 
03535   for (int j=0; j<myGrid.order-1; ++j) {
03536     fz_arr[j] |= fz_arr[myGrid.K3+j];
03537   }
03538 
03539   if ( usePencils ) {
03540     sendPencils(lattice,sequence);
03541   } else {
03542     sendData(lattice,sequence);
03543   }
03544 }

void ComputePmeMgr::chargeGridSubmitted ( Lattice lattice,
int  sequence 
)

Definition at line 3455 of file ComputePme.C.

References chargeGridSubmittedCount, computeMgr, CUDA_EVENT_ID_PME_COPY, deviceCUDA, NodePmeMgr::end_all_pme_kernels, NodePmeMgr::end_charge_memset, end_charges, DeviceCUDA::getMasterPe(), master_pe, Node::Object(), saved_lattice, saved_sequence, Node::simParameters, and simParams.

Referenced by cuda_submit_charges().

03455                                                                       {
03456   saved_lattice = &lattice;
03457   saved_sequence = sequence;
03458 
03459   // cudaDeviceSynchronize();  //  XXXX TESTING
03460   //int q_stride = myGrid.K3+myGrid.order-1;
03461   //for (int n=fsize+q_stride, j=0; j<n; ++j) {
03462   //  if ( ffz_host[j] != 0 && ffz_host[j] != 1 ) {
03463   //    CkPrintf("pre-memcpy flag %d/%d == %d on pe %d in ComputePmeMgr::chargeGridReady\n", j, n, ffz_host[j], CkMyPe());
03464   //  }
03465   //}
03466   //CmiLock(cuda_lock);
03467 
03468  if ( --(masterPmeMgr->chargeGridSubmittedCount) == 0 ) {
03469   double before = CmiWallTimer();
03470   cudaEventRecord(nodePmeMgr->end_all_pme_kernels, 0);  // when all streams complete
03471   cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_all_pme_kernels, 0);
03472   cudaMemcpyAsync(q_data_host, q_data_dev, q_data_size+ffz_size,
03473                         cudaMemcpyDeviceToHost, streams[stream]);
03474   traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
03475   cudaEventRecord(masterPmeMgr->end_charges, streams[stream]);
03476   cudaMemsetAsync(q_data_dev, 0, q_data_size + ffz_size, streams[stream]);  // for next time
03477   cudaEventRecord(nodePmeMgr->end_charge_memset, streams[stream]);
03478   //CmiUnlock(cuda_lock);
03479   // cudaDeviceSynchronize();  //  XXXX TESTING
03480   // cuda_errcheck("after memcpy grid to host");
03481 
03482   SimParameters *simParams = Node::Object()->simParameters;
03483   if ( ! simParams->useCUDA2 ) {
03484     CProxy_ComputeMgr cm(CkpvAccess(BOCclass_group).computeMgr);
03485     cm[deviceCUDA->getMasterPe()].recvYieldDevice(-1);
03486   }
03487 
03488   pmeProxy[master_pe].pollChargeGridReady();
03489  }
03490 }

void ComputePmeMgr::copyPencils ( PmeGridMsg  ) 

Definition at line 3765 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::block2, PmeGrid::dim2, PmeGrid::dim3, j, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PmeGrid::order, PmeGridMsg::qgrid, PmeGridMsg::sourceNode, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

Referenced by recvUngrid().

03765                                                {
03766 
03767   int K1 = myGrid.K1;
03768   int K2 = myGrid.K2;
03769   int dim2 = myGrid.dim2;
03770   int dim3 = myGrid.dim3;
03771   int block1 = myGrid.block1;
03772   int block2 = myGrid.block2;
03773 
03774   // msg->sourceNode = thisIndex.x * initdata.yBlocks + thisIndex.y;
03775   int ib = msg->sourceNode / yBlocks;
03776   int jb = msg->sourceNode % yBlocks;
03777 
03778   int ibegin = ib*block1;
03779   int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
03780   int jbegin = jb*block2;
03781   int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
03782 
03783   int zlistlen = msg->zlistlen;
03784   int *zlist = msg->zlist;
03785   float *qmsg = msg->qgrid;
03786   int g;
03787   for ( g=0; g<numGrids; ++g ) {
03788     char *f = f_arr + g*fsize;
03789     float **q = q_arr + g*fsize;
03790     for ( int i=ibegin; i<iend; ++i ) {
03791      for ( int j=jbegin; j<jend; ++j ) {
03792       if( f[i*dim2+j] ) {
03793         f[i*dim2+j] = 0;
03794         for ( int k=0; k<zlistlen; ++k ) {
03795           q[i*dim2+j][zlist[k]] = *(qmsg++);
03796         }
03797         for (int h=0; h<myGrid.order-1; ++h) {
03798           q[i*dim2+j][myGrid.K3+h] = q[i*dim2+j][h];
03799         }
03800       }
03801      }
03802     }
03803   }
03804 }

void ComputePmeMgr::copyResults ( PmeGridMsg  ) 

Definition at line 3957 of file ComputePme.C.

References PmeGrid::dim3, PmeGridMsg::fgrid, PmeGrid::K3, PmeGridMsg::len, PmeGrid::order, PmeGridMsg::qgrid, PmeGridMsg::start, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

Referenced by recvUngrid().

03957                                                {
03958 
03959   int zdim = myGrid.dim3;
03960   int flen = msg->len;
03961   int fstart = msg->start;
03962   int zlistlen = msg->zlistlen;
03963   int *zlist = msg->zlist;
03964   float *qmsg = msg->qgrid;
03965   int g;
03966   for ( g=0; g<numGrids; ++g ) {
03967     char *f = msg->fgrid + g*flen;
03968     float **q = q_arr + fstart + g*fsize;
03969     for ( int i=0; i<flen; ++i ) {
03970       if ( f[i] ) {
03971         f[i] = 0;
03972         for ( int k=0; k<zlistlen; ++k ) {
03973           q[i][zlist[k]] = *(qmsg++);
03974         }
03975         for (int h=0; h<myGrid.order-1; ++h) {
03976           q[i][myGrid.K3+h] = q[i][h];
03977         }
03978       }
03979     }
03980   }
03981 }

void ComputePmeMgr::cuda_submit_charges ( Lattice lattice,
int  sequence 
)

Definition at line 3402 of file ComputePme.C.

References a_data_dev, chargeGridSubmitted(), charges_time, cuda_atoms_count, CUDA_EVENT_ID_PME_COPY, CUDA_EVENT_ID_PME_KERNEL, NodePmeMgr::end_charge_memset, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, and PmeGrid::order.

03402                                                                       {
03403 
03404     int n = cuda_atoms_count;
03405     //CkPrintf("pe %d cuda_atoms_count %d\n", CkMyPe(), cuda_atoms_count);
03406     cuda_atoms_count = 0;
03407 
03408     const double before = CmiWallTimer();
03409     cudaMemcpyAsync(a_data_dev, a_data_host, 7*n*sizeof(float),
03410                           cudaMemcpyHostToDevice, streams[stream]);
03411     const double after = CmiWallTimer();
03412 
03413     cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_charge_memset, 0);
03414 
03415     cuda_pme_charges(
03416       bspline_coeffs_dev,
03417       q_arr_dev, ffz_dev, ffz_dev + fsize,
03418       a_data_dev, n,
03419       myGrid.K1, myGrid.K2, myGrid.K3, myGrid.order,
03420       streams[stream]);
03421     const double after2 = CmiWallTimer();
03422 
03423     chargeGridSubmitted(lattice,sequence);  // must be inside lock
03424 
03425     masterPmeMgr->charges_time = before;
03426     traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,after);
03427     traceUserBracketEvent(CUDA_EVENT_ID_PME_KERNEL,after,after2);
03428 }

void ComputePmeMgr::fwdSharedTrans ( PmeTransMsg  ) 

Definition at line 2011 of file ComputePme.C.

References NodePmeInfo::npe, NodePmeInfo::pe_start, PME_TRANS_PRIORITY, PRIORITY_SIZE, PmeTransMsg::sequence, and SET_PRIORITY.

Referenced by NodePmeMgr::recvTrans(), and sendTransSubset().

02011                                                    {
02012   // CkPrintf("fwdSharedTrans on Pe(%d)\n",CkMyPe());
02013   int pe = transNodeInfo[myTransNode].pe_start;
02014   int npe = transNodeInfo[myTransNode].npe;
02015   CmiNodeLock lock = CmiCreateLock();
02016   int *count = new int; *count = npe;
02017   for (int i=0; i<npe; ++i, ++pe) {
02018     PmeSharedTransMsg *shmsg = new (PRIORITY_SIZE) PmeSharedTransMsg;
02019     SET_PRIORITY(shmsg,msg->sequence,PME_TRANS_PRIORITY)
02020     shmsg->msg = msg;
02021     shmsg->count = count;
02022     shmsg->lock = lock;
02023     pmeProxy[transPeMap[pe]].recvSharedTrans(shmsg);
02024   }
02025 }

void ComputePmeMgr::fwdSharedUntrans ( PmeUntransMsg  ) 

Definition at line 2267 of file ComputePme.C.

References NodePmeInfo::npe, and NodePmeInfo::pe_start.

Referenced by NodePmeMgr::recvUntrans(), and sendUntransSubset().

02267                                                        {
02268   int pe = gridNodeInfo[myGridNode].pe_start;
02269   int npe = gridNodeInfo[myGridNode].npe;
02270   CmiNodeLock lock = CmiCreateLock();
02271   int *count = new int; *count = npe;
02272   for (int i=0; i<npe; ++i, ++pe) {
02273     PmeSharedUntransMsg *shmsg = new PmeSharedUntransMsg;
02274     shmsg->msg = msg;
02275     shmsg->count = count;
02276     shmsg->lock = lock;
02277     pmeProxy[gridPeMap[pe]].recvSharedUntrans(shmsg);
02278   }
02279 }

void ComputePmeMgr::gridCalc1 ( void   ) 

Definition at line 1903 of file ComputePme.C.

References PmeGrid::dim2, PmeGrid::dim3, and ComputePmeUtil::numGrids.

Referenced by recvGrid().

01903                                   {
01904   // CkPrintf("gridCalc1 on Pe(%d)\n",CkMyPe());
01905 
01906 #ifdef NAMD_FFTW
01907   for ( int g=0; g<numGrids; ++g ) {
01908 #ifdef NAMD_FFTW_3
01909     fftwf_execute(forward_plan_yz[g]);
01910 #else
01911     rfftwnd_real_to_complex(forward_plan_yz, localInfo[myGridPe].nx,
01912         qgrid + qgrid_size * g, 1, myGrid.dim2 * myGrid.dim3, 0, 0, 0);
01913 #endif
01914 
01915   }
01916 #endif
01917 
01918   if ( ! useBarrier ) pmeProxyDir[CkMyPe()].sendTrans();
01919 }

void ComputePmeMgr::gridCalc2 ( void   ) 

Definition at line 2079 of file ComputePme.C.

References PmeGrid::dim3, gridCalc2R(), ComputePmeUtil::numGrids, LocalPmeInfo::ny_after_transpose, and simParams.

Referenced by procTrans().

02079                                   {
02080   // CkPrintf("gridCalc2 on Pe(%d)\n",CkMyPe());
02081 
02082 #if CMK_BLUEGENEL
02083   CmiNetworkProgressAfter (0);
02084 #endif
02085 
02086   int zdim = myGrid.dim3;
02087   // int y_start = localInfo[myTransPe].y_start_after_transpose;
02088   int ny = localInfo[myTransPe].ny_after_transpose;
02089 
02090   for ( int g=0; g<numGrids; ++g ) {
02091     // finish forward FFT (x dimension)
02092 #ifdef NAMD_FFTW
02093 #ifdef NAMD_FFTW_3
02094     fftwf_execute(forward_plan_x[g]);
02095 #else
02096     fftw(forward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
02097         ny * zdim / 2, 1, work, 1, 0);
02098 #endif
02099 #endif
02100   }
02101 
02102 #ifdef OPENATOM_VERSION
02103     if ( ! simParams -> openatomOn ) { 
02104 #endif // OPENATOM_VERSION
02105       gridCalc2R();
02106 #ifdef OPENATOM_VERSION
02107     } else {
02108       gridCalc2Moa();
02109     }
02110 #endif // OPENATOM_VERSION
02111 }

void ComputePmeMgr::gridCalc2R ( void   ) 

Definition at line 2139 of file ComputePme.C.

References CKLOOP_CTRL_PME_KSPACE, PmeKSpace::compute_energy(), PmeGrid::dim3, ComputeNonbondedUtil::ewaldcof, LocalPmeInfo::ny_after_transpose, and Node::Object().

Referenced by gridCalc2().

02139                                    {
02140 
02141   int useCkLoop = 0;
02142 #if CMK_SMP && USE_CKLOOP
02143   if ( Node::Object()->simParameters->useCkLoop >= CKLOOP_CTRL_PME_KSPACE
02144        && CkNumPes() >= 2 * numTransPes ) {
02145     useCkLoop = 1;
02146   }
02147 #endif
02148 
02149   int zdim = myGrid.dim3;
02150   // int y_start = localInfo[myTransPe].y_start_after_transpose;
02151   int ny = localInfo[myTransPe].ny_after_transpose;
02152 
02153   for ( int g=0; g<numGrids; ++g ) {
02154     // reciprocal space portion of PME
02155     BigReal ewaldcof = ComputeNonbondedUtil::ewaldcof;
02156     recip_evir2[g][0] = myKSpace->compute_energy(kgrid+qgrid_size*g,
02157                         lattice, ewaldcof, &(recip_evir2[g][1]), useCkLoop);
02158     // CkPrintf("Ewald reciprocal energy = %f\n", recip_evir2[g][0]);
02159 
02160     // start backward FFT (x dimension)
02161 
02162 #ifdef NAMD_FFTW
02163 #ifdef NAMD_FFTW_3
02164     fftwf_execute(backward_plan_x[g]);
02165 #else
02166     fftw(backward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
02167         ny * zdim / 2, 1, work, 1, 0);
02168 #endif
02169 #endif
02170   }
02171   
02172   pmeProxyDir[CkMyPe()].sendUntrans();
02173 }

void ComputePmeMgr::gridCalc3 ( void   ) 

Definition at line 2341 of file ComputePme.C.

References PmeGrid::dim2, and PmeGrid::dim3.

Referenced by procUntrans().

02341                                   {
02342   // CkPrintf("gridCalc3 on Pe(%d)\n",CkMyPe());
02343 
02344   // finish backward FFT
02345 #ifdef NAMD_FFTW
02346   for ( int g=0; g<numGrids; ++g ) {
02347 #ifdef NAMD_FFTW_3
02348     fftwf_execute(backward_plan_yz[g]);
02349 #else
02350     rfftwnd_complex_to_real(backward_plan_yz, localInfo[myGridPe].nx,
02351         (fftw_complex *) (qgrid + qgrid_size * g),
02352         1, myGrid.dim2 * myGrid.dim3 / 2, 0, 0, 0);
02353 #endif
02354   }
02355 
02356 #endif
02357 
02358   pmeProxyDir[CkMyPe()].sendUngrid();
02359 }

void ComputePmeMgr::initialize ( CkQdMsg *   ) 

Definition at line 861 of file ComputePme.C.

References Lattice::a(), Lattice::a_r(), ResizeArray< Elem >::add(), ResizeArray< Elem >::begin(), PmeGrid::block1, PmeGrid::block2, PmeGrid::block3, cuda_errcheck(), deviceCUDA, PmeGrid::dim2, PmeGrid::dim3, ResizeArray< Elem >::end(), endi(), fftw_plan_lock, findRecipEvirPe(), generatePmePeList2(), DeviceCUDA::getDeviceID(), PmePencilInitMsgData::grid, if(), iINFO(), iout, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PatchMap::max_a(), PatchMap::min_a(), NAMD_bug(), NAMD_die(), PatchMap::node(), NodePmeInfo::npe, ComputePmeUtil::numGrids, PatchMap::numNodesWithPatches(), PatchMap::numPatches(), numPatches, PatchMap::numPatchesOnNode(), LocalPmeInfo::nx, LocalPmeInfo::ny_after_transpose, PatchMap::Object(), Node::Object(), DeviceCUDA::one_device_per_node(), PmeGrid::order, NodePmeInfo::pe_start, WorkDistrib::peDiffuseOrdering, pencilPMEProcessors, PmePencilInitMsgData::pmeNodeProxy, PmePencilInitMsgData::pmeProxy, NodePmeInfo::real_node, Random::reorder(), ResizeArray< Elem >::resize(), Node::simParameters, simParams, ResizeArray< Elem >::size(), SortableResizeArray< Elem >::sort(), WorkDistrib::sortPmePes(), Vector::unit(), LocalPmeInfo::x_start, PmePencilInitMsgData::xBlocks, PmePencilInitMsgData::xm, PmePencilInitMsgData::xPencil, LocalPmeInfo::y_start_after_transpose, PmePencilInitMsgData::yBlocks, PmePencilInitMsgData::ym, PmePencilInitMsgData::yPencil, PmePencilInitMsgData::zBlocks, PmePencilInitMsgData::zm, and PmePencilInitMsgData::zPencil.

00861                                            {
00862   delete msg;
00863 
00864   localInfo = new LocalPmeInfo[CkNumPes()];
00865   gridNodeInfo = new NodePmeInfo[CkNumNodes()];
00866   transNodeInfo = new NodePmeInfo[CkNumNodes()];
00867   gridPeMap = new int[CkNumPes()];
00868   transPeMap = new int[CkNumPes()];
00869   recipPeDest = new int[CkNumPes()];
00870   gridPeOrder = new int[CkNumPes()];
00871   gridNodeOrder = new int[CkNumNodes()];
00872   transNodeOrder = new int[CkNumNodes()];
00873 
00874   if (CkMyRank() == 0) {
00875     pencilPMEProcessors = new char [CkNumPes()];
00876     memset (pencilPMEProcessors, 0, sizeof(char) * CkNumPes());
00877   }
00878 
00879   SimParameters *simParams = Node::Object()->simParameters;
00880   PatchMap *patchMap = PatchMap::Object();
00881 
00882   offload = simParams->PMEOffload;
00883 #ifdef NAMD_CUDA
00884   if ( offload && ! deviceCUDA->one_device_per_node() ) {
00885     NAMD_die("PME offload requires exactly one CUDA device per process.  Use \"PMEOffload no\".");
00886   }
00887   if ( offload ) {
00888     int dev;
00889     cudaGetDevice(&dev);
00890     cuda_errcheck("in cudaGetDevice");
00891     if ( dev != deviceCUDA->getDeviceID() ) NAMD_bug("ComputePmeMgr::initialize dev != deviceCUDA->getDeviceID()");
00892     cudaDeviceProp deviceProp;
00893     cudaGetDeviceProperties(&deviceProp, dev);
00894     cuda_errcheck("in cudaGetDeviceProperties");
00895     if ( deviceProp.major < 2 )
00896       NAMD_die("PME offload requires CUDA device of compute capability 2.0 or higher.  Use \"PMEOffload no\".");
00897   }
00898 #endif
00899 
00900   alchLambda = -1.;  // illegal value to catch if not updated
00901   useBarrier = simParams->PMEBarrier;
00902 
00903   if ( numGrids != 1 || simParams->PMEPencils == 0 ) usePencils = 0;
00904   else if ( simParams->PMEPencils > 0 ) usePencils = 1;
00905   else {
00906     int nrps = simParams->PMEProcessors;
00907     if ( nrps <= 0 ) nrps = CkNumPes();
00908     if ( nrps > CkNumPes() ) nrps = CkNumPes();
00909     int dimx = simParams->PMEGridSizeX;
00910     int dimy = simParams->PMEGridSizeY;
00911     int maxslabs = 1 + (dimx - 1) / simParams->PMEMinSlices;
00912     if ( maxslabs > nrps ) maxslabs = nrps;
00913     int maxpencils = ( simParams->PMEGridSizeX * (int64) simParams->PMEGridSizeY
00914                 * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
00915     if ( maxpencils > nrps ) maxpencils = nrps;
00916     if ( maxpencils > 3 * maxslabs ) usePencils = 1;
00917     else usePencils = 0;
00918   }
00919 
00920   if ( usePencils ) {
00921     int nrps = simParams->PMEProcessors;
00922     if ( nrps <= 0 ) nrps = CkNumPes();
00923     if ( nrps > CkNumPes() ) nrps = CkNumPes();
00924     if ( simParams->PMEPencils > 1 &&
00925          simParams->PMEPencils * simParams->PMEPencils <= nrps ) {
00926       xBlocks = yBlocks = zBlocks = simParams->PMEPencils;
00927     } else {
00928       int nb2 = ( simParams->PMEGridSizeX * (int64) simParams->PMEGridSizeY
00929                 * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
00930       if ( nb2 > nrps ) nb2 = nrps;
00931       if ( nb2 < 1 ) nb2 = 1;
00932       int nb = (int) sqrt((float)nb2);
00933       if ( nb < 1 ) nb = 1;
00934       xBlocks = zBlocks = nb;
00935       yBlocks = nb2 / nb;
00936     }
00937 
00938     if ( simParams->PMEPencilsX > 0 ) xBlocks = simParams->PMEPencilsX;
00939     if ( simParams->PMEPencilsY > 0 ) yBlocks = simParams->PMEPencilsY;
00940     if ( simParams->PMEPencilsZ > 0 ) zBlocks = simParams->PMEPencilsZ;
00941 
00942     int dimx = simParams->PMEGridSizeX;
00943     int bx = 1 + ( dimx - 1 ) / xBlocks;
00944     xBlocks = 1 + ( dimx - 1 ) / bx;
00945 
00946     int dimy = simParams->PMEGridSizeY;
00947     int by = 1 + ( dimy - 1 ) / yBlocks;
00948     yBlocks = 1 + ( dimy - 1 ) / by;
00949 
00950     int dimz = simParams->PMEGridSizeZ / 2 + 1;  // complex
00951     int bz = 1 + ( dimz - 1 ) / zBlocks;
00952     zBlocks = 1 + ( dimz - 1 ) / bz;
00953 
00954     if ( xBlocks * yBlocks > CkNumPes() ) {
00955       NAMD_die("PME pencils xBlocks * yBlocks > numPes");
00956     }
00957     if ( xBlocks * zBlocks > CkNumPes() ) {
00958       NAMD_die("PME pencils xBlocks * zBlocks > numPes");
00959     }
00960     if ( yBlocks * zBlocks > CkNumPes() ) {
00961       NAMD_die("PME pencils yBlocks * zBlocks > numPes");
00962     }
00963 
00964     if ( ! CkMyPe() ) {
00965       iout << iINFO << "PME using " << xBlocks << " x " <<
00966         yBlocks << " x " << zBlocks <<
00967         " pencil grid for FFT and reciprocal sum.\n" << endi;
00968     }
00969   } else { // usePencils
00970 
00971   {  // decide how many pes to use for reciprocal sum
00972 
00973     // rules based on work available
00974     int minslices = simParams->PMEMinSlices;
00975     int dimx = simParams->PMEGridSizeX;
00976     int nrpx = ( dimx + minslices - 1 ) / minslices;
00977     int dimy = simParams->PMEGridSizeY;
00978     int nrpy = ( dimy + minslices - 1 ) / minslices;
00979 
00980     // rules based on processors available
00981     int nrpp = CkNumPes();
00982     // if ( nrpp > 32 ) nrpp = 32;  // cap to limit messages
00983     if ( nrpp < nrpx ) nrpx = nrpp;
00984     if ( nrpp < nrpy ) nrpy = nrpp;
00985 
00986     // user override
00987     int nrps = simParams->PMEProcessors;
00988     if ( nrps > CkNumPes() ) nrps = CkNumPes();
00989     if ( nrps > 0 ) nrpx = nrps;
00990     if ( nrps > 0 ) nrpy = nrps;
00991 
00992     // make sure there aren't any totally empty processors
00993     int bx = ( dimx + nrpx - 1 ) / nrpx;
00994     nrpx = ( dimx + bx - 1 ) / bx;
00995     int by = ( dimy + nrpy - 1 ) / nrpy;
00996     nrpy = ( dimy + by - 1 ) / by;
00997     if ( bx != ( dimx + nrpx - 1 ) / nrpx )
00998       NAMD_bug("Error in selecting number of PME processors.");
00999     if ( by != ( dimy + nrpy - 1 ) / nrpy )
01000       NAMD_bug("Error in selecting number of PME processors.");
01001 
01002     numGridPes = nrpx;
01003     numTransPes = nrpy;
01004   }
01005   if ( ! CkMyPe() ) {
01006     iout << iINFO << "PME using " << numGridPes << " and " << numTransPes <<
01007       " processors for FFT and reciprocal sum.\n" << endi;
01008   }
01009 
01010   int sum_npes = numTransPes + numGridPes;
01011   int max_npes = (numTransPes > numGridPes)?numTransPes:numGridPes;
01012 
01013 #if 0 // USE_TOPOMAP
01014   /* This code is being disabled permanently for slab PME on Blue Gene machines */
01015   PatchMap * pmap = PatchMap::Object();
01016   
01017   int patch_pes = pmap->numNodesWithPatches();
01018   TopoManager tmgr;
01019   if(tmgr.hasMultipleProcsPerNode())
01020     patch_pes *= 2;
01021 
01022   bool done = false;
01023   if(CkNumPes() > 2*sum_npes + patch_pes) {    
01024     done = generateBGLORBPmePeList(transPeMap, numTransPes);
01025     done &= generateBGLORBPmePeList(gridPeMap, numGridPes, transPeMap, numTransPes);    
01026   }
01027   else 
01028     if(CkNumPes() > 2 *max_npes + patch_pes) {
01029       done = generateBGLORBPmePeList(transPeMap, max_npes);
01030       gridPeMap = transPeMap;
01031     }
01032 
01033   if (!done)
01034 #endif
01035     {
01036       //generatePmePeList(transPeMap, max_npes);
01037       //gridPeMap = transPeMap;
01038       generatePmePeList2(gridPeMap, numGridPes, transPeMap, numTransPes);
01039     }
01040   
01041   if ( ! CkMyPe() ) {
01042     iout << iINFO << "PME GRID LOCATIONS:";
01043     int i;
01044     for ( i=0; i<numGridPes && i<10; ++i ) {
01045       iout << " " << gridPeMap[i];
01046     }
01047     if ( i < numGridPes ) iout << " ...";
01048     iout << "\n" << endi;
01049     iout << iINFO << "PME TRANS LOCATIONS:";
01050     for ( i=0; i<numTransPes && i<10; ++i ) {
01051       iout << " " << transPeMap[i];
01052     }
01053     if ( i < numTransPes ) iout << " ...";
01054     iout << "\n" << endi;
01055   }
01056 
01057   // sort based on nodes and physical nodes
01058   std::sort(gridPeMap,gridPeMap+numGridPes,WorkDistrib::pe_sortop_compact());
01059 
01060   myGridPe = -1;
01061   myGridNode = -1;
01062   int i = 0;
01063   int node = -1;
01064   int real_node = -1;
01065   for ( i=0; i<numGridPes; ++i ) {
01066     if ( gridPeMap[i] == CkMyPe() ) myGridPe = i;
01067     if (CkMyRank() == 0) pencilPMEProcessors[gridPeMap[i]] |= 1;
01068     int real_node_i = CkNodeOf(gridPeMap[i]);
01069     if ( real_node_i == real_node ) {
01070       gridNodeInfo[node].npe += 1;
01071     } else {
01072       real_node = real_node_i;
01073       ++node;
01074       gridNodeInfo[node].real_node = real_node;
01075       gridNodeInfo[node].pe_start = i;
01076       gridNodeInfo[node].npe = 1;
01077     }
01078     if ( CkMyNode() == real_node_i ) myGridNode = node;
01079   }
01080   numGridNodes = node + 1;
01081   myTransPe = -1;
01082   myTransNode = -1;
01083   node = -1;
01084   real_node = -1;
01085   for ( i=0; i<numTransPes; ++i ) {
01086     if ( transPeMap[i] == CkMyPe() ) myTransPe = i;
01087     if (CkMyRank() == 0) pencilPMEProcessors[transPeMap[i]] |= 2;
01088     int real_node_i = CkNodeOf(transPeMap[i]);
01089     if ( real_node_i == real_node ) {
01090       transNodeInfo[node].npe += 1;
01091     } else {
01092       real_node = real_node_i;
01093       ++node;
01094       transNodeInfo[node].real_node = real_node;
01095       transNodeInfo[node].pe_start = i;
01096       transNodeInfo[node].npe = 1;
01097     }
01098     if ( CkMyNode() == real_node_i ) myTransNode = node;
01099   }
01100   numTransNodes = node + 1;
01101 
01102   if ( ! CkMyPe() ) {
01103     iout << iINFO << "PME USING " << numGridNodes << " GRID NODES AND "
01104          << numTransNodes << " TRANS NODES\n" << endi;
01105   }
01106 
01107   { // generate random orderings for grid and trans messages
01108     int i;
01109     for ( i = 0; i < numGridPes; ++i ) {
01110       gridPeOrder[i] = i;
01111     }
01112     Random rand(CkMyPe());
01113     if ( myGridPe < 0 ) {
01114       rand.reorder(gridPeOrder,numGridPes);
01115     } else {  // self last
01116       gridPeOrder[myGridPe] = numGridPes-1;
01117       gridPeOrder[numGridPes-1] = myGridPe;
01118       rand.reorder(gridPeOrder,numGridPes-1);
01119     } 
01120     for ( i = 0; i < numGridNodes; ++i ) {
01121       gridNodeOrder[i] = i;
01122     }
01123     if ( myGridNode < 0 ) {
01124       rand.reorder(gridNodeOrder,numGridNodes);
01125     } else {  // self last
01126       gridNodeOrder[myGridNode] = numGridNodes-1;
01127       gridNodeOrder[numGridNodes-1] = myGridNode;
01128       rand.reorder(gridNodeOrder,numGridNodes-1);
01129     }
01130     for ( i = 0; i < numTransNodes; ++i ) {
01131       transNodeOrder[i] = i;
01132     }
01133     if ( myTransNode < 0 ) {
01134       rand.reorder(transNodeOrder,numTransNodes);
01135     } else {  // self last
01136       transNodeOrder[myTransNode] = numTransNodes-1;
01137       transNodeOrder[numTransNodes-1] = myTransNode;
01138       rand.reorder(transNodeOrder,numTransNodes-1);
01139     }
01140   }
01141   
01142   } // ! usePencils
01143 
01144   myGrid.K1 = simParams->PMEGridSizeX;
01145   myGrid.K2 = simParams->PMEGridSizeY;
01146   myGrid.K3 = simParams->PMEGridSizeZ;
01147   myGrid.order = simParams->PMEInterpOrder;
01148   myGrid.dim2 = myGrid.K2;
01149   myGrid.dim3 = 2 * (myGrid.K3/2 + 1);
01150 
01151   if ( ! usePencils ) {
01152     myGrid.block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
01153     myGrid.block2 = ( myGrid.K2 + numTransPes - 1 ) / numTransPes;
01154     myGrid.block3 = myGrid.dim3 / 2;  // complex
01155   }
01156 
01157   if ( usePencils ) {
01158     myGrid.block1 = ( myGrid.K1 + xBlocks - 1 ) / xBlocks;
01159     myGrid.block2 = ( myGrid.K2 + yBlocks - 1 ) / yBlocks;
01160     myGrid.block3 = ( myGrid.K3/2 + 1 + zBlocks - 1 ) / zBlocks;  // complex
01161 
01162 
01163       int pe = 0;
01164       int x,y,z;
01165 
01166                 SortableResizeArray<int> zprocs(xBlocks*yBlocks);
01167                 SortableResizeArray<int> yprocs(xBlocks*zBlocks);
01168                 SortableResizeArray<int> xprocs(yBlocks*zBlocks);
01169       
01170                 // decide which pes to use by bit reversal and patch use
01171                 int i;
01172                 int ncpus = CkNumPes();
01173                 SortableResizeArray<int> patches, nopatches, pmeprocs;
01174                 PatchMap *pmap = PatchMap::Object();
01175                 for ( int icpu=0; icpu<ncpus; ++icpu ) {
01176                         int ri = WorkDistrib::peDiffuseOrdering[icpu];
01177                         if ( ri ) { // keep 0 for special case
01178                                 if ( pmap->numPatchesOnNode(ri) ) patches.add(ri);
01179                                 else nopatches.add(ri);
01180                         }
01181                 }
01182 
01183 #if USE_RANDOM_TOPO
01184             Random rand(CkMyPe());
01185             int *tmp = new int[patches.size()];
01186             int nn = patches.size();
01187             for (i=0;i<nn;i++)  tmp[i] = patches[i];
01188             rand.reorder(tmp, nn);
01189             patches.resize(0);
01190             for (i=0;i<nn;i++)  patches.add(tmp[i]);
01191             delete [] tmp;
01192             tmp = new int[nopatches.size()];
01193             nn = nopatches.size();
01194             for (i=0;i<nn;i++)  tmp[i] = nopatches[i];
01195             rand.reorder(tmp, nn);
01196             nopatches.resize(0);
01197             for (i=0;i<nn;i++)  nopatches.add(tmp[i]);
01198             delete [] tmp;
01199 #endif
01200 
01201                 // only use zero if it eliminates overloading or has patches
01202                 int useZero = 0;
01203                 int npens = xBlocks*yBlocks;
01204                 if ( npens % ncpus == 0 ) useZero = 1;
01205                 if ( npens == nopatches.size() + 1 ) useZero = 1;
01206                 npens += xBlocks*zBlocks;
01207                 if ( npens % ncpus == 0 ) useZero = 1;
01208                 if ( npens == nopatches.size() + 1 ) useZero = 1;
01209                 npens += yBlocks*zBlocks;
01210                 if ( npens % ncpus == 0 ) useZero = 1;
01211                 if ( npens == nopatches.size() + 1 ) useZero = 1;
01212 
01213                 // add nopatches then patches in reversed order
01214                 for ( i=nopatches.size()-1; i>=0; --i ) pmeprocs.add(nopatches[i]);
01215                 if ( useZero && ! pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
01216                 for ( i=patches.size()-1; i>=0; --i ) pmeprocs.add(patches[i]);
01217                 if ( pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
01218   
01219                 int npes = pmeprocs.size();
01220                 for ( i=0; i<xBlocks*yBlocks; ++i, ++pe ) zprocs[i] = pmeprocs[pe%npes];
01221                 if ( i>1 && zprocs[0] == zprocs[i-1] ) zprocs[0] = 0;
01222 #if !USE_RANDOM_TOPO
01223                 zprocs.sort();
01224 #endif
01225                 for ( i=0; i<xBlocks*zBlocks; ++i, ++pe ) yprocs[i] = pmeprocs[pe%npes];
01226                 if ( i>1 && yprocs[0] == yprocs[i-1] ) yprocs[0] = 0;
01227 #if !USE_RANDOM_TOPO
01228                 yprocs.sort();
01229 #endif
01230       for ( i=0; i<yBlocks*zBlocks; ++i, ++pe ) xprocs[i] = pmeprocs[pe%npes];
01231       if ( i>1 && xprocs[0] == xprocs[i-1] ) xprocs[0] = 0;
01232 #if !USE_RANDOM_TOPO
01233       xprocs.sort();
01234 #endif
01235 
01236 #if USE_TOPO_SFC
01237   CmiLock(tmgr_lock);
01238   //{
01239   TopoManager tmgr;
01240   int xdim = tmgr.getDimNX();
01241   int ydim = tmgr.getDimNY();
01242   int zdim = tmgr.getDimNZ();
01243   int xdim1 = find_level_grid(xdim);
01244   int ydim1 = find_level_grid(ydim);
01245   int zdim1 = find_level_grid(zdim);
01246   if(CkMyPe() == 0)
01247       printf("xdim: %d %d %d, %d %d %d\n", xdim, ydim, zdim, xdim1, ydim1, zdim1);
01248 
01249   vector<Coord> result;
01250   SFC_grid(xdim, ydim, zdim, xdim1, ydim1, zdim1, result);
01251   sort_sfc(xprocs, tmgr, result);
01252   sort_sfc(yprocs, tmgr, result);
01253   sort_sfc(zprocs, tmgr, result);
01254   //}
01255   CmiUnlock(tmgr_lock);
01256 #endif
01257 
01258 
01259                 if(CkMyPe() == 0){  
01260               iout << iINFO << "PME Z PENCIL LOCATIONS:";
01261           for ( i=0; i<zprocs.size() && i<10; ++i ) {
01262 #if USE_TOPO_SFC
01263               int x,y,z,t;
01264               tmgr.rankToCoordinates(zprocs[i], x,y, z, t);
01265               iout << " " << zprocs[i] << "(" << x << " " << y << " " << z << ")";
01266 #else
01267               iout << " " << zprocs[i];
01268 #endif
01269           }
01270           if ( i < zprocs.size() ) iout << " ...";
01271               iout << "\n" << endi;
01272                 }
01273 
01274     if (CkMyRank() == 0) {
01275       for (pe=0, x = 0; x < xBlocks; ++x)
01276         for (y = 0; y < yBlocks; ++y, ++pe ) {
01277           pencilPMEProcessors[zprocs[pe]] = 1;
01278         }
01279     }
01280      
01281                 if(CkMyPe() == 0){  
01282               iout << iINFO << "PME Y PENCIL LOCATIONS:";
01283           for ( i=0; i<yprocs.size() && i<10; ++i ) {
01284 #if USE_TOPO_SFC
01285               int x,y,z,t;
01286               tmgr.rankToCoordinates(yprocs[i], x,y, z, t);
01287               iout << " " << yprocs[i] << "(" << x << " " << y << " " << z << ")";
01288 #else
01289               iout << " " << yprocs[i];
01290 #endif
01291           }
01292           if ( i < yprocs.size() ) iout << " ...";
01293               iout << "\n" << endi;
01294                 }
01295 
01296     if (CkMyRank() == 0) {
01297       for (pe=0, z = 0; z < zBlocks; ++z )
01298         for (x = 0; x < xBlocks; ++x, ++pe ) {
01299           pencilPMEProcessors[yprocs[pe]] = 1;
01300         }
01301     }
01302     
01303                 if(CkMyPe() == 0){  
01304                 iout << iINFO << "PME X PENCIL LOCATIONS:";
01305                     for ( i=0; i<xprocs.size() && i<10; ++i ) {
01306 #if USE_TOPO_SFC
01307                 int x,y,z,t;
01308                 tmgr.rankToCoordinates(xprocs[i], x,y, z, t);
01309                 iout << " " << xprocs[i] << "(" << x << "  " << y << " " << z << ")";
01310 #else
01311                 iout << " " << xprocs[i];
01312 #endif
01313             }
01314                 if ( i < xprocs.size() ) iout << " ...";
01315                 iout << "\n" << endi;
01316                 }
01317 
01318     if (CkMyRank() == 0) {
01319       for (pe=0, y = 0; y < yBlocks; ++y )      
01320         for (z = 0; z < zBlocks; ++z, ++pe ) {
01321           pencilPMEProcessors[xprocs[pe]] = 1;
01322         }
01323     }
01324         
01325 
01326         // creating the pencil arrays
01327         if ( CkMyPe() == 0 ){
01328 #if !USE_RANDOM_TOPO
01329         // std::sort(zprocs.begin(),zprocs.end(),WorkDistrib::pe_sortop_compact());
01330         WorkDistrib::sortPmePes(zprocs.begin(),xBlocks,yBlocks);
01331         std::sort(yprocs.begin(),yprocs.end(),WorkDistrib::pe_sortop_compact());
01332         std::sort(xprocs.begin(),xprocs.end(),WorkDistrib::pe_sortop_compact());
01333 #endif
01334 #if 1
01335         CProxy_PmePencilMap zm = CProxy_PmePencilMap::ckNew(0,1,yBlocks,xBlocks*yBlocks,zprocs.begin());
01336         CProxy_PmePencilMap ym;
01337         if ( simParams->PMEPencilsYLayout )
01338           ym = CProxy_PmePencilMap::ckNew(0,2,zBlocks,zBlocks*xBlocks,yprocs.begin()); // new
01339         else
01340           ym = CProxy_PmePencilMap::ckNew(2,0,xBlocks,zBlocks*xBlocks,yprocs.begin()); // old
01341         CProxy_PmePencilMap xm;
01342         if ( simParams->PMEPencilsXLayout )
01343           xm = CProxy_PmePencilMap::ckNew(2,1,yBlocks,yBlocks*zBlocks,xprocs.begin()); // new
01344         else
01345           xm = CProxy_PmePencilMap::ckNew(1,2,zBlocks,yBlocks*zBlocks,xprocs.begin()); // old
01346         pmeNodeProxy.recvPencilMapProxies(xm,ym,zm);
01347         CkArrayOptions zo(xBlocks,yBlocks,1);  zo.setMap(zm);
01348         CkArrayOptions yo(xBlocks,1,zBlocks);  yo.setMap(ym);
01349         CkArrayOptions xo(1,yBlocks,zBlocks);  xo.setMap(xm);
01350         zo.setAnytimeMigration(false);  zo.setStaticInsertion(true);
01351         yo.setAnytimeMigration(false);  yo.setStaticInsertion(true);
01352         xo.setAnytimeMigration(false);  xo.setStaticInsertion(true);
01353         zPencil = CProxy_PmeZPencil::ckNew(zo);  // (xBlocks,yBlocks,1);
01354         yPencil = CProxy_PmeYPencil::ckNew(yo);  // (xBlocks,1,zBlocks);
01355         xPencil = CProxy_PmeXPencil::ckNew(xo);  // (1,yBlocks,zBlocks);
01356 #else
01357         zPencil = CProxy_PmeZPencil::ckNew();  // (xBlocks,yBlocks,1);
01358         yPencil = CProxy_PmeYPencil::ckNew();  // (xBlocks,1,zBlocks);
01359         xPencil = CProxy_PmeXPencil::ckNew();  // (1,yBlocks,zBlocks);
01360 
01361                 for (pe=0, x = 0; x < xBlocks; ++x)
01362                         for (y = 0; y < yBlocks; ++y, ++pe ) {
01363                                 zPencil(x,y,0).insert(zprocs[pe]);
01364                         }
01365         zPencil.doneInserting();
01366 
01367                 for (pe=0, x = 0; x < xBlocks; ++x)
01368                         for (z = 0; z < zBlocks; ++z, ++pe ) {
01369                                 yPencil(x,0,z).insert(yprocs[pe]);
01370                         }
01371         yPencil.doneInserting();
01372 
01373 
01374                 for (pe=0, y = 0; y < yBlocks; ++y )    
01375                         for (z = 0; z < zBlocks; ++z, ++pe ) {
01376                                 xPencil(0,y,z).insert(xprocs[pe]);
01377                         }
01378                 xPencil.doneInserting();     
01379 #endif
01380 
01381                 pmeProxy.recvArrays(xPencil,yPencil,zPencil);
01382                 PmePencilInitMsgData msgdata;
01383                 msgdata.grid = myGrid;
01384                 msgdata.xBlocks = xBlocks;
01385                 msgdata.yBlocks = yBlocks;
01386                 msgdata.zBlocks = zBlocks;
01387                 msgdata.xPencil = xPencil;
01388                 msgdata.yPencil = yPencil;
01389                 msgdata.zPencil = zPencil;
01390                 msgdata.pmeProxy = pmeProxyDir;
01391         msgdata.pmeNodeProxy = pmeNodeProxy;
01392         msgdata.xm = xm;
01393         msgdata.ym = ym;
01394         msgdata.zm = zm;
01395                 xPencil.init(new PmePencilInitMsg(msgdata));
01396                 yPencil.init(new PmePencilInitMsg(msgdata));
01397                 zPencil.init(new PmePencilInitMsg(msgdata));
01398         }
01399 
01400     return;  // continue in initialize_pencils() at next startup stage
01401   }
01402 
01403 
01404   int pe;
01405   int nx = 0;
01406   for ( pe = 0; pe < numGridPes; ++pe ) {
01407     localInfo[pe].x_start = nx;
01408     nx += myGrid.block1;
01409     if ( nx > myGrid.K1 ) nx = myGrid.K1;
01410     localInfo[pe].nx = nx - localInfo[pe].x_start;
01411   }
01412   int ny = 0;
01413   for ( pe = 0; pe < numTransPes; ++pe ) {
01414     localInfo[pe].y_start_after_transpose = ny;
01415     ny += myGrid.block2;
01416     if ( ny > myGrid.K2 ) ny = myGrid.K2;
01417     localInfo[pe].ny_after_transpose =
01418                         ny - localInfo[pe].y_start_after_transpose;
01419   }
01420 
01421   {  // decide how many pes this node exchanges charges with
01422 
01423   PatchMap *patchMap = PatchMap::Object();
01424   Lattice lattice = simParams->lattice;
01425   BigReal sysdima = lattice.a_r().unit() * lattice.a();
01426   BigReal cutoff = simParams->cutoff;
01427   BigReal patchdim = simParams->patchDimension;
01428   int numPatches = patchMap->numPatches();
01429   int numNodes = CkNumPes();
01430   int *source_flags = new int[numNodes];
01431   int node;
01432   for ( node=0; node<numNodes; ++node ) {
01433     source_flags[node] = 0;
01434     recipPeDest[node] = 0;
01435   }
01436 
01437   // // make sure that we don't get ahead of ourselves on this node
01438   // if ( CkMyPe() < numPatches && myRecipPe >= 0 ) {
01439   //   source_flags[CkMyPe()] = 1;
01440   //   recipPeDest[myRecipPe] = 1;
01441   // }
01442 
01443   for ( int pid=0; pid < numPatches; ++pid ) {
01444     int pnode = patchMap->node(pid);
01445 #ifdef NAMD_CUDA
01446     if ( offload ) pnode = CkNodeFirst(CkNodeOf(pnode));
01447 #endif
01448     int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
01449     BigReal minx = patchMap->min_a(pid);
01450     BigReal maxx = patchMap->max_a(pid);
01451     BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
01452     // min1 (max1) is smallest (largest) grid line for this patch
01453     int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
01454     int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
01455     for ( int i=min1; i<=max1; ++i ) {
01456       int ix = i;
01457       while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
01458       while ( ix < 0 ) ix += myGrid.K1;
01459       // set source_flags[pnode] if this patch sends to our node
01460       if ( myGridPe >= 0 && ix >= localInfo[myGridPe].x_start &&
01461            ix < localInfo[myGridPe].x_start + localInfo[myGridPe].nx ) {
01462         source_flags[pnode] = 1;
01463       }
01464       // set dest_flags[] for node that our patch sends to
01465 #ifdef NAMD_CUDA
01466       if ( offload ) {
01467         if ( pnode == CkNodeFirst(CkMyNode()) ) {
01468           recipPeDest[ix / myGrid.block1] = 1;
01469         }
01470       } else
01471 #endif
01472       if ( pnode == CkMyPe() ) {
01473         recipPeDest[ix / myGrid.block1] = 1;
01474       }
01475     }
01476   }
01477 
01478   int numSourcesSamePhysicalNode = 0;
01479   numSources = 0;
01480   numDestRecipPes = 0;
01481   for ( node=0; node<numNodes; ++node ) {
01482     if ( source_flags[node] ) ++numSources;
01483     if ( recipPeDest[node] ) ++numDestRecipPes;
01484     if ( source_flags[node] && CmiPeOnSamePhysicalNode(node,CkMyPe()) ) ++numSourcesSamePhysicalNode;
01485   }
01486 
01487 #if 0
01488   if ( numSources ) {
01489     CkPrintf("pe %5d pme %5d of %5d on same physical node\n",
01490             CkMyPe(), numSourcesSamePhysicalNode, numSources);
01491     iout << iINFO << "PME " << CkMyPe() << " sources:";
01492     for ( node=0; node<numNodes; ++node ) {
01493       if ( source_flags[node] ) iout << " " << node;
01494     }
01495     iout << "\n" << endi;
01496   }
01497 #endif
01498 
01499   delete [] source_flags;
01500 
01501   // CkPrintf("PME on node %d has %d sources and %d destinations\n",
01502   //           CkMyPe(), numSources, numDestRecipPes);
01503 
01504   }  // decide how many pes this node exchanges charges with (end)
01505 
01506   ungrid_count = numDestRecipPes;
01507 
01508   sendTransBarrier_received = 0;
01509 
01510   if ( myGridPe < 0 && myTransPe < 0 ) return;
01511   // the following only for nodes doing reciprocal sum
01512 
01513   if ( myTransPe >= 0 ) {
01514     recipEvirPe = findRecipEvirPe();
01515     pmeProxy[recipEvirPe].addRecipEvirClient();
01516   }
01517 
01518   if ( myTransPe >= 0 ) {
01519       int k2_start = localInfo[myTransPe].y_start_after_transpose;
01520       int k2_end = k2_start + localInfo[myTransPe].ny_after_transpose;
01521       #ifdef OPENATOM_VERSION
01522       if ( simParams->openatomOn ) { 
01523         CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
01524         myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2, moaProxy);
01525       } else {
01526         myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
01527       }
01528       #else  // OPENATOM_VERSION
01529       myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
01530       #endif // OPENATOM_VERSION
01531   }
01532 
01533   int local_size = myGrid.block1 * myGrid.K2 * myGrid.dim3;
01534   int local_size_2 = myGrid.block2 * myGrid.K1 * myGrid.dim3;
01535   if ( local_size < local_size_2 ) local_size = local_size_2;
01536   qgrid = new float[local_size*numGrids];
01537   if ( numGridPes > 1 || numTransPes > 1 ) {
01538     kgrid = new float[local_size*numGrids];
01539   } else {
01540     kgrid = qgrid;
01541   }
01542   qgrid_size = local_size;
01543 
01544   if ( myGridPe >= 0 ) {
01545   qgrid_start = localInfo[myGridPe].x_start * myGrid.K2 * myGrid.dim3;
01546   qgrid_len = localInfo[myGridPe].nx * myGrid.K2 * myGrid.dim3;
01547   fgrid_start = localInfo[myGridPe].x_start * myGrid.K2;
01548   fgrid_len = localInfo[myGridPe].nx * myGrid.K2;
01549   }
01550 
01551   int n[3]; n[0] = myGrid.K1; n[1] = myGrid.K2; n[2] = myGrid.K3;
01552 #ifdef NAMD_FFTW
01553   CmiLock(fftw_plan_lock);
01554 #ifdef NAMD_FFTW_3
01555   work = new fftwf_complex[n[0]];
01556   int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT  : simParams->FFTWEstimate ? FFTW_ESTIMATE  : FFTW_MEASURE ;
01557   if ( myGridPe >= 0 ) {
01558     forward_plan_yz=new fftwf_plan[numGrids];
01559     backward_plan_yz=new fftwf_plan[numGrids];
01560   }
01561   if ( myTransPe >= 0 ) {
01562     forward_plan_x=new fftwf_plan[numGrids];
01563     backward_plan_x=new fftwf_plan[numGrids];
01564   }
01565   /* need one plan per grid */
01566   if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps.  1..." << endi;
01567   if ( myGridPe >= 0 ) {
01568     for( int g=0; g<numGrids; g++)
01569       {
01570         forward_plan_yz[g] = fftwf_plan_many_dft_r2c(2, n+1, 
01571                                                      localInfo[myGridPe].nx,
01572                                                      qgrid + qgrid_size * g,
01573                                                      NULL,
01574                                                      1,
01575                                                      myGrid.dim2 * myGrid.dim3,
01576                                                      (fftwf_complex *) 
01577                                                      (qgrid + qgrid_size * g),
01578                                                      NULL,
01579                                                      1,
01580                                                      myGrid.dim2 * (myGrid.dim3/2),
01581                                                      fftwFlags);
01582       }
01583   }
01584   int zdim = myGrid.dim3;
01585   int xStride=localInfo[myTransPe].ny_after_transpose *( myGrid.dim3 / 2);
01586   if ( ! CkMyPe() ) iout << " 2..." << endi;
01587   if ( myTransPe >= 0 ) {
01588     for( int g=0; g<numGrids; g++)
01589       {
01590 
01591         forward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
01592                                                 (fftwf_complex *)
01593                                                 (kgrid+qgrid_size*g),
01594                                                 NULL,
01595                                                 xStride,
01596                                                 1,
01597                                                 (fftwf_complex *)
01598                                                 (kgrid+qgrid_size*g),
01599                                                 NULL,
01600                                                 xStride,
01601                                                 1,
01602                                                 FFTW_FORWARD,fftwFlags);
01603         
01604       }
01605   }
01606   if ( ! CkMyPe() ) iout << " 3..." << endi;
01607   if ( myTransPe >= 0 ) {
01608     for( int g=0; g<numGrids; g++)
01609       {
01610         backward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
01611                                                  (fftwf_complex *)
01612                                                  (kgrid+qgrid_size*g),
01613                                                  NULL,
01614                                                  xStride,
01615                                                  1,
01616                                                  (fftwf_complex *)
01617                                                  (kgrid+qgrid_size*g),
01618                                                  NULL,
01619                                                  xStride,
01620                                                  1,
01621                                                  FFTW_BACKWARD, fftwFlags);
01622 
01623       }
01624   }
01625   if ( ! CkMyPe() ) iout << " 4..." << endi;
01626   if ( myGridPe >= 0 ) {
01627     for( int g=0; g<numGrids; g++)
01628       {
01629         backward_plan_yz[g] = fftwf_plan_many_dft_c2r(2, n+1, 
01630                                                       localInfo[myGridPe].nx,
01631                                                       (fftwf_complex *)
01632                                                       (qgrid + qgrid_size * g),
01633                                                       NULL,
01634                                                       1,
01635                                                       myGrid.dim2*(myGrid.dim3/2),
01636                                                       qgrid + qgrid_size * g,
01637                                                       NULL,
01638                                                       1,
01639                                                       myGrid.dim2 * myGrid.dim3,
01640                                                       fftwFlags);
01641       }
01642   }
01643   if ( ! CkMyPe() ) iout << "   Done.\n" << endi;
01644 
01645 #else
01646   work = new fftw_complex[n[0]];
01647 
01648   if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps.  1..." << endi;
01649   if ( myGridPe >= 0 ) {
01650   forward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_REAL_TO_COMPLEX,
01651         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01652         | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
01653   }
01654   if ( ! CkMyPe() ) iout << " 2..." << endi;
01655   if ( myTransPe >= 0 ) {
01656       forward_plan_x = fftw_create_plan_specific(n[0], FFTW_FORWARD,
01657         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01658         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
01659         localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
01660   }
01661   if ( ! CkMyPe() ) iout << " 3..." << endi;
01662   if ( myTransPe >= 0 ) {
01663   backward_plan_x = fftw_create_plan_specific(n[0], FFTW_BACKWARD,
01664         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01665         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
01666         localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
01667   }
01668   if ( ! CkMyPe() ) iout << " 4..." << endi;
01669   if ( myGridPe >= 0 ) {
01670   backward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_COMPLEX_TO_REAL,
01671         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01672         | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
01673   }
01674   if ( ! CkMyPe() ) iout << "   Done.\n" << endi;
01675 #endif
01676   CmiUnlock(fftw_plan_lock);
01677 #else
01678   NAMD_die("Sorry, FFTW must be compiled in to use PME.");
01679 #endif
01680 
01681   if ( myGridPe >= 0 && numSources == 0 )
01682                 NAMD_bug("PME grid elements exist without sources.");
01683   grid_count = numSources;
01684   memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
01685   trans_count = numGridPes;
01686 }

void ComputePmeMgr::initialize_computes (  ) 

Definition at line 2703 of file ComputePme.C.

References chargeGridSubmittedCount, cuda_errcheck(), cuda_init_bspline_coeffs(), cuda_lock, deviceCUDA, PmeGrid::dim2, PmeGrid::dim3, NodePmeMgr::end_all_pme_kernels, NodePmeMgr::end_charge_memset, NodePmeMgr::end_potential_memcpy, f, DeviceCUDA::getDeviceID(), DeviceCUDA::getMasterPe(), j, ijpair::j, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, NodePmeMgr::master_pe, master_pe, NodePmeMgr::masterPmeMgr, NodePmeMgr::mgrObjects, NAMD_bug(), PatchMap::numPatchesOnNode(), PatchMap::Object(), Node::Object(), ReductionMgr::Object(), PmeGrid::order, REDUCTIONS_BASIC, Node::simParameters, simParams, ReductionMgr::willSubmit(), and XCOPY.

02703                                         {
02704 
02705   noWorkCount = 0;
02706   doWorkCount = 0;
02707   ungridForcesCount = 0;
02708 
02709   reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
02710 
02711   SimParameters *simParams = Node::Object()->simParameters;
02712 
02713   strayChargeErrors = 0;
02714 
02715 #ifdef NAMD_CUDA
02716  PatchMap *patchMap = PatchMap::Object();
02717  int pe = master_pe = CkNodeFirst(CkMyNode());
02718  for ( int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
02719     if ( ! patchMap->numPatchesOnNode(master_pe) ) master_pe = pe;
02720     if ( ! patchMap->numPatchesOnNode(pe) ) continue;
02721     if ( master_pe < 1 && pe != deviceCUDA->getMasterPe() ) master_pe = pe;
02722     if ( master_pe == deviceCUDA->getMasterPe() ) master_pe = pe;
02723     if ( WorkDistrib::pe_sortop_diffuse()(pe,master_pe)
02724         && pe != deviceCUDA->getMasterPe() ) {
02725       master_pe = pe;
02726     }
02727  }
02728  if ( ! patchMap->numPatchesOnNode(master_pe) ) {
02729    NAMD_bug("ComputePmeMgr::initialize_computes() master_pe has no patches.");
02730  }
02731 
02732  masterPmeMgr = nodePmeMgr->mgrObjects[master_pe - CkNodeFirst(CkMyNode())];
02733  bool cudaFirst = 1;
02734  if ( offload ) {
02735   CmiLock(cuda_lock);
02736   cudaFirst = ! masterPmeMgr->chargeGridSubmittedCount++;
02737  }
02738 
02739  if ( cudaFirst ) {
02740   nodePmeMgr->master_pe = master_pe;
02741   nodePmeMgr->masterPmeMgr = masterPmeMgr;
02742  }
02743 #endif
02744 
02745   qsize = myGrid.K1 * myGrid.dim2 * myGrid.dim3;
02746   fsize = myGrid.K1 * myGrid.dim2;
02747   if ( myGrid.K2 != myGrid.dim2 ) NAMD_bug("PME myGrid.K2 != myGrid.dim2");
02748 #ifdef NAMD_CUDA
02749  if ( ! offload )
02750 #endif
02751  {
02752   q_arr = new float*[fsize*numGrids];
02753   memset( (void*) q_arr, 0, fsize*numGrids * sizeof(float*) );
02754   q_list = new float*[fsize*numGrids];
02755   memset( (void*) q_list, 0, fsize*numGrids * sizeof(float*) );
02756   q_count = 0;
02757  }
02758 
02759 #ifdef NAMD_CUDA
02760  if ( cudaFirst || ! offload ) {
02761 #endif
02762   f_arr = new char[fsize*numGrids];
02763   // memset to non-zero value has race condition on BlueGene/Q
02764   // memset( (void*) f_arr, 2, fsize*numGrids * sizeof(char) );
02765   for ( int n=fsize*numGrids, i=0; i<n; ++i ) f_arr[i] = 2;
02766 
02767   for ( int g=0; g<numGrids; ++g ) {
02768     char *f = f_arr + g*fsize;
02769     if ( usePencils ) {
02770       int K1 = myGrid.K1;
02771       int K2 = myGrid.K2;
02772       int block1 = ( K1 + xBlocks - 1 ) / xBlocks;
02773       int block2 = ( K2 + yBlocks - 1 ) / yBlocks;
02774       int dim2 = myGrid.dim2;
02775       for (int ap=0; ap<numPencilsActive; ++ap) {
02776         int ib = activePencils[ap].i;
02777         int jb = activePencils[ap].j;
02778         int ibegin = ib*block1;
02779         int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
02780         int jbegin = jb*block2;
02781         int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
02782         int flen = numGrids * (iend - ibegin) * (jend - jbegin);
02783         for ( int i=ibegin; i<iend; ++i ) {
02784           for ( int j=jbegin; j<jend; ++j ) {
02785             f[i*dim2+j] = 0;
02786           }
02787         }
02788       }
02789     } else {
02790       int block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
02791       bsize = block1 * myGrid.dim2 * myGrid.dim3;
02792       for (int pe=0; pe<numGridPes; pe++) {
02793         if ( ! recipPeDest[pe] ) continue;
02794         int start = pe * bsize;
02795         int len = bsize;
02796         if ( start >= qsize ) { start = 0; len = 0; }
02797         if ( start + len > qsize ) { len = qsize - start; }
02798         int zdim = myGrid.dim3;
02799         int fstart = start / zdim;
02800         int flen = len / zdim;
02801         memset(f + fstart, 0, flen*sizeof(char));
02802         // CkPrintf("pe %d enabled slabs %d to %d\n", CkMyPe(), fstart/myGrid.dim2, (fstart+flen)/myGrid.dim2-1);
02803       }
02804     }
02805   }
02806 #ifdef NAMD_CUDA
02807  }
02808  if ( offload ) {
02809  cudaSetDevice(deviceCUDA->getDeviceID());
02810  if ( cudaFirst ) {
02811 
02812   int f_alloc_count = 0;
02813   for ( int n=fsize, i=0; i<n; ++i ) {
02814     if ( f_arr[i] == 0 ) {
02815       ++f_alloc_count;
02816     }
02817   }
02818   // CkPrintf("pe %d f_alloc_count == %d (%d slabs)\n", CkMyPe(), f_alloc_count, f_alloc_count/myGrid.dim2);
02819 
02820   q_arr = new float*[fsize*numGrids];
02821   memset( (void*) q_arr, 0, fsize*numGrids * sizeof(float*) );
02822 
02823   float **q_arr_dev_host = new float*[fsize];
02824   cudaMalloc((void**) &q_arr_dev, fsize * sizeof(float*));
02825 
02826   float **v_arr_dev_host = new float*[fsize];
02827   cudaMalloc((void**) &v_arr_dev, fsize * sizeof(float*));
02828 
02829   int q_stride = myGrid.K3+myGrid.order-1;
02830   q_data_size = f_alloc_count * q_stride * sizeof(float);
02831   ffz_size = (fsize + q_stride) * sizeof(int);
02832 
02833   // tack ffz onto end of q_data to allow merged transfer
02834   cudaMallocHost((void**) &q_data_host, q_data_size+ffz_size);
02835   ffz_host = (int*)(((char*)q_data_host) + q_data_size);
02836   cudaMalloc((void**) &q_data_dev, q_data_size+ffz_size);
02837   ffz_dev = (int*)(((char*)q_data_dev) + q_data_size);
02838   cudaMalloc((void**) &v_data_dev, q_data_size);
02839   cuda_errcheck("malloc grid data for pme");
02840   cudaMemset(q_data_dev, 0, q_data_size + ffz_size);  // for first time
02841   cudaEventCreateWithFlags(&(nodePmeMgr->end_charge_memset),cudaEventDisableTiming);
02842   cudaEventRecord(nodePmeMgr->end_charge_memset, 0);
02843   cudaEventCreateWithFlags(&(nodePmeMgr->end_all_pme_kernels),cudaEventDisableTiming);
02844   cudaEventCreateWithFlags(&(nodePmeMgr->end_potential_memcpy),cudaEventDisableTiming);
02845 
02846   f_alloc_count = 0;
02847   for ( int n=fsize, i=0; i<n; ++i ) {
02848     if ( f_arr[i] == 0 ) {
02849       q_arr[i] = q_data_host + f_alloc_count * q_stride;
02850       q_arr_dev_host[i] = q_data_dev + f_alloc_count * q_stride;
02851       v_arr_dev_host[i] = v_data_dev + f_alloc_count * q_stride;
02852       ++f_alloc_count;
02853     } else {
02854       q_arr[i] = 0;
02855       q_arr_dev_host[i] = 0;
02856       v_arr_dev_host[i] = 0;
02857     }
02858   }
02859 
02860   cudaMemcpy(q_arr_dev, q_arr_dev_host, fsize * sizeof(float*), cudaMemcpyHostToDevice);
02861   cudaMemcpy(v_arr_dev, v_arr_dev_host, fsize * sizeof(float*), cudaMemcpyHostToDevice);
02862   delete [] q_arr_dev_host;
02863   delete [] v_arr_dev_host;
02864   delete [] f_arr;
02865   f_arr = new char[fsize + q_stride];
02866   fz_arr = f_arr + fsize;
02867   memset(f_arr, 0, fsize + q_stride);
02868   memset(ffz_host, 0, (fsize + q_stride)*sizeof(int));
02869 
02870   cuda_errcheck("initialize grid data for pme");
02871 
02872   cuda_init_bspline_coeffs(&bspline_coeffs_dev, &bspline_dcoeffs_dev, myGrid.order);
02873   cuda_errcheck("initialize bspline coefficients for pme");
02874 
02875 #define XCOPY(X) masterPmeMgr->X = X;
02876   XCOPY(bspline_coeffs_dev)
02877   XCOPY(bspline_dcoeffs_dev)
02878   XCOPY(q_arr)
02879   XCOPY(q_arr_dev)
02880   XCOPY(v_arr_dev)
02881   XCOPY(q_data_size)
02882   XCOPY(q_data_host)
02883   XCOPY(q_data_dev)
02884   XCOPY(v_data_dev)
02885   XCOPY(ffz_size)
02886   XCOPY(ffz_host)
02887   XCOPY(ffz_dev)
02888   XCOPY(f_arr)
02889   XCOPY(fz_arr)
02890 #undef XCOPY
02891   //CkPrintf("pe %d init first\n", CkMyPe());
02892  } else { // cudaFirst
02893   //CkPrintf("pe %d init later\n", CkMyPe());
02894 #define XCOPY(X) X = masterPmeMgr->X;
02895   XCOPY(bspline_coeffs_dev)
02896   XCOPY(bspline_dcoeffs_dev)
02897   XCOPY(q_arr)
02898   XCOPY(q_arr_dev)
02899   XCOPY(v_arr_dev)
02900   XCOPY(q_data_size)
02901   XCOPY(q_data_host)
02902   XCOPY(q_data_dev)
02903   XCOPY(v_data_dev)
02904   XCOPY(ffz_size)
02905   XCOPY(ffz_host)
02906   XCOPY(ffz_dev)
02907   XCOPY(f_arr)
02908   XCOPY(fz_arr)
02909 #undef XCOPY
02910  } // cudaFirst
02911   CmiUnlock(cuda_lock);
02912  } else // offload
02913 #endif // NAMD_CUDA
02914  {
02915   fz_arr = new char[myGrid.K3+myGrid.order-1];
02916  }
02917 
02918 #if 0 && USE_PERSISTENT
02919   recvGrid_handle = NULL;
02920 #endif
02921 }

void ComputePmeMgr::initialize_pencils ( CkQdMsg *   ) 

Definition at line 1690 of file ComputePme.C.

References Lattice::a(), Lattice::a_r(), Lattice::b(), Lattice::b_r(), PmeGrid::block1, PmeGrid::block2, deviceCUDA, DeviceCUDA::getMasterPe(), j, PmeGrid::K1, PmeGrid::K2, PatchMap::max_a(), PatchMap::max_b(), PatchMap::min_a(), PatchMap::min_b(), PatchMap::node(), PatchMap::numPatches(), numPatches, PatchMap::Object(), Node::Object(), PmeGrid::order, Node::simParameters, simParams, and Vector::unit().

01690                                                    {
01691   delete msg;
01692   if ( ! usePencils ) return;
01693 
01694   SimParameters *simParams = Node::Object()->simParameters;
01695 
01696   PatchMap *patchMap = PatchMap::Object();
01697   Lattice lattice = simParams->lattice;
01698   BigReal sysdima = lattice.a_r().unit() * lattice.a();
01699   BigReal sysdimb = lattice.b_r().unit() * lattice.b();
01700   BigReal cutoff = simParams->cutoff;
01701   BigReal patchdim = simParams->patchDimension;
01702   int numPatches = patchMap->numPatches();
01703 
01704   pencilActive = new char[xBlocks*yBlocks];
01705   for ( int i=0; i<xBlocks; ++i ) {
01706     for ( int j=0; j<yBlocks; ++j ) {
01707       pencilActive[i*yBlocks+j] = 0;
01708     }
01709   }
01710 
01711   for ( int pid=0; pid < numPatches; ++pid ) {
01712     int pnode = patchMap->node(pid);
01713 #ifdef NAMD_CUDA
01714     if ( offload ) {
01715       if ( CkNodeOf(pnode) != CkMyNode() ) continue;
01716     } else
01717 #endif
01718     if ( pnode != CkMyPe() ) continue;
01719 
01720     int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
01721     int shift2 = (myGrid.K2 + myGrid.order - 1)/2;
01722 
01723     BigReal minx = patchMap->min_a(pid);
01724     BigReal maxx = patchMap->max_a(pid);
01725     BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
01726     // min1 (max1) is smallest (largest) grid line for this patch
01727     int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
01728     int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
01729 
01730     BigReal miny = patchMap->min_b(pid);
01731     BigReal maxy = patchMap->max_b(pid);
01732     BigReal marginb = 0.5 * ( patchdim - cutoff ) / sysdimb;
01733     // min2 (max2) is smallest (largest) grid line for this patch
01734     int min2 = ((int) floor(myGrid.K2 * (miny - marginb))) + shift2 - myGrid.order + 1;
01735     int max2 = ((int) floor(myGrid.K2 * (maxy + marginb))) + shift2;
01736 
01737     for ( int i=min1; i<=max1; ++i ) {
01738       int ix = i;
01739       while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
01740       while ( ix < 0 ) ix += myGrid.K1;
01741       for ( int j=min2; j<=max2; ++j ) {
01742         int jy = j;
01743         while ( jy >= myGrid.K2 ) jy -= myGrid.K2;
01744         while ( jy < 0 ) jy += myGrid.K2;
01745         pencilActive[(ix / myGrid.block1)*yBlocks + (jy / myGrid.block2)] = 1;
01746       }
01747     }
01748   }
01749 
01750   numPencilsActive = 0;
01751   for ( int i=0; i<xBlocks; ++i ) {
01752     for ( int j=0; j<yBlocks; ++j ) {
01753       if ( pencilActive[i*yBlocks+j] ) {
01754         ++numPencilsActive;
01755 #ifdef NAMD_CUDA
01756         if ( CkMyPe() == deviceCUDA->getMasterPe() || ! offload )
01757 #endif
01758         zPencil(i,j,0).dummyRecvGrid(CkMyPe(),0);
01759       }
01760     }
01761   }
01762   activePencils = new ijpair[numPencilsActive];
01763   numPencilsActive = 0;
01764   for ( int i=0; i<xBlocks; ++i ) {
01765     for ( int j=0; j<yBlocks; ++j ) {
01766       if ( pencilActive[i*yBlocks+j] ) {
01767         activePencils[numPencilsActive++] = ijpair(i,j);
01768       }
01769     }
01770   }
01771   if ( simParams->PMESendOrder ) {
01772     std::sort(activePencils,activePencils+numPencilsActive,ijpair_sortop_bit_reversed());
01773   } else {
01774     Random rand(CkMyPe());
01775     rand.reorder(activePencils,numPencilsActive);
01776   }
01777   //if ( numPencilsActive ) {
01778   //  CkPrintf("node %d sending to %d pencils\n", CkMyPe(), numPencilsActive);
01779   //}
01780 
01781   ungrid_count = numPencilsActive;
01782 }

void ComputePmeMgr::pollChargeGridReady (  ) 

Definition at line 3506 of file ComputePme.C.

References CcdCallBacksReset(), cuda_check_pme_charges(), CUDA_POLL, and NAMD_bug().

03506                                         {
03507 #ifdef NAMD_CUDA
03508   CcdCallBacksReset(0,CmiWallTimer());  // fix Charm++
03509   CUDA_POLL(cuda_check_pme_charges,this);
03510 #else
03511   NAMD_bug("ComputePmeMgr::pollChargeGridReady() called in non-CUDA build.");
03512 #endif
03513 }

void ComputePmeMgr::pollForcesReady (  ) 

Definition at line 2639 of file ComputePme.C.

References CcdCallBacksReset(), cuda_check_pme_forces(), CUDA_POLL, and NAMD_bug().

02639                                     {
02640 #ifdef NAMD_CUDA
02641   CcdCallBacksReset(0,CmiWallTimer());  // fix Charm++
02642   CUDA_POLL(cuda_check_pme_forces,this);
02643 #else
02644   NAMD_bug("ComputePmeMgr::pollForcesReady() called in non-CUDA build.");
02645 #endif
02646 }

void ComputePmeMgr::procTrans ( PmeTransMsg  ) 

Definition at line 2045 of file ComputePme.C.

References PmeGrid::dim3, gridCalc2(), PmeTransMsg::lattice, NodePmeInfo::npe, ComputePmeUtil::numGrids, PmeTransMsg::nx, LocalPmeInfo::ny_after_transpose, NodePmeInfo::pe_start, PmeTransMsg::qgrid, PmeTransMsg::sequence, PmeTransMsg::x_start, and LocalPmeInfo::y_start_after_transpose.

Referenced by recvSharedTrans(), and recvTrans().

02045                                               {
02046   // CkPrintf("procTrans on Pe(%d)\n",CkMyPe());
02047   if ( trans_count == numGridPes ) {
02048     lattice = msg->lattice;
02049     grid_sequence = msg->sequence;
02050   }
02051 
02052  if ( msg->nx ) {
02053   int zdim = myGrid.dim3;
02054   NodePmeInfo &nodeInfo(transNodeInfo[myTransNode]);
02055   int first_pe = nodeInfo.pe_start;
02056   int last_pe = first_pe+nodeInfo.npe-1;
02057   int y_skip = localInfo[myTransPe].y_start_after_transpose
02058              - localInfo[first_pe].y_start_after_transpose;
02059   int ny_msg = localInfo[last_pe].y_start_after_transpose
02060              + localInfo[last_pe].ny_after_transpose
02061              - localInfo[first_pe].y_start_after_transpose;
02062   int ny = localInfo[myTransPe].ny_after_transpose;
02063   int x_start = msg->x_start;
02064   int nx = msg->nx;
02065   for ( int g=0; g<numGrids; ++g ) {
02066     CmiMemcpy((void*)(kgrid + qgrid_size * g + x_start*ny*zdim),
02067         (void*)(msg->qgrid + nx*(ny_msg*g+y_skip)*zdim),
02068         nx*ny*zdim*sizeof(float));
02069   }
02070  }
02071 
02072   --trans_count;
02073 
02074   if ( trans_count == 0 ) {
02075     pmeProxyDir[CkMyPe()].gridCalc2();
02076   }
02077 }

void ComputePmeMgr::procUntrans ( PmeUntransMsg  ) 

Definition at line 2299 of file ComputePme.C.

References PmeGrid::dim3, gridCalc3(), PmeGrid::K2, NodePmeInfo::npe, LocalPmeInfo::nx, PmeUntransMsg::ny, NodePmeInfo::pe_start, PmeUntransMsg::qgrid, LocalPmeInfo::x_start, and PmeUntransMsg::y_start.

Referenced by recvSharedUntrans(), and recvUntrans().

02299                                                   {
02300   // CkPrintf("recvUntrans on Pe(%d)\n",CkMyPe());
02301 
02302 #if CMK_BLUEGENEL
02303   CmiNetworkProgressAfter (0);
02304 #endif
02305 
02306   NodePmeInfo &nodeInfo(gridNodeInfo[myGridNode]);
02307   int first_pe = nodeInfo.pe_start;
02308   int g;
02309 
02310  if ( msg->ny ) {
02311   int zdim = myGrid.dim3;
02312   int last_pe = first_pe+nodeInfo.npe-1;
02313   int x_skip = localInfo[myGridPe].x_start
02314              - localInfo[first_pe].x_start;
02315   int nx_msg = localInfo[last_pe].x_start
02316              + localInfo[last_pe].nx
02317              - localInfo[first_pe].x_start;
02318   int nx = localInfo[myGridPe].nx;
02319   int y_start = msg->y_start;
02320   int ny = msg->ny;
02321   int slicelen = myGrid.K2 * zdim;
02322   int cpylen = ny * zdim;
02323   for ( g=0; g<numGrids; ++g ) {
02324     float *q = qgrid + qgrid_size * g + y_start * zdim;
02325     float *qmsg = msg->qgrid + (nx_msg*g+x_skip) * cpylen;
02326     for ( int x = 0; x < nx; ++x ) {
02327       CmiMemcpy((void*)q, (void*)qmsg, cpylen*sizeof(float));
02328       q += slicelen;
02329       qmsg += cpylen;
02330     }
02331   }
02332  }
02333 
02334   --untrans_count;
02335 
02336   if ( untrans_count == 0 ) {
02337     pmeProxyDir[CkMyPe()].gridCalc3();
02338   }
02339 }

void ComputePmeMgr::recvAck ( PmeAckMsg  ) 

Definition at line 2441 of file ComputePme.C.

References cuda_lock, master_pe, and NAMD_bug().

Referenced by recvUngrid().

02441                                           {
02442   if ( msg ) delete msg;
02443 #ifdef NAMD_CUDA
02444   if ( offload ) {
02445     CmiLock(cuda_lock);
02446     if ( ungrid_count == 0 ) {
02447       NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
02448     }
02449     int uc = --ungrid_count;
02450     CmiUnlock(cuda_lock);
02451 
02452     if ( uc == 0 ) {
02453       pmeProxyDir[master_pe].ungridCalc();
02454     }
02455     return;
02456   }
02457 #endif
02458   --ungrid_count;
02459 
02460   if ( ungrid_count == 0 ) {
02461     pmeProxyDir[CkMyPe()].ungridCalc();
02462   }
02463 }

void ComputePmeMgr::recvArrays ( CProxy_PmeXPencil  ,
CProxy_PmeYPencil  ,
CProxy_PmeZPencil   
)

Definition at line 799 of file ComputePme.C.

00800                                                                        {
00801   xPencil = x;  yPencil = y;  zPencil = z;
00802   
00803     if(CmiMyRank()==0)
00804     {
00805       pmeNodeProxy.ckLocalBranch()->xPencil=x;
00806       pmeNodeProxy.ckLocalBranch()->yPencil=y;
00807       pmeNodeProxy.ckLocalBranch()->zPencil=z;
00808     }
00809 }

void ComputePmeMgr::recvChargeGridReady (  ) 

Definition at line 3515 of file ComputePme.C.

References chargeGridReady(), saved_lattice, and saved_sequence.

03515                                         {
03516   chargeGridReady(*saved_lattice,saved_sequence);
03517 }

void ComputePmeMgr::recvGrid ( PmeGridMsg  ) 

Definition at line 1824 of file ComputePme.C.

References PmeGrid::dim3, f, PmeGridMsg::fgrid, gridCalc1(), PmeGridMsg::lattice, NAMD_bug(), ComputePmeUtil::numGrids, PmeGridMsg::qgrid, sendTransBarrier(), PmeGridMsg::sequence, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

01824                                             {
01825   // CkPrintf("recvGrid from %d on Pe(%d)\n",msg->sourceNode,CkMyPe());
01826   if ( grid_count == 0 ) {
01827     NAMD_bug("Message order failure in ComputePmeMgr::recvGrid\n");
01828   }
01829   if ( grid_count == numSources ) {
01830     lattice = msg->lattice;
01831     grid_sequence = msg->sequence;
01832   }
01833 
01834   int zdim = myGrid.dim3;
01835   int zlistlen = msg->zlistlen;
01836   int *zlist = msg->zlist;
01837   float *qmsg = msg->qgrid;
01838   for ( int g=0; g<numGrids; ++g ) {
01839     char *f = msg->fgrid + fgrid_len * g;
01840     float *q = qgrid + qgrid_size * g;
01841     for ( int i=0; i<fgrid_len; ++i ) {
01842       if ( f[i] ) {
01843         for ( int k=0; k<zlistlen; ++k ) {
01844           q[zlist[k]] += *(qmsg++);
01845         }
01846       }
01847       q += zdim;
01848     }
01849   }
01850 
01851   gridmsg_reuse[numSources-grid_count] = msg;
01852   --grid_count;
01853 
01854   if ( grid_count == 0 ) {
01855     pmeProxyDir[CkMyPe()].gridCalc1();
01856     if ( useBarrier ) pmeProxyDir[0].sendTransBarrier();
01857   }
01858 }

void ComputePmeMgr::recvRecipEvir ( PmeEvirMsg  ) 

Definition at line 3006 of file ComputePme.C.

References PmeEvirMsg::evir, NAMD_bug(), pmeComputes, ResizeArray< Elem >::size(), and submitReductions().

03006                                                  {
03007   if ( ! pmeComputes.size() ) NAMD_bug("ComputePmeMgr::recvRecipEvir() called on pe without patches");
03008   for ( int g=0; g<numGrids; ++g ) {
03009     evir[g] += msg->evir[g];
03010   }
03011   delete msg;
03012   // CkPrintf("recvRecipEvir pe %d %d %d\n", CkMyPe(), ungridForcesCount, recipEvirCount);
03013   if ( ! --recipEvirCount && ! ungridForcesCount ) submitReductions();
03014 }

void ComputePmeMgr::recvSharedTrans ( PmeSharedTransMsg  ) 

Definition at line 2027 of file ComputePme.C.

References PmeSharedTransMsg::count, PmeSharedTransMsg::lock, PmeSharedTransMsg::msg, and procTrans().

02027                                                           {
02028   procTrans(msg->msg);
02029   CmiLock(msg->lock);
02030   int count = --(*msg->count);
02031   CmiUnlock(msg->lock);
02032   if ( count == 0 ) {
02033     CmiDestroyLock(msg->lock);
02034     delete msg->count;
02035     delete msg->msg;
02036   }
02037   delete msg;
02038 }

void ComputePmeMgr::recvSharedUntrans ( PmeSharedUntransMsg  ) 

Definition at line 2281 of file ComputePme.C.

References PmeSharedUntransMsg::count, PmeSharedUntransMsg::lock, PmeSharedUntransMsg::msg, and procUntrans().

02281                                                               {
02282   procUntrans(msg->msg);
02283   CmiLock(msg->lock);
02284   int count = --(*msg->count);
02285   CmiUnlock(msg->lock);
02286   if ( count == 0 ) {
02287     CmiDestroyLock(msg->lock);
02288     delete msg->count;
02289     delete msg->msg;
02290   }
02291   delete msg;
02292 }

void ComputePmeMgr::recvTrans ( PmeTransMsg  ) 

Definition at line 2040 of file ComputePme.C.

References procTrans().

02040                                               {
02041   procTrans(msg);
02042   delete msg;
02043 }

void ComputePmeMgr::recvUngrid ( PmeGridMsg  ) 

Definition at line 2426 of file ComputePme.C.

References copyPencils(), copyResults(), NAMD_bug(), and recvAck().

Referenced by NodePmeMgr::recvUngrid().

02426                                               {
02427   // CkPrintf("recvUngrid on Pe(%d)\n",CkMyPe());
02428 #ifdef NAMD_CUDA
02429   if ( ! offload )  // would need lock
02430 #endif
02431   if ( ungrid_count == 0 ) {
02432     NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
02433   }
02434 
02435   if ( usePencils ) copyPencils(msg);
02436   else copyResults(msg);
02437   delete msg;
02438   recvAck(0);
02439 }

void ComputePmeMgr::recvUntrans ( PmeUntransMsg  ) 

Definition at line 2294 of file ComputePme.C.

References procUntrans().

02294                                                   {
02295   procUntrans(msg);
02296   delete msg;
02297 }

void ComputePmeMgr::sendChargeGridReady (  ) 

Definition at line 3492 of file ComputePme.C.

References chargeGridSubmittedCount, master_pe, NodePmeMgr::mgrObjects, pmeComputes, and ResizeArray< Elem >::size().

Referenced by cuda_check_pme_charges().

03492                                         {
03493   for ( int i=0; i<CkMyNodeSize(); ++i ) {
03494     ComputePmeMgr *mgr = nodePmeMgr->mgrObjects[i];
03495     int cs = mgr->pmeComputes.size();
03496     if ( cs ) {
03497       mgr->ungridForcesCount = cs;
03498       mgr->recipEvirCount = mgr->recipEvirClients;
03499       masterPmeMgr->chargeGridSubmittedCount++;
03500     }
03501   }
03502   pmeProxy[master_pe].recvChargeGridReady();
03503 }

void ComputePmeMgr::sendData ( Lattice ,
int  sequence 
)

Definition at line 3929 of file ComputePme.C.

References sendDataHelper_errors, sendDataHelper_lattice, sendDataHelper_sequence, sendDataHelper_sourcepe, and sendDataPart().

Referenced by chargeGridReady().

03929                                                            {
03930 
03931   sendDataHelper_lattice = &lattice;
03932   sendDataHelper_sequence = sequence;
03933   sendDataHelper_sourcepe = CkMyPe();
03934   sendDataHelper_errors = strayChargeErrors;
03935   strayChargeErrors = 0;
03936 
03937 #ifdef NAMD_CUDA
03938   if ( offload ) {
03939     for ( int i=0; i < numGridPes; ++i ) {
03940       int pe = gridPeOrder[i];  // different order
03941       if ( ! recipPeDest[pe] && ! sendDataHelper_errors ) continue;
03942 #if CMK_MULTICORE
03943       // nodegroup messages on multicore are delivered to sending pe, or pe 0 if expedited
03944       pmeProxy[gridPeMap[pe]].sendDataHelper(i);
03945 #else
03946       pmeNodeProxy[CkMyNode()].sendDataHelper(i);
03947 #endif
03948     }
03949   } else
03950 #endif
03951   {
03952     sendDataPart(0,numGridPes-1,lattice,sequence,CkMyPe(),sendDataHelper_errors);
03953   }
03954  
03955 }

void ComputePmeMgr::sendDataHelper ( int   ) 

Definition at line 3916 of file ComputePme.C.

References NodePmeMgr::sendDataHelper().

03916                                            {
03917   nodePmeMgr->sendDataHelper(iter);
03918 }

void ComputePmeMgr::sendDataPart ( int  first,
int  last,
Lattice ,
int  sequence,
int  sourcepe,
int  errors 
)

Definition at line 3807 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::dim2, PmeGrid::dim3, endi(), PmeGridMsg::fgrid, iERROR(), if(), iout, j, PmeGrid::K2, PmeGrid::K3, PmeGridMsg::lattice, PmeGridMsg::len, NAMD_bug(), PmeGrid::order, PME_GRID_PRIORITY, PRIORITY_SIZE, PmeGridMsg::qgrid, PmeGridMsg::sequence, SET_PRIORITY, PmeGridMsg::sourceNode, PmeGridMsg::start, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

Referenced by sendData(), and NodePmeMgr::sendDataHelper().

03807                                                                                                               {
03808 
03809   // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
03810 
03811   bsize = myGrid.block1 * myGrid.dim2 * myGrid.dim3;
03812 
03813   CProxy_ComputePmeMgr pmeProxy(CkpvAccess(BOCclass_group).computePmeMgr);
03814   for (int j=first; j<=last; j++) {
03815     int pe = gridPeOrder[j];  // different order
03816     if ( ! recipPeDest[pe] && ! errors ) continue;
03817     int start = pe * bsize;
03818     int len = bsize;
03819     if ( start >= qsize ) { start = 0; len = 0; }
03820     if ( start + len > qsize ) { len = qsize - start; }
03821     int zdim = myGrid.dim3;
03822     int fstart = start / zdim;
03823     int flen = len / zdim;
03824     int fcount = 0;
03825     int i;
03826 
03827     int g;
03828     for ( g=0; g<numGrids; ++g ) {
03829       char *f = f_arr + fstart + g*fsize;
03830 #ifdef NAMD_CUDA
03831      if ( offload ) {
03832       int errcount = 0;
03833       for ( i=0; i<flen; ++i ) {
03834         f[i] = ffz_host[fstart+i];
03835         fcount += f[i];
03836         if ( ffz_host[fstart+i] & ~1 ) ++errcount;
03837       }
03838       if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::sendDataPart");
03839      } else
03840 #endif
03841       for ( i=0; i<flen; ++i ) {
03842         fcount += f[i];
03843       }
03844       if ( ! recipPeDest[pe] ) {
03845         int errfound = 0;
03846         for ( i=0; i<flen; ++i ) {
03847           if ( f[i] == 3 ) {
03848             errfound = 1;
03849             break;
03850           }
03851         }
03852         if ( errfound ) {
03853           iout << iERROR << "Stray PME grid charges detected: "
03854                 << sourcepe << " sending to " << gridPeMap[pe] << " for planes";
03855           int iz = -1;
03856           for ( i=0; i<flen; ++i ) {
03857             if ( f[i] == 3 ) {
03858               f[i] = 2;
03859               int jz = (i+fstart)/myGrid.K2;
03860               if ( iz != jz ) { iout << " " << jz;  iz = jz; }
03861             }
03862           }
03863           iout << "\n" << endi;
03864         }
03865       }
03866     }
03867 
03868 #ifdef NETWORK_PROGRESS
03869     CmiNetworkProgress();
03870 #endif
03871 
03872     if ( ! recipPeDest[pe] ) continue;
03873 
03874     int zlistlen = 0;
03875     for ( i=0; i<myGrid.K3; ++i ) {
03876       if ( fz_arr[i] ) ++zlistlen;
03877     }
03878 
03879     PmeGridMsg *msg = new (zlistlen, flen*numGrids,
03880                                 fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
03881 
03882     msg->sourceNode = sourcepe;
03883     msg->lattice = lattice;
03884     msg->start = fstart;
03885     msg->len = flen;
03886     msg->zlistlen = zlistlen;
03887     int *zlist = msg->zlist;
03888     zlistlen = 0;
03889     for ( i=0; i<myGrid.K3; ++i ) {
03890       if ( fz_arr[i] ) zlist[zlistlen++] = i;
03891     }
03892     float *qmsg = msg->qgrid;
03893     for ( g=0; g<numGrids; ++g ) {
03894       char *f = f_arr + fstart + g*fsize;
03895       CmiMemcpy((void*)(msg->fgrid+g*flen),(void*)f,flen*sizeof(char));
03896       float **q = q_arr + fstart + g*fsize;
03897       for ( i=0; i<flen; ++i ) {
03898         if ( f[i] ) {
03899           for (int h=0; h<myGrid.order-1; ++h) {
03900             q[i][h] += q[i][myGrid.K3+h];
03901           }
03902           for ( int k=0; k<zlistlen; ++k ) {
03903             *(qmsg++) = q[i][zlist[k]];
03904           }
03905         }
03906       }
03907     }
03908 
03909     msg->sequence = compute_sequence;
03910     SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
03911     pmeProxy[gridPeMap[pe]].recvGrid(msg);
03912   }
03913 
03914 }

void ComputePmeMgr::sendPencils ( Lattice ,
int  sequence 
)

Definition at line 3702 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::block2, PmeGrid::dim2, endi(), iERROR(), iout, j, ijpair::j, PmeGrid::K1, PmeGrid::K2, sendDataHelper_lattice, sendDataHelper_sequence, sendDataHelper_sourcepe, sendPencilsPart(), and NodePmeMgr::zm.

Referenced by chargeGridReady().

03702                                                               {
03703 
03704   sendDataHelper_lattice = &lattice;
03705   sendDataHelper_sequence = sequence;
03706   sendDataHelper_sourcepe = CkMyPe();
03707 
03708 #ifdef NAMD_CUDA
03709   if ( offload ) {
03710     for ( int ap=0; ap < numPencilsActive; ++ap ) {
03711 #if CMK_MULTICORE
03712       // nodegroup messages on multicore are delivered to sending pe, or pe 0 if expedited
03713       int ib = activePencils[ap].i;
03714       int jb = activePencils[ap].j;
03715       int destproc = nodePmeMgr->zm.ckLocalBranch()->procNum(0, CkArrayIndex3D(ib,jb,0));
03716       pmeProxy[destproc].sendPencilsHelper(ap);
03717 #else
03718       pmeNodeProxy[CkMyNode()].sendPencilsHelper(ap);
03719 #endif
03720     }
03721   } else
03722 #endif
03723   {
03724     sendPencilsPart(0,numPencilsActive-1,lattice,sequence,CkMyPe());
03725   }
03726 
03727   if ( strayChargeErrors ) {
03728    strayChargeErrors = 0;
03729    iout << iERROR << "Stray PME grid charges detected: "
03730         << CkMyPe() << " sending to (x,y)";
03731    int K1 = myGrid.K1;
03732    int K2 = myGrid.K2;
03733    int dim2 = myGrid.dim2;
03734    int block1 = myGrid.block1;
03735    int block2 = myGrid.block2;
03736    for (int ib=0; ib<xBlocks; ++ib) {
03737     for (int jb=0; jb<yBlocks; ++jb) {
03738      int ibegin = ib*block1;
03739      int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
03740      int jbegin = jb*block2;
03741      int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
03742      int flen = numGrids * (iend - ibegin) * (jend - jbegin);
03743 
03744      for ( int g=0; g<numGrids; ++g ) {
03745        char *f = f_arr + g*fsize;
03746        if ( ! pencilActive[ib*yBlocks+jb] ) {
03747            for ( int i=ibegin; i<iend; ++i ) {
03748             for ( int j=jbegin; j<jend; ++j ) {
03749              if ( f[i*dim2+j] == 3 ) {
03750                f[i*dim2+j] = 2;
03751                iout << " (" << i << "," << j << ")";
03752              }
03753             }
03754            }
03755        }
03756      }
03757     }
03758    }
03759    iout << "\n" << endi;
03760   }
03761  
03762 }

void ComputePmeMgr::sendPencilsHelper ( int   ) 

Definition at line 3689 of file ComputePme.C.

References NodePmeMgr::sendPencilsHelper().

03689                                               {
03690   nodePmeMgr->sendPencilsHelper(iter);
03691 }

void ComputePmeMgr::sendPencilsPart ( int  first,
int  last,
Lattice ,
int  sequence,
int  sourcepe 
)

Definition at line 3547 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::block2, PmeGridMsg::destElem, PmeGrid::dim2, PmeGrid::dim3, PmeGridMsg::fgrid, PmeGridMsg::hasData, j, ijpair::j, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PmeGridMsg::lattice, PmeGridMsg::len, NAMD_bug(), PmeGrid::order, PME_GRID_PRIORITY, PRIORITY_SIZE, PmeGridMsg::qgrid, PmeGridMsg::sequence, SET_PRIORITY, PmeGridMsg::sourceNode, PmeGridMsg::start, PmeGridMsg::zlist, PmeGridMsg::zlistlen, and NodePmeMgr::zm.

Referenced by sendPencils(), and NodePmeMgr::sendPencilsHelper().

03547                                                                                                      {
03548 
03549   // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
03550 
03551 #if 0 && USE_PERSISTENT
03552     if (recvGrid_handle== NULL) setup_recvgrid_persistent();
03553 #endif
03554   int K1 = myGrid.K1;
03555   int K2 = myGrid.K2;
03556   int dim2 = myGrid.dim2;
03557   int dim3 = myGrid.dim3;
03558   int block1 = myGrid.block1;
03559   int block2 = myGrid.block2;
03560 
03561   // int savedMessages = 0;
03562   NodePmeMgr *npMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
03563 
03564   for (int ap=first; ap<=last; ++ap) {
03565     int ib = activePencils[ap].i;
03566     int jb = activePencils[ap].j;
03567     int ibegin = ib*block1;
03568     int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
03569     int jbegin = jb*block2;
03570     int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
03571     int flen = numGrids * (iend - ibegin) * (jend - jbegin);
03572 
03573     int fcount = 0;
03574     for ( int g=0; g<numGrids; ++g ) {
03575       char *f = f_arr + g*fsize;
03576 #ifdef NAMD_CUDA
03577      if ( offload ) {
03578       int errcount = 0;
03579       for ( int i=ibegin; i<iend; ++i ) {
03580        for ( int j=jbegin; j<jend; ++j ) {
03581         int k = i*dim2+j;
03582         f[k] = ffz_host[k];
03583         fcount += f[k];
03584         if ( ffz_host[k] & ~1 ) ++errcount;
03585        }
03586       }
03587       if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::sendPencilsPart");
03588      } else
03589 #endif
03590       for ( int i=ibegin; i<iend; ++i ) {
03591        for ( int j=jbegin; j<jend; ++j ) {
03592         fcount += f[i*dim2+j];
03593        }
03594       }
03595     }
03596 
03597 #ifdef NETWORK_PROGRESS
03598     CmiNetworkProgress();
03599 #endif
03600 
03601     if ( ! pencilActive[ib*yBlocks+jb] )
03602       NAMD_bug("PME activePencils list inconsistent");
03603 
03604     int zlistlen = 0;
03605     for ( int i=0; i<myGrid.K3; ++i ) {
03606       if ( fz_arr[i] ) ++zlistlen;
03607     }
03608 
03609     int hd = ( fcount? 1 : 0 );  // has data?
03610     // if ( ! hd ) ++savedMessages;
03611 
03612     
03613     PmeGridMsg *msg = new ( hd*zlistlen, hd*flen,
03614         hd*fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
03615     msg->sourceNode = sourcepe;
03616     msg->hasData = hd;
03617     msg->lattice = lattice;
03618    if ( hd ) {
03619 #if 0
03620     msg->start = fstart;
03621     msg->len = flen;
03622 #else
03623     msg->start = -1;   // obsolete?
03624     msg->len = -1;   // obsolete?
03625 #endif
03626     msg->zlistlen = zlistlen;
03627     int *zlist = msg->zlist;
03628     zlistlen = 0;
03629     for ( int i=0; i<myGrid.K3; ++i ) {
03630       if ( fz_arr[i] ) zlist[zlistlen++] = i;
03631     }
03632     char *fmsg = msg->fgrid;
03633     float *qmsg = msg->qgrid;
03634     for ( int g=0; g<numGrids; ++g ) {
03635       char *f = f_arr + g*fsize;
03636       float **q = q_arr + g*fsize;
03637       for ( int i=ibegin; i<iend; ++i ) {
03638        for ( int j=jbegin; j<jend; ++j ) {
03639         *(fmsg++) = f[i*dim2+j];
03640         if( f[i*dim2+j] ) {
03641           for (int h=0; h<myGrid.order-1; ++h) {
03642             q[i*dim2+j][h] += q[i*dim2+j][myGrid.K3+h];
03643           }
03644           for ( int k=0; k<zlistlen; ++k ) {
03645             *(qmsg++) = q[i*dim2+j][zlist[k]];
03646           }
03647         }
03648        }
03649       }
03650     }
03651    }
03652 
03653     msg->sequence = compute_sequence;
03654     SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
03655     CmiEnableUrgentSend(1);
03656 #if USE_NODE_PAR_RECEIVE
03657     msg->destElem=CkArrayIndex3D(ib,jb,0);
03658     CProxy_PmePencilMap lzm = npMgr->zm;
03659     int destproc = lzm.ckLocalBranch()->procNum(0, msg->destElem);
03660     int destnode = CmiNodeOf(destproc);
03661     
03662 #if  0 
03663     CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
03664 #endif
03665     pmeNodeProxy[destnode].recvZGrid(msg);
03666 #if 0 
03667     CmiUsePersistentHandle(NULL, 0);
03668 #endif
03669 #else
03670 #if 0 
03671     CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
03672 #endif
03673     zPencil(ib,jb,0).recvGrid(msg);
03674 #if 0 
03675     CmiUsePersistentHandle(NULL, 0);
03676 #endif
03677 #endif
03678     CmiEnableUrgentSend(0);
03679   }
03680 
03681 
03682   // if ( savedMessages ) {
03683   //   CkPrintf("Pe %d eliminated %d PME messages\n",CkMyPe(),savedMessages);
03684   // }
03685 
03686 }

void ComputePmeMgr::sendTrans ( void   ) 

Definition at line 1936 of file ComputePme.C.

References CKLOOP_CTRL_PME_SENDTRANS, Node::Object(), PmeSlabSendTrans(), sendTransSubset(), Node::simParameters, and SimParameters::useCkLoop.

Referenced by sendTransBarrier().

01936                                   {
01937 
01938   untrans_count = numTransPes;
01939 
01940 #if     CMK_SMP && USE_CKLOOP
01941   int useCkLoop = Node::Object()->simParameters->useCkLoop;
01942   if ( useCkLoop >= CKLOOP_CTRL_PME_SENDTRANS && CkNumPes() >= 2 * numGridPes) {
01943     CkLoop_Parallelize(PmeSlabSendTrans, 1, (void *)this, CkMyNodeSize(), 0, numTransNodes-1, 0); // no sync
01944   } else
01945 #endif
01946   {
01947     sendTransSubset(0, numTransNodes-1);
01948   }
01949 
01950 }

void ComputePmeMgr::sendTransBarrier ( void   ) 

Definition at line 1921 of file ComputePme.C.

References sendTrans().

Referenced by recvGrid().

01921                                          {
01922   sendTransBarrier_received += 1;
01923   // CkPrintf("sendTransBarrier on %d %d\n",myGridPe,numGridPes-sendTransBarrier_received);
01924   if ( sendTransBarrier_received < numGridPes ) return;
01925   sendTransBarrier_received = 0;
01926   for ( int i=0; i<numGridPes; ++i ) {
01927     pmeProxyDir[gridPeMap[i]].sendTrans();
01928   }
01929 }

void ComputePmeMgr::sendTransSubset ( int  first,
int  last 
)

Definition at line 1952 of file ComputePme.C.

References PmeGrid::dim3, fwdSharedTrans(), j, PmeGrid::K2, kgrid, PmeTransMsg::lattice, NodePmeInfo::npe, ComputePmeUtil::numGrids, PmeTransMsg::nx, LocalPmeInfo::nx, LocalPmeInfo::ny_after_transpose, NodePmeInfo::pe_start, PME_TRANS_PRIORITY, PRIORITY_SIZE, PmeTransMsg::qgrid, NodePmeInfo::real_node, PmeTransMsg::sequence, SET_PRIORITY, PmeTransMsg::sourceNode, PmeTransMsg::x_start, and LocalPmeInfo::x_start.

Referenced by PmeSlabSendTrans(), and sendTrans().

01952                                                        {
01953   // CkPrintf("sendTrans on Pe(%d)\n",CkMyPe());
01954 
01955   // send data for transpose
01956   int zdim = myGrid.dim3;
01957   int nx = localInfo[myGridPe].nx;
01958   int x_start = localInfo[myGridPe].x_start;
01959   int slicelen = myGrid.K2 * zdim;
01960 
01961   ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
01962 
01963 #if CMK_BLUEGENEL
01964   CmiNetworkProgressAfter (0);
01965 #endif
01966 
01967   for (int j=first; j<=last; j++) {
01968     int node = transNodeOrder[j];  // different order on each node
01969     int pe = transNodeInfo[node].pe_start;
01970     int npe = transNodeInfo[node].npe;
01971     int totlen = 0;
01972     if ( node != myTransNode ) for (int i=0; i<npe; ++i, ++pe) {
01973       LocalPmeInfo &li = localInfo[pe];
01974       int cpylen = li.ny_after_transpose * zdim;
01975       totlen += cpylen;
01976     }
01977     PmeTransMsg *newmsg = new (nx * totlen * numGrids,
01978                                 PRIORITY_SIZE) PmeTransMsg;
01979     newmsg->sourceNode = myGridPe;
01980     newmsg->lattice = lattice;
01981     newmsg->x_start = x_start;
01982     newmsg->nx = nx;
01983     for ( int g=0; g<numGrids; ++g ) {
01984       float *qmsg = newmsg->qgrid + nx * totlen * g;
01985       pe = transNodeInfo[node].pe_start;
01986       for (int i=0; i<npe; ++i, ++pe) {
01987         LocalPmeInfo &li = localInfo[pe];
01988         int cpylen = li.ny_after_transpose * zdim;
01989         if ( node == myTransNode ) {
01990           ComputePmeMgr *m = mgrObjects[CkRankOf(transPeMap[pe])];
01991           qmsg = m->kgrid + m->qgrid_size * g + x_start*cpylen;
01992         }
01993         float *q = qgrid + qgrid_size * g + li.y_start_after_transpose * zdim;
01994         for ( int x = 0; x < nx; ++x ) {
01995           CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
01996           q += slicelen;
01997           qmsg += cpylen;
01998         }
01999       }
02000     }
02001     newmsg->sequence = grid_sequence;
02002     SET_PRIORITY(newmsg,grid_sequence,PME_TRANS_PRIORITY)
02003     if ( node == myTransNode ) newmsg->nx = 0;
02004     if ( npe > 1 ) {
02005       if ( node == myTransNode ) fwdSharedTrans(newmsg);
02006       else pmeNodeProxy[transNodeInfo[node].real_node].recvTrans(newmsg);
02007     } else pmeProxy[transPeMap[transNodeInfo[node].pe_start]].recvTrans(newmsg);
02008   }
02009 }

void ComputePmeMgr::sendUngrid ( void   ) 

Definition at line 2366 of file ComputePme.C.

References CKLOOP_CTRL_PME_SENDUNTRANS, Node::Object(), PmeSlabSendUngrid(), sendUngridSubset(), Node::simParameters, and SimParameters::useCkLoop.

02366                                    {
02367 
02368 #if     CMK_SMP && USE_CKLOOP
02369   int useCkLoop = Node::Object()->simParameters->useCkLoop;
02370   if ( useCkLoop >= CKLOOP_CTRL_PME_SENDUNTRANS && CkNumPes() >= 2 * numGridPes) {
02371     CkLoop_Parallelize(PmeSlabSendUngrid, 1, (void *)this, CkMyNodeSize(), 0, numSources-1, 1); // sync
02372   } else
02373 #endif
02374   {
02375     sendUngridSubset(0, numSources-1);
02376   }
02377 
02378   grid_count = numSources;
02379   memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
02380 }

void ComputePmeMgr::sendUngridSubset ( int  first,
int  last 
)

Definition at line 2382 of file ComputePme.C.

References PmeGrid::dim3, f, j, PME_OFFLOAD_UNGRID_PRIORITY, PME_UNGRID_PRIORITY, and SET_PRIORITY.

Referenced by PmeSlabSendUngrid(), and sendUngrid().

02382                                                         {
02383 
02384 #ifdef NAMD_CUDA
02385   const int UNGRID_PRIORITY = ( offload ? PME_OFFLOAD_UNGRID_PRIORITY : PME_UNGRID_PRIORITY );
02386 #else
02387   const int UNGRID_PRIORITY = PME_UNGRID_PRIORITY ;
02388 #endif
02389 
02390   for ( int j=first; j<=last; ++j ) {
02391     // int msglen = qgrid_len;
02392     PmeGridMsg *newmsg = gridmsg_reuse[j];
02393     int pe = newmsg->sourceNode;
02394     int zdim = myGrid.dim3;
02395     int flen = newmsg->len;
02396     int fstart = newmsg->start;
02397     int zlistlen = newmsg->zlistlen;
02398     int *zlist = newmsg->zlist;
02399     float *qmsg = newmsg->qgrid;
02400     for ( int g=0; g<numGrids; ++g ) {
02401       char *f = newmsg->fgrid + fgrid_len * g;
02402       float *q = qgrid + qgrid_size * g + (fstart-fgrid_start) * zdim;
02403       for ( int i=0; i<flen; ++i ) {
02404         if ( f[i] ) {
02405           for ( int k=0; k<zlistlen; ++k ) {
02406             *(qmsg++) = q[zlist[k]];
02407           }
02408         }
02409         q += zdim;
02410       }
02411     }
02412     newmsg->sourceNode = myGridPe;
02413 
02414     SET_PRIORITY(newmsg,grid_sequence,UNGRID_PRIORITY)
02415     CmiEnableUrgentSend(1);
02416 #ifdef NAMD_CUDA
02417     if ( offload ) {
02418       pmeNodeProxy[CkNodeOf(pe)].recvUngrid(newmsg);
02419     } else
02420 #endif
02421     pmeProxyDir[pe].recvUngrid(newmsg);
02422     CmiEnableUrgentSend(0);
02423   }
02424 }

void ComputePmeMgr::sendUntrans ( void   ) 

Definition at line 2180 of file ComputePme.C.

References CKLOOP_CTRL_PME_SENDUNTRANS, PmeEvirMsg::evir, Node::Object(), PME_UNGRID_PRIORITY, PmeSlabSendUntrans(), PRIORITY_SIZE, sendUntransSubset(), SET_PRIORITY, Node::simParameters, and SimParameters::useCkLoop.

02180                                     {
02181 
02182   trans_count = numGridPes;
02183 
02184   { // send energy and virial
02185     PmeEvirMsg *newmsg = new (numGrids, PRIORITY_SIZE) PmeEvirMsg;
02186     for ( int g=0; g<numGrids; ++g ) {
02187       newmsg->evir[g] = recip_evir2[g];
02188     }
02189     SET_PRIORITY(newmsg,grid_sequence,PME_UNGRID_PRIORITY)
02190     CmiEnableUrgentSend(1);
02191     pmeProxy[recipEvirPe].recvRecipEvir(newmsg);
02192     CmiEnableUrgentSend(0);
02193   }
02194 
02195 #if     CMK_SMP && USE_CKLOOP
02196   int useCkLoop = Node::Object()->simParameters->useCkLoop;
02197   if ( useCkLoop >= CKLOOP_CTRL_PME_SENDUNTRANS && CkNumPes() >= 2 * numTransPes) {
02198     CkLoop_Parallelize(PmeSlabSendUntrans, 1, (void *)this, CkMyNodeSize(), 0, numGridNodes-1, 0); // no sync
02199   } else
02200 #endif
02201   {
02202     sendUntransSubset(0, numGridNodes-1);
02203   }
02204 
02205 }

void ComputePmeMgr::sendUntransSubset ( int  first,
int  last 
)

Definition at line 2207 of file ComputePme.C.

References PmeGrid::dim3, fwdSharedUntrans(), j, PmeGrid::K2, NodePmeInfo::npe, LocalPmeInfo::nx, PmeUntransMsg::ny, LocalPmeInfo::ny_after_transpose, NodePmeInfo::pe_start, PME_UNTRANS_PRIORITY, PRIORITY_SIZE, qgrid, PmeUntransMsg::qgrid, NodePmeInfo::real_node, SET_PRIORITY, PmeUntransMsg::sourceNode, PmeUntransMsg::y_start, and LocalPmeInfo::y_start_after_transpose.

Referenced by PmeSlabSendUntrans(), and sendUntrans().

02207                                                          {
02208 
02209   int zdim = myGrid.dim3;
02210   int y_start = localInfo[myTransPe].y_start_after_transpose;
02211   int ny = localInfo[myTransPe].ny_after_transpose;
02212   int slicelen = myGrid.K2 * zdim;
02213 
02214   ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
02215 
02216 #if CMK_BLUEGENEL
02217   CmiNetworkProgressAfter (0);
02218 #endif
02219 
02220   // send data for reverse transpose
02221   for (int j=first; j<=last; j++) {
02222     int node = gridNodeOrder[j];  // different order on each node
02223     int pe = gridNodeInfo[node].pe_start;
02224     int npe = gridNodeInfo[node].npe;
02225     int totlen = 0;
02226     if ( node != myGridNode ) for (int i=0; i<npe; ++i, ++pe) {
02227       LocalPmeInfo &li = localInfo[pe];
02228       int cpylen = li.nx * zdim;
02229       totlen += cpylen;
02230     }
02231     PmeUntransMsg *newmsg = new (ny * totlen * numGrids, PRIORITY_SIZE) PmeUntransMsg;
02232     newmsg->sourceNode = myTransPe;
02233     newmsg->y_start = y_start;
02234     newmsg->ny = ny;
02235     for ( int g=0; g<numGrids; ++g ) {
02236       float *qmsg = newmsg->qgrid + ny * totlen * g;
02237       pe = gridNodeInfo[node].pe_start;
02238       for (int i=0; i<npe; ++i, ++pe) {
02239         LocalPmeInfo &li = localInfo[pe];
02240         if ( node == myGridNode ) {
02241           ComputePmeMgr *m = mgrObjects[CkRankOf(gridPeMap[pe])];
02242           qmsg = m->qgrid + m->qgrid_size * g + y_start * zdim;
02243           float *q = kgrid + qgrid_size*g + li.x_start*ny*zdim;
02244           int cpylen = ny * zdim;
02245           for ( int x = 0; x < li.nx; ++x ) {
02246             CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
02247             q += cpylen;
02248             qmsg += slicelen;
02249           }
02250         } else {
02251           CmiMemcpy((void*)qmsg,
02252                 (void*)(kgrid + qgrid_size*g + li.x_start*ny*zdim),
02253                 li.nx*ny*zdim*sizeof(float));
02254           qmsg += li.nx*ny*zdim;
02255         }
02256       }
02257     }
02258     SET_PRIORITY(newmsg,grid_sequence,PME_UNTRANS_PRIORITY)
02259     if ( node == myGridNode ) newmsg->ny = 0;
02260     if ( npe > 1 ) {
02261       if ( node == myGridNode ) fwdSharedUntrans(newmsg);
02262       else pmeNodeProxy[gridNodeInfo[node].real_node].recvUntrans(newmsg);
02263     } else pmeProxy[gridPeMap[gridNodeInfo[node].pe_start]].recvUntrans(newmsg);
02264   }
02265 }

void ComputePmeMgr::submitReductions (  ) 

Definition at line 4190 of file ComputePme.C.

References ComputePmeUtil::alchDecouple, ComputePmeUtil::alchOn, NAMD_bug(), Node::Object(), Node::simParameters, and simParams.

Referenced by ComputePme::doWork(), and recvRecipEvir().

04190                                      {
04191 
04192     SimParameters *simParams = Node::Object()->simParameters;
04193 
04194     for ( int g=0; g<numGrids; ++g ) {
04195       float scale = 1.;
04196       if (alchOn) {
04197         BigReal elecLambdaUp, elecLambdaDown;
04198         if( simParams->alchFepWhamOn ) {
04199           if( simParams->alchFepElecOn ) {
04200             elecLambdaUp = simParams->alchElecLambda;
04201             elecLambdaDown = 1.0 - simParams->alchElecLambda;
04202           }
04203           else {
04204             elecLambdaUp = 0.0;
04205             elecLambdaDown = 1.0;
04206           }
04207         }
04208         else {
04209           // alchLambda set on each step in ComputePme::ungridForces()
04210           if ( alchLambda < 0 || alchLambda > 1 ) {
04211             NAMD_bug("ComputePmeMgr::submitReductions alchLambda out of range");
04212           }
04213           elecLambdaUp = simParams->getElecLambda(alchLambda);
04214           elecLambdaDown = simParams->getElecLambda(1-alchLambda);
04215         }
04216         if ( g == 0 ) scale = elecLambdaUp;
04217         else if ( g == 1 ) scale = elecLambdaDown;
04218         else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
04219         if (alchDecouple) {
04220           if ( g == 2 ) scale = 1-elecLambdaUp;
04221           else if ( g == 3 ) scale = 1-elecLambdaDown;
04222           else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
04223         }
04224       } else if ( lesOn ) {
04225         scale = 1.0 / lesFactor;
04226       } else if ( pairOn ) {
04227         scale = ( g == 0 ? 1. : -1. );
04228       }
04229       reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += evir[g][0] * scale;
04230       reduction->item(REDUCTION_VIRIAL_SLOW_XX) += evir[g][1] * scale;
04231       reduction->item(REDUCTION_VIRIAL_SLOW_XY) += evir[g][2] * scale;
04232       reduction->item(REDUCTION_VIRIAL_SLOW_XZ) += evir[g][3] * scale;
04233       reduction->item(REDUCTION_VIRIAL_SLOW_YX) += evir[g][2] * scale;
04234       reduction->item(REDUCTION_VIRIAL_SLOW_YY) += evir[g][4] * scale;
04235       reduction->item(REDUCTION_VIRIAL_SLOW_YZ) += evir[g][5] * scale;
04236       reduction->item(REDUCTION_VIRIAL_SLOW_ZX) += evir[g][3] * scale;
04237       reduction->item(REDUCTION_VIRIAL_SLOW_ZY) += evir[g][5] * scale;
04238       reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += evir[g][6] * scale;
04239 
04240       float scale2 = 0.;
04241 
04242       // why is this declared/defined again here?
04243       SimParameters *simParams = Node::Object()->simParameters;
04244 
04245       if (alchFepOn) {
04246         BigReal elecLambda2Up=0.0, elecLambda2Down=0.0;
04247         if(simParams->alchFepWhamOn) {
04248           if(simParams->alchFepElecOn) {
04249             elecLambda2Up = simParams->alchElecLambda;
04250             elecLambda2Down =  1.0 - simParams->alchElecLambda;
04251           }
04252           else {
04253             elecLambda2Up = 0.0;
04254             elecLambda2Down =  1.0;
04255           }
04256         }
04257         else {
04258           elecLambda2Up = simParams->getElecLambda(simParams->alchLambda2);
04259           elecLambda2Down = simParams->getElecLambda(1.-simParams->alchLambda2);
04260         }
04261         
04262         if ( g == 0 ) scale2 = elecLambda2Up;
04263         else if ( g == 1 ) scale2 = elecLambda2Down;
04264         else if ( g == 2 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
04265         if (alchDecouple && g == 2 ) scale2 = 1 - elecLambda2Up;
04266         else if (alchDecouple && g == 3 ) scale2 = 1 - elecLambda2Down;
04267         else if (alchDecouple && g == 4 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
04268       }
04269       if(simParams->alchFepWhamOn && simParams->alchFepElecOn)  {       // FEP with wham post-process
04270         if( g==0 )      scale2 = scale + 1.0;
04271         else if( g==1 ) scale2 = scale - 1.0;
04272         else if( g==2 ) scale2 = scale - 1.0;
04273         else if( g==3 ) scale2 = scale + 1.0;
04274       }
04275       reduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += evir[g][0] * scale2;
04276       
04277       if (alchThermIntOn) {
04278         
04279         // no decoupling:
04280         // part. 1 <-> all of system except partition 2: g[0] - g[2] 
04281         // (interactions between all atoms [partition 0 OR partition 1], 
04282         // minus all [within partition 0])
04283         // U = elecLambdaUp * (U[0] - U[2])
04284         // dU/dl = U[0] - U[2];
04285         
04286         // part. 2 <-> all of system except partition 1: g[1] - g[2] 
04287         // (interactions between all atoms [partition 0 OR partition 2], 
04288         // minus all [within partition 0])
04289         // U = elecLambdaDown * (U[1] - U[2])
04290         // dU/dl = U[1] - U[2];
04291 
04292         // alchDecouple:
04293         // part. 1 <-> part. 0: g[0] - g[2] - g[4] 
04294         // (interactions between all atoms [partition 0 OR partition 1]
04295         // minus all [within partition 1] minus all [within partition 0]
04296         // U = elecLambdaUp * (U[0] - U[4]) + (1-elecLambdaUp)* U[2]
04297         // dU/dl = U[0] - U[2] - U[4];
04298 
04299         // part. 2 <-> part. 0: g[1] - g[3] - g[4] 
04300         // (interactions between all atoms [partition 0 OR partition 2]
04301         // minus all [within partition 2] minus all [within partition 0]
04302         // U = elecLambdaDown * (U[1] - U[4]) + (1-elecLambdaDown)* U[3]
04303         // dU/dl = U[1] - U[3] - U[4];
04304         
04305         
04306         if ( g == 0 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) += evir[g][0];
04307         if ( g == 1 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) += evir[g][0];
04308         if (!alchDecouple) {
04309           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
04310           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
04311         }
04312         else {  // alchDecouple
04313           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
04314           if ( g == 3 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
04315           if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
04316           if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
04317         }
04318       }
04319     }
04320 
04321     alchLambda = -1.;  // illegal value to catch if not updated
04322 
04323     reduction->item(REDUCTION_STRAY_CHARGE_ERRORS) += strayChargeErrors;
04324     reduction->submit();
04325 
04326   for ( int i=0; i<heldComputes.size(); ++i ) {
04327     WorkDistrib::messageEnqueueWork(heldComputes[i]);
04328   }
04329   heldComputes.resize(0);
04330 }

void ComputePmeMgr::ungridCalc ( void   ) 

Definition at line 2510 of file ComputePme.C.

References a_data_dev, cuda_errcheck(), CUDA_EVENT_ID_PME_COPY, CUDA_EVENT_ID_PME_KERNEL, CUDA_EVENT_ID_PME_TICK, deviceCUDA, end_forces, NodePmeMgr::end_potential_memcpy, EVENT_STRIDE, f_data_dev, f_data_host, forces_count, forces_done_count, forces_time, DeviceCUDA::getDeviceID(), j, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, WorkDistrib::messageEnqueueWork(), NodePmeMgr::mgrObjects, PmeGrid::order, pmeComputes, ResizeArray< Elem >::size(), this_pe, and ungridCalc().

Referenced by ungridCalc().

02510                                    {
02511   // CkPrintf("ungridCalc on Pe(%d)\n",CkMyPe());
02512 
02513   ungridForcesCount = pmeComputes.size();
02514 
02515 #ifdef NAMD_CUDA
02516  if ( offload ) {
02517   //CmiLock(cuda_lock);
02518   cudaSetDevice(deviceCUDA->getDeviceID());
02519 
02520   if ( this == masterPmeMgr ) {
02521     double before = CmiWallTimer();
02522     cudaMemcpyAsync(v_data_dev, q_data_host, q_data_size, cudaMemcpyHostToDevice, 0 /*streams[stream]*/);
02523     cudaEventRecord(nodePmeMgr->end_potential_memcpy, 0 /*streams[stream]*/);
02524     traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
02525 
02526     const int myrank = CkMyRank();
02527     for ( int i=0; i<CkMyNodeSize(); ++i ) {
02528       if ( myrank != i && nodePmeMgr->mgrObjects[i]->pmeComputes.size() ) {
02529         nodePmeMgr->mgrObjects[i]->ungridCalc();
02530       }
02531     }
02532     if ( ! pmeComputes.size() ) return;
02533   }
02534 
02535   if ( ! end_forces ) {
02536     int n=(pmeComputes.size()-1)/EVENT_STRIDE+1;
02537     end_forces = new cudaEvent_t[n];
02538     for ( int i=0; i<n; ++i ) {
02539       cudaEventCreateWithFlags(&end_forces[i],cudaEventDisableTiming);
02540     }
02541   }
02542 
02543   const int pcsz = pmeComputes.size();
02544   if ( ! afn_host ) {
02545     cudaMallocHost((void**) &afn_host, 3*pcsz*sizeof(float*));
02546     cudaMalloc((void**) &afn_dev, 3*pcsz*sizeof(float*));
02547     cuda_errcheck("malloc params for pme");
02548   }
02549   int totn = 0;
02550   for ( int i=0; i<pcsz; ++i ) {
02551     int n = pmeComputes[i]->numGridAtoms[0];
02552     totn += n;
02553   }
02554   if ( totn > f_data_mgr_alloc ) {
02555     if ( f_data_mgr_alloc ) {
02556       CkPrintf("Expanding CUDA forces allocation because %d > %d\n", totn, f_data_mgr_alloc);
02557       cudaFree(f_data_mgr_dev);
02558       cudaFreeHost(f_data_mgr_host);
02559     }
02560     f_data_mgr_alloc = 1.2 * (totn + 100);
02561     cudaMalloc((void**) &f_data_mgr_dev, 3*f_data_mgr_alloc*sizeof(float));
02562     cudaMallocHost((void**) &f_data_mgr_host, 3*f_data_mgr_alloc*sizeof(float));
02563     cuda_errcheck("malloc forces for pme");
02564   }
02565   // CkPrintf("pe %d pcsz %d totn %d alloc %d\n", CkMyPe(), pcsz, totn, f_data_mgr_alloc);
02566   float *f_dev = f_data_mgr_dev;
02567   float *f_host = f_data_mgr_host;
02568   for ( int i=0; i<pcsz; ++i ) {
02569     int n = pmeComputes[i]->numGridAtoms[0];
02570     pmeComputes[i]->f_data_dev = f_dev;
02571     pmeComputes[i]->f_data_host = f_host;
02572     afn_host[3*i  ] = a_data_dev + 7 * pmeComputes[i]->cuda_atoms_offset;
02573     afn_host[3*i+1] = f_dev;
02574     afn_host[3*i+2] = f_dev + n;  // avoid type conversion issues
02575     f_dev += 3*n;
02576     f_host += 3*n;
02577   }
02578   //CmiLock(cuda_lock);
02579   double before = CmiWallTimer();
02580   cudaMemcpyAsync(afn_dev, afn_host, 3*pcsz*sizeof(float*), cudaMemcpyHostToDevice, streams[stream]);
02581   traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
02582   cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_potential_memcpy, 0);
02583   traceUserEvent(CUDA_EVENT_ID_PME_TICK);
02584 
02585   for ( int i=0; i<pcsz; ++i ) {
02586     // cudaMemsetAsync(pmeComputes[i]->f_data_dev, 0, 3*n*sizeof(float), streams[stream]);
02587     if ( i%EVENT_STRIDE == 0 ) {
02588       int dimy = pcsz - i;
02589       if ( dimy > EVENT_STRIDE ) dimy = EVENT_STRIDE;
02590       int maxn = 0;
02591       int subtotn = 0;
02592       for ( int j=0; j<dimy; ++j ) {
02593         int n = pmeComputes[i+j]->numGridAtoms[0];
02594         subtotn += n;
02595         if ( n > maxn ) maxn = n;
02596       }
02597       // CkPrintf("pe %d dimy %d maxn %d subtotn %d\n", CkMyPe(), dimy, maxn, subtotn);
02598       before = CmiWallTimer();
02599       cuda_pme_forces(
02600         bspline_coeffs_dev,
02601         v_arr_dev, afn_dev+3*i, dimy, maxn, /*
02602         pmeComputes[i]->a_data_dev,
02603         pmeComputes[i]->f_data_dev,
02604         n, */ myGrid.K1, myGrid.K2, myGrid.K3, myGrid.order,
02605         streams[stream]);
02606       traceUserBracketEvent(CUDA_EVENT_ID_PME_KERNEL,before,CmiWallTimer());
02607       before = CmiWallTimer();
02608       cudaMemcpyAsync(pmeComputes[i]->f_data_host, pmeComputes[i]->f_data_dev, 3*subtotn*sizeof(float),
02609         cudaMemcpyDeviceToHost, streams[stream]);
02610       traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
02611       cudaEventRecord(end_forces[i/EVENT_STRIDE], streams[stream]);
02612       traceUserEvent(CUDA_EVENT_ID_PME_TICK);
02613     }
02614     // CkPrintf("pe %d c %d natoms %d fdev %lld fhost %lld\n", CkMyPe(), i, (int64)afn_host[3*i+2], pmeComputes[i]->f_data_dev, pmeComputes[i]->f_data_host);
02615   }
02616   //CmiUnlock(cuda_lock);
02617  } else
02618 #endif // NAMD_CUDA
02619  {
02620   for ( int i=0; i<pmeComputes.size(); ++i ) {
02621     WorkDistrib::messageEnqueueWork(pmeComputes[i]);
02622     // pmeComputes[i]->ungridForces();
02623   }
02624  }
02625   // submitReductions();  // must follow all ungridForces()
02626 
02627 #ifdef NAMD_CUDA
02628  if ( offload ) {
02629   forces_time = CmiWallTimer();
02630   forces_count = ungridForcesCount;
02631   forces_done_count = 0;
02632   pmeProxy[this_pe].pollForcesReady();
02633  }
02634 #endif
02635 
02636   ungrid_count = (usePencils ? numPencilsActive : numDestRecipPes );
02637 }


Friends And Related Function Documentation

friend class ComputePme [friend]

Definition at line 357 of file ComputePme.C.

friend class NodePmeMgr [friend]

Definition at line 358 of file ComputePme.C.


Member Data Documentation

float* ComputePmeMgr::a_data_dev

Definition at line 419 of file ComputePme.C.

Referenced by cuda_submit_charges(), and ungridCalc().

float* ComputePmeMgr::a_data_host

Definition at line 418 of file ComputePme.C.

int ComputePmeMgr::chargeGridSubmittedCount

Definition at line 444 of file ComputePme.C.

Referenced by chargeGridSubmitted(), ComputePmeMgr(), initialize_computes(), and sendChargeGridReady().

double ComputePmeMgr::charges_time

Definition at line 430 of file ComputePme.C.

Referenced by cuda_check_pme_charges(), and cuda_submit_charges().

int ComputePmeMgr::check_charges_count

Definition at line 432 of file ComputePme.C.

Referenced by ComputePmeMgr(), and cuda_check_pme_charges().

int ComputePmeMgr::check_forces_count

Definition at line 433 of file ComputePme.C.

Referenced by ComputePmeMgr(), and cuda_check_pme_forces().

int ComputePmeMgr::cuda_atoms_alloc

Definition at line 423 of file ComputePme.C.

Referenced by ComputePmeMgr().

int ComputePmeMgr::cuda_atoms_count

Definition at line 422 of file ComputePme.C.

Referenced by ComputePmeMgr(), cuda_submit_charges(), and ComputePme::initialize().

bool ComputePmeMgr::cuda_busy [static]

Definition at line 442 of file ComputePme.C.

CmiNodeLock ComputePmeMgr::cuda_lock [static]

Definition at line 424 of file ComputePme.C.

Referenced by ComputePmeMgr(), initialize_computes(), and recvAck().

std::deque< ComputePmeMgr::cuda_submit_charges_args > ComputePmeMgr::cuda_submit_charges_deque [static]

Definition at line 441 of file ComputePme.C.

cudaEvent_t ComputePmeMgr::end_charges

Definition at line 426 of file ComputePme.C.

Referenced by chargeGridSubmitted(), ComputePmeMgr(), and cuda_check_pme_charges().

cudaEvent_t* ComputePmeMgr::end_forces

Definition at line 427 of file ComputePme.C.

Referenced by ComputePmeMgr(), cuda_check_pme_forces(), and ungridCalc().

float* ComputePmeMgr::f_data_dev

Definition at line 421 of file ComputePme.C.

Referenced by ungridCalc().

float* ComputePmeMgr::f_data_host

Definition at line 420 of file ComputePme.C.

Referenced by ungridCalc().

CmiNodeLock ComputePmeMgr::fftw_plan_lock [static]

Definition at line 414 of file ComputePme.C.

Referenced by ComputePmeMgr(), initialize(), and ~ComputePmeMgr().

int ComputePmeMgr::forces_count

Definition at line 428 of file ComputePme.C.

Referenced by cuda_check_pme_forces(), and ungridCalc().

int ComputePmeMgr::forces_done_count

Definition at line 429 of file ComputePme.C.

Referenced by cuda_check_pme_forces(), and ungridCalc().

double ComputePmeMgr::forces_time

Definition at line 431 of file ComputePme.C.

Referenced by cuda_check_pme_forces(), and ungridCalc().

int ComputePmeMgr::master_pe

Definition at line 434 of file ComputePme.C.

Referenced by chargeGridSubmitted(), initialize_computes(), recvAck(), and sendChargeGridReady().

ResizeArray<ComputePme*> ComputePmeMgr::pmeComputes

Definition at line 454 of file ComputePme.C.

Referenced by chargeGridReady(), cuda_check_pme_forces(), getComputes(), ComputePme::noWork(), recvRecipEvir(), sendChargeGridReady(), and ungridCalc().

CmiNodeLock ComputePmeMgr::pmemgr_lock

Definition at line 415 of file ComputePme.C.

Referenced by ComputePmeMgr(), and ~ComputePmeMgr().

Lattice* ComputePmeMgr::saved_lattice

Definition at line 447 of file ComputePme.C.

Referenced by chargeGridSubmitted(), and recvChargeGridReady().

int ComputePmeMgr::saved_sequence

Definition at line 448 of file ComputePme.C.

Referenced by chargeGridSubmitted(), cuda_check_pme_charges(), cuda_check_pme_forces(), and recvChargeGridReady().

int ComputePmeMgr::sendDataHelper_errors

Definition at line 373 of file ComputePme.C.

Referenced by sendData(), and NodePmeMgr::sendDataHelper().

Lattice* ComputePmeMgr::sendDataHelper_lattice

Definition at line 370 of file ComputePme.C.

Referenced by sendData(), NodePmeMgr::sendDataHelper(), sendPencils(), and NodePmeMgr::sendPencilsHelper().

int ComputePmeMgr::sendDataHelper_sequence

Definition at line 371 of file ComputePme.C.

Referenced by sendData(), NodePmeMgr::sendDataHelper(), sendPencils(), and NodePmeMgr::sendPencilsHelper().

int ComputePmeMgr::sendDataHelper_sourcepe

Definition at line 372 of file ComputePme.C.

Referenced by sendData(), NodePmeMgr::sendDataHelper(), sendPencils(), and NodePmeMgr::sendPencilsHelper().

int ComputePmeMgr::this_pe

Definition at line 435 of file ComputePme.C.

Referenced by ComputePmeMgr(), and ungridCalc().


The documentation for this class was generated from the following file:
Generated on Thu Feb 22 01:17:18 2018 for NAMD by  doxygen 1.4.7