ComputePmeMgr Class Reference

Inheritance diagram for ComputePmeMgr:

ComputePmeUtil List of all members.

Public Member Functions

 ComputePmeMgr ()
 ~ComputePmeMgr ()
void initialize (CkQdMsg *)
void initialize_pencils (CkQdMsg *)
void activate_pencils (CkQdMsg *)
void recvArrays (CProxy_PmeXPencil, CProxy_PmeYPencil, CProxy_PmeZPencil)
void initialize_computes ()
void sendData (Lattice &, int sequence)
void sendDataPart (int first, int last, Lattice &, int sequence, int sourcepe, int errors)
void sendPencils (Lattice &, int sequence)
void sendPencilsPart (int first, int last, Lattice &, int sequence, int sourcepe)
void recvGrid (PmeGridMsg *)
void gridCalc1 (void)
void sendTransBarrier (void)
void sendTransSubset (int first, int last)
void sendTrans (void)
void fwdSharedTrans (PmeTransMsg *)
void recvSharedTrans (PmeSharedTransMsg *)
void sendDataHelper (int)
void sendPencilsHelper (int)
void recvTrans (PmeTransMsg *)
void procTrans (PmeTransMsg *)
void gridCalc2 (void)
void gridCalc2R (void)
void fwdSharedUntrans (PmeUntransMsg *)
void recvSharedUntrans (PmeSharedUntransMsg *)
void sendUntrans (void)
void sendUntransSubset (int first, int last)
void recvUntrans (PmeUntransMsg *)
void procUntrans (PmeUntransMsg *)
void gridCalc3 (void)
void sendUngrid (void)
void sendUngridSubset (int first, int last)
void recvUngrid (PmeGridMsg *)
void recvAck (PmeAckMsg *)
void copyResults (PmeGridMsg *)
void copyPencils (PmeGridMsg *)
void ungridCalc (void)
void recvRecipEvir (PmeEvirMsg *)
void addRecipEvirClient (void)
void submitReductions ()
void chargeGridSubmitted (Lattice &lattice, int sequence)
void cuda_submit_charges (Lattice &lattice, int sequence)
void sendChargeGridReady ()
void pollChargeGridReady ()
void pollForcesReady ()
void recvChargeGridReady ()
void chargeGridReady (Lattice &lattice, int sequence)

Public Attributes

LatticesendDataHelper_lattice
int sendDataHelper_sequence
int sendDataHelper_sourcepe
int sendDataHelper_errors
CmiNodeLock pmemgr_lock
float * a_data_host
float * a_data_dev
float * f_data_host
float * f_data_dev
int cuda_atoms_count
int cuda_atoms_alloc
cudaEvent_t end_charges
cudaEvent_t * end_forces
int forces_count
int forces_done_count
double charges_time
double forces_time
int check_charges_count
int check_forces_count
int master_pe
int this_pe
int chargeGridSubmittedCount
Latticesaved_lattice
int saved_sequence
ResizeArray< ComputePme * > pmeComputes

Static Public Attributes

static CmiNodeLock fftw_plan_lock
static CmiNodeLock cuda_lock
static std::deque< cuda_submit_charges_argscuda_submit_charges_deque
static bool cuda_busy

Friends

class ComputePme
class NodePmeMgr

Classes

struct  cuda_submit_charges_args

Detailed Description

Definition at line 355 of file ComputePme.C.


Constructor & Destructor Documentation

ComputePmeMgr::ComputePmeMgr (  ) 

Definition at line 710 of file ComputePme.C.

References chargeGridSubmittedCount, check_charges_count, check_forces_count, cuda_atoms_alloc, cuda_atoms_count, cuda_errcheck(), CUDA_EVENT_ID_PME_CHARGES, CUDA_EVENT_ID_PME_COPY, CUDA_EVENT_ID_PME_FORCES, CUDA_EVENT_ID_PME_KERNEL, CUDA_EVENT_ID_PME_TICK, cuda_lock, CUDA_STREAM_CREATE, end_charges, end_forces, fftw_plan_lock, NUM_STREAMS, pmemgr_lock, stream, and this_pe.

00710                              : pmeProxy(thisgroup), 
00711                                  pmeProxyDir(thisgroup) {
00712 
00713   CkpvAccess(BOCclass_group).computePmeMgr = thisgroup;
00714   pmeNodeProxy = CkpvAccess(BOCclass_group).nodePmeMgr;
00715   nodePmeMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
00716 
00717   pmeNodeProxy.ckLocalBranch()->initialize();
00718 
00719   if ( CmiMyRank() == 0 ) {
00720     fftw_plan_lock = CmiCreateLock();
00721   }
00722   pmemgr_lock = CmiCreateLock();
00723 
00724   myKSpace = 0;
00725   kgrid = 0;
00726   work = 0;
00727   grid_count = 0;
00728   trans_count = 0;
00729   untrans_count = 0;
00730   ungrid_count = 0;
00731   gridmsg_reuse= new PmeGridMsg*[CkNumPes()];
00732   useBarrier = 0;
00733   sendTransBarrier_received = 0;
00734   usePencils = 0;
00735 
00736 #ifdef NAMD_CUDA
00737  // offload has not been set so this happens on every run
00738   if ( CmiMyRank() == 0 ) {
00739     cuda_lock = CmiCreateLock();
00740   }
00741 
00742 #if CUDA_VERSION >= 5050
00743   int leastPriority, greatestPriority;
00744   cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
00745   cuda_errcheck("in cudaDeviceGetStreamPriorityRange");
00746   //if ( CkMyNode() == 0 ) {
00747   //  CkPrintf("Pe %d PME CUDA stream priority range %d %d\n", CkMyPe(), leastPriority, greatestPriority);
00748   //}
00749 #define CUDA_STREAM_CREATE(X) cudaStreamCreateWithPriority(X,cudaStreamDefault,greatestPriority)
00750 #else
00751 #define CUDA_STREAM_CREATE(X) cudaStreamCreate(X)
00752 #endif
00753 
00754   stream = 0;
00755   for ( int i=0; i<NUM_STREAMS; ++i ) {
00756 #if 1
00757     CUDA_STREAM_CREATE(&streams[i]);
00758     cuda_errcheck("cudaStreamCreate");
00759 #else
00760   streams[i] = 0;  // XXXX Testing!!!
00761 #endif
00762   }
00763 
00764   this_pe = CkMyPe();
00765  
00766   cudaEventCreateWithFlags(&end_charges,cudaEventDisableTiming);
00767   end_forces = 0;
00768   check_charges_count = 0;
00769   check_forces_count = 0;
00770   chargeGridSubmittedCount = 0;
00771 
00772   cuda_atoms_count = 0;
00773   cuda_atoms_alloc = 0;
00774 
00775   f_data_mgr_alloc = 0;
00776   f_data_mgr_host = 0;
00777   f_data_mgr_dev = 0;
00778   afn_host = 0;
00779   afn_dev = 0;
00780 
00781 #define CUDA_EVENT_ID_PME_CHARGES 80
00782 #define CUDA_EVENT_ID_PME_FORCES 81
00783 #define CUDA_EVENT_ID_PME_TICK 82
00784 #define CUDA_EVENT_ID_PME_COPY 83
00785 #define CUDA_EVENT_ID_PME_KERNEL 84
00786   if ( 0 == CkMyPe() ) {
00787     traceRegisterUserEvent("CUDA PME charges", CUDA_EVENT_ID_PME_CHARGES);
00788     traceRegisterUserEvent("CUDA PME forces", CUDA_EVENT_ID_PME_FORCES);
00789     traceRegisterUserEvent("CUDA PME tick", CUDA_EVENT_ID_PME_TICK);
00790     traceRegisterUserEvent("CUDA PME memcpy", CUDA_EVENT_ID_PME_COPY);
00791     traceRegisterUserEvent("CUDA PME kernel", CUDA_EVENT_ID_PME_KERNEL);
00792   }
00793 #endif
00794   recipEvirCount = 0;
00795   recipEvirClients = 0;
00796   recipEvirPe = -999;
00797 }

ComputePmeMgr::~ComputePmeMgr (  ) 

Definition at line 1794 of file ComputePme.C.

References fftw_plan_lock, and pmemgr_lock.

01794                               {
01795 
01796   if ( CmiMyRank() == 0 ) {
01797     CmiDestroyLock(fftw_plan_lock);
01798   }
01799   CmiDestroyLock(pmemgr_lock);
01800 
01801   delete myKSpace;
01802   delete [] localInfo;
01803   delete [] gridNodeInfo;
01804   delete [] transNodeInfo;
01805   delete [] gridPeMap;
01806   delete [] transPeMap;
01807   delete [] recipPeDest;
01808   delete [] gridPeOrder;
01809   delete [] gridNodeOrder;
01810   delete [] transNodeOrder;
01811   delete [] qgrid;
01812   if ( kgrid != qgrid ) delete [] kgrid;
01813   delete [] work;
01814   delete [] gridmsg_reuse;
01815 
01816  if ( ! offload ) {
01817   for (int i=0; i<q_count; ++i) {
01818     delete [] q_list[i];
01819   }
01820   delete [] q_list;
01821   delete [] fz_arr;
01822  }
01823   delete [] f_arr;
01824   delete [] q_arr;
01825 }


Member Function Documentation

void ComputePmeMgr::activate_pencils ( CkQdMsg *   ) 

Definition at line 1788 of file ComputePme.C.

01788                                                  {
01789   if ( ! usePencils ) return;
01790   if ( CkMyPe() == 0 ) zPencil.dummyRecvGrid(CkMyPe(),1);
01791 }

void ComputePmeMgr::addRecipEvirClient ( void   ) 

Definition at line 3019 of file ComputePme.C.

03019                                        {
03020   ++recipEvirClients;
03021 }

void ComputePmeMgr::chargeGridReady ( Lattice lattice,
int  sequence 
)

Definition at line 3538 of file ComputePme.C.

References j, PmeGrid::K3, NAMD_bug(), PmeGrid::order, pmeComputes, sendData(), sendPencils(), and ResizeArray< Elem >::size().

Referenced by recvChargeGridReady().

03538                                                                   {
03539 
03540 #ifdef NAMD_CUDA
03541  if ( offload ) {
03542   int errcount = 0;
03543   int q_stride = myGrid.K3+myGrid.order-1;
03544   for (int n=fsize+q_stride, j=fsize; j<n; ++j) {
03545     f_arr[j] = ffz_host[j];
03546     if ( ffz_host[j] & ~1 ) ++errcount;
03547   }
03548   if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::chargeGridReady");
03549  }
03550 #endif
03551   recipEvirCount = recipEvirClients;
03552   ungridForcesCount = pmeComputes.size();
03553 
03554   for (int j=0; j<myGrid.order-1; ++j) {
03555     fz_arr[j] |= fz_arr[myGrid.K3+j];
03556   }
03557 
03558   if ( usePencils ) {
03559     sendPencils(lattice,sequence);
03560   } else {
03561     sendData(lattice,sequence);
03562   }
03563 }

void ComputePmeMgr::chargeGridSubmitted ( Lattice lattice,
int  sequence 
)

Definition at line 3474 of file ComputePme.C.

References chargeGridSubmittedCount, computeMgr, CUDA_EVENT_ID_PME_COPY, deviceCUDA, NodePmeMgr::end_all_pme_kernels, NodePmeMgr::end_charge_memset, end_charges, DeviceCUDA::getMasterPe(), master_pe, Node::Object(), saved_lattice, saved_sequence, Node::simParameters, and simParams.

Referenced by cuda_submit_charges().

03474                                                                       {
03475   saved_lattice = &lattice;
03476   saved_sequence = sequence;
03477 
03478   // cudaDeviceSynchronize();  //  XXXX TESTING
03479   //int q_stride = myGrid.K3+myGrid.order-1;
03480   //for (int n=fsize+q_stride, j=0; j<n; ++j) {
03481   //  if ( ffz_host[j] != 0 && ffz_host[j] != 1 ) {
03482   //    CkPrintf("pre-memcpy flag %d/%d == %d on pe %d in ComputePmeMgr::chargeGridReady\n", j, n, ffz_host[j], CkMyPe());
03483   //  }
03484   //}
03485   //CmiLock(cuda_lock);
03486 
03487  if ( --(masterPmeMgr->chargeGridSubmittedCount) == 0 ) {
03488   double before = CmiWallTimer();
03489   cudaEventRecord(nodePmeMgr->end_all_pme_kernels, 0);  // when all streams complete
03490   cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_all_pme_kernels, 0);
03491   cudaMemcpyAsync(q_data_host, q_data_dev, q_data_size+ffz_size,
03492                         cudaMemcpyDeviceToHost, streams[stream]);
03493   traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
03494   cudaEventRecord(masterPmeMgr->end_charges, streams[stream]);
03495   cudaMemsetAsync(q_data_dev, 0, q_data_size + ffz_size, streams[stream]);  // for next time
03496   cudaEventRecord(nodePmeMgr->end_charge_memset, streams[stream]);
03497   //CmiUnlock(cuda_lock);
03498   // cudaDeviceSynchronize();  //  XXXX TESTING
03499   // cuda_errcheck("after memcpy grid to host");
03500 
03501   SimParameters *simParams = Node::Object()->simParameters;
03502   if ( ! simParams->useCUDA2 ) {
03503     CProxy_ComputeMgr cm(CkpvAccess(BOCclass_group).computeMgr);
03504     cm[deviceCUDA->getMasterPe()].recvYieldDevice(-1);
03505   }
03506 
03507   pmeProxy[master_pe].pollChargeGridReady();
03508  }
03509 }

void ComputePmeMgr::copyPencils ( PmeGridMsg  ) 

Definition at line 3784 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::block2, PmeGrid::dim2, PmeGrid::dim3, j, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PmeGrid::order, PmeGridMsg::qgrid, PmeGridMsg::sourceNode, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

Referenced by recvUngrid().

03784                                                {
03785 
03786   int K1 = myGrid.K1;
03787   int K2 = myGrid.K2;
03788   int dim2 = myGrid.dim2;
03789   int dim3 = myGrid.dim3;
03790   int block1 = myGrid.block1;
03791   int block2 = myGrid.block2;
03792 
03793   // msg->sourceNode = thisIndex.x * initdata.yBlocks + thisIndex.y;
03794   int ib = msg->sourceNode / yBlocks;
03795   int jb = msg->sourceNode % yBlocks;
03796 
03797   int ibegin = ib*block1;
03798   int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
03799   int jbegin = jb*block2;
03800   int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
03801 
03802   int zlistlen = msg->zlistlen;
03803   int *zlist = msg->zlist;
03804   float *qmsg = msg->qgrid;
03805   int g;
03806   for ( g=0; g<numGrids; ++g ) {
03807     char *f = f_arr + g*fsize;
03808     float **q = q_arr + g*fsize;
03809     for ( int i=ibegin; i<iend; ++i ) {
03810      for ( int j=jbegin; j<jend; ++j ) {
03811       if( f[i*dim2+j] ) {
03812         f[i*dim2+j] = 0;
03813         for ( int k=0; k<zlistlen; ++k ) {
03814           q[i*dim2+j][zlist[k]] = *(qmsg++);
03815         }
03816         for (int h=0; h<myGrid.order-1; ++h) {
03817           q[i*dim2+j][myGrid.K3+h] = q[i*dim2+j][h];
03818         }
03819       }
03820      }
03821     }
03822   }
03823 }

void ComputePmeMgr::copyResults ( PmeGridMsg  ) 

Definition at line 3976 of file ComputePme.C.

References PmeGrid::dim3, PmeGridMsg::fgrid, PmeGrid::K3, PmeGridMsg::len, PmeGrid::order, PmeGridMsg::qgrid, PmeGridMsg::start, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

Referenced by recvUngrid().

03976                                                {
03977 
03978   int zdim = myGrid.dim3;
03979   int flen = msg->len;
03980   int fstart = msg->start;
03981   int zlistlen = msg->zlistlen;
03982   int *zlist = msg->zlist;
03983   float *qmsg = msg->qgrid;
03984   int g;
03985   for ( g=0; g<numGrids; ++g ) {
03986     char *f = msg->fgrid + g*flen;
03987     float **q = q_arr + fstart + g*fsize;
03988     for ( int i=0; i<flen; ++i ) {
03989       if ( f[i] ) {
03990         f[i] = 0;
03991         for ( int k=0; k<zlistlen; ++k ) {
03992           q[i][zlist[k]] = *(qmsg++);
03993         }
03994         for (int h=0; h<myGrid.order-1; ++h) {
03995           q[i][myGrid.K3+h] = q[i][h];
03996         }
03997       }
03998     }
03999   }
04000 }

void ComputePmeMgr::cuda_submit_charges ( Lattice lattice,
int  sequence 
)

Definition at line 3419 of file ComputePme.C.

References a_data_dev, chargeGridSubmitted(), charges_time, cuda_atoms_count, CUDA_EVENT_ID_PME_COPY, CUDA_EVENT_ID_PME_KERNEL, NodePmeMgr::end_charge_memset, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, and PmeGrid::order.

03419                                                                       {
03420 
03421     int n = cuda_atoms_count;
03422     //CkPrintf("pe %d cuda_atoms_count %d\n", CkMyPe(), cuda_atoms_count);
03423     cuda_atoms_count = 0;
03424 
03425     const double before = CmiWallTimer();
03426     cudaMemcpyAsync(a_data_dev, a_data_host, 7*n*sizeof(float),
03427                           cudaMemcpyHostToDevice, streams[stream]);
03428     const double after = CmiWallTimer();
03429 
03430     cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_charge_memset, 0);
03431 
03432     cuda_pme_charges(
03433       bspline_coeffs_dev,
03434       q_arr_dev, ffz_dev, ffz_dev + fsize,
03435       a_data_dev, n,
03436       myGrid.K1, myGrid.K2, myGrid.K3, myGrid.order,
03437       streams[stream]);
03438     const double after2 = CmiWallTimer();
03439 
03440     chargeGridSubmitted(lattice,sequence);  // must be inside lock
03441 
03442     masterPmeMgr->charges_time = before;
03443     traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,after);
03444     traceUserBracketEvent(CUDA_EVENT_ID_PME_KERNEL,after,after2);
03445 }

void ComputePmeMgr::fwdSharedTrans ( PmeTransMsg  ) 

Definition at line 2014 of file ComputePme.C.

References NodePmeInfo::npe, NodePmeInfo::pe_start, PME_TRANS_PRIORITY, PRIORITY_SIZE, PmeTransMsg::sequence, and SET_PRIORITY.

Referenced by NodePmeMgr::recvTrans(), and sendTransSubset().

02014                                                    {
02015   // CkPrintf("fwdSharedTrans on Pe(%d)\n",CkMyPe());
02016   int pe = transNodeInfo[myTransNode].pe_start;
02017   int npe = transNodeInfo[myTransNode].npe;
02018   CmiNodeLock lock = CmiCreateLock();
02019   int *count = new int; *count = npe;
02020   for (int i=0; i<npe; ++i, ++pe) {
02021     PmeSharedTransMsg *shmsg = new (PRIORITY_SIZE) PmeSharedTransMsg;
02022     SET_PRIORITY(shmsg,msg->sequence,PME_TRANS_PRIORITY)
02023     shmsg->msg = msg;
02024     shmsg->count = count;
02025     shmsg->lock = lock;
02026     pmeProxy[transPeMap[pe]].recvSharedTrans(shmsg);
02027   }
02028 }

void ComputePmeMgr::fwdSharedUntrans ( PmeUntransMsg  ) 

Definition at line 2270 of file ComputePme.C.

References NodePmeInfo::npe, and NodePmeInfo::pe_start.

Referenced by NodePmeMgr::recvUntrans(), and sendUntransSubset().

02270                                                        {
02271   int pe = gridNodeInfo[myGridNode].pe_start;
02272   int npe = gridNodeInfo[myGridNode].npe;
02273   CmiNodeLock lock = CmiCreateLock();
02274   int *count = new int; *count = npe;
02275   for (int i=0; i<npe; ++i, ++pe) {
02276     PmeSharedUntransMsg *shmsg = new PmeSharedUntransMsg;
02277     shmsg->msg = msg;
02278     shmsg->count = count;
02279     shmsg->lock = lock;
02280     pmeProxy[gridPeMap[pe]].recvSharedUntrans(shmsg);
02281   }
02282 }

void ComputePmeMgr::gridCalc1 ( void   ) 

Definition at line 1906 of file ComputePme.C.

References PmeGrid::dim2, PmeGrid::dim3, and ComputePmeUtil::numGrids.

Referenced by recvGrid().

01906                                   {
01907   // CkPrintf("gridCalc1 on Pe(%d)\n",CkMyPe());
01908 
01909 #ifdef NAMD_FFTW
01910   for ( int g=0; g<numGrids; ++g ) {
01911 #ifdef NAMD_FFTW_3
01912     fftwf_execute(forward_plan_yz[g]);
01913 #else
01914     rfftwnd_real_to_complex(forward_plan_yz, localInfo[myGridPe].nx,
01915         qgrid + qgrid_size * g, 1, myGrid.dim2 * myGrid.dim3, 0, 0, 0);
01916 #endif
01917 
01918   }
01919 #endif
01920 
01921   if ( ! useBarrier ) pmeProxyDir[CkMyPe()].sendTrans();
01922 }

void ComputePmeMgr::gridCalc2 ( void   ) 

Definition at line 2082 of file ComputePme.C.

References PmeGrid::dim3, gridCalc2R(), ComputePmeUtil::numGrids, LocalPmeInfo::ny_after_transpose, and simParams.

Referenced by procTrans().

02082                                   {
02083   // CkPrintf("gridCalc2 on Pe(%d)\n",CkMyPe());
02084 
02085 #if CMK_BLUEGENEL
02086   CmiNetworkProgressAfter (0);
02087 #endif
02088 
02089   int zdim = myGrid.dim3;
02090   // int y_start = localInfo[myTransPe].y_start_after_transpose;
02091   int ny = localInfo[myTransPe].ny_after_transpose;
02092 
02093   for ( int g=0; g<numGrids; ++g ) {
02094     // finish forward FFT (x dimension)
02095 #ifdef NAMD_FFTW
02096 #ifdef NAMD_FFTW_3
02097     fftwf_execute(forward_plan_x[g]);
02098 #else
02099     fftw(forward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
02100         ny * zdim / 2, 1, work, 1, 0);
02101 #endif
02102 #endif
02103   }
02104 
02105 #ifdef OPENATOM_VERSION
02106     if ( ! simParams -> openatomOn ) { 
02107 #endif // OPENATOM_VERSION
02108       gridCalc2R();
02109 #ifdef OPENATOM_VERSION
02110     } else {
02111       gridCalc2Moa();
02112     }
02113 #endif // OPENATOM_VERSION
02114 }

void ComputePmeMgr::gridCalc2R ( void   ) 

Definition at line 2142 of file ComputePme.C.

References CKLOOP_CTRL_PME_KSPACE, PmeKSpace::compute_energy(), PmeGrid::dim3, ComputeNonbondedUtil::ewaldcof, LocalPmeInfo::ny_after_transpose, and Node::Object().

Referenced by gridCalc2().

02142                                    {
02143 
02144   int useCkLoop = 0;
02145 #if CMK_SMP && USE_CKLOOP
02146   if ( Node::Object()->simParameters->useCkLoop >= CKLOOP_CTRL_PME_KSPACE
02147        && CkNumPes() >= 2 * numTransPes ) {
02148     useCkLoop = 1;
02149   }
02150 #endif
02151 
02152   int zdim = myGrid.dim3;
02153   // int y_start = localInfo[myTransPe].y_start_after_transpose;
02154   int ny = localInfo[myTransPe].ny_after_transpose;
02155 
02156   for ( int g=0; g<numGrids; ++g ) {
02157     // reciprocal space portion of PME
02158     BigReal ewaldcof = ComputeNonbondedUtil::ewaldcof;
02159     recip_evir2[g][0] = myKSpace->compute_energy(kgrid+qgrid_size*g,
02160                         lattice, ewaldcof, &(recip_evir2[g][1]), useCkLoop);
02161     // CkPrintf("Ewald reciprocal energy = %f\n", recip_evir2[g][0]);
02162 
02163     // start backward FFT (x dimension)
02164 
02165 #ifdef NAMD_FFTW
02166 #ifdef NAMD_FFTW_3
02167     fftwf_execute(backward_plan_x[g]);
02168 #else
02169     fftw(backward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
02170         ny * zdim / 2, 1, work, 1, 0);
02171 #endif
02172 #endif
02173   }
02174   
02175   pmeProxyDir[CkMyPe()].sendUntrans();
02176 }

void ComputePmeMgr::gridCalc3 ( void   ) 

Definition at line 2344 of file ComputePme.C.

References PmeGrid::dim2, and PmeGrid::dim3.

Referenced by procUntrans().

02344                                   {
02345   // CkPrintf("gridCalc3 on Pe(%d)\n",CkMyPe());
02346 
02347   // finish backward FFT
02348 #ifdef NAMD_FFTW
02349   for ( int g=0; g<numGrids; ++g ) {
02350 #ifdef NAMD_FFTW_3
02351     fftwf_execute(backward_plan_yz[g]);
02352 #else
02353     rfftwnd_complex_to_real(backward_plan_yz, localInfo[myGridPe].nx,
02354         (fftw_complex *) (qgrid + qgrid_size * g),
02355         1, myGrid.dim2 * myGrid.dim3 / 2, 0, 0, 0);
02356 #endif
02357   }
02358 
02359 #endif
02360 
02361   pmeProxyDir[CkMyPe()].sendUngrid();
02362 }

void ComputePmeMgr::initialize ( CkQdMsg *   ) 

Definition at line 862 of file ComputePme.C.

References Lattice::a(), Lattice::a_r(), ResizeArray< Elem >::add(), ResizeArray< Elem >::begin(), PmeGrid::block1, PmeGrid::block2, PmeGrid::block3, cuda_errcheck(), deviceCUDA, PmeGrid::dim2, PmeGrid::dim3, ResizeArray< Elem >::end(), endi(), fftw_plan_lock, findRecipEvirPe(), generatePmePeList2(), DeviceCUDA::getDeviceID(), PmePencilInitMsgData::grid, if(), iINFO(), iout, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PatchMap::max_a(), PatchMap::min_a(), NAMD_bug(), NAMD_die(), PatchMap::node(), NodePmeInfo::npe, ComputePmeUtil::numGrids, PatchMap::numNodesWithPatches(), PatchMap::numPatches(), numPatches, PatchMap::numPatchesOnNode(), LocalPmeInfo::nx, LocalPmeInfo::ny_after_transpose, PatchMap::Object(), Node::Object(), DeviceCUDA::one_device_per_node(), PmeGrid::order, NodePmeInfo::pe_start, WorkDistrib::peDiffuseOrdering, pencilPMEProcessors, PmePencilInitMsgData::pmeNodeProxy, PmePencilInitMsgData::pmeProxy, NodePmeInfo::real_node, Random::reorder(), ResizeArray< Elem >::resize(), Node::simParameters, simParams, ResizeArray< Elem >::size(), SortableResizeArray< Elem >::sort(), WorkDistrib::sortPmePes(), Vector::unit(), LocalPmeInfo::x_start, PmePencilInitMsgData::xBlocks, PmePencilInitMsgData::xm, PmePencilInitMsgData::xPencil, LocalPmeInfo::y_start_after_transpose, PmePencilInitMsgData::yBlocks, PmePencilInitMsgData::ym, PmePencilInitMsgData::yPencil, PmePencilInitMsgData::zBlocks, PmePencilInitMsgData::zm, and PmePencilInitMsgData::zPencil.

00862                                            {
00863   delete msg;
00864 
00865   localInfo = new LocalPmeInfo[CkNumPes()];
00866   gridNodeInfo = new NodePmeInfo[CkNumNodes()];
00867   transNodeInfo = new NodePmeInfo[CkNumNodes()];
00868   gridPeMap = new int[CkNumPes()];
00869   transPeMap = new int[CkNumPes()];
00870   recipPeDest = new int[CkNumPes()];
00871   gridPeOrder = new int[CkNumPes()];
00872   gridNodeOrder = new int[CkNumNodes()];
00873   transNodeOrder = new int[CkNumNodes()];
00874 
00875   if (CkMyRank() == 0) {
00876     pencilPMEProcessors = new char [CkNumPes()];
00877     memset (pencilPMEProcessors, 0, sizeof(char) * CkNumPes());
00878   }
00879 
00880   SimParameters *simParams = Node::Object()->simParameters;
00881   PatchMap *patchMap = PatchMap::Object();
00882 
00883   offload = simParams->PMEOffload;
00884 #ifdef NAMD_CUDA
00885   if ( offload && ! deviceCUDA->one_device_per_node() ) {
00886     NAMD_die("PME offload requires exactly one CUDA device per process.  Use \"PMEOffload no\".");
00887   }
00888   if ( offload ) {
00889     int dev;
00890     cudaGetDevice(&dev);
00891     cuda_errcheck("in cudaGetDevice");
00892     if ( dev != deviceCUDA->getDeviceID() ) NAMD_bug("ComputePmeMgr::initialize dev != deviceCUDA->getDeviceID()");
00893     cudaDeviceProp deviceProp;
00894     cudaGetDeviceProperties(&deviceProp, dev);
00895     cuda_errcheck("in cudaGetDeviceProperties");
00896     if ( deviceProp.major < 2 )
00897       NAMD_die("PME offload requires CUDA device of compute capability 2.0 or higher.  Use \"PMEOffload no\".");
00898   }
00899 #endif
00900 
00901   alchLambda = -1.;  // illegal value to catch if not updated
00902   alchLambda2 = -1.;
00903   useBarrier = simParams->PMEBarrier;
00904 
00905   if ( numGrids != 1 || simParams->PMEPencils == 0 ) usePencils = 0;
00906   else if ( simParams->PMEPencils > 0 ) usePencils = 1;
00907   else {
00908     int nrps = simParams->PMEProcessors;
00909     if ( nrps <= 0 ) nrps = CkNumPes();
00910     if ( nrps > CkNumPes() ) nrps = CkNumPes();
00911     int dimx = simParams->PMEGridSizeX;
00912     int dimy = simParams->PMEGridSizeY;
00913     int maxslabs = 1 + (dimx - 1) / simParams->PMEMinSlices;
00914     if ( maxslabs > nrps ) maxslabs = nrps;
00915     int maxpencils = ( simParams->PMEGridSizeX * (int64) simParams->PMEGridSizeY
00916                 * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
00917     if ( maxpencils > nrps ) maxpencils = nrps;
00918     if ( maxpencils > 3 * maxslabs ) usePencils = 1;
00919     else usePencils = 0;
00920   }
00921 
00922   if ( usePencils ) {
00923     int nrps = simParams->PMEProcessors;
00924     if ( nrps <= 0 ) nrps = CkNumPes();
00925     if ( nrps > CkNumPes() ) nrps = CkNumPes();
00926     if ( simParams->PMEPencils > 1 &&
00927          simParams->PMEPencils * simParams->PMEPencils <= nrps ) {
00928       xBlocks = yBlocks = zBlocks = simParams->PMEPencils;
00929     } else {
00930       int nb2 = ( simParams->PMEGridSizeX * (int64) simParams->PMEGridSizeY
00931                 * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
00932       if ( nb2 > nrps ) nb2 = nrps;
00933       if ( nb2 < 1 ) nb2 = 1;
00934       int nb = (int) sqrt((float)nb2);
00935       if ( nb < 1 ) nb = 1;
00936       xBlocks = zBlocks = nb;
00937       yBlocks = nb2 / nb;
00938     }
00939 
00940     if ( simParams->PMEPencilsX > 0 ) xBlocks = simParams->PMEPencilsX;
00941     if ( simParams->PMEPencilsY > 0 ) yBlocks = simParams->PMEPencilsY;
00942     if ( simParams->PMEPencilsZ > 0 ) zBlocks = simParams->PMEPencilsZ;
00943 
00944     int dimx = simParams->PMEGridSizeX;
00945     int bx = 1 + ( dimx - 1 ) / xBlocks;
00946     xBlocks = 1 + ( dimx - 1 ) / bx;
00947 
00948     int dimy = simParams->PMEGridSizeY;
00949     int by = 1 + ( dimy - 1 ) / yBlocks;
00950     yBlocks = 1 + ( dimy - 1 ) / by;
00951 
00952     int dimz = simParams->PMEGridSizeZ / 2 + 1;  // complex
00953     int bz = 1 + ( dimz - 1 ) / zBlocks;
00954     zBlocks = 1 + ( dimz - 1 ) / bz;
00955 
00956     if ( xBlocks * yBlocks > CkNumPes() ) {
00957       NAMD_die("PME pencils xBlocks * yBlocks > numPes");
00958     }
00959     if ( xBlocks * zBlocks > CkNumPes() ) {
00960       NAMD_die("PME pencils xBlocks * zBlocks > numPes");
00961     }
00962     if ( yBlocks * zBlocks > CkNumPes() ) {
00963       NAMD_die("PME pencils yBlocks * zBlocks > numPes");
00964     }
00965 
00966     if ( ! CkMyPe() ) {
00967       iout << iINFO << "PME using " << xBlocks << " x " <<
00968         yBlocks << " x " << zBlocks <<
00969         " pencil grid for FFT and reciprocal sum.\n" << endi;
00970     }
00971   } else { // usePencils
00972 
00973   {  // decide how many pes to use for reciprocal sum
00974 
00975     // rules based on work available
00976     int minslices = simParams->PMEMinSlices;
00977     int dimx = simParams->PMEGridSizeX;
00978     int nrpx = ( dimx + minslices - 1 ) / minslices;
00979     int dimy = simParams->PMEGridSizeY;
00980     int nrpy = ( dimy + minslices - 1 ) / minslices;
00981 
00982     // rules based on processors available
00983     int nrpp = CkNumPes();
00984     // if ( nrpp > 32 ) nrpp = 32;  // cap to limit messages
00985     if ( nrpp < nrpx ) nrpx = nrpp;
00986     if ( nrpp < nrpy ) nrpy = nrpp;
00987 
00988     // user override
00989     int nrps = simParams->PMEProcessors;
00990     if ( nrps > CkNumPes() ) nrps = CkNumPes();
00991     if ( nrps > 0 ) nrpx = nrps;
00992     if ( nrps > 0 ) nrpy = nrps;
00993 
00994     // make sure there aren't any totally empty processors
00995     int bx = ( dimx + nrpx - 1 ) / nrpx;
00996     nrpx = ( dimx + bx - 1 ) / bx;
00997     int by = ( dimy + nrpy - 1 ) / nrpy;
00998     nrpy = ( dimy + by - 1 ) / by;
00999     if ( bx != ( dimx + nrpx - 1 ) / nrpx )
01000       NAMD_bug("Error in selecting number of PME processors.");
01001     if ( by != ( dimy + nrpy - 1 ) / nrpy )
01002       NAMD_bug("Error in selecting number of PME processors.");
01003 
01004     numGridPes = nrpx;
01005     numTransPes = nrpy;
01006   }
01007   if ( ! CkMyPe() ) {
01008     iout << iINFO << "PME using " << numGridPes << " and " << numTransPes <<
01009       " processors for FFT and reciprocal sum.\n" << endi;
01010   }
01011 
01012   int sum_npes = numTransPes + numGridPes;
01013   int max_npes = (numTransPes > numGridPes)?numTransPes:numGridPes;
01014 
01015 #if 0 // USE_TOPOMAP
01016   /* This code is being disabled permanently for slab PME on Blue Gene machines */
01017   PatchMap * pmap = PatchMap::Object();
01018   
01019   int patch_pes = pmap->numNodesWithPatches();
01020   TopoManager tmgr;
01021   if(tmgr.hasMultipleProcsPerNode())
01022     patch_pes *= 2;
01023 
01024   bool done = false;
01025   if(CkNumPes() > 2*sum_npes + patch_pes) {    
01026     done = generateBGLORBPmePeList(transPeMap, numTransPes);
01027     done &= generateBGLORBPmePeList(gridPeMap, numGridPes, transPeMap, numTransPes);    
01028   }
01029   else 
01030     if(CkNumPes() > 2 *max_npes + patch_pes) {
01031       done = generateBGLORBPmePeList(transPeMap, max_npes);
01032       gridPeMap = transPeMap;
01033     }
01034 
01035   if (!done)
01036 #endif
01037     {
01038       //generatePmePeList(transPeMap, max_npes);
01039       //gridPeMap = transPeMap;
01040       generatePmePeList2(gridPeMap, numGridPes, transPeMap, numTransPes);
01041     }
01042   
01043   if ( ! CkMyPe() ) {
01044     iout << iINFO << "PME GRID LOCATIONS:";
01045     int i;
01046     for ( i=0; i<numGridPes && i<10; ++i ) {
01047       iout << " " << gridPeMap[i];
01048     }
01049     if ( i < numGridPes ) iout << " ...";
01050     iout << "\n" << endi;
01051     iout << iINFO << "PME TRANS LOCATIONS:";
01052     for ( i=0; i<numTransPes && i<10; ++i ) {
01053       iout << " " << transPeMap[i];
01054     }
01055     if ( i < numTransPes ) iout << " ...";
01056     iout << "\n" << endi;
01057   }
01058 
01059   // sort based on nodes and physical nodes
01060   std::sort(gridPeMap,gridPeMap+numGridPes,WorkDistrib::pe_sortop_compact());
01061 
01062   myGridPe = -1;
01063   myGridNode = -1;
01064   int i = 0;
01065   int node = -1;
01066   int real_node = -1;
01067   for ( i=0; i<numGridPes; ++i ) {
01068     if ( gridPeMap[i] == CkMyPe() ) myGridPe = i;
01069     if (CkMyRank() == 0) pencilPMEProcessors[gridPeMap[i]] |= 1;
01070     int real_node_i = CkNodeOf(gridPeMap[i]);
01071     if ( real_node_i == real_node ) {
01072       gridNodeInfo[node].npe += 1;
01073     } else {
01074       real_node = real_node_i;
01075       ++node;
01076       gridNodeInfo[node].real_node = real_node;
01077       gridNodeInfo[node].pe_start = i;
01078       gridNodeInfo[node].npe = 1;
01079     }
01080     if ( CkMyNode() == real_node_i ) myGridNode = node;
01081   }
01082   numGridNodes = node + 1;
01083   myTransPe = -1;
01084   myTransNode = -1;
01085   node = -1;
01086   real_node = -1;
01087   for ( i=0; i<numTransPes; ++i ) {
01088     if ( transPeMap[i] == CkMyPe() ) myTransPe = i;
01089     if (CkMyRank() == 0) pencilPMEProcessors[transPeMap[i]] |= 2;
01090     int real_node_i = CkNodeOf(transPeMap[i]);
01091     if ( real_node_i == real_node ) {
01092       transNodeInfo[node].npe += 1;
01093     } else {
01094       real_node = real_node_i;
01095       ++node;
01096       transNodeInfo[node].real_node = real_node;
01097       transNodeInfo[node].pe_start = i;
01098       transNodeInfo[node].npe = 1;
01099     }
01100     if ( CkMyNode() == real_node_i ) myTransNode = node;
01101   }
01102   numTransNodes = node + 1;
01103 
01104   if ( ! CkMyPe() ) {
01105     iout << iINFO << "PME USING " << numGridNodes << " GRID NODES AND "
01106          << numTransNodes << " TRANS NODES\n" << endi;
01107   }
01108 
01109   { // generate random orderings for grid and trans messages
01110     int i;
01111     for ( i = 0; i < numGridPes; ++i ) {
01112       gridPeOrder[i] = i;
01113     }
01114     Random rand(CkMyPe());
01115     if ( myGridPe < 0 ) {
01116       rand.reorder(gridPeOrder,numGridPes);
01117     } else {  // self last
01118       gridPeOrder[myGridPe] = numGridPes-1;
01119       gridPeOrder[numGridPes-1] = myGridPe;
01120       rand.reorder(gridPeOrder,numGridPes-1);
01121     } 
01122     for ( i = 0; i < numGridNodes; ++i ) {
01123       gridNodeOrder[i] = i;
01124     }
01125     if ( myGridNode < 0 ) {
01126       rand.reorder(gridNodeOrder,numGridNodes);
01127     } else {  // self last
01128       gridNodeOrder[myGridNode] = numGridNodes-1;
01129       gridNodeOrder[numGridNodes-1] = myGridNode;
01130       rand.reorder(gridNodeOrder,numGridNodes-1);
01131     }
01132     for ( i = 0; i < numTransNodes; ++i ) {
01133       transNodeOrder[i] = i;
01134     }
01135     if ( myTransNode < 0 ) {
01136       rand.reorder(transNodeOrder,numTransNodes);
01137     } else {  // self last
01138       transNodeOrder[myTransNode] = numTransNodes-1;
01139       transNodeOrder[numTransNodes-1] = myTransNode;
01140       rand.reorder(transNodeOrder,numTransNodes-1);
01141     }
01142   }
01143   
01144   } // ! usePencils
01145 
01146   myGrid.K1 = simParams->PMEGridSizeX;
01147   myGrid.K2 = simParams->PMEGridSizeY;
01148   myGrid.K3 = simParams->PMEGridSizeZ;
01149   myGrid.order = simParams->PMEInterpOrder;
01150   myGrid.dim2 = myGrid.K2;
01151   myGrid.dim3 = 2 * (myGrid.K3/2 + 1);
01152 
01153   if ( ! usePencils ) {
01154     myGrid.block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
01155     myGrid.block2 = ( myGrid.K2 + numTransPes - 1 ) / numTransPes;
01156     myGrid.block3 = myGrid.dim3 / 2;  // complex
01157   }
01158 
01159   if ( usePencils ) {
01160     myGrid.block1 = ( myGrid.K1 + xBlocks - 1 ) / xBlocks;
01161     myGrid.block2 = ( myGrid.K2 + yBlocks - 1 ) / yBlocks;
01162     myGrid.block3 = ( myGrid.K3/2 + 1 + zBlocks - 1 ) / zBlocks;  // complex
01163 
01164 
01165       int pe = 0;
01166       int x,y,z;
01167 
01168                 SortableResizeArray<int> zprocs(xBlocks*yBlocks);
01169                 SortableResizeArray<int> yprocs(xBlocks*zBlocks);
01170                 SortableResizeArray<int> xprocs(yBlocks*zBlocks);
01171       
01172                 // decide which pes to use by bit reversal and patch use
01173                 int i;
01174                 int ncpus = CkNumPes();
01175                 SortableResizeArray<int> patches, nopatches, pmeprocs;
01176                 PatchMap *pmap = PatchMap::Object();
01177                 for ( int icpu=0; icpu<ncpus; ++icpu ) {
01178                         int ri = WorkDistrib::peDiffuseOrdering[icpu];
01179                         if ( ri ) { // keep 0 for special case
01180                                 // pretend pe 1 has patches to avoid placing extra PME load on node
01181                                 if ( ri == 1 || pmap->numPatchesOnNode(ri) ) patches.add(ri);
01182                                 else nopatches.add(ri);
01183                         }
01184                 }
01185 
01186 #if USE_RANDOM_TOPO
01187             Random rand(CkMyPe());
01188             int *tmp = new int[patches.size()];
01189             int nn = patches.size();
01190             for (i=0;i<nn;i++)  tmp[i] = patches[i];
01191             rand.reorder(tmp, nn);
01192             patches.resize(0);
01193             for (i=0;i<nn;i++)  patches.add(tmp[i]);
01194             delete [] tmp;
01195             tmp = new int[nopatches.size()];
01196             nn = nopatches.size();
01197             for (i=0;i<nn;i++)  tmp[i] = nopatches[i];
01198             rand.reorder(tmp, nn);
01199             nopatches.resize(0);
01200             for (i=0;i<nn;i++)  nopatches.add(tmp[i]);
01201             delete [] tmp;
01202 #endif
01203 
01204                 // only use zero if it eliminates overloading or has patches
01205                 int useZero = 0;
01206                 int npens = xBlocks*yBlocks;
01207                 if ( npens % ncpus == 0 ) useZero = 1;
01208                 if ( npens == nopatches.size() + 1 ) useZero = 1;
01209                 npens += xBlocks*zBlocks;
01210                 if ( npens % ncpus == 0 ) useZero = 1;
01211                 if ( npens == nopatches.size() + 1 ) useZero = 1;
01212                 npens += yBlocks*zBlocks;
01213                 if ( npens % ncpus == 0 ) useZero = 1;
01214                 if ( npens == nopatches.size() + 1 ) useZero = 1;
01215 
01216                 // add nopatches then patches in reversed order
01217                 for ( i=nopatches.size()-1; i>=0; --i ) pmeprocs.add(nopatches[i]);
01218                 if ( useZero && ! pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
01219                 for ( i=patches.size()-1; i>=0; --i ) pmeprocs.add(patches[i]);
01220                 if ( pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
01221   
01222                 int npes = pmeprocs.size();
01223                 for ( i=0; i<xBlocks*yBlocks; ++i, ++pe ) zprocs[i] = pmeprocs[pe%npes];
01224                 if ( i>1 && zprocs[0] == zprocs[i-1] ) zprocs[0] = 0;
01225 #if !USE_RANDOM_TOPO
01226                 zprocs.sort();
01227 #endif
01228                 for ( i=0; i<xBlocks*zBlocks; ++i, ++pe ) yprocs[i] = pmeprocs[pe%npes];
01229                 if ( i>1 && yprocs[0] == yprocs[i-1] ) yprocs[0] = 0;
01230 #if !USE_RANDOM_TOPO
01231                 yprocs.sort();
01232 #endif
01233       for ( i=0; i<yBlocks*zBlocks; ++i, ++pe ) xprocs[i] = pmeprocs[pe%npes];
01234       if ( i>1 && xprocs[0] == xprocs[i-1] ) xprocs[0] = 0;
01235 #if !USE_RANDOM_TOPO
01236       xprocs.sort();
01237 #endif
01238 
01239 #if USE_TOPO_SFC
01240   CmiLock(tmgr_lock);
01241   //{
01242   TopoManager tmgr;
01243   int xdim = tmgr.getDimNX();
01244   int ydim = tmgr.getDimNY();
01245   int zdim = tmgr.getDimNZ();
01246   int xdim1 = find_level_grid(xdim);
01247   int ydim1 = find_level_grid(ydim);
01248   int zdim1 = find_level_grid(zdim);
01249   if(CkMyPe() == 0)
01250       printf("xdim: %d %d %d, %d %d %d\n", xdim, ydim, zdim, xdim1, ydim1, zdim1);
01251 
01252   vector<Coord> result;
01253   SFC_grid(xdim, ydim, zdim, xdim1, ydim1, zdim1, result);
01254   sort_sfc(xprocs, tmgr, result);
01255   sort_sfc(yprocs, tmgr, result);
01256   sort_sfc(zprocs, tmgr, result);
01257   //}
01258   CmiUnlock(tmgr_lock);
01259 #endif
01260 
01261 
01262                 if(CkMyPe() == 0){  
01263               iout << iINFO << "PME Z PENCIL LOCATIONS:";
01264           for ( i=0; i<zprocs.size() && i<10; ++i ) {
01265 #if USE_TOPO_SFC
01266               int x,y,z,t;
01267               tmgr.rankToCoordinates(zprocs[i], x,y, z, t);
01268               iout << " " << zprocs[i] << "(" << x << " " << y << " " << z << ")";
01269 #else
01270               iout << " " << zprocs[i];
01271 #endif
01272           }
01273           if ( i < zprocs.size() ) iout << " ...";
01274               iout << "\n" << endi;
01275                 }
01276 
01277     if (CkMyRank() == 0) {
01278       for (pe=0, x = 0; x < xBlocks; ++x)
01279         for (y = 0; y < yBlocks; ++y, ++pe ) {
01280           pencilPMEProcessors[zprocs[pe]] = 1;
01281         }
01282     }
01283      
01284                 if(CkMyPe() == 0){  
01285               iout << iINFO << "PME Y PENCIL LOCATIONS:";
01286           for ( i=0; i<yprocs.size() && i<10; ++i ) {
01287 #if USE_TOPO_SFC
01288               int x,y,z,t;
01289               tmgr.rankToCoordinates(yprocs[i], x,y, z, t);
01290               iout << " " << yprocs[i] << "(" << x << " " << y << " " << z << ")";
01291 #else
01292               iout << " " << yprocs[i];
01293 #endif
01294           }
01295           if ( i < yprocs.size() ) iout << " ...";
01296               iout << "\n" << endi;
01297                 }
01298 
01299     if (CkMyRank() == 0) {
01300       for (pe=0, z = 0; z < zBlocks; ++z )
01301         for (x = 0; x < xBlocks; ++x, ++pe ) {
01302           pencilPMEProcessors[yprocs[pe]] = 1;
01303         }
01304     }
01305     
01306                 if(CkMyPe() == 0){  
01307                 iout << iINFO << "PME X PENCIL LOCATIONS:";
01308                     for ( i=0; i<xprocs.size() && i<10; ++i ) {
01309 #if USE_TOPO_SFC
01310                 int x,y,z,t;
01311                 tmgr.rankToCoordinates(xprocs[i], x,y, z, t);
01312                 iout << " " << xprocs[i] << "(" << x << "  " << y << " " << z << ")";
01313 #else
01314                 iout << " " << xprocs[i];
01315 #endif
01316             }
01317                 if ( i < xprocs.size() ) iout << " ...";
01318                 iout << "\n" << endi;
01319                 }
01320 
01321     if (CkMyRank() == 0) {
01322       for (pe=0, y = 0; y < yBlocks; ++y )      
01323         for (z = 0; z < zBlocks; ++z, ++pe ) {
01324           pencilPMEProcessors[xprocs[pe]] = 1;
01325         }
01326     }
01327         
01328 
01329         // creating the pencil arrays
01330         if ( CkMyPe() == 0 ){
01331 #if !USE_RANDOM_TOPO
01332         // std::sort(zprocs.begin(),zprocs.end(),WorkDistrib::pe_sortop_compact());
01333         WorkDistrib::sortPmePes(zprocs.begin(),xBlocks,yBlocks);
01334         std::sort(yprocs.begin(),yprocs.end(),WorkDistrib::pe_sortop_compact());
01335         std::sort(xprocs.begin(),xprocs.end(),WorkDistrib::pe_sortop_compact());
01336 #endif
01337 #if 1
01338         CProxy_PmePencilMap zm = CProxy_PmePencilMap::ckNew(0,1,yBlocks,xBlocks*yBlocks,zprocs.begin());
01339         CProxy_PmePencilMap ym;
01340         if ( simParams->PMEPencilsYLayout )
01341           ym = CProxy_PmePencilMap::ckNew(0,2,zBlocks,zBlocks*xBlocks,yprocs.begin()); // new
01342         else
01343           ym = CProxy_PmePencilMap::ckNew(2,0,xBlocks,zBlocks*xBlocks,yprocs.begin()); // old
01344         CProxy_PmePencilMap xm;
01345         if ( simParams->PMEPencilsXLayout )
01346           xm = CProxy_PmePencilMap::ckNew(2,1,yBlocks,yBlocks*zBlocks,xprocs.begin()); // new
01347         else
01348           xm = CProxy_PmePencilMap::ckNew(1,2,zBlocks,yBlocks*zBlocks,xprocs.begin()); // old
01349         pmeNodeProxy.recvPencilMapProxies(xm,ym,zm);
01350         CkArrayOptions zo(xBlocks,yBlocks,1);  zo.setMap(zm);
01351         CkArrayOptions yo(xBlocks,1,zBlocks);  yo.setMap(ym);
01352         CkArrayOptions xo(1,yBlocks,zBlocks);  xo.setMap(xm);
01353         zo.setAnytimeMigration(false);  zo.setStaticInsertion(true);
01354         yo.setAnytimeMigration(false);  yo.setStaticInsertion(true);
01355         xo.setAnytimeMigration(false);  xo.setStaticInsertion(true);
01356         zPencil = CProxy_PmeZPencil::ckNew(zo);  // (xBlocks,yBlocks,1);
01357         yPencil = CProxy_PmeYPencil::ckNew(yo);  // (xBlocks,1,zBlocks);
01358         xPencil = CProxy_PmeXPencil::ckNew(xo);  // (1,yBlocks,zBlocks);
01359 #else
01360         zPencil = CProxy_PmeZPencil::ckNew();  // (xBlocks,yBlocks,1);
01361         yPencil = CProxy_PmeYPencil::ckNew();  // (xBlocks,1,zBlocks);
01362         xPencil = CProxy_PmeXPencil::ckNew();  // (1,yBlocks,zBlocks);
01363 
01364                 for (pe=0, x = 0; x < xBlocks; ++x)
01365                         for (y = 0; y < yBlocks; ++y, ++pe ) {
01366                                 zPencil(x,y,0).insert(zprocs[pe]);
01367                         }
01368         zPencil.doneInserting();
01369 
01370                 for (pe=0, x = 0; x < xBlocks; ++x)
01371                         for (z = 0; z < zBlocks; ++z, ++pe ) {
01372                                 yPencil(x,0,z).insert(yprocs[pe]);
01373                         }
01374         yPencil.doneInserting();
01375 
01376 
01377                 for (pe=0, y = 0; y < yBlocks; ++y )    
01378                         for (z = 0; z < zBlocks; ++z, ++pe ) {
01379                                 xPencil(0,y,z).insert(xprocs[pe]);
01380                         }
01381                 xPencil.doneInserting();     
01382 #endif
01383 
01384                 pmeProxy.recvArrays(xPencil,yPencil,zPencil);
01385                 PmePencilInitMsgData msgdata;
01386                 msgdata.grid = myGrid;
01387                 msgdata.xBlocks = xBlocks;
01388                 msgdata.yBlocks = yBlocks;
01389                 msgdata.zBlocks = zBlocks;
01390                 msgdata.xPencil = xPencil;
01391                 msgdata.yPencil = yPencil;
01392                 msgdata.zPencil = zPencil;
01393                 msgdata.pmeProxy = pmeProxyDir;
01394         msgdata.pmeNodeProxy = pmeNodeProxy;
01395         msgdata.xm = xm;
01396         msgdata.ym = ym;
01397         msgdata.zm = zm;
01398                 xPencil.init(new PmePencilInitMsg(msgdata));
01399                 yPencil.init(new PmePencilInitMsg(msgdata));
01400                 zPencil.init(new PmePencilInitMsg(msgdata));
01401         }
01402 
01403     return;  // continue in initialize_pencils() at next startup stage
01404   }
01405 
01406 
01407   int pe;
01408   int nx = 0;
01409   for ( pe = 0; pe < numGridPes; ++pe ) {
01410     localInfo[pe].x_start = nx;
01411     nx += myGrid.block1;
01412     if ( nx > myGrid.K1 ) nx = myGrid.K1;
01413     localInfo[pe].nx = nx - localInfo[pe].x_start;
01414   }
01415   int ny = 0;
01416   for ( pe = 0; pe < numTransPes; ++pe ) {
01417     localInfo[pe].y_start_after_transpose = ny;
01418     ny += myGrid.block2;
01419     if ( ny > myGrid.K2 ) ny = myGrid.K2;
01420     localInfo[pe].ny_after_transpose =
01421                         ny - localInfo[pe].y_start_after_transpose;
01422   }
01423 
01424   {  // decide how many pes this node exchanges charges with
01425 
01426   PatchMap *patchMap = PatchMap::Object();
01427   Lattice lattice = simParams->lattice;
01428   BigReal sysdima = lattice.a_r().unit() * lattice.a();
01429   BigReal cutoff = simParams->cutoff;
01430   BigReal patchdim = simParams->patchDimension;
01431   int numPatches = patchMap->numPatches();
01432   int numNodes = CkNumPes();
01433   int *source_flags = new int[numNodes];
01434   int node;
01435   for ( node=0; node<numNodes; ++node ) {
01436     source_flags[node] = 0;
01437     recipPeDest[node] = 0;
01438   }
01439 
01440   // // make sure that we don't get ahead of ourselves on this node
01441   // if ( CkMyPe() < numPatches && myRecipPe >= 0 ) {
01442   //   source_flags[CkMyPe()] = 1;
01443   //   recipPeDest[myRecipPe] = 1;
01444   // }
01445 
01446   for ( int pid=0; pid < numPatches; ++pid ) {
01447     int pnode = patchMap->node(pid);
01448 #ifdef NAMD_CUDA
01449     if ( offload ) pnode = CkNodeFirst(CkNodeOf(pnode));
01450 #endif
01451     int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
01452     BigReal minx = patchMap->min_a(pid);
01453     BigReal maxx = patchMap->max_a(pid);
01454     BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
01455     // min1 (max1) is smallest (largest) grid line for this patch
01456     int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
01457     int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
01458     for ( int i=min1; i<=max1; ++i ) {
01459       int ix = i;
01460       while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
01461       while ( ix < 0 ) ix += myGrid.K1;
01462       // set source_flags[pnode] if this patch sends to our node
01463       if ( myGridPe >= 0 && ix >= localInfo[myGridPe].x_start &&
01464            ix < localInfo[myGridPe].x_start + localInfo[myGridPe].nx ) {
01465         source_flags[pnode] = 1;
01466       }
01467       // set dest_flags[] for node that our patch sends to
01468 #ifdef NAMD_CUDA
01469       if ( offload ) {
01470         if ( pnode == CkNodeFirst(CkMyNode()) ) {
01471           recipPeDest[ix / myGrid.block1] = 1;
01472         }
01473       } else
01474 #endif
01475       if ( pnode == CkMyPe() ) {
01476         recipPeDest[ix / myGrid.block1] = 1;
01477       }
01478     }
01479   }
01480 
01481   int numSourcesSamePhysicalNode = 0;
01482   numSources = 0;
01483   numDestRecipPes = 0;
01484   for ( node=0; node<numNodes; ++node ) {
01485     if ( source_flags[node] ) ++numSources;
01486     if ( recipPeDest[node] ) ++numDestRecipPes;
01487     if ( source_flags[node] && CmiPeOnSamePhysicalNode(node,CkMyPe()) ) ++numSourcesSamePhysicalNode;
01488   }
01489 
01490 #if 0
01491   if ( numSources ) {
01492     CkPrintf("pe %5d pme %5d of %5d on same physical node\n",
01493             CkMyPe(), numSourcesSamePhysicalNode, numSources);
01494     iout << iINFO << "PME " << CkMyPe() << " sources:";
01495     for ( node=0; node<numNodes; ++node ) {
01496       if ( source_flags[node] ) iout << " " << node;
01497     }
01498     iout << "\n" << endi;
01499   }
01500 #endif
01501 
01502   delete [] source_flags;
01503 
01504   // CkPrintf("PME on node %d has %d sources and %d destinations\n",
01505   //           CkMyPe(), numSources, numDestRecipPes);
01506 
01507   }  // decide how many pes this node exchanges charges with (end)
01508 
01509   ungrid_count = numDestRecipPes;
01510 
01511   sendTransBarrier_received = 0;
01512 
01513   if ( myGridPe < 0 && myTransPe < 0 ) return;
01514   // the following only for nodes doing reciprocal sum
01515 
01516   if ( myTransPe >= 0 ) {
01517     recipEvirPe = findRecipEvirPe();
01518     pmeProxy[recipEvirPe].addRecipEvirClient();
01519   }
01520 
01521   if ( myTransPe >= 0 ) {
01522       int k2_start = localInfo[myTransPe].y_start_after_transpose;
01523       int k2_end = k2_start + localInfo[myTransPe].ny_after_transpose;
01524       #ifdef OPENATOM_VERSION
01525       if ( simParams->openatomOn ) { 
01526         CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
01527         myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2, moaProxy);
01528       } else {
01529         myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
01530       }
01531       #else  // OPENATOM_VERSION
01532       myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
01533       #endif // OPENATOM_VERSION
01534   }
01535 
01536   int local_size = myGrid.block1 * myGrid.K2 * myGrid.dim3;
01537   int local_size_2 = myGrid.block2 * myGrid.K1 * myGrid.dim3;
01538   if ( local_size < local_size_2 ) local_size = local_size_2;
01539   qgrid = new float[local_size*numGrids];
01540   if ( numGridPes > 1 || numTransPes > 1 ) {
01541     kgrid = new float[local_size*numGrids];
01542   } else {
01543     kgrid = qgrid;
01544   }
01545   qgrid_size = local_size;
01546 
01547   if ( myGridPe >= 0 ) {
01548   qgrid_start = localInfo[myGridPe].x_start * myGrid.K2 * myGrid.dim3;
01549   qgrid_len = localInfo[myGridPe].nx * myGrid.K2 * myGrid.dim3;
01550   fgrid_start = localInfo[myGridPe].x_start * myGrid.K2;
01551   fgrid_len = localInfo[myGridPe].nx * myGrid.K2;
01552   }
01553 
01554   int n[3]; n[0] = myGrid.K1; n[1] = myGrid.K2; n[2] = myGrid.K3;
01555 #ifdef NAMD_FFTW
01556   CmiLock(fftw_plan_lock);
01557 #ifdef NAMD_FFTW_3
01558   work = new fftwf_complex[n[0]];
01559   int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT  : simParams->FFTWEstimate ? FFTW_ESTIMATE  : FFTW_MEASURE ;
01560   if ( myGridPe >= 0 ) {
01561     forward_plan_yz=new fftwf_plan[numGrids];
01562     backward_plan_yz=new fftwf_plan[numGrids];
01563   }
01564   if ( myTransPe >= 0 ) {
01565     forward_plan_x=new fftwf_plan[numGrids];
01566     backward_plan_x=new fftwf_plan[numGrids];
01567   }
01568   /* need one plan per grid */
01569   if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps.  1..." << endi;
01570   if ( myGridPe >= 0 ) {
01571     for( int g=0; g<numGrids; g++)
01572       {
01573         forward_plan_yz[g] = fftwf_plan_many_dft_r2c(2, n+1, 
01574                                                      localInfo[myGridPe].nx,
01575                                                      qgrid + qgrid_size * g,
01576                                                      NULL,
01577                                                      1,
01578                                                      myGrid.dim2 * myGrid.dim3,
01579                                                      (fftwf_complex *) 
01580                                                      (qgrid + qgrid_size * g),
01581                                                      NULL,
01582                                                      1,
01583                                                      myGrid.dim2 * (myGrid.dim3/2),
01584                                                      fftwFlags);
01585       }
01586   }
01587   int zdim = myGrid.dim3;
01588   int xStride=localInfo[myTransPe].ny_after_transpose *( myGrid.dim3 / 2);
01589   if ( ! CkMyPe() ) iout << " 2..." << endi;
01590   if ( myTransPe >= 0 ) {
01591     for( int g=0; g<numGrids; g++)
01592       {
01593 
01594         forward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
01595                                                 (fftwf_complex *)
01596                                                 (kgrid+qgrid_size*g),
01597                                                 NULL,
01598                                                 xStride,
01599                                                 1,
01600                                                 (fftwf_complex *)
01601                                                 (kgrid+qgrid_size*g),
01602                                                 NULL,
01603                                                 xStride,
01604                                                 1,
01605                                                 FFTW_FORWARD,fftwFlags);
01606         
01607       }
01608   }
01609   if ( ! CkMyPe() ) iout << " 3..." << endi;
01610   if ( myTransPe >= 0 ) {
01611     for( int g=0; g<numGrids; g++)
01612       {
01613         backward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
01614                                                  (fftwf_complex *)
01615                                                  (kgrid+qgrid_size*g),
01616                                                  NULL,
01617                                                  xStride,
01618                                                  1,
01619                                                  (fftwf_complex *)
01620                                                  (kgrid+qgrid_size*g),
01621                                                  NULL,
01622                                                  xStride,
01623                                                  1,
01624                                                  FFTW_BACKWARD, fftwFlags);
01625 
01626       }
01627   }
01628   if ( ! CkMyPe() ) iout << " 4..." << endi;
01629   if ( myGridPe >= 0 ) {
01630     for( int g=0; g<numGrids; g++)
01631       {
01632         backward_plan_yz[g] = fftwf_plan_many_dft_c2r(2, n+1, 
01633                                                       localInfo[myGridPe].nx,
01634                                                       (fftwf_complex *)
01635                                                       (qgrid + qgrid_size * g),
01636                                                       NULL,
01637                                                       1,
01638                                                       myGrid.dim2*(myGrid.dim3/2),
01639                                                       qgrid + qgrid_size * g,
01640                                                       NULL,
01641                                                       1,
01642                                                       myGrid.dim2 * myGrid.dim3,
01643                                                       fftwFlags);
01644       }
01645   }
01646   if ( ! CkMyPe() ) iout << "   Done.\n" << endi;
01647 
01648 #else
01649   work = new fftw_complex[n[0]];
01650 
01651   if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps.  1..." << endi;
01652   if ( myGridPe >= 0 ) {
01653   forward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_REAL_TO_COMPLEX,
01654         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01655         | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
01656   }
01657   if ( ! CkMyPe() ) iout << " 2..." << endi;
01658   if ( myTransPe >= 0 ) {
01659       forward_plan_x = fftw_create_plan_specific(n[0], FFTW_FORWARD,
01660         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01661         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
01662         localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
01663   }
01664   if ( ! CkMyPe() ) iout << " 3..." << endi;
01665   if ( myTransPe >= 0 ) {
01666   backward_plan_x = fftw_create_plan_specific(n[0], FFTW_BACKWARD,
01667         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01668         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
01669         localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
01670   }
01671   if ( ! CkMyPe() ) iout << " 4..." << endi;
01672   if ( myGridPe >= 0 ) {
01673   backward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_COMPLEX_TO_REAL,
01674         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01675         | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
01676   }
01677   if ( ! CkMyPe() ) iout << "   Done.\n" << endi;
01678 #endif
01679   CmiUnlock(fftw_plan_lock);
01680 #else
01681   NAMD_die("Sorry, FFTW must be compiled in to use PME.");
01682 #endif
01683 
01684   if ( myGridPe >= 0 && numSources == 0 )
01685                 NAMD_bug("PME grid elements exist without sources.");
01686   grid_count = numSources;
01687   memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
01688   trans_count = numGridPes;
01689 }

void ComputePmeMgr::initialize_computes (  ) 

Definition at line 2720 of file ComputePme.C.

References chargeGridSubmittedCount, cuda_errcheck(), cuda_init_bspline_coeffs(), cuda_lock, deviceCUDA, PmeGrid::dim2, PmeGrid::dim3, NodePmeMgr::end_all_pme_kernels, NodePmeMgr::end_charge_memset, NodePmeMgr::end_potential_memcpy, f, DeviceCUDA::getDeviceID(), DeviceCUDA::getMasterPe(), j, ijpair::j, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, NodePmeMgr::master_pe, master_pe, NodePmeMgr::masterPmeMgr, NodePmeMgr::mgrObjects, NAMD_bug(), PatchMap::numPatchesOnNode(), PatchMap::Object(), Node::Object(), ReductionMgr::Object(), PmeGrid::order, REDUCTIONS_BASIC, Node::simParameters, simParams, ReductionMgr::willSubmit(), and XCOPY.

02720                                         {
02721 
02722   noWorkCount = 0;
02723   doWorkCount = 0;
02724   ungridForcesCount = 0;
02725 
02726   reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
02727 
02728   SimParameters *simParams = Node::Object()->simParameters;
02729 
02730   strayChargeErrors = 0;
02731 
02732 #ifdef NAMD_CUDA
02733  PatchMap *patchMap = PatchMap::Object();
02734  int pe = master_pe = CkNodeFirst(CkMyNode());
02735  for ( int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
02736     if ( ! patchMap->numPatchesOnNode(master_pe) ) master_pe = pe;
02737     if ( ! patchMap->numPatchesOnNode(pe) ) continue;
02738     if ( master_pe < 1 && pe != deviceCUDA->getMasterPe() ) master_pe = pe;
02739     if ( master_pe == deviceCUDA->getMasterPe() ) master_pe = pe;
02740     if ( WorkDistrib::pe_sortop_diffuse()(pe,master_pe)
02741         && pe != deviceCUDA->getMasterPe() ) {
02742       master_pe = pe;
02743     }
02744  }
02745  if ( ! patchMap->numPatchesOnNode(master_pe) ) {
02746    NAMD_bug("ComputePmeMgr::initialize_computes() master_pe has no patches.");
02747  }
02748 
02749  masterPmeMgr = nodePmeMgr->mgrObjects[master_pe - CkNodeFirst(CkMyNode())];
02750  bool cudaFirst = 1;
02751  if ( offload ) {
02752   CmiLock(cuda_lock);
02753   cudaFirst = ! masterPmeMgr->chargeGridSubmittedCount++;
02754  }
02755 
02756  if ( cudaFirst ) {
02757   nodePmeMgr->master_pe = master_pe;
02758   nodePmeMgr->masterPmeMgr = masterPmeMgr;
02759  }
02760 #endif
02761 
02762   qsize = myGrid.K1 * myGrid.dim2 * myGrid.dim3;
02763   fsize = myGrid.K1 * myGrid.dim2;
02764   if ( myGrid.K2 != myGrid.dim2 ) NAMD_bug("PME myGrid.K2 != myGrid.dim2");
02765 #ifdef NAMD_CUDA
02766  if ( ! offload )
02767 #endif
02768  {
02769   q_arr = new float*[fsize*numGrids];
02770   memset( (void*) q_arr, 0, fsize*numGrids * sizeof(float*) );
02771   q_list = new float*[fsize*numGrids];
02772   memset( (void*) q_list, 0, fsize*numGrids * sizeof(float*) );
02773   q_count = 0;
02774  }
02775 
02776 #ifdef NAMD_CUDA
02777  if ( cudaFirst || ! offload ) {
02778 #endif
02779   f_arr = new char[fsize*numGrids];
02780   // memset to non-zero value has race condition on BlueGene/Q
02781   // memset( (void*) f_arr, 2, fsize*numGrids * sizeof(char) );
02782   for ( int n=fsize*numGrids, i=0; i<n; ++i ) f_arr[i] = 2;
02783 
02784   for ( int g=0; g<numGrids; ++g ) {
02785     char *f = f_arr + g*fsize;
02786     if ( usePencils ) {
02787       int K1 = myGrid.K1;
02788       int K2 = myGrid.K2;
02789       int block1 = ( K1 + xBlocks - 1 ) / xBlocks;
02790       int block2 = ( K2 + yBlocks - 1 ) / yBlocks;
02791       int dim2 = myGrid.dim2;
02792       for (int ap=0; ap<numPencilsActive; ++ap) {
02793         int ib = activePencils[ap].i;
02794         int jb = activePencils[ap].j;
02795         int ibegin = ib*block1;
02796         int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
02797         int jbegin = jb*block2;
02798         int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
02799         int flen = numGrids * (iend - ibegin) * (jend - jbegin);
02800         for ( int i=ibegin; i<iend; ++i ) {
02801           for ( int j=jbegin; j<jend; ++j ) {
02802             f[i*dim2+j] = 0;
02803           }
02804         }
02805       }
02806     } else {
02807       int block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
02808       bsize = block1 * myGrid.dim2 * myGrid.dim3;
02809       for (int pe=0; pe<numGridPes; pe++) {
02810         if ( ! recipPeDest[pe] ) continue;
02811         int start = pe * bsize;
02812         int len = bsize;
02813         if ( start >= qsize ) { start = 0; len = 0; }
02814         if ( start + len > qsize ) { len = qsize - start; }
02815         int zdim = myGrid.dim3;
02816         int fstart = start / zdim;
02817         int flen = len / zdim;
02818         memset(f + fstart, 0, flen*sizeof(char));
02819         // CkPrintf("pe %d enabled slabs %d to %d\n", CkMyPe(), fstart/myGrid.dim2, (fstart+flen)/myGrid.dim2-1);
02820       }
02821     }
02822   }
02823 #ifdef NAMD_CUDA
02824  }
02825  if ( offload ) {
02826  cudaSetDevice(deviceCUDA->getDeviceID());
02827  if ( cudaFirst ) {
02828 
02829   int f_alloc_count = 0;
02830   for ( int n=fsize, i=0; i<n; ++i ) {
02831     if ( f_arr[i] == 0 ) {
02832       ++f_alloc_count;
02833     }
02834   }
02835   // CkPrintf("pe %d f_alloc_count == %d (%d slabs)\n", CkMyPe(), f_alloc_count, f_alloc_count/myGrid.dim2);
02836 
02837   q_arr = new float*[fsize*numGrids];
02838   memset( (void*) q_arr, 0, fsize*numGrids * sizeof(float*) );
02839 
02840   float **q_arr_dev_host = new float*[fsize];
02841   cudaMalloc((void**) &q_arr_dev, fsize * sizeof(float*));
02842 
02843   float **v_arr_dev_host = new float*[fsize];
02844   cudaMalloc((void**) &v_arr_dev, fsize * sizeof(float*));
02845 
02846   int q_stride = myGrid.K3+myGrid.order-1;
02847   q_data_size = f_alloc_count * q_stride * sizeof(float);
02848   ffz_size = (fsize + q_stride) * sizeof(int);
02849 
02850   // tack ffz onto end of q_data to allow merged transfer
02851   cudaMallocHost((void**) &q_data_host, q_data_size+ffz_size);
02852   ffz_host = (int*)(((char*)q_data_host) + q_data_size);
02853   cudaMalloc((void**) &q_data_dev, q_data_size+ffz_size);
02854   ffz_dev = (int*)(((char*)q_data_dev) + q_data_size);
02855   cudaMalloc((void**) &v_data_dev, q_data_size);
02856   cuda_errcheck("malloc grid data for pme");
02857   cudaMemset(q_data_dev, 0, q_data_size + ffz_size);  // for first time
02858   cudaEventCreateWithFlags(&(nodePmeMgr->end_charge_memset),cudaEventDisableTiming);
02859   cudaEventRecord(nodePmeMgr->end_charge_memset, 0);
02860   cudaEventCreateWithFlags(&(nodePmeMgr->end_all_pme_kernels),cudaEventDisableTiming);
02861   cudaEventCreateWithFlags(&(nodePmeMgr->end_potential_memcpy),cudaEventDisableTiming);
02862 
02863   f_alloc_count = 0;
02864   for ( int n=fsize, i=0; i<n; ++i ) {
02865     if ( f_arr[i] == 0 ) {
02866       q_arr[i] = q_data_host + f_alloc_count * q_stride;
02867       q_arr_dev_host[i] = q_data_dev + f_alloc_count * q_stride;
02868       v_arr_dev_host[i] = v_data_dev + f_alloc_count * q_stride;
02869       ++f_alloc_count;
02870     } else {
02871       q_arr[i] = 0;
02872       q_arr_dev_host[i] = 0;
02873       v_arr_dev_host[i] = 0;
02874     }
02875   }
02876 
02877   cudaMemcpy(q_arr_dev, q_arr_dev_host, fsize * sizeof(float*), cudaMemcpyHostToDevice);
02878   cudaMemcpy(v_arr_dev, v_arr_dev_host, fsize * sizeof(float*), cudaMemcpyHostToDevice);
02879   delete [] q_arr_dev_host;
02880   delete [] v_arr_dev_host;
02881   delete [] f_arr;
02882   f_arr = new char[fsize + q_stride];
02883   fz_arr = f_arr + fsize;
02884   memset(f_arr, 0, fsize + q_stride);
02885   memset(ffz_host, 0, (fsize + q_stride)*sizeof(int));
02886 
02887   cuda_errcheck("initialize grid data for pme");
02888 
02889   cuda_init_bspline_coeffs(&bspline_coeffs_dev, &bspline_dcoeffs_dev, myGrid.order);
02890   cuda_errcheck("initialize bspline coefficients for pme");
02891 
02892 #define XCOPY(X) masterPmeMgr->X = X;
02893   XCOPY(bspline_coeffs_dev)
02894   XCOPY(bspline_dcoeffs_dev)
02895   XCOPY(q_arr)
02896   XCOPY(q_arr_dev)
02897   XCOPY(v_arr_dev)
02898   XCOPY(q_data_size)
02899   XCOPY(q_data_host)
02900   XCOPY(q_data_dev)
02901   XCOPY(v_data_dev)
02902   XCOPY(ffz_size)
02903   XCOPY(ffz_host)
02904   XCOPY(ffz_dev)
02905   XCOPY(f_arr)
02906   XCOPY(fz_arr)
02907 #undef XCOPY
02908   //CkPrintf("pe %d init first\n", CkMyPe());
02909  } else { // cudaFirst
02910   //CkPrintf("pe %d init later\n", CkMyPe());
02911 #define XCOPY(X) X = masterPmeMgr->X;
02912   XCOPY(bspline_coeffs_dev)
02913   XCOPY(bspline_dcoeffs_dev)
02914   XCOPY(q_arr)
02915   XCOPY(q_arr_dev)
02916   XCOPY(v_arr_dev)
02917   XCOPY(q_data_size)
02918   XCOPY(q_data_host)
02919   XCOPY(q_data_dev)
02920   XCOPY(v_data_dev)
02921   XCOPY(ffz_size)
02922   XCOPY(ffz_host)
02923   XCOPY(ffz_dev)
02924   XCOPY(f_arr)
02925   XCOPY(fz_arr)
02926 #undef XCOPY
02927  } // cudaFirst
02928   CmiUnlock(cuda_lock);
02929  } else // offload
02930 #endif // NAMD_CUDA
02931  {
02932   fz_arr = new char[myGrid.K3+myGrid.order-1];
02933  }
02934 
02935 #if 0 && USE_PERSISTENT
02936   recvGrid_handle = NULL;
02937 #endif
02938 }

void ComputePmeMgr::initialize_pencils ( CkQdMsg *   ) 

Definition at line 1693 of file ComputePme.C.

References Lattice::a(), Lattice::a_r(), Lattice::b(), Lattice::b_r(), PmeGrid::block1, PmeGrid::block2, deviceCUDA, DeviceCUDA::getMasterPe(), j, PmeGrid::K1, PmeGrid::K2, PatchMap::max_a(), PatchMap::max_b(), PatchMap::min_a(), PatchMap::min_b(), PatchMap::node(), PatchMap::numPatches(), numPatches, PatchMap::Object(), Node::Object(), PmeGrid::order, Node::simParameters, simParams, and Vector::unit().

01693                                                    {
01694   delete msg;
01695   if ( ! usePencils ) return;
01696 
01697   SimParameters *simParams = Node::Object()->simParameters;
01698 
01699   PatchMap *patchMap = PatchMap::Object();
01700   Lattice lattice = simParams->lattice;
01701   BigReal sysdima = lattice.a_r().unit() * lattice.a();
01702   BigReal sysdimb = lattice.b_r().unit() * lattice.b();
01703   BigReal cutoff = simParams->cutoff;
01704   BigReal patchdim = simParams->patchDimension;
01705   int numPatches = patchMap->numPatches();
01706 
01707   pencilActive = new char[xBlocks*yBlocks];
01708   for ( int i=0; i<xBlocks; ++i ) {
01709     for ( int j=0; j<yBlocks; ++j ) {
01710       pencilActive[i*yBlocks+j] = 0;
01711     }
01712   }
01713 
01714   for ( int pid=0; pid < numPatches; ++pid ) {
01715     int pnode = patchMap->node(pid);
01716 #ifdef NAMD_CUDA
01717     if ( offload ) {
01718       if ( CkNodeOf(pnode) != CkMyNode() ) continue;
01719     } else
01720 #endif
01721     if ( pnode != CkMyPe() ) continue;
01722 
01723     int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
01724     int shift2 = (myGrid.K2 + myGrid.order - 1)/2;
01725 
01726     BigReal minx = patchMap->min_a(pid);
01727     BigReal maxx = patchMap->max_a(pid);
01728     BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
01729     // min1 (max1) is smallest (largest) grid line for this patch
01730     int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
01731     int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
01732 
01733     BigReal miny = patchMap->min_b(pid);
01734     BigReal maxy = patchMap->max_b(pid);
01735     BigReal marginb = 0.5 * ( patchdim - cutoff ) / sysdimb;
01736     // min2 (max2) is smallest (largest) grid line for this patch
01737     int min2 = ((int) floor(myGrid.K2 * (miny - marginb))) + shift2 - myGrid.order + 1;
01738     int max2 = ((int) floor(myGrid.K2 * (maxy + marginb))) + shift2;
01739 
01740     for ( int i=min1; i<=max1; ++i ) {
01741       int ix = i;
01742       while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
01743       while ( ix < 0 ) ix += myGrid.K1;
01744       for ( int j=min2; j<=max2; ++j ) {
01745         int jy = j;
01746         while ( jy >= myGrid.K2 ) jy -= myGrid.K2;
01747         while ( jy < 0 ) jy += myGrid.K2;
01748         pencilActive[(ix / myGrid.block1)*yBlocks + (jy / myGrid.block2)] = 1;
01749       }
01750     }
01751   }
01752 
01753   numPencilsActive = 0;
01754   for ( int i=0; i<xBlocks; ++i ) {
01755     for ( int j=0; j<yBlocks; ++j ) {
01756       if ( pencilActive[i*yBlocks+j] ) {
01757         ++numPencilsActive;
01758 #ifdef NAMD_CUDA
01759         if ( CkMyPe() == deviceCUDA->getMasterPe() || ! offload )
01760 #endif
01761         zPencil(i,j,0).dummyRecvGrid(CkMyPe(),0);
01762       }
01763     }
01764   }
01765   activePencils = new ijpair[numPencilsActive];
01766   numPencilsActive = 0;
01767   for ( int i=0; i<xBlocks; ++i ) {
01768     for ( int j=0; j<yBlocks; ++j ) {
01769       if ( pencilActive[i*yBlocks+j] ) {
01770         activePencils[numPencilsActive++] = ijpair(i,j);
01771       }
01772     }
01773   }
01774   if ( simParams->PMESendOrder ) {
01775     std::sort(activePencils,activePencils+numPencilsActive,ijpair_sortop_bit_reversed());
01776   } else {
01777     Random rand(CkMyPe());
01778     rand.reorder(activePencils,numPencilsActive);
01779   }
01780   //if ( numPencilsActive ) {
01781   //  CkPrintf("node %d sending to %d pencils\n", CkMyPe(), numPencilsActive);
01782   //}
01783 
01784   ungrid_count = numPencilsActive;
01785 }

void ComputePmeMgr::pollChargeGridReady (  ) 

Definition at line 3525 of file ComputePme.C.

References CcdCallBacksReset(), cuda_check_pme_charges(), CUDA_POLL, and NAMD_bug().

03525                                         {
03526 #ifdef NAMD_CUDA
03527   CcdCallBacksReset(0,CmiWallTimer());  // fix Charm++
03528   CUDA_POLL(cuda_check_pme_charges,this);
03529 #else
03530   NAMD_bug("ComputePmeMgr::pollChargeGridReady() called in non-CUDA build.");
03531 #endif
03532 }

void ComputePmeMgr::pollForcesReady (  ) 

Definition at line 2656 of file ComputePme.C.

References CcdCallBacksReset(), cuda_check_pme_forces(), CUDA_POLL, and NAMD_bug().

02656                                     {
02657 #ifdef NAMD_CUDA
02658   CcdCallBacksReset(0,CmiWallTimer());  // fix Charm++
02659   CUDA_POLL(cuda_check_pme_forces,this);
02660 #else
02661   NAMD_bug("ComputePmeMgr::pollForcesReady() called in non-CUDA build.");
02662 #endif
02663 }

void ComputePmeMgr::procTrans ( PmeTransMsg  ) 

Definition at line 2048 of file ComputePme.C.

References PmeGrid::dim3, gridCalc2(), PmeTransMsg::lattice, NodePmeInfo::npe, ComputePmeUtil::numGrids, PmeTransMsg::nx, LocalPmeInfo::ny_after_transpose, NodePmeInfo::pe_start, PmeTransMsg::qgrid, PmeTransMsg::sequence, PmeTransMsg::x_start, and LocalPmeInfo::y_start_after_transpose.

Referenced by recvSharedTrans(), and recvTrans().

02048                                               {
02049   // CkPrintf("procTrans on Pe(%d)\n",CkMyPe());
02050   if ( trans_count == numGridPes ) {
02051     lattice = msg->lattice;
02052     grid_sequence = msg->sequence;
02053   }
02054 
02055  if ( msg->nx ) {
02056   int zdim = myGrid.dim3;
02057   NodePmeInfo &nodeInfo(transNodeInfo[myTransNode]);
02058   int first_pe = nodeInfo.pe_start;
02059   int last_pe = first_pe+nodeInfo.npe-1;
02060   int y_skip = localInfo[myTransPe].y_start_after_transpose
02061              - localInfo[first_pe].y_start_after_transpose;
02062   int ny_msg = localInfo[last_pe].y_start_after_transpose
02063              + localInfo[last_pe].ny_after_transpose
02064              - localInfo[first_pe].y_start_after_transpose;
02065   int ny = localInfo[myTransPe].ny_after_transpose;
02066   int x_start = msg->x_start;
02067   int nx = msg->nx;
02068   for ( int g=0; g<numGrids; ++g ) {
02069     CmiMemcpy((void*)(kgrid + qgrid_size * g + x_start*ny*zdim),
02070         (void*)(msg->qgrid + nx*(ny_msg*g+y_skip)*zdim),
02071         nx*ny*zdim*sizeof(float));
02072   }
02073  }
02074 
02075   --trans_count;
02076 
02077   if ( trans_count == 0 ) {
02078     pmeProxyDir[CkMyPe()].gridCalc2();
02079   }
02080 }

void ComputePmeMgr::procUntrans ( PmeUntransMsg  ) 

Definition at line 2302 of file ComputePme.C.

References PmeGrid::dim3, gridCalc3(), PmeGrid::K2, NodePmeInfo::npe, LocalPmeInfo::nx, PmeUntransMsg::ny, NodePmeInfo::pe_start, PmeUntransMsg::qgrid, LocalPmeInfo::x_start, and PmeUntransMsg::y_start.

Referenced by recvSharedUntrans(), and recvUntrans().

02302                                                   {
02303   // CkPrintf("recvUntrans on Pe(%d)\n",CkMyPe());
02304 
02305 #if CMK_BLUEGENEL
02306   CmiNetworkProgressAfter (0);
02307 #endif
02308 
02309   NodePmeInfo &nodeInfo(gridNodeInfo[myGridNode]);
02310   int first_pe = nodeInfo.pe_start;
02311   int g;
02312 
02313  if ( msg->ny ) {
02314   int zdim = myGrid.dim3;
02315   int last_pe = first_pe+nodeInfo.npe-1;
02316   int x_skip = localInfo[myGridPe].x_start
02317              - localInfo[first_pe].x_start;
02318   int nx_msg = localInfo[last_pe].x_start
02319              + localInfo[last_pe].nx
02320              - localInfo[first_pe].x_start;
02321   int nx = localInfo[myGridPe].nx;
02322   int y_start = msg->y_start;
02323   int ny = msg->ny;
02324   int slicelen = myGrid.K2 * zdim;
02325   int cpylen = ny * zdim;
02326   for ( g=0; g<numGrids; ++g ) {
02327     float *q = qgrid + qgrid_size * g + y_start * zdim;
02328     float *qmsg = msg->qgrid + (nx_msg*g+x_skip) * cpylen;
02329     for ( int x = 0; x < nx; ++x ) {
02330       CmiMemcpy((void*)q, (void*)qmsg, cpylen*sizeof(float));
02331       q += slicelen;
02332       qmsg += cpylen;
02333     }
02334   }
02335  }
02336 
02337   --untrans_count;
02338 
02339   if ( untrans_count == 0 ) {
02340     pmeProxyDir[CkMyPe()].gridCalc3();
02341   }
02342 }

void ComputePmeMgr::recvAck ( PmeAckMsg  ) 

Definition at line 2444 of file ComputePme.C.

References cuda_lock, master_pe, and NAMD_bug().

Referenced by recvUngrid().

02444                                           {
02445   if ( msg ) delete msg;
02446 #ifdef NAMD_CUDA
02447   if ( offload ) {
02448     CmiLock(cuda_lock);
02449     if ( ungrid_count == 0 ) {
02450       NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
02451     }
02452     int uc = --ungrid_count;
02453     CmiUnlock(cuda_lock);
02454 
02455     if ( uc == 0 ) {
02456       pmeProxyDir[master_pe].ungridCalc();
02457     }
02458     return;
02459   }
02460 #endif
02461   --ungrid_count;
02462 
02463   if ( ungrid_count == 0 ) {
02464     pmeProxyDir[CkMyPe()].ungridCalc();
02465   }
02466 }

void ComputePmeMgr::recvArrays ( CProxy_PmeXPencil  ,
CProxy_PmeYPencil  ,
CProxy_PmeZPencil   
)

Definition at line 800 of file ComputePme.C.

00801                                                                        {
00802   xPencil = x;  yPencil = y;  zPencil = z;
00803   
00804     if(CmiMyRank()==0)
00805     {
00806       pmeNodeProxy.ckLocalBranch()->xPencil=x;
00807       pmeNodeProxy.ckLocalBranch()->yPencil=y;
00808       pmeNodeProxy.ckLocalBranch()->zPencil=z;
00809     }
00810 }

void ComputePmeMgr::recvChargeGridReady (  ) 

Definition at line 3534 of file ComputePme.C.

References chargeGridReady(), saved_lattice, and saved_sequence.

03534                                         {
03535   chargeGridReady(*saved_lattice,saved_sequence);
03536 }

void ComputePmeMgr::recvGrid ( PmeGridMsg  ) 

Definition at line 1827 of file ComputePme.C.

References PmeGrid::dim3, f, PmeGridMsg::fgrid, gridCalc1(), PmeGridMsg::lattice, NAMD_bug(), ComputePmeUtil::numGrids, PmeGridMsg::qgrid, sendTransBarrier(), PmeGridMsg::sequence, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

01827                                             {
01828   // CkPrintf("recvGrid from %d on Pe(%d)\n",msg->sourceNode,CkMyPe());
01829   if ( grid_count == 0 ) {
01830     NAMD_bug("Message order failure in ComputePmeMgr::recvGrid\n");
01831   }
01832   if ( grid_count == numSources ) {
01833     lattice = msg->lattice;
01834     grid_sequence = msg->sequence;
01835   }
01836 
01837   int zdim = myGrid.dim3;
01838   int zlistlen = msg->zlistlen;
01839   int *zlist = msg->zlist;
01840   float *qmsg = msg->qgrid;
01841   for ( int g=0; g<numGrids; ++g ) {
01842     char *f = msg->fgrid + fgrid_len * g;
01843     float *q = qgrid + qgrid_size * g;
01844     for ( int i=0; i<fgrid_len; ++i ) {
01845       if ( f[i] ) {
01846         for ( int k=0; k<zlistlen; ++k ) {
01847           q[zlist[k]] += *(qmsg++);
01848         }
01849       }
01850       q += zdim;
01851     }
01852   }
01853 
01854   gridmsg_reuse[numSources-grid_count] = msg;
01855   --grid_count;
01856 
01857   if ( grid_count == 0 ) {
01858     pmeProxyDir[CkMyPe()].gridCalc1();
01859     if ( useBarrier ) pmeProxyDir[0].sendTransBarrier();
01860   }
01861 }

void ComputePmeMgr::recvRecipEvir ( PmeEvirMsg  ) 

Definition at line 3023 of file ComputePme.C.

References PmeEvirMsg::evir, NAMD_bug(), pmeComputes, ResizeArray< Elem >::size(), and submitReductions().

03023                                                  {
03024   if ( ! pmeComputes.size() ) NAMD_bug("ComputePmeMgr::recvRecipEvir() called on pe without patches");
03025   for ( int g=0; g<numGrids; ++g ) {
03026     evir[g] += msg->evir[g];
03027   }
03028   delete msg;
03029   // CkPrintf("recvRecipEvir pe %d %d %d\n", CkMyPe(), ungridForcesCount, recipEvirCount);
03030   if ( ! --recipEvirCount && ! ungridForcesCount ) submitReductions();
03031 }

void ComputePmeMgr::recvSharedTrans ( PmeSharedTransMsg  ) 

Definition at line 2030 of file ComputePme.C.

References PmeSharedTransMsg::count, PmeSharedTransMsg::lock, PmeSharedTransMsg::msg, and procTrans().

02030                                                           {
02031   procTrans(msg->msg);
02032   CmiLock(msg->lock);
02033   int count = --(*msg->count);
02034   CmiUnlock(msg->lock);
02035   if ( count == 0 ) {
02036     CmiDestroyLock(msg->lock);
02037     delete msg->count;
02038     delete msg->msg;
02039   }
02040   delete msg;
02041 }

void ComputePmeMgr::recvSharedUntrans ( PmeSharedUntransMsg  ) 

Definition at line 2284 of file ComputePme.C.

References PmeSharedUntransMsg::count, PmeSharedUntransMsg::lock, PmeSharedUntransMsg::msg, and procUntrans().

02284                                                               {
02285   procUntrans(msg->msg);
02286   CmiLock(msg->lock);
02287   int count = --(*msg->count);
02288   CmiUnlock(msg->lock);
02289   if ( count == 0 ) {
02290     CmiDestroyLock(msg->lock);
02291     delete msg->count;
02292     delete msg->msg;
02293   }
02294   delete msg;
02295 }

void ComputePmeMgr::recvTrans ( PmeTransMsg  ) 

Definition at line 2043 of file ComputePme.C.

References procTrans().

02043                                               {
02044   procTrans(msg);
02045   delete msg;
02046 }

void ComputePmeMgr::recvUngrid ( PmeGridMsg  ) 

Definition at line 2429 of file ComputePme.C.

References copyPencils(), copyResults(), NAMD_bug(), and recvAck().

Referenced by NodePmeMgr::recvUngrid().

02429                                               {
02430   // CkPrintf("recvUngrid on Pe(%d)\n",CkMyPe());
02431 #ifdef NAMD_CUDA
02432   if ( ! offload )  // would need lock
02433 #endif
02434   if ( ungrid_count == 0 ) {
02435     NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
02436   }
02437 
02438   if ( usePencils ) copyPencils(msg);
02439   else copyResults(msg);
02440   delete msg;
02441   recvAck(0);
02442 }

void ComputePmeMgr::recvUntrans ( PmeUntransMsg  ) 

Definition at line 2297 of file ComputePme.C.

References procUntrans().

02297                                                   {
02298   procUntrans(msg);
02299   delete msg;
02300 }

void ComputePmeMgr::sendChargeGridReady (  ) 

Definition at line 3511 of file ComputePme.C.

References chargeGridSubmittedCount, master_pe, NodePmeMgr::mgrObjects, pmeComputes, and ResizeArray< Elem >::size().

Referenced by cuda_check_pme_charges().

03511                                         {
03512   for ( int i=0; i<CkMyNodeSize(); ++i ) {
03513     ComputePmeMgr *mgr = nodePmeMgr->mgrObjects[i];
03514     int cs = mgr->pmeComputes.size();
03515     if ( cs ) {
03516       mgr->ungridForcesCount = cs;
03517       mgr->recipEvirCount = mgr->recipEvirClients;
03518       masterPmeMgr->chargeGridSubmittedCount++;
03519     }
03520   }
03521   pmeProxy[master_pe].recvChargeGridReady();
03522 }

void ComputePmeMgr::sendData ( Lattice ,
int  sequence 
)

Definition at line 3948 of file ComputePme.C.

References sendDataHelper_errors, sendDataHelper_lattice, sendDataHelper_sequence, sendDataHelper_sourcepe, and sendDataPart().

Referenced by chargeGridReady().

03948                                                            {
03949 
03950   sendDataHelper_lattice = &lattice;
03951   sendDataHelper_sequence = sequence;
03952   sendDataHelper_sourcepe = CkMyPe();
03953   sendDataHelper_errors = strayChargeErrors;
03954   strayChargeErrors = 0;
03955 
03956 #ifdef NAMD_CUDA
03957   if ( offload ) {
03958     for ( int i=0; i < numGridPes; ++i ) {
03959       int pe = gridPeOrder[i];  // different order
03960       if ( ! recipPeDest[pe] && ! sendDataHelper_errors ) continue;
03961 #if CMK_MULTICORE
03962       // nodegroup messages on multicore are delivered to sending pe, or pe 0 if expedited
03963       pmeProxy[gridPeMap[pe]].sendDataHelper(i);
03964 #else
03965       pmeNodeProxy[CkMyNode()].sendDataHelper(i);
03966 #endif
03967     }
03968   } else
03969 #endif
03970   {
03971     sendDataPart(0,numGridPes-1,lattice,sequence,CkMyPe(),sendDataHelper_errors);
03972   }
03973  
03974 }

void ComputePmeMgr::sendDataHelper ( int   ) 

Definition at line 3935 of file ComputePme.C.

References NodePmeMgr::sendDataHelper().

03935                                            {
03936   nodePmeMgr->sendDataHelper(iter);
03937 }

void ComputePmeMgr::sendDataPart ( int  first,
int  last,
Lattice ,
int  sequence,
int  sourcepe,
int  errors 
)

Definition at line 3826 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::dim2, PmeGrid::dim3, endi(), PmeGridMsg::fgrid, iERROR(), if(), iout, j, PmeGrid::K2, PmeGrid::K3, PmeGridMsg::lattice, PmeGridMsg::len, NAMD_bug(), PmeGrid::order, PME_GRID_PRIORITY, PRIORITY_SIZE, PmeGridMsg::qgrid, PmeGridMsg::sequence, SET_PRIORITY, PmeGridMsg::sourceNode, PmeGridMsg::start, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

Referenced by sendData(), and NodePmeMgr::sendDataHelper().

03826                                                                                                               {
03827 
03828   // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
03829 
03830   bsize = myGrid.block1 * myGrid.dim2 * myGrid.dim3;
03831 
03832   CProxy_ComputePmeMgr pmeProxy(CkpvAccess(BOCclass_group).computePmeMgr);
03833   for (int j=first; j<=last; j++) {
03834     int pe = gridPeOrder[j];  // different order
03835     if ( ! recipPeDest[pe] && ! errors ) continue;
03836     int start = pe * bsize;
03837     int len = bsize;
03838     if ( start >= qsize ) { start = 0; len = 0; }
03839     if ( start + len > qsize ) { len = qsize - start; }
03840     int zdim = myGrid.dim3;
03841     int fstart = start / zdim;
03842     int flen = len / zdim;
03843     int fcount = 0;
03844     int i;
03845 
03846     int g;
03847     for ( g=0; g<numGrids; ++g ) {
03848       char *f = f_arr + fstart + g*fsize;
03849 #ifdef NAMD_CUDA
03850      if ( offload ) {
03851       int errcount = 0;
03852       for ( i=0; i<flen; ++i ) {
03853         f[i] = ffz_host[fstart+i];
03854         fcount += f[i];
03855         if ( ffz_host[fstart+i] & ~1 ) ++errcount;
03856       }
03857       if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::sendDataPart");
03858      } else
03859 #endif
03860       for ( i=0; i<flen; ++i ) {
03861         fcount += f[i];
03862       }
03863       if ( ! recipPeDest[pe] ) {
03864         int errfound = 0;
03865         for ( i=0; i<flen; ++i ) {
03866           if ( f[i] == 3 ) {
03867             errfound = 1;
03868             break;
03869           }
03870         }
03871         if ( errfound ) {
03872           iout << iERROR << "Stray PME grid charges detected: "
03873                 << sourcepe << " sending to " << gridPeMap[pe] << " for planes";
03874           int iz = -1;
03875           for ( i=0; i<flen; ++i ) {
03876             if ( f[i] == 3 ) {
03877               f[i] = 2;
03878               int jz = (i+fstart)/myGrid.K2;
03879               if ( iz != jz ) { iout << " " << jz;  iz = jz; }
03880             }
03881           }
03882           iout << "\n" << endi;
03883         }
03884       }
03885     }
03886 
03887 #ifdef NETWORK_PROGRESS
03888     CmiNetworkProgress();
03889 #endif
03890 
03891     if ( ! recipPeDest[pe] ) continue;
03892 
03893     int zlistlen = 0;
03894     for ( i=0; i<myGrid.K3; ++i ) {
03895       if ( fz_arr[i] ) ++zlistlen;
03896     }
03897 
03898     PmeGridMsg *msg = new (zlistlen, flen*numGrids,
03899                                 fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
03900 
03901     msg->sourceNode = sourcepe;
03902     msg->lattice = lattice;
03903     msg->start = fstart;
03904     msg->len = flen;
03905     msg->zlistlen = zlistlen;
03906     int *zlist = msg->zlist;
03907     zlistlen = 0;
03908     for ( i=0; i<myGrid.K3; ++i ) {
03909       if ( fz_arr[i] ) zlist[zlistlen++] = i;
03910     }
03911     float *qmsg = msg->qgrid;
03912     for ( g=0; g<numGrids; ++g ) {
03913       char *f = f_arr + fstart + g*fsize;
03914       CmiMemcpy((void*)(msg->fgrid+g*flen),(void*)f,flen*sizeof(char));
03915       float **q = q_arr + fstart + g*fsize;
03916       for ( i=0; i<flen; ++i ) {
03917         if ( f[i] ) {
03918           for (int h=0; h<myGrid.order-1; ++h) {
03919             q[i][h] += q[i][myGrid.K3+h];
03920           }
03921           for ( int k=0; k<zlistlen; ++k ) {
03922             *(qmsg++) = q[i][zlist[k]];
03923           }
03924         }
03925       }
03926     }
03927 
03928     msg->sequence = compute_sequence;
03929     SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
03930     pmeProxy[gridPeMap[pe]].recvGrid(msg);
03931   }
03932 
03933 }

void ComputePmeMgr::sendPencils ( Lattice ,
int  sequence 
)

Definition at line 3721 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::block2, PmeGrid::dim2, endi(), iERROR(), iout, j, ijpair::j, PmeGrid::K1, PmeGrid::K2, sendDataHelper_lattice, sendDataHelper_sequence, sendDataHelper_sourcepe, sendPencilsPart(), and NodePmeMgr::zm.

Referenced by chargeGridReady().

03721                                                               {
03722 
03723   sendDataHelper_lattice = &lattice;
03724   sendDataHelper_sequence = sequence;
03725   sendDataHelper_sourcepe = CkMyPe();
03726 
03727 #ifdef NAMD_CUDA
03728   if ( offload ) {
03729     for ( int ap=0; ap < numPencilsActive; ++ap ) {
03730 #if CMK_MULTICORE
03731       // nodegroup messages on multicore are delivered to sending pe, or pe 0 if expedited
03732       int ib = activePencils[ap].i;
03733       int jb = activePencils[ap].j;
03734       int destproc = nodePmeMgr->zm.ckLocalBranch()->procNum(0, CkArrayIndex3D(ib,jb,0));
03735       pmeProxy[destproc].sendPencilsHelper(ap);
03736 #else
03737       pmeNodeProxy[CkMyNode()].sendPencilsHelper(ap);
03738 #endif
03739     }
03740   } else
03741 #endif
03742   {
03743     sendPencilsPart(0,numPencilsActive-1,lattice,sequence,CkMyPe());
03744   }
03745 
03746   if ( strayChargeErrors ) {
03747    strayChargeErrors = 0;
03748    iout << iERROR << "Stray PME grid charges detected: "
03749         << CkMyPe() << " sending to (x,y)";
03750    int K1 = myGrid.K1;
03751    int K2 = myGrid.K2;
03752    int dim2 = myGrid.dim2;
03753    int block1 = myGrid.block1;
03754    int block2 = myGrid.block2;
03755    for (int ib=0; ib<xBlocks; ++ib) {
03756     for (int jb=0; jb<yBlocks; ++jb) {
03757      int ibegin = ib*block1;
03758      int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
03759      int jbegin = jb*block2;
03760      int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
03761      int flen = numGrids * (iend - ibegin) * (jend - jbegin);
03762 
03763      for ( int g=0; g<numGrids; ++g ) {
03764        char *f = f_arr + g*fsize;
03765        if ( ! pencilActive[ib*yBlocks+jb] ) {
03766            for ( int i=ibegin; i<iend; ++i ) {
03767             for ( int j=jbegin; j<jend; ++j ) {
03768              if ( f[i*dim2+j] == 3 ) {
03769                f[i*dim2+j] = 2;
03770                iout << " (" << i << "," << j << ")";
03771              }
03772             }
03773            }
03774        }
03775      }
03776     }
03777    }
03778    iout << "\n" << endi;
03779   }
03780  
03781 }

void ComputePmeMgr::sendPencilsHelper ( int   ) 

Definition at line 3708 of file ComputePme.C.

References NodePmeMgr::sendPencilsHelper().

03708                                               {
03709   nodePmeMgr->sendPencilsHelper(iter);
03710 }

void ComputePmeMgr::sendPencilsPart ( int  first,
int  last,
Lattice ,
int  sequence,
int  sourcepe 
)

Definition at line 3566 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::block2, PmeGridMsg::destElem, PmeGrid::dim2, PmeGrid::dim3, PmeGridMsg::fgrid, PmeGridMsg::hasData, j, ijpair::j, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PmeGridMsg::lattice, PmeGridMsg::len, NAMD_bug(), PmeGrid::order, PME_GRID_PRIORITY, PRIORITY_SIZE, PmeGridMsg::qgrid, PmeGridMsg::sequence, SET_PRIORITY, PmeGridMsg::sourceNode, PmeGridMsg::start, PmeGridMsg::zlist, PmeGridMsg::zlistlen, and NodePmeMgr::zm.

Referenced by sendPencils(), and NodePmeMgr::sendPencilsHelper().

03566                                                                                                      {
03567 
03568   // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
03569 
03570 #if 0 && USE_PERSISTENT
03571     if (recvGrid_handle== NULL) setup_recvgrid_persistent();
03572 #endif
03573   int K1 = myGrid.K1;
03574   int K2 = myGrid.K2;
03575   int dim2 = myGrid.dim2;
03576   int dim3 = myGrid.dim3;
03577   int block1 = myGrid.block1;
03578   int block2 = myGrid.block2;
03579 
03580   // int savedMessages = 0;
03581   NodePmeMgr *npMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
03582 
03583   for (int ap=first; ap<=last; ++ap) {
03584     int ib = activePencils[ap].i;
03585     int jb = activePencils[ap].j;
03586     int ibegin = ib*block1;
03587     int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
03588     int jbegin = jb*block2;
03589     int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
03590     int flen = numGrids * (iend - ibegin) * (jend - jbegin);
03591 
03592     int fcount = 0;
03593     for ( int g=0; g<numGrids; ++g ) {
03594       char *f = f_arr + g*fsize;
03595 #ifdef NAMD_CUDA
03596      if ( offload ) {
03597       int errcount = 0;
03598       for ( int i=ibegin; i<iend; ++i ) {
03599        for ( int j=jbegin; j<jend; ++j ) {
03600         int k = i*dim2+j;
03601         f[k] = ffz_host[k];
03602         fcount += f[k];
03603         if ( ffz_host[k] & ~1 ) ++errcount;
03604        }
03605       }
03606       if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::sendPencilsPart");
03607      } else
03608 #endif
03609       for ( int i=ibegin; i<iend; ++i ) {
03610        for ( int j=jbegin; j<jend; ++j ) {
03611         fcount += f[i*dim2+j];
03612        }
03613       }
03614     }
03615 
03616 #ifdef NETWORK_PROGRESS
03617     CmiNetworkProgress();
03618 #endif
03619 
03620     if ( ! pencilActive[ib*yBlocks+jb] )
03621       NAMD_bug("PME activePencils list inconsistent");
03622 
03623     int zlistlen = 0;
03624     for ( int i=0; i<myGrid.K3; ++i ) {
03625       if ( fz_arr[i] ) ++zlistlen;
03626     }
03627 
03628     int hd = ( fcount? 1 : 0 );  // has data?
03629     // if ( ! hd ) ++savedMessages;
03630 
03631     
03632     PmeGridMsg *msg = new ( hd*zlistlen, hd*flen,
03633         hd*fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
03634     msg->sourceNode = sourcepe;
03635     msg->hasData = hd;
03636     msg->lattice = lattice;
03637    if ( hd ) {
03638 #if 0
03639     msg->start = fstart;
03640     msg->len = flen;
03641 #else
03642     msg->start = -1;   // obsolete?
03643     msg->len = -1;   // obsolete?
03644 #endif
03645     msg->zlistlen = zlistlen;
03646     int *zlist = msg->zlist;
03647     zlistlen = 0;
03648     for ( int i=0; i<myGrid.K3; ++i ) {
03649       if ( fz_arr[i] ) zlist[zlistlen++] = i;
03650     }
03651     char *fmsg = msg->fgrid;
03652     float *qmsg = msg->qgrid;
03653     for ( int g=0; g<numGrids; ++g ) {
03654       char *f = f_arr + g*fsize;
03655       float **q = q_arr + g*fsize;
03656       for ( int i=ibegin; i<iend; ++i ) {
03657        for ( int j=jbegin; j<jend; ++j ) {
03658         *(fmsg++) = f[i*dim2+j];
03659         if( f[i*dim2+j] ) {
03660           for (int h=0; h<myGrid.order-1; ++h) {
03661             q[i*dim2+j][h] += q[i*dim2+j][myGrid.K3+h];
03662           }
03663           for ( int k=0; k<zlistlen; ++k ) {
03664             *(qmsg++) = q[i*dim2+j][zlist[k]];
03665           }
03666         }
03667        }
03668       }
03669     }
03670    }
03671 
03672     msg->sequence = compute_sequence;
03673     SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
03674     CmiEnableUrgentSend(1);
03675 #if USE_NODE_PAR_RECEIVE
03676     msg->destElem=CkArrayIndex3D(ib,jb,0);
03677     CProxy_PmePencilMap lzm = npMgr->zm;
03678     int destproc = lzm.ckLocalBranch()->procNum(0, msg->destElem);
03679     int destnode = CmiNodeOf(destproc);
03680     
03681 #if  0 
03682     CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
03683 #endif
03684     pmeNodeProxy[destnode].recvZGrid(msg);
03685 #if 0 
03686     CmiUsePersistentHandle(NULL, 0);
03687 #endif
03688 #else
03689 #if 0 
03690     CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
03691 #endif
03692     zPencil(ib,jb,0).recvGrid(msg);
03693 #if 0 
03694     CmiUsePersistentHandle(NULL, 0);
03695 #endif
03696 #endif
03697     CmiEnableUrgentSend(0);
03698   }
03699 
03700 
03701   // if ( savedMessages ) {
03702   //   CkPrintf("Pe %d eliminated %d PME messages\n",CkMyPe(),savedMessages);
03703   // }
03704 
03705 }

void ComputePmeMgr::sendTrans ( void   ) 

Definition at line 1939 of file ComputePme.C.

References CKLOOP_CTRL_PME_SENDTRANS, Node::Object(), PmeSlabSendTrans(), sendTransSubset(), Node::simParameters, and SimParameters::useCkLoop.

Referenced by sendTransBarrier().

01939                                   {
01940 
01941   untrans_count = numTransPes;
01942 
01943 #if     CMK_SMP && USE_CKLOOP
01944   int useCkLoop = Node::Object()->simParameters->useCkLoop;
01945   if ( useCkLoop >= CKLOOP_CTRL_PME_SENDTRANS && CkNumPes() >= 2 * numGridPes) {
01946     CkLoop_Parallelize(PmeSlabSendTrans, 1, (void *)this, CkMyNodeSize(), 0, numTransNodes-1, 0); // no sync
01947   } else
01948 #endif
01949   {
01950     sendTransSubset(0, numTransNodes-1);
01951   }
01952 
01953 }

void ComputePmeMgr::sendTransBarrier ( void   ) 

Definition at line 1924 of file ComputePme.C.

References sendTrans().

Referenced by recvGrid().

01924                                          {
01925   sendTransBarrier_received += 1;
01926   // CkPrintf("sendTransBarrier on %d %d\n",myGridPe,numGridPes-sendTransBarrier_received);
01927   if ( sendTransBarrier_received < numGridPes ) return;
01928   sendTransBarrier_received = 0;
01929   for ( int i=0; i<numGridPes; ++i ) {
01930     pmeProxyDir[gridPeMap[i]].sendTrans();
01931   }
01932 }

void ComputePmeMgr::sendTransSubset ( int  first,
int  last 
)

Definition at line 1955 of file ComputePme.C.

References PmeGrid::dim3, fwdSharedTrans(), j, PmeGrid::K2, kgrid, PmeTransMsg::lattice, NodePmeInfo::npe, ComputePmeUtil::numGrids, PmeTransMsg::nx, LocalPmeInfo::nx, LocalPmeInfo::ny_after_transpose, NodePmeInfo::pe_start, PME_TRANS_PRIORITY, PRIORITY_SIZE, PmeTransMsg::qgrid, NodePmeInfo::real_node, PmeTransMsg::sequence, SET_PRIORITY, PmeTransMsg::sourceNode, PmeTransMsg::x_start, and LocalPmeInfo::x_start.

Referenced by PmeSlabSendTrans(), and sendTrans().

01955                                                        {
01956   // CkPrintf("sendTrans on Pe(%d)\n",CkMyPe());
01957 
01958   // send data for transpose
01959   int zdim = myGrid.dim3;
01960   int nx = localInfo[myGridPe].nx;
01961   int x_start = localInfo[myGridPe].x_start;
01962   int slicelen = myGrid.K2 * zdim;
01963 
01964   ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
01965 
01966 #if CMK_BLUEGENEL
01967   CmiNetworkProgressAfter (0);
01968 #endif
01969 
01970   for (int j=first; j<=last; j++) {
01971     int node = transNodeOrder[j];  // different order on each node
01972     int pe = transNodeInfo[node].pe_start;
01973     int npe = transNodeInfo[node].npe;
01974     int totlen = 0;
01975     if ( node != myTransNode ) for (int i=0; i<npe; ++i, ++pe) {
01976       LocalPmeInfo &li = localInfo[pe];
01977       int cpylen = li.ny_after_transpose * zdim;
01978       totlen += cpylen;
01979     }
01980     PmeTransMsg *newmsg = new (nx * totlen * numGrids,
01981                                 PRIORITY_SIZE) PmeTransMsg;
01982     newmsg->sourceNode = myGridPe;
01983     newmsg->lattice = lattice;
01984     newmsg->x_start = x_start;
01985     newmsg->nx = nx;
01986     for ( int g=0; g<numGrids; ++g ) {
01987       float *qmsg = newmsg->qgrid + nx * totlen * g;
01988       pe = transNodeInfo[node].pe_start;
01989       for (int i=0; i<npe; ++i, ++pe) {
01990         LocalPmeInfo &li = localInfo[pe];
01991         int cpylen = li.ny_after_transpose * zdim;
01992         if ( node == myTransNode ) {
01993           ComputePmeMgr *m = mgrObjects[CkRankOf(transPeMap[pe])];
01994           qmsg = m->kgrid + m->qgrid_size * g + x_start*cpylen;
01995         }
01996         float *q = qgrid + qgrid_size * g + li.y_start_after_transpose * zdim;
01997         for ( int x = 0; x < nx; ++x ) {
01998           CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
01999           q += slicelen;
02000           qmsg += cpylen;
02001         }
02002       }
02003     }
02004     newmsg->sequence = grid_sequence;
02005     SET_PRIORITY(newmsg,grid_sequence,PME_TRANS_PRIORITY)
02006     if ( node == myTransNode ) newmsg->nx = 0;
02007     if ( npe > 1 ) {
02008       if ( node == myTransNode ) fwdSharedTrans(newmsg);
02009       else pmeNodeProxy[transNodeInfo[node].real_node].recvTrans(newmsg);
02010     } else pmeProxy[transPeMap[transNodeInfo[node].pe_start]].recvTrans(newmsg);
02011   }
02012 }

void ComputePmeMgr::sendUngrid ( void   ) 

Definition at line 2369 of file ComputePme.C.

References CKLOOP_CTRL_PME_SENDUNTRANS, Node::Object(), PmeSlabSendUngrid(), sendUngridSubset(), Node::simParameters, and SimParameters::useCkLoop.

02369                                    {
02370 
02371 #if     CMK_SMP && USE_CKLOOP
02372   int useCkLoop = Node::Object()->simParameters->useCkLoop;
02373   if ( useCkLoop >= CKLOOP_CTRL_PME_SENDUNTRANS && CkNumPes() >= 2 * numGridPes) {
02374     CkLoop_Parallelize(PmeSlabSendUngrid, 1, (void *)this, CkMyNodeSize(), 0, numSources-1, 1); // sync
02375   } else
02376 #endif
02377   {
02378     sendUngridSubset(0, numSources-1);
02379   }
02380 
02381   grid_count = numSources;
02382   memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
02383 }

void ComputePmeMgr::sendUngridSubset ( int  first,
int  last 
)

Definition at line 2385 of file ComputePme.C.

References PmeGrid::dim3, f, j, PME_OFFLOAD_UNGRID_PRIORITY, PME_UNGRID_PRIORITY, and SET_PRIORITY.

Referenced by PmeSlabSendUngrid(), and sendUngrid().

02385                                                         {
02386 
02387 #ifdef NAMD_CUDA
02388   const int UNGRID_PRIORITY = ( offload ? PME_OFFLOAD_UNGRID_PRIORITY : PME_UNGRID_PRIORITY );
02389 #else
02390   const int UNGRID_PRIORITY = PME_UNGRID_PRIORITY ;
02391 #endif
02392 
02393   for ( int j=first; j<=last; ++j ) {
02394     // int msglen = qgrid_len;
02395     PmeGridMsg *newmsg = gridmsg_reuse[j];
02396     int pe = newmsg->sourceNode;
02397     int zdim = myGrid.dim3;
02398     int flen = newmsg->len;
02399     int fstart = newmsg->start;
02400     int zlistlen = newmsg->zlistlen;
02401     int *zlist = newmsg->zlist;
02402     float *qmsg = newmsg->qgrid;
02403     for ( int g=0; g<numGrids; ++g ) {
02404       char *f = newmsg->fgrid + fgrid_len * g;
02405       float *q = qgrid + qgrid_size * g + (fstart-fgrid_start) * zdim;
02406       for ( int i=0; i<flen; ++i ) {
02407         if ( f[i] ) {
02408           for ( int k=0; k<zlistlen; ++k ) {
02409             *(qmsg++) = q[zlist[k]];
02410           }
02411         }
02412         q += zdim;
02413       }
02414     }
02415     newmsg->sourceNode = myGridPe;
02416 
02417     SET_PRIORITY(newmsg,grid_sequence,UNGRID_PRIORITY)
02418     CmiEnableUrgentSend(1);
02419 #ifdef NAMD_CUDA
02420     if ( offload ) {
02421       pmeNodeProxy[CkNodeOf(pe)].recvUngrid(newmsg);
02422     } else
02423 #endif
02424     pmeProxyDir[pe].recvUngrid(newmsg);
02425     CmiEnableUrgentSend(0);
02426   }
02427 }

void ComputePmeMgr::sendUntrans ( void   ) 

Definition at line 2183 of file ComputePme.C.

References CKLOOP_CTRL_PME_SENDUNTRANS, PmeEvirMsg::evir, Node::Object(), PME_UNGRID_PRIORITY, PmeSlabSendUntrans(), PRIORITY_SIZE, sendUntransSubset(), SET_PRIORITY, Node::simParameters, and SimParameters::useCkLoop.

02183                                     {
02184 
02185   trans_count = numGridPes;
02186 
02187   { // send energy and virial
02188     PmeEvirMsg *newmsg = new (numGrids, PRIORITY_SIZE) PmeEvirMsg;
02189     for ( int g=0; g<numGrids; ++g ) {
02190       newmsg->evir[g] = recip_evir2[g];
02191     }
02192     SET_PRIORITY(newmsg,grid_sequence,PME_UNGRID_PRIORITY)
02193     CmiEnableUrgentSend(1);
02194     pmeProxy[recipEvirPe].recvRecipEvir(newmsg);
02195     CmiEnableUrgentSend(0);
02196   }
02197 
02198 #if     CMK_SMP && USE_CKLOOP
02199   int useCkLoop = Node::Object()->simParameters->useCkLoop;
02200   if ( useCkLoop >= CKLOOP_CTRL_PME_SENDUNTRANS && CkNumPes() >= 2 * numTransPes) {
02201     CkLoop_Parallelize(PmeSlabSendUntrans, 1, (void *)this, CkMyNodeSize(), 0, numGridNodes-1, 0); // no sync
02202   } else
02203 #endif
02204   {
02205     sendUntransSubset(0, numGridNodes-1);
02206   }
02207 
02208 }

void ComputePmeMgr::sendUntransSubset ( int  first,
int  last 
)

Definition at line 2210 of file ComputePme.C.

References PmeGrid::dim3, fwdSharedUntrans(), j, PmeGrid::K2, NodePmeInfo::npe, LocalPmeInfo::nx, PmeUntransMsg::ny, LocalPmeInfo::ny_after_transpose, NodePmeInfo::pe_start, PME_UNTRANS_PRIORITY, PRIORITY_SIZE, qgrid, PmeUntransMsg::qgrid, NodePmeInfo::real_node, SET_PRIORITY, PmeUntransMsg::sourceNode, PmeUntransMsg::y_start, and LocalPmeInfo::y_start_after_transpose.

Referenced by PmeSlabSendUntrans(), and sendUntrans().

02210                                                          {
02211 
02212   int zdim = myGrid.dim3;
02213   int y_start = localInfo[myTransPe].y_start_after_transpose;
02214   int ny = localInfo[myTransPe].ny_after_transpose;
02215   int slicelen = myGrid.K2 * zdim;
02216 
02217   ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
02218 
02219 #if CMK_BLUEGENEL
02220   CmiNetworkProgressAfter (0);
02221 #endif
02222 
02223   // send data for reverse transpose
02224   for (int j=first; j<=last; j++) {
02225     int node = gridNodeOrder[j];  // different order on each node
02226     int pe = gridNodeInfo[node].pe_start;
02227     int npe = gridNodeInfo[node].npe;
02228     int totlen = 0;
02229     if ( node != myGridNode ) for (int i=0; i<npe; ++i, ++pe) {
02230       LocalPmeInfo &li = localInfo[pe];
02231       int cpylen = li.nx * zdim;
02232       totlen += cpylen;
02233     }
02234     PmeUntransMsg *newmsg = new (ny * totlen * numGrids, PRIORITY_SIZE) PmeUntransMsg;
02235     newmsg->sourceNode = myTransPe;
02236     newmsg->y_start = y_start;
02237     newmsg->ny = ny;
02238     for ( int g=0; g<numGrids; ++g ) {
02239       float *qmsg = newmsg->qgrid + ny * totlen * g;
02240       pe = gridNodeInfo[node].pe_start;
02241       for (int i=0; i<npe; ++i, ++pe) {
02242         LocalPmeInfo &li = localInfo[pe];
02243         if ( node == myGridNode ) {
02244           ComputePmeMgr *m = mgrObjects[CkRankOf(gridPeMap[pe])];
02245           qmsg = m->qgrid + m->qgrid_size * g + y_start * zdim;
02246           float *q = kgrid + qgrid_size*g + li.x_start*ny*zdim;
02247           int cpylen = ny * zdim;
02248           for ( int x = 0; x < li.nx; ++x ) {
02249             CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
02250             q += cpylen;
02251             qmsg += slicelen;
02252           }
02253         } else {
02254           CmiMemcpy((void*)qmsg,
02255                 (void*)(kgrid + qgrid_size*g + li.x_start*ny*zdim),
02256                 li.nx*ny*zdim*sizeof(float));
02257           qmsg += li.nx*ny*zdim;
02258         }
02259       }
02260     }
02261     SET_PRIORITY(newmsg,grid_sequence,PME_UNTRANS_PRIORITY)
02262     if ( node == myGridNode ) newmsg->ny = 0;
02263     if ( npe > 1 ) {
02264       if ( node == myGridNode ) fwdSharedUntrans(newmsg);
02265       else pmeNodeProxy[gridNodeInfo[node].real_node].recvUntrans(newmsg);
02266     } else pmeProxy[gridPeMap[gridNodeInfo[node].pe_start]].recvUntrans(newmsg);
02267   }
02268 }

void ComputePmeMgr::submitReductions (  ) 

Definition at line 4213 of file ComputePme.C.

References ComputePmeUtil::alchDecouple, ComputePmeUtil::alchOn, NAMD_bug(), Node::Object(), Node::simParameters, and simParams.

Referenced by ComputePme::doWork(), and recvRecipEvir().

04213                                      {
04214 
04215     SimParameters *simParams = Node::Object()->simParameters;
04216 
04217     for ( int g=0; g<numGrids; ++g ) {
04218       float scale = 1.;
04219       if (alchOn) {
04220         BigReal elecLambdaUp, elecLambdaDown;
04221         if( simParams->alchFepWhamOn ) {
04222           if( simParams->alchFepElecOn ) {
04223             elecLambdaUp = simParams->alchElecLambda;
04224             elecLambdaDown = 1.0 - simParams->alchElecLambda;
04225           }
04226           else {
04227             elecLambdaUp = 0.0;
04228             elecLambdaDown = 1.0;
04229           }
04230         }
04231         else {
04232           // alchLambda set on each step in ComputePme::ungridForces()
04233           if ( alchLambda < 0 || alchLambda > 1 ) {
04234             NAMD_bug("ComputePmeMgr::submitReductions alchLambda out of range");
04235           }
04236           elecLambdaUp = simParams->getElecLambda(alchLambda);
04237           elecLambdaDown = simParams->getElecLambda(1-alchLambda);
04238         }
04239         if ( g == 0 ) scale = elecLambdaUp;
04240         else if ( g == 1 ) scale = elecLambdaDown;
04241         else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
04242         if (alchDecouple) {
04243           if ( g == 2 ) scale = 1-elecLambdaUp;
04244           else if ( g == 3 ) scale = 1-elecLambdaDown;
04245           else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
04246         }
04247       } else if ( lesOn ) {
04248         scale = 1.0 / lesFactor;
04249       } else if ( pairOn ) {
04250         scale = ( g == 0 ? 1. : -1. );
04251       }
04252       reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += evir[g][0] * scale;
04253       reduction->item(REDUCTION_VIRIAL_SLOW_XX) += evir[g][1] * scale;
04254       reduction->item(REDUCTION_VIRIAL_SLOW_XY) += evir[g][2] * scale;
04255       reduction->item(REDUCTION_VIRIAL_SLOW_XZ) += evir[g][3] * scale;
04256       reduction->item(REDUCTION_VIRIAL_SLOW_YX) += evir[g][2] * scale;
04257       reduction->item(REDUCTION_VIRIAL_SLOW_YY) += evir[g][4] * scale;
04258       reduction->item(REDUCTION_VIRIAL_SLOW_YZ) += evir[g][5] * scale;
04259       reduction->item(REDUCTION_VIRIAL_SLOW_ZX) += evir[g][3] * scale;
04260       reduction->item(REDUCTION_VIRIAL_SLOW_ZY) += evir[g][5] * scale;
04261       reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += evir[g][6] * scale;
04262 
04263       float scale2 = 0.;
04264 
04265       // why is this declared/defined again here?
04266       SimParameters *simParams = Node::Object()->simParameters;
04267 
04268       if (alchFepOn) {
04269         BigReal elecLambda2Up=0.0, elecLambda2Down=0.0;
04270         if(simParams->alchFepWhamOn) {
04271           if(simParams->alchFepElecOn) {
04272             elecLambda2Up = simParams->alchElecLambda;
04273             elecLambda2Down =  1.0 - simParams->alchElecLambda;
04274           }
04275           else {
04276             elecLambda2Up = 0.0;
04277             elecLambda2Down =  1.0;
04278           }
04279         }
04280         else {
04281           elecLambda2Up = simParams->getElecLambda(alchLambda2);
04282           elecLambda2Down = simParams->getElecLambda(1.-alchLambda2);
04283         }
04284         
04285         if ( g == 0 ) scale2 = elecLambda2Up;
04286         else if ( g == 1 ) scale2 = elecLambda2Down;
04287         else if ( g == 2 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
04288         if (alchDecouple && g == 2 ) scale2 = 1 - elecLambda2Up;
04289         else if (alchDecouple && g == 3 ) scale2 = 1 - elecLambda2Down;
04290         else if (alchDecouple && g == 4 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
04291       }
04292       if(simParams->alchFepWhamOn && simParams->alchFepElecOn)  {       // FEP with wham post-process
04293         if( g==0 )      scale2 = scale + 1.0;
04294         else if( g==1 ) scale2 = scale - 1.0;
04295         else if( g==2 ) scale2 = scale - 1.0;
04296         else if( g==3 ) scale2 = scale + 1.0;
04297       }
04298       reduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += evir[g][0] * scale2;
04299       
04300       if (alchThermIntOn) {
04301         
04302         // no decoupling:
04303         // part. 1 <-> all of system except partition 2: g[0] - g[2] 
04304         // (interactions between all atoms [partition 0 OR partition 1], 
04305         // minus all [within partition 0])
04306         // U = elecLambdaUp * (U[0] - U[2])
04307         // dU/dl = U[0] - U[2];
04308         
04309         // part. 2 <-> all of system except partition 1: g[1] - g[2] 
04310         // (interactions between all atoms [partition 0 OR partition 2], 
04311         // minus all [within partition 0])
04312         // U = elecLambdaDown * (U[1] - U[2])
04313         // dU/dl = U[1] - U[2];
04314 
04315         // alchDecouple:
04316         // part. 1 <-> part. 0: g[0] - g[2] - g[4] 
04317         // (interactions between all atoms [partition 0 OR partition 1]
04318         // minus all [within partition 1] minus all [within partition 0]
04319         // U = elecLambdaUp * (U[0] - U[4]) + (1-elecLambdaUp)* U[2]
04320         // dU/dl = U[0] - U[2] - U[4];
04321 
04322         // part. 2 <-> part. 0: g[1] - g[3] - g[4] 
04323         // (interactions between all atoms [partition 0 OR partition 2]
04324         // minus all [within partition 2] minus all [within partition 0]
04325         // U = elecLambdaDown * (U[1] - U[4]) + (1-elecLambdaDown)* U[3]
04326         // dU/dl = U[1] - U[3] - U[4];
04327         
04328         
04329         if ( g == 0 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) += evir[g][0];
04330         if ( g == 1 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) += evir[g][0];
04331         if (!alchDecouple) {
04332           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
04333           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
04334         }
04335         else {  // alchDecouple
04336           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
04337           if ( g == 3 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
04338           if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
04339           if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
04340         }
04341       }
04342     }
04343 
04344     alchLambda = -1.;  // illegal value to catch if not updated
04345 
04346     reduction->item(REDUCTION_STRAY_CHARGE_ERRORS) += strayChargeErrors;
04347     reduction->submit();
04348 
04349   for ( int i=0; i<heldComputes.size(); ++i ) {
04350     WorkDistrib::messageEnqueueWork(heldComputes[i]);
04351   }
04352   heldComputes.resize(0);
04353 }

void ComputePmeMgr::ungridCalc ( void   ) 

Definition at line 2519 of file ComputePme.C.

References a_data_dev, cuda_errcheck(), CUDA_EVENT_ID_PME_COPY, CUDA_EVENT_ID_PME_KERNEL, CUDA_EVENT_ID_PME_TICK, deviceCUDA, end_forces, NodePmeMgr::end_potential_memcpy, EVENT_STRIDE, f_data_dev, f_data_host, forces_count, forces_done_count, forces_time, DeviceCUDA::getDeviceID(), j, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, WorkDistrib::messageEnqueueWork(), NodePmeMgr::mgrObjects, PmeGrid::order, pmeComputes, ResizeArray< Elem >::size(), this_pe, and ungridCalc().

Referenced by ungridCalc().

02519                                    {
02520   // CkPrintf("ungridCalc on Pe(%d)\n",CkMyPe());
02521 
02522   ungridForcesCount = pmeComputes.size();
02523 
02524 #ifdef NAMD_CUDA
02525  if ( offload ) {
02526   //CmiLock(cuda_lock);
02527   cudaSetDevice(deviceCUDA->getDeviceID());
02528 
02529   if ( this == masterPmeMgr ) {
02530     double before = CmiWallTimer();
02531     cudaMemcpyAsync(v_data_dev, q_data_host, q_data_size, cudaMemcpyHostToDevice, 0 /*streams[stream]*/);
02532     cudaEventRecord(nodePmeMgr->end_potential_memcpy, 0 /*streams[stream]*/);
02533     // try to make the unspecified launch failures go away
02534     cudaEventSynchronize(nodePmeMgr->end_potential_memcpy);
02535     cuda_errcheck("in ComputePmeMgr::ungridCalc after potential memcpy");
02536     traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
02537 
02538     const int myrank = CkMyRank();
02539     for ( int i=0; i<CkMyNodeSize(); ++i ) {
02540       if ( myrank != i && nodePmeMgr->mgrObjects[i]->pmeComputes.size() ) {
02541         nodePmeMgr->mgrObjects[i]->ungridCalc();
02542       }
02543     }
02544     if ( ! pmeComputes.size() ) return;
02545   }
02546 
02547   if ( ! end_forces ) {
02548     int n=(pmeComputes.size()-1)/EVENT_STRIDE+1;
02549     end_forces = new cudaEvent_t[n];
02550     for ( int i=0; i<n; ++i ) {
02551       cudaEventCreateWithFlags(&end_forces[i],cudaEventDisableTiming);
02552     }
02553   }
02554 
02555   const int pcsz = pmeComputes.size();
02556   if ( ! afn_host ) {
02557     cudaMallocHost((void**) &afn_host, 3*pcsz*sizeof(float*));
02558     cudaMalloc((void**) &afn_dev, 3*pcsz*sizeof(float*));
02559     cuda_errcheck("malloc params for pme");
02560   }
02561   int totn = 0;
02562   for ( int i=0; i<pcsz; ++i ) {
02563     int n = pmeComputes[i]->numGridAtoms[0];
02564     totn += n;
02565   }
02566   if ( totn > f_data_mgr_alloc ) {
02567     if ( f_data_mgr_alloc ) {
02568       CkPrintf("Expanding CUDA forces allocation because %d > %d\n", totn, f_data_mgr_alloc);
02569       cudaFree(f_data_mgr_dev);
02570       cudaFreeHost(f_data_mgr_host);
02571     }
02572     f_data_mgr_alloc = 1.2 * (totn + 100);
02573     cudaMalloc((void**) &f_data_mgr_dev, 3*f_data_mgr_alloc*sizeof(float));
02574     cudaMallocHost((void**) &f_data_mgr_host, 3*f_data_mgr_alloc*sizeof(float));
02575     cuda_errcheck("malloc forces for pme");
02576   }
02577   // CkPrintf("pe %d pcsz %d totn %d alloc %d\n", CkMyPe(), pcsz, totn, f_data_mgr_alloc);
02578   float *f_dev = f_data_mgr_dev;
02579   float *f_host = f_data_mgr_host;
02580   for ( int i=0; i<pcsz; ++i ) {
02581     int n = pmeComputes[i]->numGridAtoms[0];
02582     pmeComputes[i]->f_data_dev = f_dev;
02583     pmeComputes[i]->f_data_host = f_host;
02584     afn_host[3*i  ] = a_data_dev + 7 * pmeComputes[i]->cuda_atoms_offset;
02585     afn_host[3*i+1] = f_dev;
02586     afn_host[3*i+2] = f_dev + n;  // avoid type conversion issues
02587     f_dev += 3*n;
02588     f_host += 3*n;
02589   }
02590   //CmiLock(cuda_lock);
02591   double before = CmiWallTimer();
02592   cudaMemcpyAsync(afn_dev, afn_host, 3*pcsz*sizeof(float*), cudaMemcpyHostToDevice, streams[stream]);
02593   cuda_errcheck("in ComputePmeMgr::ungridCalc after force pointer memcpy");
02594   traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
02595   cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_potential_memcpy, 0);
02596   cuda_errcheck("in ComputePmeMgr::ungridCalc after wait for potential memcpy");
02597   traceUserEvent(CUDA_EVENT_ID_PME_TICK);
02598 
02599   for ( int i=0; i<pcsz; ++i ) {
02600     // cudaMemsetAsync(pmeComputes[i]->f_data_dev, 0, 3*n*sizeof(float), streams[stream]);
02601     if ( i%EVENT_STRIDE == 0 ) {
02602       int dimy = pcsz - i;
02603       if ( dimy > EVENT_STRIDE ) dimy = EVENT_STRIDE;
02604       int maxn = 0;
02605       int subtotn = 0;
02606       for ( int j=0; j<dimy; ++j ) {
02607         int n = pmeComputes[i+j]->numGridAtoms[0];
02608         subtotn += n;
02609         if ( n > maxn ) maxn = n;
02610       }
02611       // CkPrintf("pe %d dimy %d maxn %d subtotn %d\n", CkMyPe(), dimy, maxn, subtotn);
02612       before = CmiWallTimer();
02613       cuda_pme_forces(
02614         bspline_coeffs_dev,
02615         v_arr_dev, afn_dev+3*i, dimy, maxn, /*
02616         pmeComputes[i]->a_data_dev,
02617         pmeComputes[i]->f_data_dev,
02618         n, */ myGrid.K1, myGrid.K2, myGrid.K3, myGrid.order,
02619         streams[stream]);
02620       cuda_errcheck("in ComputePmeMgr::ungridCalc after force kernel submit");
02621       traceUserBracketEvent(CUDA_EVENT_ID_PME_KERNEL,before,CmiWallTimer());
02622       before = CmiWallTimer();
02623       cudaMemcpyAsync(pmeComputes[i]->f_data_host, pmeComputes[i]->f_data_dev, 3*subtotn*sizeof(float),
02624         cudaMemcpyDeviceToHost, streams[stream]);
02625       cuda_errcheck("in ComputePmeMgr::ungridCalc after force memcpy submit");
02626       traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
02627       cudaEventRecord(end_forces[i/EVENT_STRIDE], streams[stream]);
02628       cuda_errcheck("in ComputePmeMgr::ungridCalc after end_forces event");
02629       traceUserEvent(CUDA_EVENT_ID_PME_TICK);
02630     }
02631     // CkPrintf("pe %d c %d natoms %d fdev %lld fhost %lld\n", CkMyPe(), i, (int64)afn_host[3*i+2], pmeComputes[i]->f_data_dev, pmeComputes[i]->f_data_host);
02632   }
02633   //CmiUnlock(cuda_lock);
02634  } else
02635 #endif // NAMD_CUDA
02636  {
02637   for ( int i=0; i<pmeComputes.size(); ++i ) {
02638     WorkDistrib::messageEnqueueWork(pmeComputes[i]);
02639     // pmeComputes[i]->ungridForces();
02640   }
02641  }
02642   // submitReductions();  // must follow all ungridForces()
02643 
02644 #ifdef NAMD_CUDA
02645  if ( offload ) {
02646   forces_time = CmiWallTimer();
02647   forces_count = ungridForcesCount;
02648   forces_done_count = 0;
02649   pmeProxy[this_pe].pollForcesReady();
02650  }
02651 #endif
02652 
02653   ungrid_count = (usePencils ? numPencilsActive : numDestRecipPes );
02654 }


Friends And Related Function Documentation

friend class ComputePme [friend]

Definition at line 357 of file ComputePme.C.

friend class NodePmeMgr [friend]

Definition at line 358 of file ComputePme.C.


Member Data Documentation

float* ComputePmeMgr::a_data_dev

Definition at line 419 of file ComputePme.C.

Referenced by cuda_submit_charges(), and ungridCalc().

float* ComputePmeMgr::a_data_host

Definition at line 418 of file ComputePme.C.

int ComputePmeMgr::chargeGridSubmittedCount

Definition at line 444 of file ComputePme.C.

Referenced by chargeGridSubmitted(), ComputePmeMgr(), initialize_computes(), and sendChargeGridReady().

double ComputePmeMgr::charges_time

Definition at line 430 of file ComputePme.C.

Referenced by cuda_check_pme_charges(), and cuda_submit_charges().

int ComputePmeMgr::check_charges_count

Definition at line 432 of file ComputePme.C.

Referenced by ComputePmeMgr(), and cuda_check_pme_charges().

int ComputePmeMgr::check_forces_count

Definition at line 433 of file ComputePme.C.

Referenced by ComputePmeMgr(), and cuda_check_pme_forces().

int ComputePmeMgr::cuda_atoms_alloc

Definition at line 423 of file ComputePme.C.

Referenced by ComputePmeMgr().

int ComputePmeMgr::cuda_atoms_count

Definition at line 422 of file ComputePme.C.

Referenced by ComputePmeMgr(), cuda_submit_charges(), and ComputePme::initialize().

bool ComputePmeMgr::cuda_busy [static]

Definition at line 442 of file ComputePme.C.

CmiNodeLock ComputePmeMgr::cuda_lock [static]

Definition at line 424 of file ComputePme.C.

Referenced by ComputePmeMgr(), initialize_computes(), and recvAck().

std::deque< ComputePmeMgr::cuda_submit_charges_args > ComputePmeMgr::cuda_submit_charges_deque [static]

Definition at line 441 of file ComputePme.C.

cudaEvent_t ComputePmeMgr::end_charges

Definition at line 426 of file ComputePme.C.

Referenced by chargeGridSubmitted(), ComputePmeMgr(), and cuda_check_pme_charges().

cudaEvent_t* ComputePmeMgr::end_forces

Definition at line 427 of file ComputePme.C.

Referenced by ComputePmeMgr(), cuda_check_pme_forces(), and ungridCalc().

float* ComputePmeMgr::f_data_dev

Definition at line 421 of file ComputePme.C.

Referenced by ungridCalc().

float* ComputePmeMgr::f_data_host

Definition at line 420 of file ComputePme.C.

Referenced by ungridCalc().

CmiNodeLock ComputePmeMgr::fftw_plan_lock [static]

Definition at line 414 of file ComputePme.C.

Referenced by ComputePmeMgr(), initialize(), and ~ComputePmeMgr().

int ComputePmeMgr::forces_count

Definition at line 428 of file ComputePme.C.

Referenced by cuda_check_pme_forces(), and ungridCalc().

int ComputePmeMgr::forces_done_count

Definition at line 429 of file ComputePme.C.

Referenced by cuda_check_pme_forces(), and ungridCalc().

double ComputePmeMgr::forces_time

Definition at line 431 of file ComputePme.C.

Referenced by cuda_check_pme_forces(), and ungridCalc().

int ComputePmeMgr::master_pe

Definition at line 434 of file ComputePme.C.

Referenced by chargeGridSubmitted(), initialize_computes(), recvAck(), and sendChargeGridReady().

ResizeArray<ComputePme*> ComputePmeMgr::pmeComputes

Definition at line 454 of file ComputePme.C.

Referenced by chargeGridReady(), cuda_check_pme_forces(), getComputes(), ComputePme::noWork(), recvRecipEvir(), sendChargeGridReady(), and ungridCalc().

CmiNodeLock ComputePmeMgr::pmemgr_lock

Definition at line 415 of file ComputePme.C.

Referenced by ComputePmeMgr(), and ~ComputePmeMgr().

Lattice* ComputePmeMgr::saved_lattice

Definition at line 447 of file ComputePme.C.

Referenced by chargeGridSubmitted(), and recvChargeGridReady().

int ComputePmeMgr::saved_sequence

Definition at line 448 of file ComputePme.C.

Referenced by chargeGridSubmitted(), cuda_check_pme_charges(), cuda_check_pme_forces(), and recvChargeGridReady().

int ComputePmeMgr::sendDataHelper_errors

Definition at line 373 of file ComputePme.C.

Referenced by sendData(), and NodePmeMgr::sendDataHelper().

Lattice* ComputePmeMgr::sendDataHelper_lattice

Definition at line 370 of file ComputePme.C.

Referenced by sendData(), NodePmeMgr::sendDataHelper(), sendPencils(), and NodePmeMgr::sendPencilsHelper().

int ComputePmeMgr::sendDataHelper_sequence

Definition at line 371 of file ComputePme.C.

Referenced by sendData(), NodePmeMgr::sendDataHelper(), sendPencils(), and NodePmeMgr::sendPencilsHelper().

int ComputePmeMgr::sendDataHelper_sourcepe

Definition at line 372 of file ComputePme.C.

Referenced by sendData(), NodePmeMgr::sendDataHelper(), sendPencils(), and NodePmeMgr::sendPencilsHelper().

int ComputePmeMgr::this_pe

Definition at line 435 of file ComputePme.C.

Referenced by ComputePmeMgr(), and ungridCalc().


The documentation for this class was generated from the following file:
Generated on Sun Oct 21 01:17:18 2018 for NAMD by  doxygen 1.4.7