ComputePmeMgr Class Reference

Inheritance diagram for ComputePmeMgr:

ComputePmeUtil List of all members.

Public Member Functions

 ComputePmeMgr ()
 ~ComputePmeMgr ()
void initialize (CkQdMsg *)
void initialize_pencils (CkQdMsg *)
void activate_pencils (CkQdMsg *)
void recvArrays (CProxy_PmeXPencil, CProxy_PmeYPencil, CProxy_PmeZPencil)
void initialize_computes ()
void sendData (Lattice &, int sequence)
void sendDataPart (int first, int last, Lattice &, int sequence, int sourcepe, int errors)
void sendPencils (Lattice &, int sequence)
void sendPencilsPart (int first, int last, Lattice &, int sequence, int sourcepe)
void recvGrid (PmeGridMsg *)
void gridCalc1 (void)
void sendTransBarrier (void)
void sendTransSubset (int first, int last)
void sendTrans (void)
void fwdSharedTrans (PmeTransMsg *)
void recvSharedTrans (PmeSharedTransMsg *)
void sendDataHelper (int)
void sendPencilsHelper (int)
void recvTrans (PmeTransMsg *)
void procTrans (PmeTransMsg *)
void gridCalc2 (void)
void gridCalc2R (void)
void fwdSharedUntrans (PmeUntransMsg *)
void recvSharedUntrans (PmeSharedUntransMsg *)
void sendUntrans (void)
void sendUntransSubset (int first, int last)
void recvUntrans (PmeUntransMsg *)
void procUntrans (PmeUntransMsg *)
void gridCalc3 (void)
void sendUngrid (void)
void sendUngridSubset (int first, int last)
void recvUngrid (PmeGridMsg *)
void recvAck (PmeAckMsg *)
void copyResults (PmeGridMsg *)
void copyPencils (PmeGridMsg *)
void ungridCalc (void)
void recvRecipEvir (PmeEvirMsg *)
void addRecipEvirClient (void)
void submitReductions ()
void chargeGridSubmitted (Lattice &lattice, int sequence)
void cuda_submit_charges (Lattice &lattice, int sequence)
void sendChargeGridReady ()
void pollChargeGridReady ()
void pollForcesReady ()
void recvChargeGridReady ()
void chargeGridReady (Lattice &lattice, int sequence)

Public Attributes

LatticesendDataHelper_lattice
int sendDataHelper_sequence
int sendDataHelper_sourcepe
int sendDataHelper_errors
CmiNodeLock pmemgr_lock
float * a_data_host
float * a_data_dev
float * f_data_host
float * f_data_dev
int cuda_atoms_count
int cuda_atoms_alloc
cudaEvent_t end_charges
cudaEvent_t * end_forces
int forces_count
int forces_done_count
double charges_time
double forces_time
int check_charges_count
int check_forces_count
int master_pe
int this_pe
int chargeGridSubmittedCount
Latticesaved_lattice
int saved_sequence
ResizeArray< ComputePme * > pmeComputes

Static Public Attributes

static CmiNodeLock fftw_plan_lock
static CmiNodeLock cuda_lock
static std::deque< cuda_submit_charges_argscuda_submit_charges_deque
static bool cuda_busy

Friends

class ComputePme
class NodePmeMgr

Classes

struct  cuda_submit_charges_args

Detailed Description

Definition at line 355 of file ComputePme.C.


Constructor & Destructor Documentation

ComputePmeMgr::ComputePmeMgr (  ) 

Definition at line 710 of file ComputePme.C.

References chargeGridSubmittedCount, check_charges_count, check_forces_count, cuda_atoms_alloc, cuda_atoms_count, cuda_errcheck(), CUDA_EVENT_ID_PME_CHARGES, CUDA_EVENT_ID_PME_COPY, CUDA_EVENT_ID_PME_FORCES, CUDA_EVENT_ID_PME_KERNEL, CUDA_EVENT_ID_PME_TICK, cuda_lock, CUDA_STREAM_CREATE, end_charges, end_forces, fftw_plan_lock, NUM_STREAMS, pmemgr_lock, stream, and this_pe.

00710                              : pmeProxy(thisgroup), 
00711                                  pmeProxyDir(thisgroup) {
00712 
00713   CkpvAccess(BOCclass_group).computePmeMgr = thisgroup;
00714   pmeNodeProxy = CkpvAccess(BOCclass_group).nodePmeMgr;
00715   nodePmeMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
00716 
00717   pmeNodeProxy.ckLocalBranch()->initialize();
00718 
00719   if ( CmiMyRank() == 0 ) {
00720     fftw_plan_lock = CmiCreateLock();
00721   }
00722   pmemgr_lock = CmiCreateLock();
00723 
00724   myKSpace = 0;
00725   kgrid = 0;
00726   work = 0;
00727   grid_count = 0;
00728   trans_count = 0;
00729   untrans_count = 0;
00730   ungrid_count = 0;
00731   gridmsg_reuse= new PmeGridMsg*[CkNumPes()];
00732   useBarrier = 0;
00733   sendTransBarrier_received = 0;
00734   usePencils = 0;
00735 
00736 #ifdef NAMD_CUDA
00737  // offload has not been set so this happens on every run
00738   if ( CmiMyRank() == 0 ) {
00739     cuda_lock = CmiCreateLock();
00740   }
00741 
00742 #if CUDA_VERSION >= 5050
00743   int leastPriority, greatestPriority;
00744   cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
00745   cuda_errcheck("in cudaDeviceGetStreamPriorityRange");
00746   //if ( CkMyNode() == 0 ) {
00747   //  CkPrintf("Pe %d PME CUDA stream priority range %d %d\n", CkMyPe(), leastPriority, greatestPriority);
00748   //}
00749 #define CUDA_STREAM_CREATE(X) cudaStreamCreateWithPriority(X,cudaStreamDefault,greatestPriority)
00750 #else
00751 #define CUDA_STREAM_CREATE(X) cudaStreamCreate(X)
00752 #endif
00753 
00754   stream = 0;
00755   for ( int i=0; i<NUM_STREAMS; ++i ) {
00756 #if 1
00757     CUDA_STREAM_CREATE(&streams[i]);
00758     cuda_errcheck("cudaStreamCreate");
00759 #else
00760   streams[i] = 0;  // XXXX Testing!!!
00761 #endif
00762   }
00763 
00764   this_pe = CkMyPe();
00765  
00766   cudaEventCreateWithFlags(&end_charges,cudaEventDisableTiming);
00767   end_forces = 0;
00768   check_charges_count = 0;
00769   check_forces_count = 0;
00770   chargeGridSubmittedCount = 0;
00771 
00772   cuda_atoms_count = 0;
00773   cuda_atoms_alloc = 0;
00774 
00775   f_data_mgr_alloc = 0;
00776   f_data_mgr_host = 0;
00777   f_data_mgr_dev = 0;
00778   afn_host = 0;
00779   afn_dev = 0;
00780 
00781 #define CUDA_EVENT_ID_PME_CHARGES 80
00782 #define CUDA_EVENT_ID_PME_FORCES 81
00783 #define CUDA_EVENT_ID_PME_TICK 82
00784 #define CUDA_EVENT_ID_PME_COPY 83
00785 #define CUDA_EVENT_ID_PME_KERNEL 84
00786   if ( 0 == CkMyPe() ) {
00787     traceRegisterUserEvent("CUDA PME charges", CUDA_EVENT_ID_PME_CHARGES);
00788     traceRegisterUserEvent("CUDA PME forces", CUDA_EVENT_ID_PME_FORCES);
00789     traceRegisterUserEvent("CUDA PME tick", CUDA_EVENT_ID_PME_TICK);
00790     traceRegisterUserEvent("CUDA PME memcpy", CUDA_EVENT_ID_PME_COPY);
00791     traceRegisterUserEvent("CUDA PME kernel", CUDA_EVENT_ID_PME_KERNEL);
00792   }
00793 #endif
00794   recipEvirCount = 0;
00795   recipEvirClients = 0;
00796   recipEvirPe = -999;
00797 }

ComputePmeMgr::~ComputePmeMgr (  ) 

Definition at line 1794 of file ComputePme.C.

References fftw_plan_lock, and pmemgr_lock.

01794                               {
01795 
01796   if ( CmiMyRank() == 0 ) {
01797     CmiDestroyLock(fftw_plan_lock);
01798   }
01799   CmiDestroyLock(pmemgr_lock);
01800 
01801   delete myKSpace;
01802   delete [] localInfo;
01803   delete [] gridNodeInfo;
01804   delete [] transNodeInfo;
01805   delete [] gridPeMap;
01806   delete [] transPeMap;
01807   delete [] recipPeDest;
01808   delete [] gridPeOrder;
01809   delete [] gridNodeOrder;
01810   delete [] transNodeOrder;
01811   delete [] qgrid;
01812   if ( kgrid != qgrid ) delete [] kgrid;
01813   delete [] work;
01814   delete [] gridmsg_reuse;
01815 
01816  if ( ! offload ) {
01817   for (int i=0; i<q_count; ++i) {
01818     delete [] q_list[i];
01819   }
01820   delete [] q_list;
01821   delete [] fz_arr;
01822  }
01823   delete [] f_arr;
01824   delete [] q_arr;
01825 }


Member Function Documentation

void ComputePmeMgr::activate_pencils ( CkQdMsg *   ) 

Definition at line 1788 of file ComputePme.C.

01788                                                  {
01789   if ( ! usePencils ) return;
01790   if ( CkMyPe() == 0 ) zPencil.dummyRecvGrid(CkMyPe(),1);
01791 }

void ComputePmeMgr::addRecipEvirClient ( void   ) 

Definition at line 3005 of file ComputePme.C.

03005                                        {
03006   ++recipEvirClients;
03007 }

void ComputePmeMgr::chargeGridReady ( Lattice lattice,
int  sequence 
)

Definition at line 3522 of file ComputePme.C.

References j, PmeGrid::K3, NAMD_bug(), PmeGrid::order, pmeComputes, sendData(), sendPencils(), and ResizeArray< Elem >::size().

Referenced by recvChargeGridReady().

03522                                                                   {
03523 
03524 #ifdef NAMD_CUDA
03525  if ( offload ) {
03526   int errcount = 0;
03527   int q_stride = myGrid.K3+myGrid.order-1;
03528   for (int n=fsize+q_stride, j=fsize; j<n; ++j) {
03529     f_arr[j] = ffz_host[j];
03530     if ( ffz_host[j] & ~1 ) ++errcount;
03531   }
03532   if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::chargeGridReady");
03533  }
03534 #endif
03535   recipEvirCount = recipEvirClients;
03536   ungridForcesCount = pmeComputes.size();
03537 
03538   for (int j=0; j<myGrid.order-1; ++j) {
03539     fz_arr[j] |= fz_arr[myGrid.K3+j];
03540   }
03541 
03542   if ( usePencils ) {
03543     sendPencils(lattice,sequence);
03544   } else {
03545     sendData(lattice,sequence);
03546   }
03547 }

void ComputePmeMgr::chargeGridSubmitted ( Lattice lattice,
int  sequence 
)

Definition at line 3458 of file ComputePme.C.

References chargeGridSubmittedCount, computeMgr, CUDA_EVENT_ID_PME_COPY, deviceCUDA, NodePmeMgr::end_all_pme_kernels, NodePmeMgr::end_charge_memset, end_charges, DeviceCUDA::getMasterPe(), master_pe, Node::Object(), saved_lattice, saved_sequence, Node::simParameters, and simParams.

Referenced by cuda_submit_charges().

03458                                                                       {
03459   saved_lattice = &lattice;
03460   saved_sequence = sequence;
03461 
03462   // cudaDeviceSynchronize();  //  XXXX TESTING
03463   //int q_stride = myGrid.K3+myGrid.order-1;
03464   //for (int n=fsize+q_stride, j=0; j<n; ++j) {
03465   //  if ( ffz_host[j] != 0 && ffz_host[j] != 1 ) {
03466   //    CkPrintf("pre-memcpy flag %d/%d == %d on pe %d in ComputePmeMgr::chargeGridReady\n", j, n, ffz_host[j], CkMyPe());
03467   //  }
03468   //}
03469   //CmiLock(cuda_lock);
03470 
03471  if ( --(masterPmeMgr->chargeGridSubmittedCount) == 0 ) {
03472   double before = CmiWallTimer();
03473   cudaEventRecord(nodePmeMgr->end_all_pme_kernels, 0);  // when all streams complete
03474   cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_all_pme_kernels, 0);
03475   cudaMemcpyAsync(q_data_host, q_data_dev, q_data_size+ffz_size,
03476                         cudaMemcpyDeviceToHost, streams[stream]);
03477   traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
03478   cudaEventRecord(masterPmeMgr->end_charges, streams[stream]);
03479   cudaMemsetAsync(q_data_dev, 0, q_data_size + ffz_size, streams[stream]);  // for next time
03480   cudaEventRecord(nodePmeMgr->end_charge_memset, streams[stream]);
03481   //CmiUnlock(cuda_lock);
03482   // cudaDeviceSynchronize();  //  XXXX TESTING
03483   // cuda_errcheck("after memcpy grid to host");
03484 
03485   SimParameters *simParams = Node::Object()->simParameters;
03486   if ( ! simParams->useCUDA2 ) {
03487     CProxy_ComputeMgr cm(CkpvAccess(BOCclass_group).computeMgr);
03488     cm[deviceCUDA->getMasterPe()].recvYieldDevice(-1);
03489   }
03490 
03491   pmeProxy[master_pe].pollChargeGridReady();
03492  }
03493 }

void ComputePmeMgr::copyPencils ( PmeGridMsg  ) 

Definition at line 3768 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::block2, PmeGrid::dim2, PmeGrid::dim3, j, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PmeGrid::order, PmeGridMsg::qgrid, PmeGridMsg::sourceNode, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

Referenced by recvUngrid().

03768                                                {
03769 
03770   int K1 = myGrid.K1;
03771   int K2 = myGrid.K2;
03772   int dim2 = myGrid.dim2;
03773   int dim3 = myGrid.dim3;
03774   int block1 = myGrid.block1;
03775   int block2 = myGrid.block2;
03776 
03777   // msg->sourceNode = thisIndex.x * initdata.yBlocks + thisIndex.y;
03778   int ib = msg->sourceNode / yBlocks;
03779   int jb = msg->sourceNode % yBlocks;
03780 
03781   int ibegin = ib*block1;
03782   int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
03783   int jbegin = jb*block2;
03784   int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
03785 
03786   int zlistlen = msg->zlistlen;
03787   int *zlist = msg->zlist;
03788   float *qmsg = msg->qgrid;
03789   int g;
03790   for ( g=0; g<numGrids; ++g ) {
03791     char *f = f_arr + g*fsize;
03792     float **q = q_arr + g*fsize;
03793     for ( int i=ibegin; i<iend; ++i ) {
03794      for ( int j=jbegin; j<jend; ++j ) {
03795       if( f[i*dim2+j] ) {
03796         f[i*dim2+j] = 0;
03797         for ( int k=0; k<zlistlen; ++k ) {
03798           q[i*dim2+j][zlist[k]] = *(qmsg++);
03799         }
03800         for (int h=0; h<myGrid.order-1; ++h) {
03801           q[i*dim2+j][myGrid.K3+h] = q[i*dim2+j][h];
03802         }
03803       }
03804      }
03805     }
03806   }
03807 }

void ComputePmeMgr::copyResults ( PmeGridMsg  ) 

Definition at line 3960 of file ComputePme.C.

References PmeGrid::dim3, PmeGridMsg::fgrid, PmeGrid::K3, PmeGridMsg::len, PmeGrid::order, PmeGridMsg::qgrid, PmeGridMsg::start, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

Referenced by recvUngrid().

03960                                                {
03961 
03962   int zdim = myGrid.dim3;
03963   int flen = msg->len;
03964   int fstart = msg->start;
03965   int zlistlen = msg->zlistlen;
03966   int *zlist = msg->zlist;
03967   float *qmsg = msg->qgrid;
03968   int g;
03969   for ( g=0; g<numGrids; ++g ) {
03970     char *f = msg->fgrid + g*flen;
03971     float **q = q_arr + fstart + g*fsize;
03972     for ( int i=0; i<flen; ++i ) {
03973       if ( f[i] ) {
03974         f[i] = 0;
03975         for ( int k=0; k<zlistlen; ++k ) {
03976           q[i][zlist[k]] = *(qmsg++);
03977         }
03978         for (int h=0; h<myGrid.order-1; ++h) {
03979           q[i][myGrid.K3+h] = q[i][h];
03980         }
03981       }
03982     }
03983   }
03984 }

void ComputePmeMgr::cuda_submit_charges ( Lattice lattice,
int  sequence 
)

Definition at line 3405 of file ComputePme.C.

References a_data_dev, chargeGridSubmitted(), charges_time, cuda_atoms_count, CUDA_EVENT_ID_PME_COPY, CUDA_EVENT_ID_PME_KERNEL, NodePmeMgr::end_charge_memset, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, and PmeGrid::order.

03405                                                                       {
03406 
03407     int n = cuda_atoms_count;
03408     //CkPrintf("pe %d cuda_atoms_count %d\n", CkMyPe(), cuda_atoms_count);
03409     cuda_atoms_count = 0;
03410 
03411     const double before = CmiWallTimer();
03412     cudaMemcpyAsync(a_data_dev, a_data_host, 7*n*sizeof(float),
03413                           cudaMemcpyHostToDevice, streams[stream]);
03414     const double after = CmiWallTimer();
03415 
03416     cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_charge_memset, 0);
03417 
03418     cuda_pme_charges(
03419       bspline_coeffs_dev,
03420       q_arr_dev, ffz_dev, ffz_dev + fsize,
03421       a_data_dev, n,
03422       myGrid.K1, myGrid.K2, myGrid.K3, myGrid.order,
03423       streams[stream]);
03424     const double after2 = CmiWallTimer();
03425 
03426     chargeGridSubmitted(lattice,sequence);  // must be inside lock
03427 
03428     masterPmeMgr->charges_time = before;
03429     traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,after);
03430     traceUserBracketEvent(CUDA_EVENT_ID_PME_KERNEL,after,after2);
03431 }

void ComputePmeMgr::fwdSharedTrans ( PmeTransMsg  ) 

Definition at line 2014 of file ComputePme.C.

References NodePmeInfo::npe, NodePmeInfo::pe_start, PME_TRANS_PRIORITY, PRIORITY_SIZE, PmeTransMsg::sequence, and SET_PRIORITY.

Referenced by NodePmeMgr::recvTrans(), and sendTransSubset().

02014                                                    {
02015   // CkPrintf("fwdSharedTrans on Pe(%d)\n",CkMyPe());
02016   int pe = transNodeInfo[myTransNode].pe_start;
02017   int npe = transNodeInfo[myTransNode].npe;
02018   CmiNodeLock lock = CmiCreateLock();
02019   int *count = new int; *count = npe;
02020   for (int i=0; i<npe; ++i, ++pe) {
02021     PmeSharedTransMsg *shmsg = new (PRIORITY_SIZE) PmeSharedTransMsg;
02022     SET_PRIORITY(shmsg,msg->sequence,PME_TRANS_PRIORITY)
02023     shmsg->msg = msg;
02024     shmsg->count = count;
02025     shmsg->lock = lock;
02026     pmeProxy[transPeMap[pe]].recvSharedTrans(shmsg);
02027   }
02028 }

void ComputePmeMgr::fwdSharedUntrans ( PmeUntransMsg  ) 

Definition at line 2270 of file ComputePme.C.

References NodePmeInfo::npe, and NodePmeInfo::pe_start.

Referenced by NodePmeMgr::recvUntrans(), and sendUntransSubset().

02270                                                        {
02271   int pe = gridNodeInfo[myGridNode].pe_start;
02272   int npe = gridNodeInfo[myGridNode].npe;
02273   CmiNodeLock lock = CmiCreateLock();
02274   int *count = new int; *count = npe;
02275   for (int i=0; i<npe; ++i, ++pe) {
02276     PmeSharedUntransMsg *shmsg = new PmeSharedUntransMsg;
02277     shmsg->msg = msg;
02278     shmsg->count = count;
02279     shmsg->lock = lock;
02280     pmeProxy[gridPeMap[pe]].recvSharedUntrans(shmsg);
02281   }
02282 }

void ComputePmeMgr::gridCalc1 ( void   ) 

Definition at line 1906 of file ComputePme.C.

References PmeGrid::dim2, PmeGrid::dim3, and ComputePmeUtil::numGrids.

Referenced by recvGrid().

01906                                   {
01907   // CkPrintf("gridCalc1 on Pe(%d)\n",CkMyPe());
01908 
01909 #ifdef NAMD_FFTW
01910   for ( int g=0; g<numGrids; ++g ) {
01911 #ifdef NAMD_FFTW_3
01912     fftwf_execute(forward_plan_yz[g]);
01913 #else
01914     rfftwnd_real_to_complex(forward_plan_yz, localInfo[myGridPe].nx,
01915         qgrid + qgrid_size * g, 1, myGrid.dim2 * myGrid.dim3, 0, 0, 0);
01916 #endif
01917 
01918   }
01919 #endif
01920 
01921   if ( ! useBarrier ) pmeProxyDir[CkMyPe()].sendTrans();
01922 }

void ComputePmeMgr::gridCalc2 ( void   ) 

Definition at line 2082 of file ComputePme.C.

References PmeGrid::dim3, gridCalc2R(), ComputePmeUtil::numGrids, LocalPmeInfo::ny_after_transpose, and simParams.

Referenced by procTrans().

02082                                   {
02083   // CkPrintf("gridCalc2 on Pe(%d)\n",CkMyPe());
02084 
02085 #if CMK_BLUEGENEL
02086   CmiNetworkProgressAfter (0);
02087 #endif
02088 
02089   int zdim = myGrid.dim3;
02090   // int y_start = localInfo[myTransPe].y_start_after_transpose;
02091   int ny = localInfo[myTransPe].ny_after_transpose;
02092 
02093   for ( int g=0; g<numGrids; ++g ) {
02094     // finish forward FFT (x dimension)
02095 #ifdef NAMD_FFTW
02096 #ifdef NAMD_FFTW_3
02097     fftwf_execute(forward_plan_x[g]);
02098 #else
02099     fftw(forward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
02100         ny * zdim / 2, 1, work, 1, 0);
02101 #endif
02102 #endif
02103   }
02104 
02105 #ifdef OPENATOM_VERSION
02106     if ( ! simParams -> openatomOn ) { 
02107 #endif // OPENATOM_VERSION
02108       gridCalc2R();
02109 #ifdef OPENATOM_VERSION
02110     } else {
02111       gridCalc2Moa();
02112     }
02113 #endif // OPENATOM_VERSION
02114 }

void ComputePmeMgr::gridCalc2R ( void   ) 

Definition at line 2142 of file ComputePme.C.

References CKLOOP_CTRL_PME_KSPACE, PmeKSpace::compute_energy(), PmeGrid::dim3, ComputeNonbondedUtil::ewaldcof, LocalPmeInfo::ny_after_transpose, and Node::Object().

Referenced by gridCalc2().

02142                                    {
02143 
02144   int useCkLoop = 0;
02145 #if CMK_SMP && USE_CKLOOP
02146   if ( Node::Object()->simParameters->useCkLoop >= CKLOOP_CTRL_PME_KSPACE
02147        && CkNumPes() >= 2 * numTransPes ) {
02148     useCkLoop = 1;
02149   }
02150 #endif
02151 
02152   int zdim = myGrid.dim3;
02153   // int y_start = localInfo[myTransPe].y_start_after_transpose;
02154   int ny = localInfo[myTransPe].ny_after_transpose;
02155 
02156   for ( int g=0; g<numGrids; ++g ) {
02157     // reciprocal space portion of PME
02158     BigReal ewaldcof = ComputeNonbondedUtil::ewaldcof;
02159     recip_evir2[g][0] = myKSpace->compute_energy(kgrid+qgrid_size*g,
02160                         lattice, ewaldcof, &(recip_evir2[g][1]), useCkLoop);
02161     // CkPrintf("Ewald reciprocal energy = %f\n", recip_evir2[g][0]);
02162 
02163     // start backward FFT (x dimension)
02164 
02165 #ifdef NAMD_FFTW
02166 #ifdef NAMD_FFTW_3
02167     fftwf_execute(backward_plan_x[g]);
02168 #else
02169     fftw(backward_plan_x, ny * zdim / 2, (fftw_complex *)(kgrid+qgrid_size*g),
02170         ny * zdim / 2, 1, work, 1, 0);
02171 #endif
02172 #endif
02173   }
02174   
02175   pmeProxyDir[CkMyPe()].sendUntrans();
02176 }

void ComputePmeMgr::gridCalc3 ( void   ) 

Definition at line 2344 of file ComputePme.C.

References PmeGrid::dim2, and PmeGrid::dim3.

Referenced by procUntrans().

02344                                   {
02345   // CkPrintf("gridCalc3 on Pe(%d)\n",CkMyPe());
02346 
02347   // finish backward FFT
02348 #ifdef NAMD_FFTW
02349   for ( int g=0; g<numGrids; ++g ) {
02350 #ifdef NAMD_FFTW_3
02351     fftwf_execute(backward_plan_yz[g]);
02352 #else
02353     rfftwnd_complex_to_real(backward_plan_yz, localInfo[myGridPe].nx,
02354         (fftw_complex *) (qgrid + qgrid_size * g),
02355         1, myGrid.dim2 * myGrid.dim3 / 2, 0, 0, 0);
02356 #endif
02357   }
02358 
02359 #endif
02360 
02361   pmeProxyDir[CkMyPe()].sendUngrid();
02362 }

void ComputePmeMgr::initialize ( CkQdMsg *   ) 

Definition at line 862 of file ComputePme.C.

References Lattice::a(), Lattice::a_r(), ResizeArray< Elem >::add(), ResizeArray< Elem >::begin(), PmeGrid::block1, PmeGrid::block2, PmeGrid::block3, cuda_errcheck(), deviceCUDA, PmeGrid::dim2, PmeGrid::dim3, ResizeArray< Elem >::end(), endi(), fftw_plan_lock, findRecipEvirPe(), generatePmePeList2(), DeviceCUDA::getDeviceID(), PmePencilInitMsgData::grid, if(), iINFO(), iout, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PatchMap::max_a(), PatchMap::min_a(), NAMD_bug(), NAMD_die(), PatchMap::node(), NodePmeInfo::npe, ComputePmeUtil::numGrids, PatchMap::numNodesWithPatches(), PatchMap::numPatches(), numPatches, PatchMap::numPatchesOnNode(), LocalPmeInfo::nx, LocalPmeInfo::ny_after_transpose, PatchMap::Object(), Node::Object(), DeviceCUDA::one_device_per_node(), PmeGrid::order, NodePmeInfo::pe_start, WorkDistrib::peDiffuseOrdering, pencilPMEProcessors, PmePencilInitMsgData::pmeNodeProxy, PmePencilInitMsgData::pmeProxy, NodePmeInfo::real_node, Random::reorder(), ResizeArray< Elem >::resize(), Node::simParameters, simParams, ResizeArray< Elem >::size(), SortableResizeArray< Elem >::sort(), WorkDistrib::sortPmePes(), Vector::unit(), LocalPmeInfo::x_start, PmePencilInitMsgData::xBlocks, PmePencilInitMsgData::xm, PmePencilInitMsgData::xPencil, LocalPmeInfo::y_start_after_transpose, PmePencilInitMsgData::yBlocks, PmePencilInitMsgData::ym, PmePencilInitMsgData::yPencil, PmePencilInitMsgData::zBlocks, PmePencilInitMsgData::zm, and PmePencilInitMsgData::zPencil.

00862                                            {
00863   delete msg;
00864 
00865   localInfo = new LocalPmeInfo[CkNumPes()];
00866   gridNodeInfo = new NodePmeInfo[CkNumNodes()];
00867   transNodeInfo = new NodePmeInfo[CkNumNodes()];
00868   gridPeMap = new int[CkNumPes()];
00869   transPeMap = new int[CkNumPes()];
00870   recipPeDest = new int[CkNumPes()];
00871   gridPeOrder = new int[CkNumPes()];
00872   gridNodeOrder = new int[CkNumNodes()];
00873   transNodeOrder = new int[CkNumNodes()];
00874 
00875   if (CkMyRank() == 0) {
00876     pencilPMEProcessors = new char [CkNumPes()];
00877     memset (pencilPMEProcessors, 0, sizeof(char) * CkNumPes());
00878   }
00879 
00880   SimParameters *simParams = Node::Object()->simParameters;
00881   PatchMap *patchMap = PatchMap::Object();
00882 
00883   offload = simParams->PMEOffload;
00884 #ifdef NAMD_CUDA
00885   if ( offload && ! deviceCUDA->one_device_per_node() ) {
00886     NAMD_die("PME offload requires exactly one CUDA device per process.  Use \"PMEOffload no\".");
00887   }
00888   if ( offload ) {
00889     int dev;
00890     cudaGetDevice(&dev);
00891     cuda_errcheck("in cudaGetDevice");
00892     if ( dev != deviceCUDA->getDeviceID() ) NAMD_bug("ComputePmeMgr::initialize dev != deviceCUDA->getDeviceID()");
00893     cudaDeviceProp deviceProp;
00894     cudaGetDeviceProperties(&deviceProp, dev);
00895     cuda_errcheck("in cudaGetDeviceProperties");
00896     if ( deviceProp.major < 2 )
00897       NAMD_die("PME offload requires CUDA device of compute capability 2.0 or higher.  Use \"PMEOffload no\".");
00898   }
00899 #endif
00900 
00901   alchLambda = -1.;  // illegal value to catch if not updated
00902   alchLambda2 = -1.;
00903   useBarrier = simParams->PMEBarrier;
00904 
00905   if ( numGrids != 1 || simParams->PMEPencils == 0 ) usePencils = 0;
00906   else if ( simParams->PMEPencils > 0 ) usePencils = 1;
00907   else {
00908     int nrps = simParams->PMEProcessors;
00909     if ( nrps <= 0 ) nrps = CkNumPes();
00910     if ( nrps > CkNumPes() ) nrps = CkNumPes();
00911     int dimx = simParams->PMEGridSizeX;
00912     int dimy = simParams->PMEGridSizeY;
00913     int maxslabs = 1 + (dimx - 1) / simParams->PMEMinSlices;
00914     if ( maxslabs > nrps ) maxslabs = nrps;
00915     int maxpencils = ( simParams->PMEGridSizeX * (int64) simParams->PMEGridSizeY
00916                 * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
00917     if ( maxpencils > nrps ) maxpencils = nrps;
00918     if ( maxpencils > 3 * maxslabs ) usePencils = 1;
00919     else usePencils = 0;
00920   }
00921 
00922   if ( usePencils ) {
00923     int nrps = simParams->PMEProcessors;
00924     if ( nrps <= 0 ) nrps = CkNumPes();
00925     if ( nrps > CkNumPes() ) nrps = CkNumPes();
00926     if ( simParams->PMEPencils > 1 &&
00927          simParams->PMEPencils * simParams->PMEPencils <= nrps ) {
00928       xBlocks = yBlocks = zBlocks = simParams->PMEPencils;
00929     } else {
00930       int nb2 = ( simParams->PMEGridSizeX * (int64) simParams->PMEGridSizeY
00931                 * simParams->PMEGridSizeZ ) / simParams->PMEMinPoints;
00932       if ( nb2 > nrps ) nb2 = nrps;
00933       if ( nb2 < 1 ) nb2 = 1;
00934       int nb = (int) sqrt((float)nb2);
00935       if ( nb < 1 ) nb = 1;
00936       xBlocks = zBlocks = nb;
00937       yBlocks = nb2 / nb;
00938     }
00939 
00940     if ( simParams->PMEPencilsX > 0 ) xBlocks = simParams->PMEPencilsX;
00941     if ( simParams->PMEPencilsY > 0 ) yBlocks = simParams->PMEPencilsY;
00942     if ( simParams->PMEPencilsZ > 0 ) zBlocks = simParams->PMEPencilsZ;
00943 
00944     int dimx = simParams->PMEGridSizeX;
00945     int bx = 1 + ( dimx - 1 ) / xBlocks;
00946     xBlocks = 1 + ( dimx - 1 ) / bx;
00947 
00948     int dimy = simParams->PMEGridSizeY;
00949     int by = 1 + ( dimy - 1 ) / yBlocks;
00950     yBlocks = 1 + ( dimy - 1 ) / by;
00951 
00952     int dimz = simParams->PMEGridSizeZ / 2 + 1;  // complex
00953     int bz = 1 + ( dimz - 1 ) / zBlocks;
00954     zBlocks = 1 + ( dimz - 1 ) / bz;
00955 
00956     if ( xBlocks * yBlocks > CkNumPes() ) {
00957       NAMD_die("PME pencils xBlocks * yBlocks > numPes");
00958     }
00959     if ( xBlocks * zBlocks > CkNumPes() ) {
00960       NAMD_die("PME pencils xBlocks * zBlocks > numPes");
00961     }
00962     if ( yBlocks * zBlocks > CkNumPes() ) {
00963       NAMD_die("PME pencils yBlocks * zBlocks > numPes");
00964     }
00965 
00966     if ( ! CkMyPe() ) {
00967       iout << iINFO << "PME using " << xBlocks << " x " <<
00968         yBlocks << " x " << zBlocks <<
00969         " pencil grid for FFT and reciprocal sum.\n" << endi;
00970     }
00971   } else { // usePencils
00972 
00973   {  // decide how many pes to use for reciprocal sum
00974 
00975     // rules based on work available
00976     int minslices = simParams->PMEMinSlices;
00977     int dimx = simParams->PMEGridSizeX;
00978     int nrpx = ( dimx + minslices - 1 ) / minslices;
00979     int dimy = simParams->PMEGridSizeY;
00980     int nrpy = ( dimy + minslices - 1 ) / minslices;
00981 
00982     // rules based on processors available
00983     int nrpp = CkNumPes();
00984     // if ( nrpp > 32 ) nrpp = 32;  // cap to limit messages
00985     if ( nrpp < nrpx ) nrpx = nrpp;
00986     if ( nrpp < nrpy ) nrpy = nrpp;
00987 
00988     // user override
00989     int nrps = simParams->PMEProcessors;
00990     if ( nrps > CkNumPes() ) nrps = CkNumPes();
00991     if ( nrps > 0 ) nrpx = nrps;
00992     if ( nrps > 0 ) nrpy = nrps;
00993 
00994     // make sure there aren't any totally empty processors
00995     int bx = ( dimx + nrpx - 1 ) / nrpx;
00996     nrpx = ( dimx + bx - 1 ) / bx;
00997     int by = ( dimy + nrpy - 1 ) / nrpy;
00998     nrpy = ( dimy + by - 1 ) / by;
00999     if ( bx != ( dimx + nrpx - 1 ) / nrpx )
01000       NAMD_bug("Error in selecting number of PME processors.");
01001     if ( by != ( dimy + nrpy - 1 ) / nrpy )
01002       NAMD_bug("Error in selecting number of PME processors.");
01003 
01004     numGridPes = nrpx;
01005     numTransPes = nrpy;
01006   }
01007   if ( ! CkMyPe() ) {
01008     iout << iINFO << "PME using " << numGridPes << " and " << numTransPes <<
01009       " processors for FFT and reciprocal sum.\n" << endi;
01010   }
01011 
01012   int sum_npes = numTransPes + numGridPes;
01013   int max_npes = (numTransPes > numGridPes)?numTransPes:numGridPes;
01014 
01015 #if 0 // USE_TOPOMAP
01016   /* This code is being disabled permanently for slab PME on Blue Gene machines */
01017   PatchMap * pmap = PatchMap::Object();
01018   
01019   int patch_pes = pmap->numNodesWithPatches();
01020   TopoManager tmgr;
01021   if(tmgr.hasMultipleProcsPerNode())
01022     patch_pes *= 2;
01023 
01024   bool done = false;
01025   if(CkNumPes() > 2*sum_npes + patch_pes) {    
01026     done = generateBGLORBPmePeList(transPeMap, numTransPes);
01027     done &= generateBGLORBPmePeList(gridPeMap, numGridPes, transPeMap, numTransPes);    
01028   }
01029   else 
01030     if(CkNumPes() > 2 *max_npes + patch_pes) {
01031       done = generateBGLORBPmePeList(transPeMap, max_npes);
01032       gridPeMap = transPeMap;
01033     }
01034 
01035   if (!done)
01036 #endif
01037     {
01038       //generatePmePeList(transPeMap, max_npes);
01039       //gridPeMap = transPeMap;
01040       generatePmePeList2(gridPeMap, numGridPes, transPeMap, numTransPes);
01041     }
01042   
01043   if ( ! CkMyPe() ) {
01044     iout << iINFO << "PME GRID LOCATIONS:";
01045     int i;
01046     for ( i=0; i<numGridPes && i<10; ++i ) {
01047       iout << " " << gridPeMap[i];
01048     }
01049     if ( i < numGridPes ) iout << " ...";
01050     iout << "\n" << endi;
01051     iout << iINFO << "PME TRANS LOCATIONS:";
01052     for ( i=0; i<numTransPes && i<10; ++i ) {
01053       iout << " " << transPeMap[i];
01054     }
01055     if ( i < numTransPes ) iout << " ...";
01056     iout << "\n" << endi;
01057   }
01058 
01059   // sort based on nodes and physical nodes
01060   std::sort(gridPeMap,gridPeMap+numGridPes,WorkDistrib::pe_sortop_compact());
01061 
01062   myGridPe = -1;
01063   myGridNode = -1;
01064   int i = 0;
01065   int node = -1;
01066   int real_node = -1;
01067   for ( i=0; i<numGridPes; ++i ) {
01068     if ( gridPeMap[i] == CkMyPe() ) myGridPe = i;
01069     if (CkMyRank() == 0) pencilPMEProcessors[gridPeMap[i]] |= 1;
01070     int real_node_i = CkNodeOf(gridPeMap[i]);
01071     if ( real_node_i == real_node ) {
01072       gridNodeInfo[node].npe += 1;
01073     } else {
01074       real_node = real_node_i;
01075       ++node;
01076       gridNodeInfo[node].real_node = real_node;
01077       gridNodeInfo[node].pe_start = i;
01078       gridNodeInfo[node].npe = 1;
01079     }
01080     if ( CkMyNode() == real_node_i ) myGridNode = node;
01081   }
01082   numGridNodes = node + 1;
01083   myTransPe = -1;
01084   myTransNode = -1;
01085   node = -1;
01086   real_node = -1;
01087   for ( i=0; i<numTransPes; ++i ) {
01088     if ( transPeMap[i] == CkMyPe() ) myTransPe = i;
01089     if (CkMyRank() == 0) pencilPMEProcessors[transPeMap[i]] |= 2;
01090     int real_node_i = CkNodeOf(transPeMap[i]);
01091     if ( real_node_i == real_node ) {
01092       transNodeInfo[node].npe += 1;
01093     } else {
01094       real_node = real_node_i;
01095       ++node;
01096       transNodeInfo[node].real_node = real_node;
01097       transNodeInfo[node].pe_start = i;
01098       transNodeInfo[node].npe = 1;
01099     }
01100     if ( CkMyNode() == real_node_i ) myTransNode = node;
01101   }
01102   numTransNodes = node + 1;
01103 
01104   if ( ! CkMyPe() ) {
01105     iout << iINFO << "PME USING " << numGridNodes << " GRID NODES AND "
01106          << numTransNodes << " TRANS NODES\n" << endi;
01107   }
01108 
01109   { // generate random orderings for grid and trans messages
01110     int i;
01111     for ( i = 0; i < numGridPes; ++i ) {
01112       gridPeOrder[i] = i;
01113     }
01114     Random rand(CkMyPe());
01115     if ( myGridPe < 0 ) {
01116       rand.reorder(gridPeOrder,numGridPes);
01117     } else {  // self last
01118       gridPeOrder[myGridPe] = numGridPes-1;
01119       gridPeOrder[numGridPes-1] = myGridPe;
01120       rand.reorder(gridPeOrder,numGridPes-1);
01121     } 
01122     for ( i = 0; i < numGridNodes; ++i ) {
01123       gridNodeOrder[i] = i;
01124     }
01125     if ( myGridNode < 0 ) {
01126       rand.reorder(gridNodeOrder,numGridNodes);
01127     } else {  // self last
01128       gridNodeOrder[myGridNode] = numGridNodes-1;
01129       gridNodeOrder[numGridNodes-1] = myGridNode;
01130       rand.reorder(gridNodeOrder,numGridNodes-1);
01131     }
01132     for ( i = 0; i < numTransNodes; ++i ) {
01133       transNodeOrder[i] = i;
01134     }
01135     if ( myTransNode < 0 ) {
01136       rand.reorder(transNodeOrder,numTransNodes);
01137     } else {  // self last
01138       transNodeOrder[myTransNode] = numTransNodes-1;
01139       transNodeOrder[numTransNodes-1] = myTransNode;
01140       rand.reorder(transNodeOrder,numTransNodes-1);
01141     }
01142   }
01143   
01144   } // ! usePencils
01145 
01146   myGrid.K1 = simParams->PMEGridSizeX;
01147   myGrid.K2 = simParams->PMEGridSizeY;
01148   myGrid.K3 = simParams->PMEGridSizeZ;
01149   myGrid.order = simParams->PMEInterpOrder;
01150   myGrid.dim2 = myGrid.K2;
01151   myGrid.dim3 = 2 * (myGrid.K3/2 + 1);
01152 
01153   if ( ! usePencils ) {
01154     myGrid.block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
01155     myGrid.block2 = ( myGrid.K2 + numTransPes - 1 ) / numTransPes;
01156     myGrid.block3 = myGrid.dim3 / 2;  // complex
01157   }
01158 
01159   if ( usePencils ) {
01160     myGrid.block1 = ( myGrid.K1 + xBlocks - 1 ) / xBlocks;
01161     myGrid.block2 = ( myGrid.K2 + yBlocks - 1 ) / yBlocks;
01162     myGrid.block3 = ( myGrid.K3/2 + 1 + zBlocks - 1 ) / zBlocks;  // complex
01163 
01164 
01165       int pe = 0;
01166       int x,y,z;
01167 
01168                 SortableResizeArray<int> zprocs(xBlocks*yBlocks);
01169                 SortableResizeArray<int> yprocs(xBlocks*zBlocks);
01170                 SortableResizeArray<int> xprocs(yBlocks*zBlocks);
01171       
01172                 // decide which pes to use by bit reversal and patch use
01173                 int i;
01174                 int ncpus = CkNumPes();
01175                 SortableResizeArray<int> patches, nopatches, pmeprocs;
01176                 PatchMap *pmap = PatchMap::Object();
01177                 for ( int icpu=0; icpu<ncpus; ++icpu ) {
01178                         int ri = WorkDistrib::peDiffuseOrdering[icpu];
01179                         if ( ri ) { // keep 0 for special case
01180                                 // pretend pe 1 has patches to avoid placing extra PME load on node
01181                                 if ( ri == 1 || pmap->numPatchesOnNode(ri) ) patches.add(ri);
01182                                 else nopatches.add(ri);
01183                         }
01184                 }
01185 
01186 #if USE_RANDOM_TOPO
01187             Random rand(CkMyPe());
01188             int *tmp = new int[patches.size()];
01189             int nn = patches.size();
01190             for (i=0;i<nn;i++)  tmp[i] = patches[i];
01191             rand.reorder(tmp, nn);
01192             patches.resize(0);
01193             for (i=0;i<nn;i++)  patches.add(tmp[i]);
01194             delete [] tmp;
01195             tmp = new int[nopatches.size()];
01196             nn = nopatches.size();
01197             for (i=0;i<nn;i++)  tmp[i] = nopatches[i];
01198             rand.reorder(tmp, nn);
01199             nopatches.resize(0);
01200             for (i=0;i<nn;i++)  nopatches.add(tmp[i]);
01201             delete [] tmp;
01202 #endif
01203 
01204                 // only use zero if it eliminates overloading or has patches
01205                 int useZero = 0;
01206                 int npens = xBlocks*yBlocks;
01207                 if ( npens % ncpus == 0 ) useZero = 1;
01208                 if ( npens == nopatches.size() + 1 ) useZero = 1;
01209                 npens += xBlocks*zBlocks;
01210                 if ( npens % ncpus == 0 ) useZero = 1;
01211                 if ( npens == nopatches.size() + 1 ) useZero = 1;
01212                 npens += yBlocks*zBlocks;
01213                 if ( npens % ncpus == 0 ) useZero = 1;
01214                 if ( npens == nopatches.size() + 1 ) useZero = 1;
01215 
01216                 // add nopatches then patches in reversed order
01217                 for ( i=nopatches.size()-1; i>=0; --i ) pmeprocs.add(nopatches[i]);
01218                 if ( useZero && ! pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
01219                 for ( i=patches.size()-1; i>=0; --i ) pmeprocs.add(patches[i]);
01220                 if ( pmap->numPatchesOnNode(0) ) pmeprocs.add(0);
01221   
01222                 int npes = pmeprocs.size();
01223                 for ( i=0; i<xBlocks*yBlocks; ++i, ++pe ) zprocs[i] = pmeprocs[pe%npes];
01224                 if ( i>1 && zprocs[0] == zprocs[i-1] ) zprocs[0] = 0;
01225 #if !USE_RANDOM_TOPO
01226                 zprocs.sort();
01227 #endif
01228                 for ( i=0; i<xBlocks*zBlocks; ++i, ++pe ) yprocs[i] = pmeprocs[pe%npes];
01229                 if ( i>1 && yprocs[0] == yprocs[i-1] ) yprocs[0] = 0;
01230 #if !USE_RANDOM_TOPO
01231                 yprocs.sort();
01232 #endif
01233       for ( i=0; i<yBlocks*zBlocks; ++i, ++pe ) xprocs[i] = pmeprocs[pe%npes];
01234       if ( i>1 && xprocs[0] == xprocs[i-1] ) xprocs[0] = 0;
01235 #if !USE_RANDOM_TOPO
01236       xprocs.sort();
01237 #endif
01238 
01239 #if USE_TOPO_SFC
01240   CmiLock(tmgr_lock);
01241   //{
01242   TopoManager tmgr;
01243   int xdim = tmgr.getDimNX();
01244   int ydim = tmgr.getDimNY();
01245   int zdim = tmgr.getDimNZ();
01246   int xdim1 = find_level_grid(xdim);
01247   int ydim1 = find_level_grid(ydim);
01248   int zdim1 = find_level_grid(zdim);
01249   if(CkMyPe() == 0)
01250       printf("xdim: %d %d %d, %d %d %d\n", xdim, ydim, zdim, xdim1, ydim1, zdim1);
01251 
01252   vector<Coord> result;
01253   SFC_grid(xdim, ydim, zdim, xdim1, ydim1, zdim1, result);
01254   sort_sfc(xprocs, tmgr, result);
01255   sort_sfc(yprocs, tmgr, result);
01256   sort_sfc(zprocs, tmgr, result);
01257   //}
01258   CmiUnlock(tmgr_lock);
01259 #endif
01260 
01261 
01262                 if(CkMyPe() == 0){  
01263               iout << iINFO << "PME Z PENCIL LOCATIONS:";
01264           for ( i=0; i<zprocs.size() && i<10; ++i ) {
01265 #if USE_TOPO_SFC
01266               int x,y,z,t;
01267               tmgr.rankToCoordinates(zprocs[i], x,y, z, t);
01268               iout << " " << zprocs[i] << "(" << x << " " << y << " " << z << ")";
01269 #else
01270               iout << " " << zprocs[i];
01271 #endif
01272           }
01273           if ( i < zprocs.size() ) iout << " ...";
01274               iout << "\n" << endi;
01275                 }
01276 
01277     if (CkMyRank() == 0) {
01278       for (pe=0, x = 0; x < xBlocks; ++x)
01279         for (y = 0; y < yBlocks; ++y, ++pe ) {
01280           pencilPMEProcessors[zprocs[pe]] = 1;
01281         }
01282     }
01283      
01284                 if(CkMyPe() == 0){  
01285               iout << iINFO << "PME Y PENCIL LOCATIONS:";
01286           for ( i=0; i<yprocs.size() && i<10; ++i ) {
01287 #if USE_TOPO_SFC
01288               int x,y,z,t;
01289               tmgr.rankToCoordinates(yprocs[i], x,y, z, t);
01290               iout << " " << yprocs[i] << "(" << x << " " << y << " " << z << ")";
01291 #else
01292               iout << " " << yprocs[i];
01293 #endif
01294           }
01295           if ( i < yprocs.size() ) iout << " ...";
01296               iout << "\n" << endi;
01297                 }
01298 
01299     if (CkMyRank() == 0) {
01300       for (pe=0, z = 0; z < zBlocks; ++z )
01301         for (x = 0; x < xBlocks; ++x, ++pe ) {
01302           pencilPMEProcessors[yprocs[pe]] = 1;
01303         }
01304     }
01305     
01306                 if(CkMyPe() == 0){  
01307                 iout << iINFO << "PME X PENCIL LOCATIONS:";
01308                     for ( i=0; i<xprocs.size() && i<10; ++i ) {
01309 #if USE_TOPO_SFC
01310                 int x,y,z,t;
01311                 tmgr.rankToCoordinates(xprocs[i], x,y, z, t);
01312                 iout << " " << xprocs[i] << "(" << x << "  " << y << " " << z << ")";
01313 #else
01314                 iout << " " << xprocs[i];
01315 #endif
01316             }
01317                 if ( i < xprocs.size() ) iout << " ...";
01318                 iout << "\n" << endi;
01319                 }
01320 
01321     if (CkMyRank() == 0) {
01322       for (pe=0, y = 0; y < yBlocks; ++y )      
01323         for (z = 0; z < zBlocks; ++z, ++pe ) {
01324           pencilPMEProcessors[xprocs[pe]] = 1;
01325         }
01326     }
01327         
01328 
01329         // creating the pencil arrays
01330         if ( CkMyPe() == 0 ){
01331 #if !USE_RANDOM_TOPO
01332         // std::sort(zprocs.begin(),zprocs.end(),WorkDistrib::pe_sortop_compact());
01333         WorkDistrib::sortPmePes(zprocs.begin(),xBlocks,yBlocks);
01334         std::sort(yprocs.begin(),yprocs.end(),WorkDistrib::pe_sortop_compact());
01335         std::sort(xprocs.begin(),xprocs.end(),WorkDistrib::pe_sortop_compact());
01336 #endif
01337 #if 1
01338         CProxy_PmePencilMap zm = CProxy_PmePencilMap::ckNew(0,1,yBlocks,xBlocks*yBlocks,zprocs.begin());
01339         CProxy_PmePencilMap ym;
01340         if ( simParams->PMEPencilsYLayout )
01341           ym = CProxy_PmePencilMap::ckNew(0,2,zBlocks,zBlocks*xBlocks,yprocs.begin()); // new
01342         else
01343           ym = CProxy_PmePencilMap::ckNew(2,0,xBlocks,zBlocks*xBlocks,yprocs.begin()); // old
01344         CProxy_PmePencilMap xm;
01345         if ( simParams->PMEPencilsXLayout )
01346           xm = CProxy_PmePencilMap::ckNew(2,1,yBlocks,yBlocks*zBlocks,xprocs.begin()); // new
01347         else
01348           xm = CProxy_PmePencilMap::ckNew(1,2,zBlocks,yBlocks*zBlocks,xprocs.begin()); // old
01349         pmeNodeProxy.recvPencilMapProxies(xm,ym,zm);
01350         CkArrayOptions zo(xBlocks,yBlocks,1);  zo.setMap(zm);
01351         CkArrayOptions yo(xBlocks,1,zBlocks);  yo.setMap(ym);
01352         CkArrayOptions xo(1,yBlocks,zBlocks);  xo.setMap(xm);
01353         zo.setAnytimeMigration(false);  zo.setStaticInsertion(true);
01354         yo.setAnytimeMigration(false);  yo.setStaticInsertion(true);
01355         xo.setAnytimeMigration(false);  xo.setStaticInsertion(true);
01356         zPencil = CProxy_PmeZPencil::ckNew(zo);  // (xBlocks,yBlocks,1);
01357         yPencil = CProxy_PmeYPencil::ckNew(yo);  // (xBlocks,1,zBlocks);
01358         xPencil = CProxy_PmeXPencil::ckNew(xo);  // (1,yBlocks,zBlocks);
01359 #else
01360         zPencil = CProxy_PmeZPencil::ckNew();  // (xBlocks,yBlocks,1);
01361         yPencil = CProxy_PmeYPencil::ckNew();  // (xBlocks,1,zBlocks);
01362         xPencil = CProxy_PmeXPencil::ckNew();  // (1,yBlocks,zBlocks);
01363 
01364                 for (pe=0, x = 0; x < xBlocks; ++x)
01365                         for (y = 0; y < yBlocks; ++y, ++pe ) {
01366                                 zPencil(x,y,0).insert(zprocs[pe]);
01367                         }
01368         zPencil.doneInserting();
01369 
01370                 for (pe=0, x = 0; x < xBlocks; ++x)
01371                         for (z = 0; z < zBlocks; ++z, ++pe ) {
01372                                 yPencil(x,0,z).insert(yprocs[pe]);
01373                         }
01374         yPencil.doneInserting();
01375 
01376 
01377                 for (pe=0, y = 0; y < yBlocks; ++y )    
01378                         for (z = 0; z < zBlocks; ++z, ++pe ) {
01379                                 xPencil(0,y,z).insert(xprocs[pe]);
01380                         }
01381                 xPencil.doneInserting();     
01382 #endif
01383 
01384                 pmeProxy.recvArrays(xPencil,yPencil,zPencil);
01385                 PmePencilInitMsgData msgdata;
01386                 msgdata.grid = myGrid;
01387                 msgdata.xBlocks = xBlocks;
01388                 msgdata.yBlocks = yBlocks;
01389                 msgdata.zBlocks = zBlocks;
01390                 msgdata.xPencil = xPencil;
01391                 msgdata.yPencil = yPencil;
01392                 msgdata.zPencil = zPencil;
01393                 msgdata.pmeProxy = pmeProxyDir;
01394         msgdata.pmeNodeProxy = pmeNodeProxy;
01395         msgdata.xm = xm;
01396         msgdata.ym = ym;
01397         msgdata.zm = zm;
01398                 xPencil.init(new PmePencilInitMsg(msgdata));
01399                 yPencil.init(new PmePencilInitMsg(msgdata));
01400                 zPencil.init(new PmePencilInitMsg(msgdata));
01401         }
01402 
01403     return;  // continue in initialize_pencils() at next startup stage
01404   }
01405 
01406 
01407   int pe;
01408   int nx = 0;
01409   for ( pe = 0; pe < numGridPes; ++pe ) {
01410     localInfo[pe].x_start = nx;
01411     nx += myGrid.block1;
01412     if ( nx > myGrid.K1 ) nx = myGrid.K1;
01413     localInfo[pe].nx = nx - localInfo[pe].x_start;
01414   }
01415   int ny = 0;
01416   for ( pe = 0; pe < numTransPes; ++pe ) {
01417     localInfo[pe].y_start_after_transpose = ny;
01418     ny += myGrid.block2;
01419     if ( ny > myGrid.K2 ) ny = myGrid.K2;
01420     localInfo[pe].ny_after_transpose =
01421                         ny - localInfo[pe].y_start_after_transpose;
01422   }
01423 
01424   {  // decide how many pes this node exchanges charges with
01425 
01426   PatchMap *patchMap = PatchMap::Object();
01427   Lattice lattice = simParams->lattice;
01428   BigReal sysdima = lattice.a_r().unit() * lattice.a();
01429   BigReal cutoff = simParams->cutoff;
01430   BigReal patchdim = simParams->patchDimension;
01431   int numPatches = patchMap->numPatches();
01432   int numNodes = CkNumPes();
01433   int *source_flags = new int[numNodes];
01434   int node;
01435   for ( node=0; node<numNodes; ++node ) {
01436     source_flags[node] = 0;
01437     recipPeDest[node] = 0;
01438   }
01439 
01440   // // make sure that we don't get ahead of ourselves on this node
01441   // if ( CkMyPe() < numPatches && myRecipPe >= 0 ) {
01442   //   source_flags[CkMyPe()] = 1;
01443   //   recipPeDest[myRecipPe] = 1;
01444   // }
01445 
01446   for ( int pid=0; pid < numPatches; ++pid ) {
01447     int pnode = patchMap->node(pid);
01448 #ifdef NAMD_CUDA
01449     if ( offload ) pnode = CkNodeFirst(CkNodeOf(pnode));
01450 #endif
01451     int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
01452     BigReal minx = patchMap->min_a(pid);
01453     BigReal maxx = patchMap->max_a(pid);
01454     BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
01455     // min1 (max1) is smallest (largest) grid line for this patch
01456     int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
01457     int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
01458     for ( int i=min1; i<=max1; ++i ) {
01459       int ix = i;
01460       while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
01461       while ( ix < 0 ) ix += myGrid.K1;
01462       // set source_flags[pnode] if this patch sends to our node
01463       if ( myGridPe >= 0 && ix >= localInfo[myGridPe].x_start &&
01464            ix < localInfo[myGridPe].x_start + localInfo[myGridPe].nx ) {
01465         source_flags[pnode] = 1;
01466       }
01467       // set dest_flags[] for node that our patch sends to
01468 #ifdef NAMD_CUDA
01469       if ( offload ) {
01470         if ( pnode == CkNodeFirst(CkMyNode()) ) {
01471           recipPeDest[ix / myGrid.block1] = 1;
01472         }
01473       } else
01474 #endif
01475       if ( pnode == CkMyPe() ) {
01476         recipPeDest[ix / myGrid.block1] = 1;
01477       }
01478     }
01479   }
01480 
01481   int numSourcesSamePhysicalNode = 0;
01482   numSources = 0;
01483   numDestRecipPes = 0;
01484   for ( node=0; node<numNodes; ++node ) {
01485     if ( source_flags[node] ) ++numSources;
01486     if ( recipPeDest[node] ) ++numDestRecipPes;
01487     if ( source_flags[node] && CmiPeOnSamePhysicalNode(node,CkMyPe()) ) ++numSourcesSamePhysicalNode;
01488   }
01489 
01490 #if 0
01491   if ( numSources ) {
01492     CkPrintf("pe %5d pme %5d of %5d on same physical node\n",
01493             CkMyPe(), numSourcesSamePhysicalNode, numSources);
01494     iout << iINFO << "PME " << CkMyPe() << " sources:";
01495     for ( node=0; node<numNodes; ++node ) {
01496       if ( source_flags[node] ) iout << " " << node;
01497     }
01498     iout << "\n" << endi;
01499   }
01500 #endif
01501 
01502   delete [] source_flags;
01503 
01504   // CkPrintf("PME on node %d has %d sources and %d destinations\n",
01505   //           CkMyPe(), numSources, numDestRecipPes);
01506 
01507   }  // decide how many pes this node exchanges charges with (end)
01508 
01509   ungrid_count = numDestRecipPes;
01510 
01511   sendTransBarrier_received = 0;
01512 
01513   if ( myGridPe < 0 && myTransPe < 0 ) return;
01514   // the following only for nodes doing reciprocal sum
01515 
01516   if ( myTransPe >= 0 ) {
01517     recipEvirPe = findRecipEvirPe();
01518     pmeProxy[recipEvirPe].addRecipEvirClient();
01519   }
01520 
01521   if ( myTransPe >= 0 ) {
01522       int k2_start = localInfo[myTransPe].y_start_after_transpose;
01523       int k2_end = k2_start + localInfo[myTransPe].ny_after_transpose;
01524       #ifdef OPENATOM_VERSION
01525       if ( simParams->openatomOn ) { 
01526         CProxy_ComputeMoaMgr moaProxy(CkpvAccess(BOCclass_group).computeMoaMgr);
01527         myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2, moaProxy);
01528       } else {
01529         myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
01530       }
01531       #else  // OPENATOM_VERSION
01532       myKSpace = new PmeKSpace(myGrid, k2_start, k2_end, 0, myGrid.dim3/2);
01533       #endif // OPENATOM_VERSION
01534   }
01535 
01536   int local_size = myGrid.block1 * myGrid.K2 * myGrid.dim3;
01537   int local_size_2 = myGrid.block2 * myGrid.K1 * myGrid.dim3;
01538   if ( local_size < local_size_2 ) local_size = local_size_2;
01539   qgrid = new float[local_size*numGrids];
01540   if ( numGridPes > 1 || numTransPes > 1 ) {
01541     kgrid = new float[local_size*numGrids];
01542   } else {
01543     kgrid = qgrid;
01544   }
01545   qgrid_size = local_size;
01546 
01547   if ( myGridPe >= 0 ) {
01548   qgrid_start = localInfo[myGridPe].x_start * myGrid.K2 * myGrid.dim3;
01549   qgrid_len = localInfo[myGridPe].nx * myGrid.K2 * myGrid.dim3;
01550   fgrid_start = localInfo[myGridPe].x_start * myGrid.K2;
01551   fgrid_len = localInfo[myGridPe].nx * myGrid.K2;
01552   }
01553 
01554   int n[3]; n[0] = myGrid.K1; n[1] = myGrid.K2; n[2] = myGrid.K3;
01555 #ifdef NAMD_FFTW
01556   CmiLock(fftw_plan_lock);
01557 #ifdef NAMD_FFTW_3
01558   work = new fftwf_complex[n[0]];
01559   int fftwFlags = simParams->FFTWPatient ? FFTW_PATIENT  : simParams->FFTWEstimate ? FFTW_ESTIMATE  : FFTW_MEASURE ;
01560   if ( myGridPe >= 0 ) {
01561     forward_plan_yz=new fftwf_plan[numGrids];
01562     backward_plan_yz=new fftwf_plan[numGrids];
01563   }
01564   if ( myTransPe >= 0 ) {
01565     forward_plan_x=new fftwf_plan[numGrids];
01566     backward_plan_x=new fftwf_plan[numGrids];
01567   }
01568   /* need one plan per grid */
01569   if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps.  1..." << endi;
01570   if ( myGridPe >= 0 ) {
01571     for( int g=0; g<numGrids; g++)
01572       {
01573         forward_plan_yz[g] = fftwf_plan_many_dft_r2c(2, n+1, 
01574                                                      localInfo[myGridPe].nx,
01575                                                      qgrid + qgrid_size * g,
01576                                                      NULL,
01577                                                      1,
01578                                                      myGrid.dim2 * myGrid.dim3,
01579                                                      (fftwf_complex *) 
01580                                                      (qgrid + qgrid_size * g),
01581                                                      NULL,
01582                                                      1,
01583                                                      myGrid.dim2 * (myGrid.dim3/2),
01584                                                      fftwFlags);
01585       }
01586   }
01587   int zdim = myGrid.dim3;
01588   int xStride=localInfo[myTransPe].ny_after_transpose *( myGrid.dim3 / 2);
01589   if ( ! CkMyPe() ) iout << " 2..." << endi;
01590   if ( myTransPe >= 0 ) {
01591     for( int g=0; g<numGrids; g++)
01592       {
01593 
01594         forward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
01595                                                 (fftwf_complex *)
01596                                                 (kgrid+qgrid_size*g),
01597                                                 NULL,
01598                                                 xStride,
01599                                                 1,
01600                                                 (fftwf_complex *)
01601                                                 (kgrid+qgrid_size*g),
01602                                                 NULL,
01603                                                 xStride,
01604                                                 1,
01605                                                 FFTW_FORWARD,fftwFlags);
01606         
01607       }
01608   }
01609   if ( ! CkMyPe() ) iout << " 3..." << endi;
01610   if ( myTransPe >= 0 ) {
01611     for( int g=0; g<numGrids; g++)
01612       {
01613         backward_plan_x[g] = fftwf_plan_many_dft(1, n, xStride,
01614                                                  (fftwf_complex *)
01615                                                  (kgrid+qgrid_size*g),
01616                                                  NULL,
01617                                                  xStride,
01618                                                  1,
01619                                                  (fftwf_complex *)
01620                                                  (kgrid+qgrid_size*g),
01621                                                  NULL,
01622                                                  xStride,
01623                                                  1,
01624                                                  FFTW_BACKWARD, fftwFlags);
01625 
01626       }
01627   }
01628   if ( ! CkMyPe() ) iout << " 4..." << endi;
01629   if ( myGridPe >= 0 ) {
01630     for( int g=0; g<numGrids; g++)
01631       {
01632         backward_plan_yz[g] = fftwf_plan_many_dft_c2r(2, n+1, 
01633                                                       localInfo[myGridPe].nx,
01634                                                       (fftwf_complex *)
01635                                                       (qgrid + qgrid_size * g),
01636                                                       NULL,
01637                                                       1,
01638                                                       myGrid.dim2*(myGrid.dim3/2),
01639                                                       qgrid + qgrid_size * g,
01640                                                       NULL,
01641                                                       1,
01642                                                       myGrid.dim2 * myGrid.dim3,
01643                                                       fftwFlags);
01644       }
01645   }
01646   if ( ! CkMyPe() ) iout << "   Done.\n" << endi;
01647 
01648 #else
01649   work = new fftw_complex[n[0]];
01650 
01651   if ( ! CkMyPe() ) iout << iINFO << "Optimizing 4 FFT steps.  1..." << endi;
01652   if ( myGridPe >= 0 ) {
01653   forward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_REAL_TO_COMPLEX,
01654         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01655         | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
01656   }
01657   if ( ! CkMyPe() ) iout << " 2..." << endi;
01658   if ( myTransPe >= 0 ) {
01659       forward_plan_x = fftw_create_plan_specific(n[0], FFTW_FORWARD,
01660         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01661         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
01662         localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
01663   }
01664   if ( ! CkMyPe() ) iout << " 3..." << endi;
01665   if ( myTransPe >= 0 ) {
01666   backward_plan_x = fftw_create_plan_specific(n[0], FFTW_BACKWARD,
01667         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01668         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) kgrid,
01669         localInfo[myTransPe].ny_after_transpose * myGrid.dim3 / 2, work, 1);
01670   }
01671   if ( ! CkMyPe() ) iout << " 4..." << endi;
01672   if ( myGridPe >= 0 ) {
01673   backward_plan_yz = rfftwnd_create_plan_specific(2, n+1, FFTW_COMPLEX_TO_REAL,
01674         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
01675         | FFTW_IN_PLACE | FFTW_USE_WISDOM, qgrid, 1, 0, 0);
01676   }
01677   if ( ! CkMyPe() ) iout << "   Done.\n" << endi;
01678 #endif
01679   CmiUnlock(fftw_plan_lock);
01680 #else
01681   NAMD_die("Sorry, FFTW must be compiled in to use PME.");
01682 #endif
01683 
01684   if ( myGridPe >= 0 && numSources == 0 )
01685                 NAMD_bug("PME grid elements exist without sources.");
01686   grid_count = numSources;
01687   memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
01688   trans_count = numGridPes;
01689 }

void ComputePmeMgr::initialize_computes (  ) 

Definition at line 2706 of file ComputePme.C.

References chargeGridSubmittedCount, cuda_errcheck(), cuda_init_bspline_coeffs(), cuda_lock, deviceCUDA, PmeGrid::dim2, PmeGrid::dim3, NodePmeMgr::end_all_pme_kernels, NodePmeMgr::end_charge_memset, NodePmeMgr::end_potential_memcpy, f, DeviceCUDA::getDeviceID(), DeviceCUDA::getMasterPe(), j, ijpair::j, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, NodePmeMgr::master_pe, master_pe, NodePmeMgr::masterPmeMgr, NodePmeMgr::mgrObjects, NAMD_bug(), PatchMap::numPatchesOnNode(), PatchMap::Object(), Node::Object(), ReductionMgr::Object(), PmeGrid::order, REDUCTIONS_BASIC, Node::simParameters, simParams, ReductionMgr::willSubmit(), and XCOPY.

02706                                         {
02707 
02708   noWorkCount = 0;
02709   doWorkCount = 0;
02710   ungridForcesCount = 0;
02711 
02712   reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
02713 
02714   SimParameters *simParams = Node::Object()->simParameters;
02715 
02716   strayChargeErrors = 0;
02717 
02718 #ifdef NAMD_CUDA
02719  PatchMap *patchMap = PatchMap::Object();
02720  int pe = master_pe = CkNodeFirst(CkMyNode());
02721  for ( int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
02722     if ( ! patchMap->numPatchesOnNode(master_pe) ) master_pe = pe;
02723     if ( ! patchMap->numPatchesOnNode(pe) ) continue;
02724     if ( master_pe < 1 && pe != deviceCUDA->getMasterPe() ) master_pe = pe;
02725     if ( master_pe == deviceCUDA->getMasterPe() ) master_pe = pe;
02726     if ( WorkDistrib::pe_sortop_diffuse()(pe,master_pe)
02727         && pe != deviceCUDA->getMasterPe() ) {
02728       master_pe = pe;
02729     }
02730  }
02731  if ( ! patchMap->numPatchesOnNode(master_pe) ) {
02732    NAMD_bug("ComputePmeMgr::initialize_computes() master_pe has no patches.");
02733  }
02734 
02735  masterPmeMgr = nodePmeMgr->mgrObjects[master_pe - CkNodeFirst(CkMyNode())];
02736  bool cudaFirst = 1;
02737  if ( offload ) {
02738   CmiLock(cuda_lock);
02739   cudaFirst = ! masterPmeMgr->chargeGridSubmittedCount++;
02740  }
02741 
02742  if ( cudaFirst ) {
02743   nodePmeMgr->master_pe = master_pe;
02744   nodePmeMgr->masterPmeMgr = masterPmeMgr;
02745  }
02746 #endif
02747 
02748   qsize = myGrid.K1 * myGrid.dim2 * myGrid.dim3;
02749   fsize = myGrid.K1 * myGrid.dim2;
02750   if ( myGrid.K2 != myGrid.dim2 ) NAMD_bug("PME myGrid.K2 != myGrid.dim2");
02751 #ifdef NAMD_CUDA
02752  if ( ! offload )
02753 #endif
02754  {
02755   q_arr = new float*[fsize*numGrids];
02756   memset( (void*) q_arr, 0, fsize*numGrids * sizeof(float*) );
02757   q_list = new float*[fsize*numGrids];
02758   memset( (void*) q_list, 0, fsize*numGrids * sizeof(float*) );
02759   q_count = 0;
02760  }
02761 
02762 #ifdef NAMD_CUDA
02763  if ( cudaFirst || ! offload ) {
02764 #endif
02765   f_arr = new char[fsize*numGrids];
02766   // memset to non-zero value has race condition on BlueGene/Q
02767   // memset( (void*) f_arr, 2, fsize*numGrids * sizeof(char) );
02768   for ( int n=fsize*numGrids, i=0; i<n; ++i ) f_arr[i] = 2;
02769 
02770   for ( int g=0; g<numGrids; ++g ) {
02771     char *f = f_arr + g*fsize;
02772     if ( usePencils ) {
02773       int K1 = myGrid.K1;
02774       int K2 = myGrid.K2;
02775       int block1 = ( K1 + xBlocks - 1 ) / xBlocks;
02776       int block2 = ( K2 + yBlocks - 1 ) / yBlocks;
02777       int dim2 = myGrid.dim2;
02778       for (int ap=0; ap<numPencilsActive; ++ap) {
02779         int ib = activePencils[ap].i;
02780         int jb = activePencils[ap].j;
02781         int ibegin = ib*block1;
02782         int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
02783         int jbegin = jb*block2;
02784         int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
02785         int flen = numGrids * (iend - ibegin) * (jend - jbegin);
02786         for ( int i=ibegin; i<iend; ++i ) {
02787           for ( int j=jbegin; j<jend; ++j ) {
02788             f[i*dim2+j] = 0;
02789           }
02790         }
02791       }
02792     } else {
02793       int block1 = ( myGrid.K1 + numGridPes - 1 ) / numGridPes;
02794       bsize = block1 * myGrid.dim2 * myGrid.dim3;
02795       for (int pe=0; pe<numGridPes; pe++) {
02796         if ( ! recipPeDest[pe] ) continue;
02797         int start = pe * bsize;
02798         int len = bsize;
02799         if ( start >= qsize ) { start = 0; len = 0; }
02800         if ( start + len > qsize ) { len = qsize - start; }
02801         int zdim = myGrid.dim3;
02802         int fstart = start / zdim;
02803         int flen = len / zdim;
02804         memset(f + fstart, 0, flen*sizeof(char));
02805         // CkPrintf("pe %d enabled slabs %d to %d\n", CkMyPe(), fstart/myGrid.dim2, (fstart+flen)/myGrid.dim2-1);
02806       }
02807     }
02808   }
02809 #ifdef NAMD_CUDA
02810  }
02811  if ( offload ) {
02812  cudaSetDevice(deviceCUDA->getDeviceID());
02813  if ( cudaFirst ) {
02814 
02815   int f_alloc_count = 0;
02816   for ( int n=fsize, i=0; i<n; ++i ) {
02817     if ( f_arr[i] == 0 ) {
02818       ++f_alloc_count;
02819     }
02820   }
02821   // CkPrintf("pe %d f_alloc_count == %d (%d slabs)\n", CkMyPe(), f_alloc_count, f_alloc_count/myGrid.dim2);
02822 
02823   q_arr = new float*[fsize*numGrids];
02824   memset( (void*) q_arr, 0, fsize*numGrids * sizeof(float*) );
02825 
02826   float **q_arr_dev_host = new float*[fsize];
02827   cudaMalloc((void**) &q_arr_dev, fsize * sizeof(float*));
02828 
02829   float **v_arr_dev_host = new float*[fsize];
02830   cudaMalloc((void**) &v_arr_dev, fsize * sizeof(float*));
02831 
02832   int q_stride = myGrid.K3+myGrid.order-1;
02833   q_data_size = f_alloc_count * q_stride * sizeof(float);
02834   ffz_size = (fsize + q_stride) * sizeof(int);
02835 
02836   // tack ffz onto end of q_data to allow merged transfer
02837   cudaMallocHost((void**) &q_data_host, q_data_size+ffz_size);
02838   ffz_host = (int*)(((char*)q_data_host) + q_data_size);
02839   cudaMalloc((void**) &q_data_dev, q_data_size+ffz_size);
02840   ffz_dev = (int*)(((char*)q_data_dev) + q_data_size);
02841   cudaMalloc((void**) &v_data_dev, q_data_size);
02842   cuda_errcheck("malloc grid data for pme");
02843   cudaMemset(q_data_dev, 0, q_data_size + ffz_size);  // for first time
02844   cudaEventCreateWithFlags(&(nodePmeMgr->end_charge_memset),cudaEventDisableTiming);
02845   cudaEventRecord(nodePmeMgr->end_charge_memset, 0);
02846   cudaEventCreateWithFlags(&(nodePmeMgr->end_all_pme_kernels),cudaEventDisableTiming);
02847   cudaEventCreateWithFlags(&(nodePmeMgr->end_potential_memcpy),cudaEventDisableTiming);
02848 
02849   f_alloc_count = 0;
02850   for ( int n=fsize, i=0; i<n; ++i ) {
02851     if ( f_arr[i] == 0 ) {
02852       q_arr[i] = q_data_host + f_alloc_count * q_stride;
02853       q_arr_dev_host[i] = q_data_dev + f_alloc_count * q_stride;
02854       v_arr_dev_host[i] = v_data_dev + f_alloc_count * q_stride;
02855       ++f_alloc_count;
02856     } else {
02857       q_arr[i] = 0;
02858       q_arr_dev_host[i] = 0;
02859       v_arr_dev_host[i] = 0;
02860     }
02861   }
02862 
02863   cudaMemcpy(q_arr_dev, q_arr_dev_host, fsize * sizeof(float*), cudaMemcpyHostToDevice);
02864   cudaMemcpy(v_arr_dev, v_arr_dev_host, fsize * sizeof(float*), cudaMemcpyHostToDevice);
02865   delete [] q_arr_dev_host;
02866   delete [] v_arr_dev_host;
02867   delete [] f_arr;
02868   f_arr = new char[fsize + q_stride];
02869   fz_arr = f_arr + fsize;
02870   memset(f_arr, 0, fsize + q_stride);
02871   memset(ffz_host, 0, (fsize + q_stride)*sizeof(int));
02872 
02873   cuda_errcheck("initialize grid data for pme");
02874 
02875   cuda_init_bspline_coeffs(&bspline_coeffs_dev, &bspline_dcoeffs_dev, myGrid.order);
02876   cuda_errcheck("initialize bspline coefficients for pme");
02877 
02878 #define XCOPY(X) masterPmeMgr->X = X;
02879   XCOPY(bspline_coeffs_dev)
02880   XCOPY(bspline_dcoeffs_dev)
02881   XCOPY(q_arr)
02882   XCOPY(q_arr_dev)
02883   XCOPY(v_arr_dev)
02884   XCOPY(q_data_size)
02885   XCOPY(q_data_host)
02886   XCOPY(q_data_dev)
02887   XCOPY(v_data_dev)
02888   XCOPY(ffz_size)
02889   XCOPY(ffz_host)
02890   XCOPY(ffz_dev)
02891   XCOPY(f_arr)
02892   XCOPY(fz_arr)
02893 #undef XCOPY
02894   //CkPrintf("pe %d init first\n", CkMyPe());
02895  } else { // cudaFirst
02896   //CkPrintf("pe %d init later\n", CkMyPe());
02897 #define XCOPY(X) X = masterPmeMgr->X;
02898   XCOPY(bspline_coeffs_dev)
02899   XCOPY(bspline_dcoeffs_dev)
02900   XCOPY(q_arr)
02901   XCOPY(q_arr_dev)
02902   XCOPY(v_arr_dev)
02903   XCOPY(q_data_size)
02904   XCOPY(q_data_host)
02905   XCOPY(q_data_dev)
02906   XCOPY(v_data_dev)
02907   XCOPY(ffz_size)
02908   XCOPY(ffz_host)
02909   XCOPY(ffz_dev)
02910   XCOPY(f_arr)
02911   XCOPY(fz_arr)
02912 #undef XCOPY
02913  } // cudaFirst
02914   CmiUnlock(cuda_lock);
02915  } else // offload
02916 #endif // NAMD_CUDA
02917  {
02918   fz_arr = new char[myGrid.K3+myGrid.order-1];
02919  }
02920 
02921 #if 0 && USE_PERSISTENT
02922   recvGrid_handle = NULL;
02923 #endif
02924 }

void ComputePmeMgr::initialize_pencils ( CkQdMsg *   ) 

Definition at line 1693 of file ComputePme.C.

References Lattice::a(), Lattice::a_r(), Lattice::b(), Lattice::b_r(), PmeGrid::block1, PmeGrid::block2, deviceCUDA, DeviceCUDA::getMasterPe(), j, PmeGrid::K1, PmeGrid::K2, PatchMap::max_a(), PatchMap::max_b(), PatchMap::min_a(), PatchMap::min_b(), PatchMap::node(), PatchMap::numPatches(), numPatches, PatchMap::Object(), Node::Object(), PmeGrid::order, Node::simParameters, simParams, and Vector::unit().

01693                                                    {
01694   delete msg;
01695   if ( ! usePencils ) return;
01696 
01697   SimParameters *simParams = Node::Object()->simParameters;
01698 
01699   PatchMap *patchMap = PatchMap::Object();
01700   Lattice lattice = simParams->lattice;
01701   BigReal sysdima = lattice.a_r().unit() * lattice.a();
01702   BigReal sysdimb = lattice.b_r().unit() * lattice.b();
01703   BigReal cutoff = simParams->cutoff;
01704   BigReal patchdim = simParams->patchDimension;
01705   int numPatches = patchMap->numPatches();
01706 
01707   pencilActive = new char[xBlocks*yBlocks];
01708   for ( int i=0; i<xBlocks; ++i ) {
01709     for ( int j=0; j<yBlocks; ++j ) {
01710       pencilActive[i*yBlocks+j] = 0;
01711     }
01712   }
01713 
01714   for ( int pid=0; pid < numPatches; ++pid ) {
01715     int pnode = patchMap->node(pid);
01716 #ifdef NAMD_CUDA
01717     if ( offload ) {
01718       if ( CkNodeOf(pnode) != CkMyNode() ) continue;
01719     } else
01720 #endif
01721     if ( pnode != CkMyPe() ) continue;
01722 
01723     int shift1 = (myGrid.K1 + myGrid.order - 1)/2;
01724     int shift2 = (myGrid.K2 + myGrid.order - 1)/2;
01725 
01726     BigReal minx = patchMap->min_a(pid);
01727     BigReal maxx = patchMap->max_a(pid);
01728     BigReal margina = 0.5 * ( patchdim - cutoff ) / sysdima;
01729     // min1 (max1) is smallest (largest) grid line for this patch
01730     int min1 = ((int) floor(myGrid.K1 * (minx - margina))) + shift1 - myGrid.order + 1;
01731     int max1 = ((int) floor(myGrid.K1 * (maxx + margina))) + shift1;
01732 
01733     BigReal miny = patchMap->min_b(pid);
01734     BigReal maxy = patchMap->max_b(pid);
01735     BigReal marginb = 0.5 * ( patchdim - cutoff ) / sysdimb;
01736     // min2 (max2) is smallest (largest) grid line for this patch
01737     int min2 = ((int) floor(myGrid.K2 * (miny - marginb))) + shift2 - myGrid.order + 1;
01738     int max2 = ((int) floor(myGrid.K2 * (maxy + marginb))) + shift2;
01739 
01740     for ( int i=min1; i<=max1; ++i ) {
01741       int ix = i;
01742       while ( ix >= myGrid.K1 ) ix -= myGrid.K1;
01743       while ( ix < 0 ) ix += myGrid.K1;
01744       for ( int j=min2; j<=max2; ++j ) {
01745         int jy = j;
01746         while ( jy >= myGrid.K2 ) jy -= myGrid.K2;
01747         while ( jy < 0 ) jy += myGrid.K2;
01748         pencilActive[(ix / myGrid.block1)*yBlocks + (jy / myGrid.block2)] = 1;
01749       }
01750     }
01751   }
01752 
01753   numPencilsActive = 0;
01754   for ( int i=0; i<xBlocks; ++i ) {
01755     for ( int j=0; j<yBlocks; ++j ) {
01756       if ( pencilActive[i*yBlocks+j] ) {
01757         ++numPencilsActive;
01758 #ifdef NAMD_CUDA
01759         if ( CkMyPe() == deviceCUDA->getMasterPe() || ! offload )
01760 #endif
01761         zPencil(i,j,0).dummyRecvGrid(CkMyPe(),0);
01762       }
01763     }
01764   }
01765   activePencils = new ijpair[numPencilsActive];
01766   numPencilsActive = 0;
01767   for ( int i=0; i<xBlocks; ++i ) {
01768     for ( int j=0; j<yBlocks; ++j ) {
01769       if ( pencilActive[i*yBlocks+j] ) {
01770         activePencils[numPencilsActive++] = ijpair(i,j);
01771       }
01772     }
01773   }
01774   if ( simParams->PMESendOrder ) {
01775     std::sort(activePencils,activePencils+numPencilsActive,ijpair_sortop_bit_reversed());
01776   } else {
01777     Random rand(CkMyPe());
01778     rand.reorder(activePencils,numPencilsActive);
01779   }
01780   //if ( numPencilsActive ) {
01781   //  CkPrintf("node %d sending to %d pencils\n", CkMyPe(), numPencilsActive);
01782   //}
01783 
01784   ungrid_count = numPencilsActive;
01785 }

void ComputePmeMgr::pollChargeGridReady (  ) 

Definition at line 3509 of file ComputePme.C.

References CcdCallBacksReset(), cuda_check_pme_charges(), CUDA_POLL, and NAMD_bug().

03509                                         {
03510 #ifdef NAMD_CUDA
03511   CcdCallBacksReset(0,CmiWallTimer());  // fix Charm++
03512   CUDA_POLL(cuda_check_pme_charges,this);
03513 #else
03514   NAMD_bug("ComputePmeMgr::pollChargeGridReady() called in non-CUDA build.");
03515 #endif
03516 }

void ComputePmeMgr::pollForcesReady (  ) 

Definition at line 2642 of file ComputePme.C.

References CcdCallBacksReset(), cuda_check_pme_forces(), CUDA_POLL, and NAMD_bug().

02642                                     {
02643 #ifdef NAMD_CUDA
02644   CcdCallBacksReset(0,CmiWallTimer());  // fix Charm++
02645   CUDA_POLL(cuda_check_pme_forces,this);
02646 #else
02647   NAMD_bug("ComputePmeMgr::pollForcesReady() called in non-CUDA build.");
02648 #endif
02649 }

void ComputePmeMgr::procTrans ( PmeTransMsg  ) 

Definition at line 2048 of file ComputePme.C.

References PmeGrid::dim3, gridCalc2(), PmeTransMsg::lattice, NodePmeInfo::npe, ComputePmeUtil::numGrids, PmeTransMsg::nx, LocalPmeInfo::ny_after_transpose, NodePmeInfo::pe_start, PmeTransMsg::qgrid, PmeTransMsg::sequence, PmeTransMsg::x_start, and LocalPmeInfo::y_start_after_transpose.

Referenced by recvSharedTrans(), and recvTrans().

02048                                               {
02049   // CkPrintf("procTrans on Pe(%d)\n",CkMyPe());
02050   if ( trans_count == numGridPes ) {
02051     lattice = msg->lattice;
02052     grid_sequence = msg->sequence;
02053   }
02054 
02055  if ( msg->nx ) {
02056   int zdim = myGrid.dim3;
02057   NodePmeInfo &nodeInfo(transNodeInfo[myTransNode]);
02058   int first_pe = nodeInfo.pe_start;
02059   int last_pe = first_pe+nodeInfo.npe-1;
02060   int y_skip = localInfo[myTransPe].y_start_after_transpose
02061              - localInfo[first_pe].y_start_after_transpose;
02062   int ny_msg = localInfo[last_pe].y_start_after_transpose
02063              + localInfo[last_pe].ny_after_transpose
02064              - localInfo[first_pe].y_start_after_transpose;
02065   int ny = localInfo[myTransPe].ny_after_transpose;
02066   int x_start = msg->x_start;
02067   int nx = msg->nx;
02068   for ( int g=0; g<numGrids; ++g ) {
02069     CmiMemcpy((void*)(kgrid + qgrid_size * g + x_start*ny*zdim),
02070         (void*)(msg->qgrid + nx*(ny_msg*g+y_skip)*zdim),
02071         nx*ny*zdim*sizeof(float));
02072   }
02073  }
02074 
02075   --trans_count;
02076 
02077   if ( trans_count == 0 ) {
02078     pmeProxyDir[CkMyPe()].gridCalc2();
02079   }
02080 }

void ComputePmeMgr::procUntrans ( PmeUntransMsg  ) 

Definition at line 2302 of file ComputePme.C.

References PmeGrid::dim3, gridCalc3(), PmeGrid::K2, NodePmeInfo::npe, LocalPmeInfo::nx, PmeUntransMsg::ny, NodePmeInfo::pe_start, PmeUntransMsg::qgrid, LocalPmeInfo::x_start, and PmeUntransMsg::y_start.

Referenced by recvSharedUntrans(), and recvUntrans().

02302                                                   {
02303   // CkPrintf("recvUntrans on Pe(%d)\n",CkMyPe());
02304 
02305 #if CMK_BLUEGENEL
02306   CmiNetworkProgressAfter (0);
02307 #endif
02308 
02309   NodePmeInfo &nodeInfo(gridNodeInfo[myGridNode]);
02310   int first_pe = nodeInfo.pe_start;
02311   int g;
02312 
02313  if ( msg->ny ) {
02314   int zdim = myGrid.dim3;
02315   int last_pe = first_pe+nodeInfo.npe-1;
02316   int x_skip = localInfo[myGridPe].x_start
02317              - localInfo[first_pe].x_start;
02318   int nx_msg = localInfo[last_pe].x_start
02319              + localInfo[last_pe].nx
02320              - localInfo[first_pe].x_start;
02321   int nx = localInfo[myGridPe].nx;
02322   int y_start = msg->y_start;
02323   int ny = msg->ny;
02324   int slicelen = myGrid.K2 * zdim;
02325   int cpylen = ny * zdim;
02326   for ( g=0; g<numGrids; ++g ) {
02327     float *q = qgrid + qgrid_size * g + y_start * zdim;
02328     float *qmsg = msg->qgrid + (nx_msg*g+x_skip) * cpylen;
02329     for ( int x = 0; x < nx; ++x ) {
02330       CmiMemcpy((void*)q, (void*)qmsg, cpylen*sizeof(float));
02331       q += slicelen;
02332       qmsg += cpylen;
02333     }
02334   }
02335  }
02336 
02337   --untrans_count;
02338 
02339   if ( untrans_count == 0 ) {
02340     pmeProxyDir[CkMyPe()].gridCalc3();
02341   }
02342 }

void ComputePmeMgr::recvAck ( PmeAckMsg  ) 

Definition at line 2444 of file ComputePme.C.

References cuda_lock, master_pe, and NAMD_bug().

Referenced by recvUngrid().

02444                                           {
02445   if ( msg ) delete msg;
02446 #ifdef NAMD_CUDA
02447   if ( offload ) {
02448     CmiLock(cuda_lock);
02449     if ( ungrid_count == 0 ) {
02450       NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
02451     }
02452     int uc = --ungrid_count;
02453     CmiUnlock(cuda_lock);
02454 
02455     if ( uc == 0 ) {
02456       pmeProxyDir[master_pe].ungridCalc();
02457     }
02458     return;
02459   }
02460 #endif
02461   --ungrid_count;
02462 
02463   if ( ungrid_count == 0 ) {
02464     pmeProxyDir[CkMyPe()].ungridCalc();
02465   }
02466 }

void ComputePmeMgr::recvArrays ( CProxy_PmeXPencil  ,
CProxy_PmeYPencil  ,
CProxy_PmeZPencil   
)

Definition at line 800 of file ComputePme.C.

00801                                                                        {
00802   xPencil = x;  yPencil = y;  zPencil = z;
00803   
00804     if(CmiMyRank()==0)
00805     {
00806       pmeNodeProxy.ckLocalBranch()->xPencil=x;
00807       pmeNodeProxy.ckLocalBranch()->yPencil=y;
00808       pmeNodeProxy.ckLocalBranch()->zPencil=z;
00809     }
00810 }

void ComputePmeMgr::recvChargeGridReady (  ) 

Definition at line 3518 of file ComputePme.C.

References chargeGridReady(), saved_lattice, and saved_sequence.

03518                                         {
03519   chargeGridReady(*saved_lattice,saved_sequence);
03520 }

void ComputePmeMgr::recvGrid ( PmeGridMsg  ) 

Definition at line 1827 of file ComputePme.C.

References PmeGrid::dim3, f, PmeGridMsg::fgrid, gridCalc1(), PmeGridMsg::lattice, NAMD_bug(), ComputePmeUtil::numGrids, PmeGridMsg::qgrid, sendTransBarrier(), PmeGridMsg::sequence, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

01827                                             {
01828   // CkPrintf("recvGrid from %d on Pe(%d)\n",msg->sourceNode,CkMyPe());
01829   if ( grid_count == 0 ) {
01830     NAMD_bug("Message order failure in ComputePmeMgr::recvGrid\n");
01831   }
01832   if ( grid_count == numSources ) {
01833     lattice = msg->lattice;
01834     grid_sequence = msg->sequence;
01835   }
01836 
01837   int zdim = myGrid.dim3;
01838   int zlistlen = msg->zlistlen;
01839   int *zlist = msg->zlist;
01840   float *qmsg = msg->qgrid;
01841   for ( int g=0; g<numGrids; ++g ) {
01842     char *f = msg->fgrid + fgrid_len * g;
01843     float *q = qgrid + qgrid_size * g;
01844     for ( int i=0; i<fgrid_len; ++i ) {
01845       if ( f[i] ) {
01846         for ( int k=0; k<zlistlen; ++k ) {
01847           q[zlist[k]] += *(qmsg++);
01848         }
01849       }
01850       q += zdim;
01851     }
01852   }
01853 
01854   gridmsg_reuse[numSources-grid_count] = msg;
01855   --grid_count;
01856 
01857   if ( grid_count == 0 ) {
01858     pmeProxyDir[CkMyPe()].gridCalc1();
01859     if ( useBarrier ) pmeProxyDir[0].sendTransBarrier();
01860   }
01861 }

void ComputePmeMgr::recvRecipEvir ( PmeEvirMsg  ) 

Definition at line 3009 of file ComputePme.C.

References PmeEvirMsg::evir, NAMD_bug(), pmeComputes, ResizeArray< Elem >::size(), and submitReductions().

03009                                                  {
03010   if ( ! pmeComputes.size() ) NAMD_bug("ComputePmeMgr::recvRecipEvir() called on pe without patches");
03011   for ( int g=0; g<numGrids; ++g ) {
03012     evir[g] += msg->evir[g];
03013   }
03014   delete msg;
03015   // CkPrintf("recvRecipEvir pe %d %d %d\n", CkMyPe(), ungridForcesCount, recipEvirCount);
03016   if ( ! --recipEvirCount && ! ungridForcesCount ) submitReductions();
03017 }

void ComputePmeMgr::recvSharedTrans ( PmeSharedTransMsg  ) 

Definition at line 2030 of file ComputePme.C.

References PmeSharedTransMsg::count, PmeSharedTransMsg::lock, PmeSharedTransMsg::msg, and procTrans().

02030                                                           {
02031   procTrans(msg->msg);
02032   CmiLock(msg->lock);
02033   int count = --(*msg->count);
02034   CmiUnlock(msg->lock);
02035   if ( count == 0 ) {
02036     CmiDestroyLock(msg->lock);
02037     delete msg->count;
02038     delete msg->msg;
02039   }
02040   delete msg;
02041 }

void ComputePmeMgr::recvSharedUntrans ( PmeSharedUntransMsg  ) 

Definition at line 2284 of file ComputePme.C.

References PmeSharedUntransMsg::count, PmeSharedUntransMsg::lock, PmeSharedUntransMsg::msg, and procUntrans().

02284                                                               {
02285   procUntrans(msg->msg);
02286   CmiLock(msg->lock);
02287   int count = --(*msg->count);
02288   CmiUnlock(msg->lock);
02289   if ( count == 0 ) {
02290     CmiDestroyLock(msg->lock);
02291     delete msg->count;
02292     delete msg->msg;
02293   }
02294   delete msg;
02295 }

void ComputePmeMgr::recvTrans ( PmeTransMsg  ) 

Definition at line 2043 of file ComputePme.C.

References procTrans().

02043                                               {
02044   procTrans(msg);
02045   delete msg;
02046 }

void ComputePmeMgr::recvUngrid ( PmeGridMsg  ) 

Definition at line 2429 of file ComputePme.C.

References copyPencils(), copyResults(), NAMD_bug(), and recvAck().

Referenced by NodePmeMgr::recvUngrid().

02429                                               {
02430   // CkPrintf("recvUngrid on Pe(%d)\n",CkMyPe());
02431 #ifdef NAMD_CUDA
02432   if ( ! offload )  // would need lock
02433 #endif
02434   if ( ungrid_count == 0 ) {
02435     NAMD_bug("Message order failure in ComputePmeMgr::recvUngrid\n");
02436   }
02437 
02438   if ( usePencils ) copyPencils(msg);
02439   else copyResults(msg);
02440   delete msg;
02441   recvAck(0);
02442 }

void ComputePmeMgr::recvUntrans ( PmeUntransMsg  ) 

Definition at line 2297 of file ComputePme.C.

References procUntrans().

02297                                                   {
02298   procUntrans(msg);
02299   delete msg;
02300 }

void ComputePmeMgr::sendChargeGridReady (  ) 

Definition at line 3495 of file ComputePme.C.

References chargeGridSubmittedCount, master_pe, NodePmeMgr::mgrObjects, pmeComputes, and ResizeArray< Elem >::size().

Referenced by cuda_check_pme_charges().

03495                                         {
03496   for ( int i=0; i<CkMyNodeSize(); ++i ) {
03497     ComputePmeMgr *mgr = nodePmeMgr->mgrObjects[i];
03498     int cs = mgr->pmeComputes.size();
03499     if ( cs ) {
03500       mgr->ungridForcesCount = cs;
03501       mgr->recipEvirCount = mgr->recipEvirClients;
03502       masterPmeMgr->chargeGridSubmittedCount++;
03503     }
03504   }
03505   pmeProxy[master_pe].recvChargeGridReady();
03506 }

void ComputePmeMgr::sendData ( Lattice ,
int  sequence 
)

Definition at line 3932 of file ComputePme.C.

References sendDataHelper_errors, sendDataHelper_lattice, sendDataHelper_sequence, sendDataHelper_sourcepe, and sendDataPart().

Referenced by chargeGridReady().

03932                                                            {
03933 
03934   sendDataHelper_lattice = &lattice;
03935   sendDataHelper_sequence = sequence;
03936   sendDataHelper_sourcepe = CkMyPe();
03937   sendDataHelper_errors = strayChargeErrors;
03938   strayChargeErrors = 0;
03939 
03940 #ifdef NAMD_CUDA
03941   if ( offload ) {
03942     for ( int i=0; i < numGridPes; ++i ) {
03943       int pe = gridPeOrder[i];  // different order
03944       if ( ! recipPeDest[pe] && ! sendDataHelper_errors ) continue;
03945 #if CMK_MULTICORE
03946       // nodegroup messages on multicore are delivered to sending pe, or pe 0 if expedited
03947       pmeProxy[gridPeMap[pe]].sendDataHelper(i);
03948 #else
03949       pmeNodeProxy[CkMyNode()].sendDataHelper(i);
03950 #endif
03951     }
03952   } else
03953 #endif
03954   {
03955     sendDataPart(0,numGridPes-1,lattice,sequence,CkMyPe(),sendDataHelper_errors);
03956   }
03957  
03958 }

void ComputePmeMgr::sendDataHelper ( int   ) 

Definition at line 3919 of file ComputePme.C.

References NodePmeMgr::sendDataHelper().

03919                                            {
03920   nodePmeMgr->sendDataHelper(iter);
03921 }

void ComputePmeMgr::sendDataPart ( int  first,
int  last,
Lattice ,
int  sequence,
int  sourcepe,
int  errors 
)

Definition at line 3810 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::dim2, PmeGrid::dim3, endi(), PmeGridMsg::fgrid, iERROR(), if(), iout, j, PmeGrid::K2, PmeGrid::K3, PmeGridMsg::lattice, PmeGridMsg::len, NAMD_bug(), PmeGrid::order, PME_GRID_PRIORITY, PRIORITY_SIZE, PmeGridMsg::qgrid, PmeGridMsg::sequence, SET_PRIORITY, PmeGridMsg::sourceNode, PmeGridMsg::start, PmeGridMsg::zlist, and PmeGridMsg::zlistlen.

Referenced by sendData(), and NodePmeMgr::sendDataHelper().

03810                                                                                                               {
03811 
03812   // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
03813 
03814   bsize = myGrid.block1 * myGrid.dim2 * myGrid.dim3;
03815 
03816   CProxy_ComputePmeMgr pmeProxy(CkpvAccess(BOCclass_group).computePmeMgr);
03817   for (int j=first; j<=last; j++) {
03818     int pe = gridPeOrder[j];  // different order
03819     if ( ! recipPeDest[pe] && ! errors ) continue;
03820     int start = pe * bsize;
03821     int len = bsize;
03822     if ( start >= qsize ) { start = 0; len = 0; }
03823     if ( start + len > qsize ) { len = qsize - start; }
03824     int zdim = myGrid.dim3;
03825     int fstart = start / zdim;
03826     int flen = len / zdim;
03827     int fcount = 0;
03828     int i;
03829 
03830     int g;
03831     for ( g=0; g<numGrids; ++g ) {
03832       char *f = f_arr + fstart + g*fsize;
03833 #ifdef NAMD_CUDA
03834      if ( offload ) {
03835       int errcount = 0;
03836       for ( i=0; i<flen; ++i ) {
03837         f[i] = ffz_host[fstart+i];
03838         fcount += f[i];
03839         if ( ffz_host[fstart+i] & ~1 ) ++errcount;
03840       }
03841       if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::sendDataPart");
03842      } else
03843 #endif
03844       for ( i=0; i<flen; ++i ) {
03845         fcount += f[i];
03846       }
03847       if ( ! recipPeDest[pe] ) {
03848         int errfound = 0;
03849         for ( i=0; i<flen; ++i ) {
03850           if ( f[i] == 3 ) {
03851             errfound = 1;
03852             break;
03853           }
03854         }
03855         if ( errfound ) {
03856           iout << iERROR << "Stray PME grid charges detected: "
03857                 << sourcepe << " sending to " << gridPeMap[pe] << " for planes";
03858           int iz = -1;
03859           for ( i=0; i<flen; ++i ) {
03860             if ( f[i] == 3 ) {
03861               f[i] = 2;
03862               int jz = (i+fstart)/myGrid.K2;
03863               if ( iz != jz ) { iout << " " << jz;  iz = jz; }
03864             }
03865           }
03866           iout << "\n" << endi;
03867         }
03868       }
03869     }
03870 
03871 #ifdef NETWORK_PROGRESS
03872     CmiNetworkProgress();
03873 #endif
03874 
03875     if ( ! recipPeDest[pe] ) continue;
03876 
03877     int zlistlen = 0;
03878     for ( i=0; i<myGrid.K3; ++i ) {
03879       if ( fz_arr[i] ) ++zlistlen;
03880     }
03881 
03882     PmeGridMsg *msg = new (zlistlen, flen*numGrids,
03883                                 fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
03884 
03885     msg->sourceNode = sourcepe;
03886     msg->lattice = lattice;
03887     msg->start = fstart;
03888     msg->len = flen;
03889     msg->zlistlen = zlistlen;
03890     int *zlist = msg->zlist;
03891     zlistlen = 0;
03892     for ( i=0; i<myGrid.K3; ++i ) {
03893       if ( fz_arr[i] ) zlist[zlistlen++] = i;
03894     }
03895     float *qmsg = msg->qgrid;
03896     for ( g=0; g<numGrids; ++g ) {
03897       char *f = f_arr + fstart + g*fsize;
03898       CmiMemcpy((void*)(msg->fgrid+g*flen),(void*)f,flen*sizeof(char));
03899       float **q = q_arr + fstart + g*fsize;
03900       for ( i=0; i<flen; ++i ) {
03901         if ( f[i] ) {
03902           for (int h=0; h<myGrid.order-1; ++h) {
03903             q[i][h] += q[i][myGrid.K3+h];
03904           }
03905           for ( int k=0; k<zlistlen; ++k ) {
03906             *(qmsg++) = q[i][zlist[k]];
03907           }
03908         }
03909       }
03910     }
03911 
03912     msg->sequence = compute_sequence;
03913     SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
03914     pmeProxy[gridPeMap[pe]].recvGrid(msg);
03915   }
03916 
03917 }

void ComputePmeMgr::sendPencils ( Lattice ,
int  sequence 
)

Definition at line 3705 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::block2, PmeGrid::dim2, endi(), iERROR(), iout, j, ijpair::j, PmeGrid::K1, PmeGrid::K2, sendDataHelper_lattice, sendDataHelper_sequence, sendDataHelper_sourcepe, sendPencilsPart(), and NodePmeMgr::zm.

Referenced by chargeGridReady().

03705                                                               {
03706 
03707   sendDataHelper_lattice = &lattice;
03708   sendDataHelper_sequence = sequence;
03709   sendDataHelper_sourcepe = CkMyPe();
03710 
03711 #ifdef NAMD_CUDA
03712   if ( offload ) {
03713     for ( int ap=0; ap < numPencilsActive; ++ap ) {
03714 #if CMK_MULTICORE
03715       // nodegroup messages on multicore are delivered to sending pe, or pe 0 if expedited
03716       int ib = activePencils[ap].i;
03717       int jb = activePencils[ap].j;
03718       int destproc = nodePmeMgr->zm.ckLocalBranch()->procNum(0, CkArrayIndex3D(ib,jb,0));
03719       pmeProxy[destproc].sendPencilsHelper(ap);
03720 #else
03721       pmeNodeProxy[CkMyNode()].sendPencilsHelper(ap);
03722 #endif
03723     }
03724   } else
03725 #endif
03726   {
03727     sendPencilsPart(0,numPencilsActive-1,lattice,sequence,CkMyPe());
03728   }
03729 
03730   if ( strayChargeErrors ) {
03731    strayChargeErrors = 0;
03732    iout << iERROR << "Stray PME grid charges detected: "
03733         << CkMyPe() << " sending to (x,y)";
03734    int K1 = myGrid.K1;
03735    int K2 = myGrid.K2;
03736    int dim2 = myGrid.dim2;
03737    int block1 = myGrid.block1;
03738    int block2 = myGrid.block2;
03739    for (int ib=0; ib<xBlocks; ++ib) {
03740     for (int jb=0; jb<yBlocks; ++jb) {
03741      int ibegin = ib*block1;
03742      int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
03743      int jbegin = jb*block2;
03744      int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
03745      int flen = numGrids * (iend - ibegin) * (jend - jbegin);
03746 
03747      for ( int g=0; g<numGrids; ++g ) {
03748        char *f = f_arr + g*fsize;
03749        if ( ! pencilActive[ib*yBlocks+jb] ) {
03750            for ( int i=ibegin; i<iend; ++i ) {
03751             for ( int j=jbegin; j<jend; ++j ) {
03752              if ( f[i*dim2+j] == 3 ) {
03753                f[i*dim2+j] = 2;
03754                iout << " (" << i << "," << j << ")";
03755              }
03756             }
03757            }
03758        }
03759      }
03760     }
03761    }
03762    iout << "\n" << endi;
03763   }
03764  
03765 }

void ComputePmeMgr::sendPencilsHelper ( int   ) 

Definition at line 3692 of file ComputePme.C.

References NodePmeMgr::sendPencilsHelper().

03692                                               {
03693   nodePmeMgr->sendPencilsHelper(iter);
03694 }

void ComputePmeMgr::sendPencilsPart ( int  first,
int  last,
Lattice ,
int  sequence,
int  sourcepe 
)

Definition at line 3550 of file ComputePme.C.

References PmeGrid::block1, PmeGrid::block2, PmeGridMsg::destElem, PmeGrid::dim2, PmeGrid::dim3, PmeGridMsg::fgrid, PmeGridMsg::hasData, j, ijpair::j, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, PmeGridMsg::lattice, PmeGridMsg::len, NAMD_bug(), PmeGrid::order, PME_GRID_PRIORITY, PRIORITY_SIZE, PmeGridMsg::qgrid, PmeGridMsg::sequence, SET_PRIORITY, PmeGridMsg::sourceNode, PmeGridMsg::start, PmeGridMsg::zlist, PmeGridMsg::zlistlen, and NodePmeMgr::zm.

Referenced by sendPencils(), and NodePmeMgr::sendPencilsHelper().

03550                                                                                                      {
03551 
03552   // iout << "Sending charge grid for " << numLocalAtoms << " atoms to FFT on " << iPE << ".\n" << endi;
03553 
03554 #if 0 && USE_PERSISTENT
03555     if (recvGrid_handle== NULL) setup_recvgrid_persistent();
03556 #endif
03557   int K1 = myGrid.K1;
03558   int K2 = myGrid.K2;
03559   int dim2 = myGrid.dim2;
03560   int dim3 = myGrid.dim3;
03561   int block1 = myGrid.block1;
03562   int block2 = myGrid.block2;
03563 
03564   // int savedMessages = 0;
03565   NodePmeMgr *npMgr = pmeNodeProxy[CkMyNode()].ckLocalBranch();
03566 
03567   for (int ap=first; ap<=last; ++ap) {
03568     int ib = activePencils[ap].i;
03569     int jb = activePencils[ap].j;
03570     int ibegin = ib*block1;
03571     int iend = ibegin + block1;  if ( iend > K1 ) iend = K1;
03572     int jbegin = jb*block2;
03573     int jend = jbegin + block2;  if ( jend > K2 ) jend = K2;
03574     int flen = numGrids * (iend - ibegin) * (jend - jbegin);
03575 
03576     int fcount = 0;
03577     for ( int g=0; g<numGrids; ++g ) {
03578       char *f = f_arr + g*fsize;
03579 #ifdef NAMD_CUDA
03580      if ( offload ) {
03581       int errcount = 0;
03582       for ( int i=ibegin; i<iend; ++i ) {
03583        for ( int j=jbegin; j<jend; ++j ) {
03584         int k = i*dim2+j;
03585         f[k] = ffz_host[k];
03586         fcount += f[k];
03587         if ( ffz_host[k] & ~1 ) ++errcount;
03588        }
03589       }
03590       if ( errcount ) NAMD_bug("bad flag in ComputePmeMgr::sendPencilsPart");
03591      } else
03592 #endif
03593       for ( int i=ibegin; i<iend; ++i ) {
03594        for ( int j=jbegin; j<jend; ++j ) {
03595         fcount += f[i*dim2+j];
03596        }
03597       }
03598     }
03599 
03600 #ifdef NETWORK_PROGRESS
03601     CmiNetworkProgress();
03602 #endif
03603 
03604     if ( ! pencilActive[ib*yBlocks+jb] )
03605       NAMD_bug("PME activePencils list inconsistent");
03606 
03607     int zlistlen = 0;
03608     for ( int i=0; i<myGrid.K3; ++i ) {
03609       if ( fz_arr[i] ) ++zlistlen;
03610     }
03611 
03612     int hd = ( fcount? 1 : 0 );  // has data?
03613     // if ( ! hd ) ++savedMessages;
03614 
03615     
03616     PmeGridMsg *msg = new ( hd*zlistlen, hd*flen,
03617         hd*fcount*zlistlen, PRIORITY_SIZE) PmeGridMsg;
03618     msg->sourceNode = sourcepe;
03619     msg->hasData = hd;
03620     msg->lattice = lattice;
03621    if ( hd ) {
03622 #if 0
03623     msg->start = fstart;
03624     msg->len = flen;
03625 #else
03626     msg->start = -1;   // obsolete?
03627     msg->len = -1;   // obsolete?
03628 #endif
03629     msg->zlistlen = zlistlen;
03630     int *zlist = msg->zlist;
03631     zlistlen = 0;
03632     for ( int i=0; i<myGrid.K3; ++i ) {
03633       if ( fz_arr[i] ) zlist[zlistlen++] = i;
03634     }
03635     char *fmsg = msg->fgrid;
03636     float *qmsg = msg->qgrid;
03637     for ( int g=0; g<numGrids; ++g ) {
03638       char *f = f_arr + g*fsize;
03639       float **q = q_arr + g*fsize;
03640       for ( int i=ibegin; i<iend; ++i ) {
03641        for ( int j=jbegin; j<jend; ++j ) {
03642         *(fmsg++) = f[i*dim2+j];
03643         if( f[i*dim2+j] ) {
03644           for (int h=0; h<myGrid.order-1; ++h) {
03645             q[i*dim2+j][h] += q[i*dim2+j][myGrid.K3+h];
03646           }
03647           for ( int k=0; k<zlistlen; ++k ) {
03648             *(qmsg++) = q[i*dim2+j][zlist[k]];
03649           }
03650         }
03651        }
03652       }
03653     }
03654    }
03655 
03656     msg->sequence = compute_sequence;
03657     SET_PRIORITY(msg,compute_sequence,PME_GRID_PRIORITY)
03658     CmiEnableUrgentSend(1);
03659 #if USE_NODE_PAR_RECEIVE
03660     msg->destElem=CkArrayIndex3D(ib,jb,0);
03661     CProxy_PmePencilMap lzm = npMgr->zm;
03662     int destproc = lzm.ckLocalBranch()->procNum(0, msg->destElem);
03663     int destnode = CmiNodeOf(destproc);
03664     
03665 #if  0 
03666     CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
03667 #endif
03668     pmeNodeProxy[destnode].recvZGrid(msg);
03669 #if 0 
03670     CmiUsePersistentHandle(NULL, 0);
03671 #endif
03672 #else
03673 #if 0 
03674     CmiUsePersistentHandle(&recvGrid_handle[ap], 1);
03675 #endif
03676     zPencil(ib,jb,0).recvGrid(msg);
03677 #if 0 
03678     CmiUsePersistentHandle(NULL, 0);
03679 #endif
03680 #endif
03681     CmiEnableUrgentSend(0);
03682   }
03683 
03684 
03685   // if ( savedMessages ) {
03686   //   CkPrintf("Pe %d eliminated %d PME messages\n",CkMyPe(),savedMessages);
03687   // }
03688 
03689 }

void ComputePmeMgr::sendTrans ( void   ) 

Definition at line 1939 of file ComputePme.C.

References CKLOOP_CTRL_PME_SENDTRANS, Node::Object(), PmeSlabSendTrans(), sendTransSubset(), Node::simParameters, and SimParameters::useCkLoop.

Referenced by sendTransBarrier().

01939                                   {
01940 
01941   untrans_count = numTransPes;
01942 
01943 #if     CMK_SMP && USE_CKLOOP
01944   int useCkLoop = Node::Object()->simParameters->useCkLoop;
01945   if ( useCkLoop >= CKLOOP_CTRL_PME_SENDTRANS && CkNumPes() >= 2 * numGridPes) {
01946     CkLoop_Parallelize(PmeSlabSendTrans, 1, (void *)this, CkMyNodeSize(), 0, numTransNodes-1, 0); // no sync
01947   } else
01948 #endif
01949   {
01950     sendTransSubset(0, numTransNodes-1);
01951   }
01952 
01953 }

void ComputePmeMgr::sendTransBarrier ( void   ) 

Definition at line 1924 of file ComputePme.C.

References sendTrans().

Referenced by recvGrid().

01924                                          {
01925   sendTransBarrier_received += 1;
01926   // CkPrintf("sendTransBarrier on %d %d\n",myGridPe,numGridPes-sendTransBarrier_received);
01927   if ( sendTransBarrier_received < numGridPes ) return;
01928   sendTransBarrier_received = 0;
01929   for ( int i=0; i<numGridPes; ++i ) {
01930     pmeProxyDir[gridPeMap[i]].sendTrans();
01931   }
01932 }

void ComputePmeMgr::sendTransSubset ( int  first,
int  last 
)

Definition at line 1955 of file ComputePme.C.

References PmeGrid::dim3, fwdSharedTrans(), j, PmeGrid::K2, kgrid, PmeTransMsg::lattice, NodePmeInfo::npe, ComputePmeUtil::numGrids, PmeTransMsg::nx, LocalPmeInfo::nx, LocalPmeInfo::ny_after_transpose, NodePmeInfo::pe_start, PME_TRANS_PRIORITY, PRIORITY_SIZE, PmeTransMsg::qgrid, NodePmeInfo::real_node, PmeTransMsg::sequence, SET_PRIORITY, PmeTransMsg::sourceNode, PmeTransMsg::x_start, and LocalPmeInfo::x_start.

Referenced by PmeSlabSendTrans(), and sendTrans().

01955                                                        {
01956   // CkPrintf("sendTrans on Pe(%d)\n",CkMyPe());
01957 
01958   // send data for transpose
01959   int zdim = myGrid.dim3;
01960   int nx = localInfo[myGridPe].nx;
01961   int x_start = localInfo[myGridPe].x_start;
01962   int slicelen = myGrid.K2 * zdim;
01963 
01964   ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
01965 
01966 #if CMK_BLUEGENEL
01967   CmiNetworkProgressAfter (0);
01968 #endif
01969 
01970   for (int j=first; j<=last; j++) {
01971     int node = transNodeOrder[j];  // different order on each node
01972     int pe = transNodeInfo[node].pe_start;
01973     int npe = transNodeInfo[node].npe;
01974     int totlen = 0;
01975     if ( node != myTransNode ) for (int i=0; i<npe; ++i, ++pe) {
01976       LocalPmeInfo &li = localInfo[pe];
01977       int cpylen = li.ny_after_transpose * zdim;
01978       totlen += cpylen;
01979     }
01980     PmeTransMsg *newmsg = new (nx * totlen * numGrids,
01981                                 PRIORITY_SIZE) PmeTransMsg;
01982     newmsg->sourceNode = myGridPe;
01983     newmsg->lattice = lattice;
01984     newmsg->x_start = x_start;
01985     newmsg->nx = nx;
01986     for ( int g=0; g<numGrids; ++g ) {
01987       float *qmsg = newmsg->qgrid + nx * totlen * g;
01988       pe = transNodeInfo[node].pe_start;
01989       for (int i=0; i<npe; ++i, ++pe) {
01990         LocalPmeInfo &li = localInfo[pe];
01991         int cpylen = li.ny_after_transpose * zdim;
01992         if ( node == myTransNode ) {
01993           ComputePmeMgr *m = mgrObjects[CkRankOf(transPeMap[pe])];
01994           qmsg = m->kgrid + m->qgrid_size * g + x_start*cpylen;
01995         }
01996         float *q = qgrid + qgrid_size * g + li.y_start_after_transpose * zdim;
01997         for ( int x = 0; x < nx; ++x ) {
01998           CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
01999           q += slicelen;
02000           qmsg += cpylen;
02001         }
02002       }
02003     }
02004     newmsg->sequence = grid_sequence;
02005     SET_PRIORITY(newmsg,grid_sequence,PME_TRANS_PRIORITY)
02006     if ( node == myTransNode ) newmsg->nx = 0;
02007     if ( npe > 1 ) {
02008       if ( node == myTransNode ) fwdSharedTrans(newmsg);
02009       else pmeNodeProxy[transNodeInfo[node].real_node].recvTrans(newmsg);
02010     } else pmeProxy[transPeMap[transNodeInfo[node].pe_start]].recvTrans(newmsg);
02011   }
02012 }

void ComputePmeMgr::sendUngrid ( void   ) 

Definition at line 2369 of file ComputePme.C.

References CKLOOP_CTRL_PME_SENDUNTRANS, Node::Object(), PmeSlabSendUngrid(), sendUngridSubset(), Node::simParameters, and SimParameters::useCkLoop.

02369                                    {
02370 
02371 #if     CMK_SMP && USE_CKLOOP
02372   int useCkLoop = Node::Object()->simParameters->useCkLoop;
02373   if ( useCkLoop >= CKLOOP_CTRL_PME_SENDUNTRANS && CkNumPes() >= 2 * numGridPes) {
02374     CkLoop_Parallelize(PmeSlabSendUngrid, 1, (void *)this, CkMyNodeSize(), 0, numSources-1, 1); // sync
02375   } else
02376 #endif
02377   {
02378     sendUngridSubset(0, numSources-1);
02379   }
02380 
02381   grid_count = numSources;
02382   memset( (void*) qgrid, 0, qgrid_size * numGrids * sizeof(float) );
02383 }

void ComputePmeMgr::sendUngridSubset ( int  first,
int  last 
)

Definition at line 2385 of file ComputePme.C.

References PmeGrid::dim3, f, j, PME_OFFLOAD_UNGRID_PRIORITY, PME_UNGRID_PRIORITY, and SET_PRIORITY.

Referenced by PmeSlabSendUngrid(), and sendUngrid().

02385                                                         {
02386 
02387 #ifdef NAMD_CUDA
02388   const int UNGRID_PRIORITY = ( offload ? PME_OFFLOAD_UNGRID_PRIORITY : PME_UNGRID_PRIORITY );
02389 #else
02390   const int UNGRID_PRIORITY = PME_UNGRID_PRIORITY ;
02391 #endif
02392 
02393   for ( int j=first; j<=last; ++j ) {
02394     // int msglen = qgrid_len;
02395     PmeGridMsg *newmsg = gridmsg_reuse[j];
02396     int pe = newmsg->sourceNode;
02397     int zdim = myGrid.dim3;
02398     int flen = newmsg->len;
02399     int fstart = newmsg->start;
02400     int zlistlen = newmsg->zlistlen;
02401     int *zlist = newmsg->zlist;
02402     float *qmsg = newmsg->qgrid;
02403     for ( int g=0; g<numGrids; ++g ) {
02404       char *f = newmsg->fgrid + fgrid_len * g;
02405       float *q = qgrid + qgrid_size * g + (fstart-fgrid_start) * zdim;
02406       for ( int i=0; i<flen; ++i ) {
02407         if ( f[i] ) {
02408           for ( int k=0; k<zlistlen; ++k ) {
02409             *(qmsg++) = q[zlist[k]];
02410           }
02411         }
02412         q += zdim;
02413       }
02414     }
02415     newmsg->sourceNode = myGridPe;
02416 
02417     SET_PRIORITY(newmsg,grid_sequence,UNGRID_PRIORITY)
02418     CmiEnableUrgentSend(1);
02419 #ifdef NAMD_CUDA
02420     if ( offload ) {
02421       pmeNodeProxy[CkNodeOf(pe)].recvUngrid(newmsg);
02422     } else
02423 #endif
02424     pmeProxyDir[pe].recvUngrid(newmsg);
02425     CmiEnableUrgentSend(0);
02426   }
02427 }

void ComputePmeMgr::sendUntrans ( void   ) 

Definition at line 2183 of file ComputePme.C.

References CKLOOP_CTRL_PME_SENDUNTRANS, PmeEvirMsg::evir, Node::Object(), PME_UNGRID_PRIORITY, PmeSlabSendUntrans(), PRIORITY_SIZE, sendUntransSubset(), SET_PRIORITY, Node::simParameters, and SimParameters::useCkLoop.

02183                                     {
02184 
02185   trans_count = numGridPes;
02186 
02187   { // send energy and virial
02188     PmeEvirMsg *newmsg = new (numGrids, PRIORITY_SIZE) PmeEvirMsg;
02189     for ( int g=0; g<numGrids; ++g ) {
02190       newmsg->evir[g] = recip_evir2[g];
02191     }
02192     SET_PRIORITY(newmsg,grid_sequence,PME_UNGRID_PRIORITY)
02193     CmiEnableUrgentSend(1);
02194     pmeProxy[recipEvirPe].recvRecipEvir(newmsg);
02195     CmiEnableUrgentSend(0);
02196   }
02197 
02198 #if     CMK_SMP && USE_CKLOOP
02199   int useCkLoop = Node::Object()->simParameters->useCkLoop;
02200   if ( useCkLoop >= CKLOOP_CTRL_PME_SENDUNTRANS && CkNumPes() >= 2 * numTransPes) {
02201     CkLoop_Parallelize(PmeSlabSendUntrans, 1, (void *)this, CkMyNodeSize(), 0, numGridNodes-1, 0); // no sync
02202   } else
02203 #endif
02204   {
02205     sendUntransSubset(0, numGridNodes-1);
02206   }
02207 
02208 }

void ComputePmeMgr::sendUntransSubset ( int  first,
int  last 
)

Definition at line 2210 of file ComputePme.C.

References PmeGrid::dim3, fwdSharedUntrans(), j, PmeGrid::K2, NodePmeInfo::npe, LocalPmeInfo::nx, PmeUntransMsg::ny, LocalPmeInfo::ny_after_transpose, NodePmeInfo::pe_start, PME_UNTRANS_PRIORITY, PRIORITY_SIZE, qgrid, PmeUntransMsg::qgrid, NodePmeInfo::real_node, SET_PRIORITY, PmeUntransMsg::sourceNode, PmeUntransMsg::y_start, and LocalPmeInfo::y_start_after_transpose.

Referenced by PmeSlabSendUntrans(), and sendUntrans().

02210                                                          {
02211 
02212   int zdim = myGrid.dim3;
02213   int y_start = localInfo[myTransPe].y_start_after_transpose;
02214   int ny = localInfo[myTransPe].ny_after_transpose;
02215   int slicelen = myGrid.K2 * zdim;
02216 
02217   ComputePmeMgr **mgrObjects = pmeNodeProxy.ckLocalBranch()->mgrObjects;
02218 
02219 #if CMK_BLUEGENEL
02220   CmiNetworkProgressAfter (0);
02221 #endif
02222 
02223   // send data for reverse transpose
02224   for (int j=first; j<=last; j++) {
02225     int node = gridNodeOrder[j];  // different order on each node
02226     int pe = gridNodeInfo[node].pe_start;
02227     int npe = gridNodeInfo[node].npe;
02228     int totlen = 0;
02229     if ( node != myGridNode ) for (int i=0; i<npe; ++i, ++pe) {
02230       LocalPmeInfo &li = localInfo[pe];
02231       int cpylen = li.nx * zdim;
02232       totlen += cpylen;
02233     }
02234     PmeUntransMsg *newmsg = new (ny * totlen * numGrids, PRIORITY_SIZE) PmeUntransMsg;
02235     newmsg->sourceNode = myTransPe;
02236     newmsg->y_start = y_start;
02237     newmsg->ny = ny;
02238     for ( int g=0; g<numGrids; ++g ) {
02239       float *qmsg = newmsg->qgrid + ny * totlen * g;
02240       pe = gridNodeInfo[node].pe_start;
02241       for (int i=0; i<npe; ++i, ++pe) {
02242         LocalPmeInfo &li = localInfo[pe];
02243         if ( node == myGridNode ) {
02244           ComputePmeMgr *m = mgrObjects[CkRankOf(gridPeMap[pe])];
02245           qmsg = m->qgrid + m->qgrid_size * g + y_start * zdim;
02246           float *q = kgrid + qgrid_size*g + li.x_start*ny*zdim;
02247           int cpylen = ny * zdim;
02248           for ( int x = 0; x < li.nx; ++x ) {
02249             CmiMemcpy((void*)qmsg, (void*)q, cpylen*sizeof(float));
02250             q += cpylen;
02251             qmsg += slicelen;
02252           }
02253         } else {
02254           CmiMemcpy((void*)qmsg,
02255                 (void*)(kgrid + qgrid_size*g + li.x_start*ny*zdim),
02256                 li.nx*ny*zdim*sizeof(float));
02257           qmsg += li.nx*ny*zdim;
02258         }
02259       }
02260     }
02261     SET_PRIORITY(newmsg,grid_sequence,PME_UNTRANS_PRIORITY)
02262     if ( node == myGridNode ) newmsg->ny = 0;
02263     if ( npe > 1 ) {
02264       if ( node == myGridNode ) fwdSharedUntrans(newmsg);
02265       else pmeNodeProxy[gridNodeInfo[node].real_node].recvUntrans(newmsg);
02266     } else pmeProxy[gridPeMap[gridNodeInfo[node].pe_start]].recvUntrans(newmsg);
02267   }
02268 }

void ComputePmeMgr::submitReductions (  ) 

Definition at line 4197 of file ComputePme.C.

References ComputePmeUtil::alchDecouple, ComputePmeUtil::alchOn, NAMD_bug(), Node::Object(), Node::simParameters, and simParams.

Referenced by ComputePme::doWork(), and recvRecipEvir().

04197                                      {
04198 
04199     SimParameters *simParams = Node::Object()->simParameters;
04200 
04201     for ( int g=0; g<numGrids; ++g ) {
04202       float scale = 1.;
04203       if (alchOn) {
04204         BigReal elecLambdaUp, elecLambdaDown;
04205         if( simParams->alchFepWhamOn ) {
04206           if( simParams->alchFepElecOn ) {
04207             elecLambdaUp = simParams->alchElecLambda;
04208             elecLambdaDown = 1.0 - simParams->alchElecLambda;
04209           }
04210           else {
04211             elecLambdaUp = 0.0;
04212             elecLambdaDown = 1.0;
04213           }
04214         }
04215         else {
04216           // alchLambda set on each step in ComputePme::ungridForces()
04217           if ( alchLambda < 0 || alchLambda > 1 ) {
04218             NAMD_bug("ComputePmeMgr::submitReductions alchLambda out of range");
04219           }
04220           elecLambdaUp = simParams->getElecLambda(alchLambda);
04221           elecLambdaDown = simParams->getElecLambda(1-alchLambda);
04222         }
04223         if ( g == 0 ) scale = elecLambdaUp;
04224         else if ( g == 1 ) scale = elecLambdaDown;
04225         else if ( g == 2 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
04226         if (alchDecouple) {
04227           if ( g == 2 ) scale = 1-elecLambdaUp;
04228           else if ( g == 3 ) scale = 1-elecLambdaDown;
04229           else if ( g == 4 ) scale = (elecLambdaUp + elecLambdaDown - 1)*(-1);
04230         }
04231       } else if ( lesOn ) {
04232         scale = 1.0 / lesFactor;
04233       } else if ( pairOn ) {
04234         scale = ( g == 0 ? 1. : -1. );
04235       }
04236       reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += evir[g][0] * scale;
04237       reduction->item(REDUCTION_VIRIAL_SLOW_XX) += evir[g][1] * scale;
04238       reduction->item(REDUCTION_VIRIAL_SLOW_XY) += evir[g][2] * scale;
04239       reduction->item(REDUCTION_VIRIAL_SLOW_XZ) += evir[g][3] * scale;
04240       reduction->item(REDUCTION_VIRIAL_SLOW_YX) += evir[g][2] * scale;
04241       reduction->item(REDUCTION_VIRIAL_SLOW_YY) += evir[g][4] * scale;
04242       reduction->item(REDUCTION_VIRIAL_SLOW_YZ) += evir[g][5] * scale;
04243       reduction->item(REDUCTION_VIRIAL_SLOW_ZX) += evir[g][3] * scale;
04244       reduction->item(REDUCTION_VIRIAL_SLOW_ZY) += evir[g][5] * scale;
04245       reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += evir[g][6] * scale;
04246 
04247       float scale2 = 0.;
04248 
04249       // why is this declared/defined again here?
04250       SimParameters *simParams = Node::Object()->simParameters;
04251 
04252       if (alchFepOn) {
04253         BigReal elecLambda2Up=0.0, elecLambda2Down=0.0;
04254         if(simParams->alchFepWhamOn) {
04255           if(simParams->alchFepElecOn) {
04256             elecLambda2Up = simParams->alchElecLambda;
04257             elecLambda2Down =  1.0 - simParams->alchElecLambda;
04258           }
04259           else {
04260             elecLambda2Up = 0.0;
04261             elecLambda2Down =  1.0;
04262           }
04263         }
04264         else {
04265           elecLambda2Up = simParams->getElecLambda(alchLambda2);
04266           elecLambda2Down = simParams->getElecLambda(1.-alchLambda2);
04267         }
04268         
04269         if ( g == 0 ) scale2 = elecLambda2Up;
04270         else if ( g == 1 ) scale2 = elecLambda2Down;
04271         else if ( g == 2 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
04272         if (alchDecouple && g == 2 ) scale2 = 1 - elecLambda2Up;
04273         else if (alchDecouple && g == 3 ) scale2 = 1 - elecLambda2Down;
04274         else if (alchDecouple && g == 4 ) scale2 = (elecLambda2Up + elecLambda2Down - 1)*(-1);
04275       }
04276       if(simParams->alchFepWhamOn && simParams->alchFepElecOn)  {       // FEP with wham post-process
04277         if( g==0 )      scale2 = scale + 1.0;
04278         else if( g==1 ) scale2 = scale - 1.0;
04279         else if( g==2 ) scale2 = scale - 1.0;
04280         else if( g==3 ) scale2 = scale + 1.0;
04281       }
04282       reduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += evir[g][0] * scale2;
04283       
04284       if (alchThermIntOn) {
04285         
04286         // no decoupling:
04287         // part. 1 <-> all of system except partition 2: g[0] - g[2] 
04288         // (interactions between all atoms [partition 0 OR partition 1], 
04289         // minus all [within partition 0])
04290         // U = elecLambdaUp * (U[0] - U[2])
04291         // dU/dl = U[0] - U[2];
04292         
04293         // part. 2 <-> all of system except partition 1: g[1] - g[2] 
04294         // (interactions between all atoms [partition 0 OR partition 2], 
04295         // minus all [within partition 0])
04296         // U = elecLambdaDown * (U[1] - U[2])
04297         // dU/dl = U[1] - U[2];
04298 
04299         // alchDecouple:
04300         // part. 1 <-> part. 0: g[0] - g[2] - g[4] 
04301         // (interactions between all atoms [partition 0 OR partition 1]
04302         // minus all [within partition 1] minus all [within partition 0]
04303         // U = elecLambdaUp * (U[0] - U[4]) + (1-elecLambdaUp)* U[2]
04304         // dU/dl = U[0] - U[2] - U[4];
04305 
04306         // part. 2 <-> part. 0: g[1] - g[3] - g[4] 
04307         // (interactions between all atoms [partition 0 OR partition 2]
04308         // minus all [within partition 2] minus all [within partition 0]
04309         // U = elecLambdaDown * (U[1] - U[4]) + (1-elecLambdaDown)* U[3]
04310         // dU/dl = U[1] - U[3] - U[4];
04311         
04312         
04313         if ( g == 0 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) += evir[g][0];
04314         if ( g == 1 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) += evir[g][0];
04315         if (!alchDecouple) {
04316           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
04317           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
04318         }
04319         else {  // alchDecouple
04320           if ( g == 2 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
04321           if ( g == 3 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
04322           if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_1) -= evir[g][0];
04323           if ( g == 4 ) reduction->item(REDUCTION_ELECT_ENERGY_PME_TI_2) -= evir[g][0];
04324         }
04325       }
04326     }
04327 
04328     alchLambda = -1.;  // illegal value to catch if not updated
04329 
04330     reduction->item(REDUCTION_STRAY_CHARGE_ERRORS) += strayChargeErrors;
04331     reduction->submit();
04332 
04333   for ( int i=0; i<heldComputes.size(); ++i ) {
04334     WorkDistrib::messageEnqueueWork(heldComputes[i]);
04335   }
04336   heldComputes.resize(0);
04337 }

void ComputePmeMgr::ungridCalc ( void   ) 

Definition at line 2513 of file ComputePme.C.

References a_data_dev, cuda_errcheck(), CUDA_EVENT_ID_PME_COPY, CUDA_EVENT_ID_PME_KERNEL, CUDA_EVENT_ID_PME_TICK, deviceCUDA, end_forces, NodePmeMgr::end_potential_memcpy, EVENT_STRIDE, f_data_dev, f_data_host, forces_count, forces_done_count, forces_time, DeviceCUDA::getDeviceID(), j, PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, WorkDistrib::messageEnqueueWork(), NodePmeMgr::mgrObjects, PmeGrid::order, pmeComputes, ResizeArray< Elem >::size(), this_pe, and ungridCalc().

Referenced by ungridCalc().

02513                                    {
02514   // CkPrintf("ungridCalc on Pe(%d)\n",CkMyPe());
02515 
02516   ungridForcesCount = pmeComputes.size();
02517 
02518 #ifdef NAMD_CUDA
02519  if ( offload ) {
02520   //CmiLock(cuda_lock);
02521   cudaSetDevice(deviceCUDA->getDeviceID());
02522 
02523   if ( this == masterPmeMgr ) {
02524     double before = CmiWallTimer();
02525     cudaMemcpyAsync(v_data_dev, q_data_host, q_data_size, cudaMemcpyHostToDevice, 0 /*streams[stream]*/);
02526     cudaEventRecord(nodePmeMgr->end_potential_memcpy, 0 /*streams[stream]*/);
02527     traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
02528 
02529     const int myrank = CkMyRank();
02530     for ( int i=0; i<CkMyNodeSize(); ++i ) {
02531       if ( myrank != i && nodePmeMgr->mgrObjects[i]->pmeComputes.size() ) {
02532         nodePmeMgr->mgrObjects[i]->ungridCalc();
02533       }
02534     }
02535     if ( ! pmeComputes.size() ) return;
02536   }
02537 
02538   if ( ! end_forces ) {
02539     int n=(pmeComputes.size()-1)/EVENT_STRIDE+1;
02540     end_forces = new cudaEvent_t[n];
02541     for ( int i=0; i<n; ++i ) {
02542       cudaEventCreateWithFlags(&end_forces[i],cudaEventDisableTiming);
02543     }
02544   }
02545 
02546   const int pcsz = pmeComputes.size();
02547   if ( ! afn_host ) {
02548     cudaMallocHost((void**) &afn_host, 3*pcsz*sizeof(float*));
02549     cudaMalloc((void**) &afn_dev, 3*pcsz*sizeof(float*));
02550     cuda_errcheck("malloc params for pme");
02551   }
02552   int totn = 0;
02553   for ( int i=0; i<pcsz; ++i ) {
02554     int n = pmeComputes[i]->numGridAtoms[0];
02555     totn += n;
02556   }
02557   if ( totn > f_data_mgr_alloc ) {
02558     if ( f_data_mgr_alloc ) {
02559       CkPrintf("Expanding CUDA forces allocation because %d > %d\n", totn, f_data_mgr_alloc);
02560       cudaFree(f_data_mgr_dev);
02561       cudaFreeHost(f_data_mgr_host);
02562     }
02563     f_data_mgr_alloc = 1.2 * (totn + 100);
02564     cudaMalloc((void**) &f_data_mgr_dev, 3*f_data_mgr_alloc*sizeof(float));
02565     cudaMallocHost((void**) &f_data_mgr_host, 3*f_data_mgr_alloc*sizeof(float));
02566     cuda_errcheck("malloc forces for pme");
02567   }
02568   // CkPrintf("pe %d pcsz %d totn %d alloc %d\n", CkMyPe(), pcsz, totn, f_data_mgr_alloc);
02569   float *f_dev = f_data_mgr_dev;
02570   float *f_host = f_data_mgr_host;
02571   for ( int i=0; i<pcsz; ++i ) {
02572     int n = pmeComputes[i]->numGridAtoms[0];
02573     pmeComputes[i]->f_data_dev = f_dev;
02574     pmeComputes[i]->f_data_host = f_host;
02575     afn_host[3*i  ] = a_data_dev + 7 * pmeComputes[i]->cuda_atoms_offset;
02576     afn_host[3*i+1] = f_dev;
02577     afn_host[3*i+2] = f_dev + n;  // avoid type conversion issues
02578     f_dev += 3*n;
02579     f_host += 3*n;
02580   }
02581   //CmiLock(cuda_lock);
02582   double before = CmiWallTimer();
02583   cudaMemcpyAsync(afn_dev, afn_host, 3*pcsz*sizeof(float*), cudaMemcpyHostToDevice, streams[stream]);
02584   traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
02585   cudaStreamWaitEvent(streams[stream], nodePmeMgr->end_potential_memcpy, 0);
02586   traceUserEvent(CUDA_EVENT_ID_PME_TICK);
02587 
02588   for ( int i=0; i<pcsz; ++i ) {
02589     // cudaMemsetAsync(pmeComputes[i]->f_data_dev, 0, 3*n*sizeof(float), streams[stream]);
02590     if ( i%EVENT_STRIDE == 0 ) {
02591       int dimy = pcsz - i;
02592       if ( dimy > EVENT_STRIDE ) dimy = EVENT_STRIDE;
02593       int maxn = 0;
02594       int subtotn = 0;
02595       for ( int j=0; j<dimy; ++j ) {
02596         int n = pmeComputes[i+j]->numGridAtoms[0];
02597         subtotn += n;
02598         if ( n > maxn ) maxn = n;
02599       }
02600       // CkPrintf("pe %d dimy %d maxn %d subtotn %d\n", CkMyPe(), dimy, maxn, subtotn);
02601       before = CmiWallTimer();
02602       cuda_pme_forces(
02603         bspline_coeffs_dev,
02604         v_arr_dev, afn_dev+3*i, dimy, maxn, /*
02605         pmeComputes[i]->a_data_dev,
02606         pmeComputes[i]->f_data_dev,
02607         n, */ myGrid.K1, myGrid.K2, myGrid.K3, myGrid.order,
02608         streams[stream]);
02609       traceUserBracketEvent(CUDA_EVENT_ID_PME_KERNEL,before,CmiWallTimer());
02610       before = CmiWallTimer();
02611       cudaMemcpyAsync(pmeComputes[i]->f_data_host, pmeComputes[i]->f_data_dev, 3*subtotn*sizeof(float),
02612         cudaMemcpyDeviceToHost, streams[stream]);
02613       traceUserBracketEvent(CUDA_EVENT_ID_PME_COPY,before,CmiWallTimer());
02614       cudaEventRecord(end_forces[i/EVENT_STRIDE], streams[stream]);
02615       traceUserEvent(CUDA_EVENT_ID_PME_TICK);
02616     }
02617     // CkPrintf("pe %d c %d natoms %d fdev %lld fhost %lld\n", CkMyPe(), i, (int64)afn_host[3*i+2], pmeComputes[i]->f_data_dev, pmeComputes[i]->f_data_host);
02618   }
02619   //CmiUnlock(cuda_lock);
02620  } else
02621 #endif // NAMD_CUDA
02622  {
02623   for ( int i=0; i<pmeComputes.size(); ++i ) {
02624     WorkDistrib::messageEnqueueWork(pmeComputes[i]);
02625     // pmeComputes[i]->ungridForces();
02626   }
02627  }
02628   // submitReductions();  // must follow all ungridForces()
02629 
02630 #ifdef NAMD_CUDA
02631  if ( offload ) {
02632   forces_time = CmiWallTimer();
02633   forces_count = ungridForcesCount;
02634   forces_done_count = 0;
02635   pmeProxy[this_pe].pollForcesReady();
02636  }
02637 #endif
02638 
02639   ungrid_count = (usePencils ? numPencilsActive : numDestRecipPes );
02640 }


Friends And Related Function Documentation

friend class ComputePme [friend]

Definition at line 357 of file ComputePme.C.

friend class NodePmeMgr [friend]

Definition at line 358 of file ComputePme.C.


Member Data Documentation

float* ComputePmeMgr::a_data_dev

Definition at line 419 of file ComputePme.C.

Referenced by cuda_submit_charges(), and ungridCalc().

float* ComputePmeMgr::a_data_host

Definition at line 418 of file ComputePme.C.

int ComputePmeMgr::chargeGridSubmittedCount

Definition at line 444 of file ComputePme.C.

Referenced by chargeGridSubmitted(), ComputePmeMgr(), initialize_computes(), and sendChargeGridReady().

double ComputePmeMgr::charges_time

Definition at line 430 of file ComputePme.C.

Referenced by cuda_check_pme_charges(), and cuda_submit_charges().

int ComputePmeMgr::check_charges_count

Definition at line 432 of file ComputePme.C.

Referenced by ComputePmeMgr(), and cuda_check_pme_charges().

int ComputePmeMgr::check_forces_count

Definition at line 433 of file ComputePme.C.

Referenced by ComputePmeMgr(), and cuda_check_pme_forces().

int ComputePmeMgr::cuda_atoms_alloc

Definition at line 423 of file ComputePme.C.

Referenced by ComputePmeMgr().

int ComputePmeMgr::cuda_atoms_count

Definition at line 422 of file ComputePme.C.

Referenced by ComputePmeMgr(), cuda_submit_charges(), and ComputePme::initialize().

bool ComputePmeMgr::cuda_busy [static]

Definition at line 442 of file ComputePme.C.

CmiNodeLock ComputePmeMgr::cuda_lock [static]

Definition at line 424 of file ComputePme.C.

Referenced by ComputePmeMgr(), initialize_computes(), and recvAck().

std::deque< ComputePmeMgr::cuda_submit_charges_args > ComputePmeMgr::cuda_submit_charges_deque [static]

Definition at line 441 of file ComputePme.C.

cudaEvent_t ComputePmeMgr::end_charges

Definition at line 426 of file ComputePme.C.

Referenced by chargeGridSubmitted(), ComputePmeMgr(), and cuda_check_pme_charges().

cudaEvent_t* ComputePmeMgr::end_forces

Definition at line 427 of file ComputePme.C.

Referenced by ComputePmeMgr(), cuda_check_pme_forces(), and ungridCalc().

float* ComputePmeMgr::f_data_dev

Definition at line 421 of file ComputePme.C.

Referenced by ungridCalc().

float* ComputePmeMgr::f_data_host

Definition at line 420 of file ComputePme.C.

Referenced by ungridCalc().

CmiNodeLock ComputePmeMgr::fftw_plan_lock [static]

Definition at line 414 of file ComputePme.C.

Referenced by ComputePmeMgr(), initialize(), and ~ComputePmeMgr().

int ComputePmeMgr::forces_count

Definition at line 428 of file ComputePme.C.

Referenced by cuda_check_pme_forces(), and ungridCalc().

int ComputePmeMgr::forces_done_count

Definition at line 429 of file ComputePme.C.

Referenced by cuda_check_pme_forces(), and ungridCalc().

double ComputePmeMgr::forces_time

Definition at line 431 of file ComputePme.C.

Referenced by cuda_check_pme_forces(), and ungridCalc().

int ComputePmeMgr::master_pe

Definition at line 434 of file ComputePme.C.

Referenced by chargeGridSubmitted(), initialize_computes(), recvAck(), and sendChargeGridReady().

ResizeArray<ComputePme*> ComputePmeMgr::pmeComputes

Definition at line 454 of file ComputePme.C.

Referenced by chargeGridReady(), cuda_check_pme_forces(), getComputes(), ComputePme::noWork(), recvRecipEvir(), sendChargeGridReady(), and ungridCalc().

CmiNodeLock ComputePmeMgr::pmemgr_lock

Definition at line 415 of file ComputePme.C.

Referenced by ComputePmeMgr(), and ~ComputePmeMgr().

Lattice* ComputePmeMgr::saved_lattice

Definition at line 447 of file ComputePme.C.

Referenced by chargeGridSubmitted(), and recvChargeGridReady().

int ComputePmeMgr::saved_sequence

Definition at line 448 of file ComputePme.C.

Referenced by chargeGridSubmitted(), cuda_check_pme_charges(), cuda_check_pme_forces(), and recvChargeGridReady().

int ComputePmeMgr::sendDataHelper_errors

Definition at line 373 of file ComputePme.C.

Referenced by sendData(), and NodePmeMgr::sendDataHelper().

Lattice* ComputePmeMgr::sendDataHelper_lattice

Definition at line 370 of file ComputePme.C.

Referenced by sendData(), NodePmeMgr::sendDataHelper(), sendPencils(), and NodePmeMgr::sendPencilsHelper().

int ComputePmeMgr::sendDataHelper_sequence

Definition at line 371 of file ComputePme.C.

Referenced by sendData(), NodePmeMgr::sendDataHelper(), sendPencils(), and NodePmeMgr::sendPencilsHelper().

int ComputePmeMgr::sendDataHelper_sourcepe

Definition at line 372 of file ComputePme.C.

Referenced by sendData(), NodePmeMgr::sendDataHelper(), sendPencils(), and NodePmeMgr::sendPencilsHelper().

int ComputePmeMgr::this_pe

Definition at line 435 of file ComputePme.C.

Referenced by ComputePmeMgr(), and ungridCalc().


The documentation for this class was generated from the following file:
Generated on Fri Aug 17 01:17:18 2018 for NAMD by  doxygen 1.4.7