fftlib.C

Go to the documentation of this file.
00001 
00002 #include "fftlib.h"
00003 
00004 #include <assert.h>
00005 
00006 void call_ck_cb (void *arg) {
00007   CkCallbackWrapper *cw = (CkCallbackWrapper *)arg;
00008   cw->cb.send (cw->msg);
00009 }
00010 
00011 void call_ck_cb_recv_grid (void *arg) {
00012   CkCallbackWrapper *cw = (CkCallbackWrapper *)arg;
00013   OptPmeDummyMsg *msg = (OptPmeDummyMsg *)cw->msg;
00014   OptPmeZPencil *zp = (OptPmeZPencil *)cw->array;
00015   zp->many_to_manyRecvGrid(msg);
00016 }
00017 
00018 void call_ck_cb_recv_trans_y (void *arg) {
00019   CkCallbackWrapper *cw = (CkCallbackWrapper *)arg;
00020   OptPmeDummyMsg *msg = (OptPmeDummyMsg *)cw->msg;
00021   OptPmeYPencil *yp = (OptPmeYPencil *)cw->array;
00022   yp->many_to_manyRecvTrans(msg);
00023 }
00024 
00025 void call_ck_cb_recv_trans_x (void *arg) {
00026   CkCallbackWrapper *cw = (CkCallbackWrapper *)arg;
00027   OptPmeDummyMsg *msg = (OptPmeDummyMsg *)cw->msg;
00028   OptPmeXPencil *xp = (OptPmeXPencil *)cw->array;
00029   xp->many_to_manyRecvTrans(msg);
00030 }
00031 
00032 void call_ck_cb_recv_untrans_y (void *arg) {
00033   CkCallbackWrapper *cw = (CkCallbackWrapper *)arg;
00034   OptPmeDummyMsg *msg = (OptPmeDummyMsg *)cw->msg;
00035   OptPmeYPencil *yp = (OptPmeYPencil *)cw->array;
00036   yp->many_to_manyRecvUntrans(msg);
00037 }
00038 
00039 void call_ck_cb_recv_untrans_z (void *arg) {
00040   CkCallbackWrapper *cw = (CkCallbackWrapper *)arg;
00041   OptPmeDummyMsg *msg = (OptPmeDummyMsg *)cw->msg;
00042   OptPmeZPencil *zp = (OptPmeZPencil *)cw->array;
00043   zp->many_to_manyRecvUntrans(msg);
00044 }
00045 
00046 void OptPmeZPencil::fft_init() {
00047   
00048   //printf ("Initialize zpencil [%d,%d], on pd %d\n", thisIndex.x, thisIndex.y, CkMyPe());
00049 
00050   CProxy_Node nd(CkpvAccess(BOCclass_group).node);
00051   Node *node = nd.ckLocalBranch();
00052   SimParameters *simParams = node->simParameters;
00053 
00054   int K1 = initdata.grid.K1;
00055   int K2 = initdata.grid.K2;
00056   int K3 = initdata.grid.K3;
00057   int dim3 = initdata.grid.dim3;
00058   int block1 = initdata.grid.block1;
00059   int block2 = initdata.grid.block2;
00060 
00061   nx = block1;
00062   if ( (thisIndex.x + 1) * block1 > K1 ) nx = K1 - thisIndex.x * block1;
00063   ny = block2;
00064   if ( (thisIndex.y + 1) * block2 > K2 ) ny = K2 - thisIndex.y * block2;
00065 
00066   data = new float[nx*ny*dim3];
00067   many_to_many_data = new float[nx*ny*dim3];
00068   many_to_many_nb = new int [initdata.zBlocks];
00069   work = new float[dim3];
00070 
00071   memset(data, 0, sizeof(float) * nx*ny*dim3);
00072   memset(many_to_many_data, 0, sizeof(float) * nx*ny*dim3);
00073 
00074   order_init(initdata.zBlocks);
00075 
00076 #ifdef NAMD_FFTW
00077   CmiLock(fftw_plan_lock);
00078 #ifdef NAMD_FFTW_3
00079   /* need array of sizes for the how many */
00080   int numLines=nx*ny;
00081   int planLineSizes[1];
00082   planLineSizes[0]=K3;
00083   CkAbort("what are we doing in here?");
00084   forward_plan = fftwf_plan_many_dft_r2c(1, planLineSizes, numLines,
00085                                      (float *) data, NULL, 1, 
00086                                          initdata.grid.dim3,
00087                                          (fftwf_complex *) data, NULL, 1, 0,
00088                                    ( simParams->FFTWEstimate ? FFTW_ESTIMATE 
00089                                      : FFTW_MEASURE ));
00090   backward_plan = fftwf_plan_many_dft_c2r(1, planLineSizes, numLines,
00091                                      (fftwf_complex *) data, NULL, 1, 
00092                                          initdata.grid.dim3/2,
00093                                      (float *) data, NULL, 1, 0,
00094                                    ( simParams->FFTWEstimate ? FFTW_ESTIMATE 
00095                                      : FFTW_MEASURE ));
00096 #else
00097 
00098   forward_plan = rfftwnd_create_plan_specific(1, &K3, FFTW_REAL_TO_COMPLEX,
00099         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
00100         | FFTW_IN_PLACE | FFTW_USE_WISDOM, data, 1, work, 1);
00101   backward_plan = rfftwnd_create_plan_specific(1, &K3, FFTW_COMPLEX_TO_REAL,
00102         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
00103         | FFTW_IN_PLACE | FFTW_USE_WISDOM, data, 1, work, 1);
00104 #endif
00105   CmiUnlock(fftw_plan_lock);
00106 #else
00107   NAMD_die("Sorry, FFTW must be compiled in to use PME.");
00108 #endif
00109 
00110   handle = CmiDirect_manytomany_allocate_handle();
00111 }
00112 
00113 void OptPmeYPencil::fft_init() {
00114   CProxy_Node nd(CkpvAccess(BOCclass_group).node);
00115   Node *node = nd.ckLocalBranch();
00116   SimParameters *simParams = node->simParameters;
00117 
00118   //  printf ("Initialize ypencil [%d,%d], on pd %d\n", thisIndex.x, thisIndex.y, CkMyPe());
00119 
00120   int K1 = initdata.grid.K1;
00121   int K2 = initdata.grid.K2;
00122   int dim2 = initdata.grid.dim2;
00123   int dim3 = initdata.grid.dim3;
00124   int block1 = initdata.grid.block1;
00125   int block3 = initdata.grid.block3;
00126 
00127   nx = block1;
00128   if ( (thisIndex.x + 1) * block1 > K1 ) nx = K1 - thisIndex.x * block1;
00129   nz = block3;
00130   if ( (thisIndex.z+1)*block3 > dim3/2 ) nz = dim3/2 - thisIndex.z*block3;
00131 
00132   data = new float[nx*dim2*nz*2];
00133   many_to_many_data = new float[nx*dim2*nz*2];
00134   many_to_many_nb = new int [initdata.yBlocks];
00135   work = new float[2*K2];
00136 
00137   memset(many_to_many_data, 0, sizeof(float) * nx*dim2*nz*2);
00138 
00139   order_init(initdata.yBlocks);
00140 
00141 #ifdef NAMD_FFTW
00142   CmiLock(fftw_plan_lock);
00143 #ifdef NAMD_FFTW_3
00144   /* need array of sizes for the dimensions */
00145   int numLines=nz;
00146   int planLineSizes[2];
00147   planLineSizes[0]=initdata.grid.K2;
00148   planLineSizes[1]=nz;
00149   forward_plan = fftwf_plan_many_dft(2, planLineSizes, numLines, 
00150                                      (fftwf_complex *) data, NULL, nz, 1,
00151                                      (fftwf_complex *) data, NULL, 1, 0,
00152                                      FFTW_FORWARD, 
00153                                      ( simParams->FFTWEstimate 
00154                                        ? FFTW_ESTIMATE : FFTW_MEASURE ));
00155   backward_plan = fftwf_plan_many_dft(2, planLineSizes, numLines, 
00156                                      (fftwf_complex *) data, NULL, nz, 1,
00157                                      (fftwf_complex *) data, NULL, 1, 0,
00158                                      FFTW_FORWARD, 
00159                                      ( simParams->FFTWEstimate 
00160                                        ? FFTW_ESTIMATE : FFTW_MEASURE ));
00161 #else
00162 
00163   forward_plan = fftw_create_plan_specific(K2, FFTW_FORWARD,
00164         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
00165         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
00166         nz, (fftw_complex *) work, 1);
00167   backward_plan = fftw_create_plan_specific(K2, FFTW_BACKWARD,
00168         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
00169         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
00170         nz, (fftw_complex *) work, 1);
00171 #endif
00172   CmiUnlock(fftw_plan_lock);
00173 #else
00174   NAMD_die("Sorry, FFTW must be compiled in to use PME.");
00175 #endif
00176 
00177   handle = CmiDirect_manytomany_allocate_handle();
00178   initialize_manytomany();
00179 }
00180 
00181 
00182 void OptPmeXPencil::fft_init() {
00183   CProxy_Node nd(CkpvAccess(BOCclass_group).node);
00184   Node *node = nd.ckLocalBranch();
00185   SimParameters *simParams = node->simParameters;
00186 
00187   //  printf ("Initialize xpencil [%d,%d], on pd %d\n", thisIndex.x, thisIndex.y, CkMyPe());
00188 
00189   lattice = simParams->lattice;
00190 
00191   int K1 = initdata.grid.K1;
00192   int K2 = initdata.grid.K2;
00193   int dim3 = initdata.grid.dim3;
00194   int block2 = initdata.grid.block2;
00195   int block3 = initdata.grid.block3;
00196 
00197   ny = block2;
00198   if ( (thisIndex.y + 1) * block2 > K2 ) ny = K2 - thisIndex.y * block2;
00199   nz = block3;
00200   if ( (thisIndex.z+1)*block3 > dim3/2 ) nz = dim3/2 - thisIndex.z*block3;
00201 
00202   data = new float[K1*block2*block3*2];
00203   many_to_many_data = new float[K1*block2*block3*2];
00204   many_to_many_nb = new int [initdata.xBlocks];
00205   work = new float[2*K1];
00206 
00207   memset(many_to_many_data, 0, sizeof(float) * K1*block2*block3*2);
00208 
00209   order_init(initdata.xBlocks);
00210 
00211 #ifdef NAMD_FFTW
00212   CmiLock(fftw_plan_lock);
00213 #ifdef NAMD_FFTW_3
00214   /* need array of sizes for the how many */
00215   int numLines=ny*nz;
00216   int planLineSizes[1];
00217   planLineSizes[0]=K1;
00218   forward_plan = fftwf_plan_many_dft(1, planLineSizes, numLines,
00219                                      (fftwf_complex *) data, NULL, K1, 1,
00220                                      (fftwf_complex *) data, NULL, 1, 0,
00221                                    FFTW_FORWARD,
00222                                    ( simParams->FFTWEstimate ? FFTW_ESTIMATE 
00223                                      : FFTW_MEASURE ));
00224   backward_plan = fftwf_plan_many_dft(1, planLineSizes, numLines,
00225                                      (fftwf_complex *) data, NULL, K1, 1,
00226                                      (fftwf_complex *) data, NULL, 1, 0,
00227                                    FFTW_BACKWARD,
00228                                    ( simParams->FFTWEstimate ? FFTW_ESTIMATE 
00229                                      : FFTW_MEASURE ));
00230 #else
00231 
00232   forward_plan = fftw_create_plan_specific(K1, FFTW_FORWARD,
00233         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
00234         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
00235         ny*nz, (fftw_complex *) work, 1);
00236   backward_plan = fftw_create_plan_specific(K1, FFTW_BACKWARD,
00237         ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
00238         | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
00239         ny*nz, (fftw_complex *) work, 1);
00240 
00241   CmiUnlock(fftw_plan_lock);
00242 #endif
00243 #else
00244   NAMD_die("Sorry, FFTW must be compiled in to use PME.");
00245 #endif
00246 
00247   myKSpace = new PmeKSpace(initdata.grid,
00248                 thisIndex.y*block2, thisIndex.y*block2 + ny,
00249                 thisIndex.z*block3, thisIndex.z*block3 + nz);
00250 
00251   handle = CmiDirect_manytomany_allocate_handle();
00252   initialize_manytomany();
00253 
00254   constant_pressure = initdata.constant_pressure;
00255 
00256   reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
00257 }
00258 
00259 //#define FFTCHECK   // run a grid of integers through the fft
00260 // #define ZEROCHECK  // check for suspicious zeros in fft
00261 
00262 void OptPmeZPencil::recv_grid(const OptPmeGridMsg *msg) {
00263 
00264   int dim3 = initdata.grid.dim3;
00265   if ( imsg == 0 ) {
00266     memset(data, 0, sizeof(float) * nx*ny*dim3);
00267   }
00268 
00269   int xstart = msg->xstart - thisIndex.x * initdata.grid.block1;
00270   int ystart = msg->ystart - thisIndex.y * initdata.grid.block2;
00271   assert (xstart >= 0);
00272   assert (ystart >= 0);
00273   int xlen   = msg->xlen;
00274   int ylen   = msg->ylen;    
00275   assert (xstart + xlen <= nx);
00276   assert (ystart + ylen <= ny);
00277   
00278   float *qmsg = msg->qgrid;
00279   float *d = data;
00280   int zlen = msg->zlen;
00281   int zstart = msg->zstart;
00282   int zmax = zstart + zlen - 1;
00283   int k = 0;
00284 
00285   int K3 = initdata.grid.K3; 
00286   int K3_1 = initdata.grid.K3 - 1; 
00287   while (zstart < 0) {
00288     zstart += K3;
00289     zmax += K3;
00290   }
00291 
00292   for ( int i=xstart; i<xstart+xlen; ++i ) {
00293     for ( int j=ystart; j<ystart+ylen; ++j) {
00294       float *d = data + (i * ny + j) * dim3; 
00295       for ( k=zstart; k<=zmax; ++k ) {
00296         int kz = k;
00297         //if (kz >= K3) kz -= K3;
00298         kz = kz - ((unsigned)(K3_1 - kz)>>31)*K3;
00299         //assert (kz >= 0);
00300         //assert (kz <  K3);
00301         d[kz] += *(qmsg++);
00302       }
00303     }
00304   }
00305 
00306   if (_iter == MANY_TO_MANY_SETUP) {
00307     if (imsg == 0)
00308       m2m_recv_grid = new PatchGridElem [grid_msgs.size()];    
00309     
00310     m2m_recv_grid[imsg].xstart = xstart;
00311     m2m_recv_grid[imsg].xlen   = xlen;
00312     m2m_recv_grid[imsg].ystart = ystart;
00313     m2m_recv_grid[imsg].ylen   = ylen;
00314     m2m_recv_grid[imsg].zstart = zstart;
00315     m2m_recv_grid[imsg].zlen   = zlen;
00316     m2m_recv_grid[imsg].patchID = msg->patchID;
00317     
00318     if ( imsg == grid_msgs.size() - 1) 
00319         initialize_manytomany ();
00320   }
00321 }
00322 
00323 
00324 void OptPmeZPencil::forward_fft() {
00325 #ifdef FFTCHECK
00326   int dim3 = initdata.grid.dim3;
00327   int K3 = initdata.grid.K3;
00328   float std_base = 100. * (thisIndex.x+1.) + 10. * (thisIndex.y+1.);
00329   float *d = data;
00330   for ( int i=0; i<nx; ++i ) {
00331    for ( int j=0; j<ny; ++j, d += dim3 ) {
00332     for ( int k=0; k<dim3; ++k ) {
00333       d[k] = 10. * (10. * (10. * std_base + i) + j) + k;
00334     }
00335    }
00336   }
00337 #endif
00338 #ifdef NAMD_FFTW
00339 #ifdef NAMD_FFTW_3
00340   fftwf_execute(forward_plan);
00341 #else
00342   rfftwnd_real_to_complex(forward_plan, nx*ny,
00343                           data, 1, initdata.grid.dim3, (fftw_complex *) work, 1, 0);
00344 #endif
00345 #endif
00346 }
00347 
00348 void OptPmeZPencil::send_trans() {
00349   int zBlocks = initdata.zBlocks;
00350   int block3 = initdata.grid.block3;
00351   int dim3 = initdata.grid.dim3;
00352 
00353   //int offset = 0;
00354   for ( int isend=0; isend<zBlocks; ++isend ) {
00355     int kb = send_order[isend];
00356     int nz = block3;
00357     if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
00358     //assert (nz > 0);
00359     OptPmeFFTMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) OptPmeFFTMsg;
00360     msg->sourceNode = thisIndex.y;
00361     msg->nx = ny;
00362     float *md = msg->qgrid;
00363     const float *d = data;
00364     for ( int i=0; i<nx; ++i ) {
00365       for ( int j=0; j<ny; ++j, d += dim3 ) {
00366         for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
00367           *(md++) = d[2*k];
00368           *(md++) = d[2*k+1];
00369         }
00370       }
00371     }          
00372     //    printf ("%d, %d: Zpencil Sending trans to %d, %d\n", thisIndex.x, thisIndex.y, thisIndex.x, kb);
00373     initdata.yPencil(thisIndex.x,0,kb).recvTrans(msg);
00374   }
00375 }
00376 
00377 
00378 void OptPmeYPencil::recv_trans(const OptPmeFFTMsg *msg) {
00379 
00380   int block2 = initdata.grid.block2;
00381   int K2 = initdata.grid.K2;
00382   int jb = msg->sourceNode;
00383   int ny = msg->nx;
00384   const float *md = msg->qgrid;
00385   float *d = data;
00386   for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
00387     for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
00388       for ( int k=0; k<nz; ++k ) {
00389         d[2*(j*nz+k)] = *(md++);
00390         d[2*(j*nz+k)+1] = *(md++);
00391       }
00392     }
00393   }
00394 } 
00395 
00396 void OptPmeYPencil::forward_fft() {
00397 #ifdef NAMD_FFTW
00398 #ifdef NAMD_FFTW_3
00399   fftwf_execute(forward_plan);
00400 #else
00401   for ( int i=0; i<nx; ++i ) {
00402     fftw(forward_plan, nz,
00403          ((fftw_complex *) data) + i * nz * initdata.grid.K2,
00404          nz, 1, (fftw_complex *) work, 1, 0);
00405   }
00406 #endif
00407 #endif
00408 }
00409 
00410 void OptPmeYPencil::send_trans() {
00411   int yBlocks = initdata.yBlocks;
00412   int block2 = initdata.grid.block2;
00413   int K2 = initdata.grid.K2;
00414   
00415   for ( int isend=0; isend<yBlocks; ++isend ) {
00416     int jb = send_order[isend];
00417     int ny = block2;
00418     if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
00419     OptPmeFFTMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) OptPmeFFTMsg;
00420     msg->sourceNode = thisIndex.x;
00421     msg->nx = nx;
00422     float *md = msg->qgrid;
00423     const float *d = data;
00424     for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
00425       for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
00426         for ( int k=0; k<nz; ++k ) {
00427           *(md++) = d[2*(j*nz+k)];
00428           *(md++) = d[2*(j*nz+k)+1];
00429         }
00430       }
00431     }
00432     if ( md != msg->qgrid + nx*ny*nz*2 ) CkPrintf("error in YX at %d %d %d\n",
00433                                                   thisIndex.x, jb, thisIndex.z);
00434     
00435     //printf ("%d, %d: Ypencil Sending trans to %d, %d\n", thisIndex.z, thisIndex.x, jb, thisIndex.z);
00436     initdata.xPencil(0,jb,thisIndex.z).recvTrans(msg);
00437   }
00438 }
00439 
00440 
00441 
00442 void OptPmeXPencil::recv_trans(const OptPmeFFTMsg *msg) {
00443 
00444   int block1 = initdata.grid.block1;
00445   int K1 = initdata.grid.K1;
00446   int ib = msg->sourceNode;
00447   int nx = msg->nx;
00448   const float *md = msg->qgrid;
00449   for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
00450     float *d = data + i*ny*nz*2;
00451     for ( int j=0; j<ny; ++j, d += nz*2 ) {
00452       for ( int k=0; k<nz; ++k ) {
00453         d[2*k] = *(md++);
00454         d[2*k+1] = *(md++);
00455       }
00456     }
00457   }
00458 }
00459 
00460 
00461 
00462 void OptPmeXPencil::forward_fft() {
00463 #ifdef NAMD_FFTW
00464 #ifdef NAMD_FFTW_3
00465   fftwf_execute(forward_plan);
00466 #else
00467 
00468   fftw(forward_plan, ny*nz,
00469        ((fftw_complex *) data), ny*nz, 1, (fftw_complex *) work, 1, 0);
00470 #endif
00471 #endif
00472 }
00473 
00474 void OptPmeXPencil::pme_kspace() {
00475   evir = 0.;  //set evir to 0
00476 
00477 #ifdef FFTCHECK
00478   return;
00479 #endif
00480 
00481   BigReal ewaldcof = ComputeNonbondedUtil::ewaldcof;
00482   evir[0] = myKSpace->compute_energy(data,
00483                                      lattice, ewaldcof, &(evir[1]), 0);
00484 
00485   //contribute (7*sizeof(double), evir.begin(), CkReduction::sum_double, initdata.cb_energy);
00486 }
00487 
00488 void OptPmeXPencil::submit_evir() {
00489   double * cdata = (double *) evir.begin();
00490   reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += cdata[0];
00491   reduction->item(REDUCTION_VIRIAL_SLOW_XX) += cdata[1];
00492   reduction->item(REDUCTION_VIRIAL_SLOW_XY) += cdata[2];
00493   reduction->item(REDUCTION_VIRIAL_SLOW_XZ) += cdata[3];
00494   reduction->item(REDUCTION_VIRIAL_SLOW_YX) += cdata[2];
00495   reduction->item(REDUCTION_VIRIAL_SLOW_YY) += cdata[4];
00496   reduction->item(REDUCTION_VIRIAL_SLOW_YZ) += cdata[5];
00497   reduction->item(REDUCTION_VIRIAL_SLOW_ZX) += cdata[3];
00498   reduction->item(REDUCTION_VIRIAL_SLOW_ZY) += cdata[5];
00499   reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += cdata[6];   
00500 
00501   SimParameters *simParams = Node::Object()->simParameters;
00502   int fef = simParams->fullElectFrequency;
00503   for (int i = 0; i < fef; i++) {
00504     reduction->submit();
00505   }
00506 }
00507 
00508 void OptPmeXPencil::backward_fft() {
00509 #ifdef NAMD_FFTW
00510 #ifdef NAMD_FFTW_3
00511   fftwf_execute(backward_plan);
00512 #else
00513 
00514   fftw(backward_plan, ny*nz,
00515        ((fftw_complex *) data), ny*nz, 1, (fftw_complex *) work, 1, 0);
00516 #endif
00517 #endif
00518 }
00519 
00520 void OptPmeXPencil::send_untrans() {
00521   int xBlocks = initdata.xBlocks;
00522   int block1 = initdata.grid.block1;
00523   int K1 = initdata.grid.K1;
00524 
00525   for ( int isend=0; isend<xBlocks; ++isend ) {
00526     int ib = send_order[isend];
00527     int nx = block1;
00528     if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
00529     OptPmeFFTMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) OptPmeFFTMsg;    
00530     msg->sourceNode = thisIndex.y;
00531     msg->nx = ny;
00532     float *md = msg->qgrid;
00533     for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
00534       float *d = data + i*ny*nz*2;
00535       for ( int j=0; j<ny; ++j, d += nz*2 ) {
00536         for ( int k=0; k<nz; ++k ) {
00537           *(md++) = d[2*k];
00538           *(md++) = d[2*k+1];
00539         }
00540       }
00541     }
00542     
00543     //printf ("%d, %d: Xpencil Sending untrans to %d, %d\n", thisIndex.y, thisIndex.z, ib, thisIndex.z);
00544     initdata.yPencil(ib,0,thisIndex.z).recvUntrans(msg);
00545   }
00546 }
00547 
00548 
00549 void OptPmeYPencil::recv_untrans(const OptPmeFFTMsg *msg) {
00550 
00551   int block2 = initdata.grid.block2;
00552   int K2 = initdata.grid.K2;
00553   int jb = msg->sourceNode;
00554   int ny = msg->nx;
00555   const float *md = msg->qgrid;
00556   float *d = data;
00557   for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
00558     for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
00559       for ( int k=0; k<nz; ++k ) {
00560         d[2*(j*nz+k)] = *(md++);
00561         d[2*(j*nz+k)+1] = *(md++);
00562       }
00563     }
00564   }
00565 } 
00566 
00567 
00568 
00569 void OptPmeYPencil::backward_fft() {
00570 #ifdef NAMD_FFTW
00571 #ifdef NAMD_FFTW_3
00572   fftwf_execute(backward_plan);
00573 #else
00574   for ( int i=0; i<nx; ++i ) {
00575 #if CMK_BLUEGENEL
00576     CmiNetworkProgress();
00577 #endif
00578 
00579     fftw(backward_plan, nz,
00580          ((fftw_complex *) data) + i * nz * initdata.grid.K2,
00581          nz, 1, (fftw_complex *) work, 1, 0);
00582   }
00583 #endif
00584 #endif
00585 }
00586 
00587 void OptPmeYPencil::send_untrans() {
00588   //printf ("%d, %d: In ypencil send_untrans called once \n", thisIndex.x, thisIndex.z);
00589 
00590   int yBlocks = initdata.yBlocks;
00591   int block2 = initdata.grid.block2;
00592   int K2 = initdata.grid.K2;
00593 
00594   for ( int isend=0; isend<yBlocks; ++isend ) {
00595     int jb = send_order[isend];
00596     //if ( ! needs_reply[jb] ) continue;
00597     int ny = block2;
00598     if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
00599     OptPmeFFTMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) OptPmeFFTMsg;
00600     msg->sourceNode = thisIndex.z;
00601     msg->nx = nz;
00602     float *md = msg->qgrid;
00603     const float *d = data;
00604     for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
00605      for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
00606       for ( int k=0; k<nz; ++k ) {
00607         *(md++) = d[2*(j*nz+k)];
00608         *(md++) = d[2*(j*nz+k)+1];
00609       }
00610      }
00611     }
00612 
00613     //printf ("%d, %d: Sending untrans to %d, %d\n", thisIndex.z, thisIndex.x, thisIndex.x, jb);
00614     initdata.zPencil(thisIndex.x,jb,0).recvUntrans(msg);
00615   }
00616 }
00617 
00618 void OptPmeZPencil::recv_untrans(const OptPmeFFTMsg *msg) {
00619 
00620   //printf ("%d, %d In recv untrans\n", thisIndex.x, thisIndex.y);
00621 
00622   int block3 = initdata.grid.block3;
00623   int dim3 = initdata.grid.dim3;
00624   int kb = msg->sourceNode;
00625   int nz = msg->nx;
00626   const float *md = msg->qgrid;
00627   float *d = data;
00628   for ( int i=0; i<nx; ++i ) {
00629     for ( int j=0; j<ny; ++j, d += dim3 ) {
00630       for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
00631         d[2*k] = *(md++);
00632         d[2*k+1] = *(md++);
00633       }
00634     }
00635   }
00636 }
00637 
00638 
00639 void OptPmeZPencil::backward_fft() {
00640 #ifdef NAMD_FFTW
00641 #ifdef NAMD_FFTW_3
00642   fftwf_execute(backward_plan);
00643 #else
00644   rfftwnd_complex_to_real(backward_plan, nx*ny,
00645                           (fftw_complex *) data, 1, initdata.grid.dim3/2, work, 1, 0);
00646 #endif
00647 #endif
00648   
00649 #if CMK_BLUEGENEL
00650   CmiNetworkProgress();
00651 #endif
00652 
00653 #ifdef FFTCHECK
00654   int dim3 = initdata.grid.dim3;
00655   int K1 = initdata.grid.K1;
00656   int K2 = initdata.grid.K2;
00657   int K3 = initdata.grid.K3;
00658   float scale = 1. / (1. * K1 * K2 * K3);
00659   float maxerr = 0.;
00660   float maxstd = 0.;
00661   int mi, mj, mk;  mi = mj = mk = -1;
00662   float std_base = 100. * (thisIndex.x+1.) + 10. * (thisIndex.y+1.);
00663   const float *d = data;
00664   for ( int i=0; i<nx; ++i ) {
00665    for ( int j=0; j<ny; ++j, d += dim3 ) {
00666     for ( int k=0; k<K3; ++k ) {
00667       float std = 10. * (10. * (10. * std_base + i) + j) + k;
00668       float err = scale * d[k] - std;
00669       if ( fabsf(err) > fabsf(maxerr) ) {
00670         maxerr = err;
00671         maxstd = std;
00672         mi = i;  mj = j;  mk = k;
00673       }
00674     }
00675    }
00676   }
00677   CkPrintf("pencil %d %d max error %f at %d %d %d (should be %f)\n",
00678            thisIndex.x, thisIndex.y, maxerr, mi, mj, mk, maxstd);
00679 #endif
00680 }
00681 
00682 void OptPmeZPencil::send_ungrid(OptPmeGridMsg *msg) {
00683   int pe = msg->sourceNode;
00684   msg->sourceNode = thisIndex.x * initdata.yBlocks + thisIndex.y;
00685 
00686   int dim3 = initdata.grid.dim3;
00687   int xstart = msg->xstart - thisIndex.x * initdata.grid.block1;
00688   int ystart = msg->ystart - thisIndex.y * initdata.grid.block2;
00689   assert (xstart >= 0);
00690   assert (ystart >= 0);
00691   int xlen   = msg->xlen;
00692   int ylen   = msg->ylen;
00693   assert (xstart + xlen <= nx);
00694   assert (ystart + ylen <= ny);
00695 
00696   float *qmsg = msg->qgrid;
00697   float *d = data;
00698   int zstart = msg->zstart;
00699   int zlen = msg->zlen;
00700   int zmax = zstart + zlen - 1;
00701   
00702   int K3_1 = initdata.grid.K3 - 1; 
00703   int K3 = initdata.grid.K3;
00704   if (zstart < 0) {
00705     zstart += K3;
00706     zmax += K3;
00707   }
00708   
00709   int k = 0;
00710   for ( int i=xstart; i<xstart+xlen; ++i ) {
00711     for ( int j=ystart; j<ystart+ylen; ++j) {
00712       float *d = data + (i * ny + j) * dim3;
00713       for ( k=zstart; k<=zmax; ++k ) {
00714         int kz = k;
00715         //if (kz >= K3) kz -= K3;
00716         kz = kz - ((unsigned)(K3_1 - kz)>>31)*K3;
00717         *(qmsg++) = d[kz];
00718       }
00719     }
00720   }
00721   
00722   initdata.pmeProxy[pe].recvUngrid(msg);
00723 }
00724 
00725 
00729 
00730 void OptPmeZPencil::many_to_many_recv_grid () {
00731   int dim3 = initdata.grid.dim3;  
00732   memset(data, 0, sizeof(float) * nx*ny*dim3);
00733   int K3 = initdata.grid.K3;
00734   int K3_1 = initdata.grid.K3 - 1;
00735   int k = 0;
00736   for (int idx = 0; idx < grid_msgs.size(); idx++) {
00737     int xstart = m2m_recv_grid[idx].xstart; 
00738     int xlen   = m2m_recv_grid[idx].xlen; 
00739     int ystart = m2m_recv_grid[idx].ystart; 
00740     int ylen   = m2m_recv_grid[idx].ylen; 
00741     int zstart = m2m_recv_grid[idx].zstart; 
00742     int zlen   = m2m_recv_grid[idx].zlen;   
00743     
00744     float *qmsg = m2m_recv_grid[idx].data;
00745     for ( int i=xstart; i<xstart+xlen; ++i ) {
00746       for ( int j=ystart; j<ystart+ylen; ++j) {
00747         float *d = data + (i * ny + j) * dim3;  
00748 
00749 #pragma disjoint (*qmsg, *d)
00750 #pragma unroll (4)
00751         for ( k=zstart; k<zstart+zlen; ++k ) {
00752           int kz = k;
00753           //if (kz >= K3) kz -= K3;
00754           kz = kz - ((unsigned)(K3_1 - kz)>>31)*K3;
00755           //assert (kz >= 0);
00756           //assert (kz <  K3);
00757           d[kz] += *(qmsg++);
00758         }
00759       }
00760     } 
00761   }
00762 }
00763 
00764 void OptPmeZPencil::many_to_many_send_trans() {
00765   int zBlocks = initdata.zBlocks;
00766   int block3 = initdata.grid.block3;
00767   int dim3 = initdata.grid.dim3;
00768 
00769   float *md = many_to_many_data;
00770   if (single_pencil) {
00771     const float *d = data;
00772     for ( int kb=0; kb<zBlocks; ++kb ) {
00773       *(md++) = d[2*kb];
00774       *(md++) = d[2*kb+1];
00775     }
00776   }
00777   else {
00778     int nz = block3;
00779     for ( int kb=0; kb<zBlocks; ++kb ) {
00780       nz = block3;
00781       if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
00782       
00783       const float *d = data;
00784       for ( int i=0; i<nx; ++i ) {
00785         for ( int j=0; j<ny; ++j, d += dim3 ) {
00786           for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
00787             *(md++) = d[2*k];
00788             *(md++) = d[2*k+1];
00789           }
00790         }
00791       }
00792     }
00793   }
00794 
00795   CmiDirect_manytomany_start (handle, PHASE_YF);
00796 }
00797 
00798 void OptPmeYPencil::many_to_many_recv_trans () {  
00799   int block2 = initdata.grid.block2;
00800   int K2 = initdata.grid.K2;
00801   
00802   const float *md = many_to_many_data;
00803   if (single_pencil) {
00804     float *d = data;
00805     for (int jb = 0; jb < initdata.yBlocks; jb++ ) {
00806       d[2*jb]    = *(md++);
00807       d[2*jb +1] = *(md++);
00808     }
00809   }
00810   else {
00811     for (int jb = 0; jb < initdata.yBlocks; jb++ ) {
00812       int ny = many_to_many_nb[jb];  
00813       float *d = data;
00814       for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
00815         for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
00816           for ( int k=0; k<nz; ++k ) {
00817             d[2*(j*nz+k)] = *(md++);
00818             d[2*(j*nz+k)+1] = *(md++);
00819           }
00820         }
00821       }
00822     }
00823   }
00824 }
00825 
00826 void OptPmeYPencil::many_to_many_send(int phase) {
00827   int yBlocks = initdata.yBlocks;
00828   int block2 = initdata.grid.block2;
00829   int K2 = initdata.grid.K2;
00830   
00831   float *md = many_to_many_data;
00832   int ny = block2;
00833   if (single_pencil) {
00834     const float *d = data;
00835     for ( int jb=0; jb<yBlocks; ++jb ) {
00836       *(md++) = d[2*jb];
00837       *(md++) = d[2*jb+1];
00838     }
00839   }
00840   else {
00841     for ( int jb=0; jb<yBlocks; ++jb ) {
00842       ny = block2;
00843       if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
00844       
00845       const float *d = data;
00846       for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
00847         for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
00848           for ( int k=0; k<nz; ++k ) {
00849             *(md++) = d[2*(j*nz+k)];
00850             *(md++) = d[2*(j*nz+k)+1];
00851           }
00852         }
00853       }
00854     }
00855   }
00856 
00857   CmiDirect_manytomany_start (handle, phase);  
00858 }
00859 
00860 void OptPmeXPencil::many_to_many_recv_trans () {
00861   
00862   int block1 = initdata.grid.block1;
00863   int K1 = initdata.grid.K1;
00864   
00865   const float *md = many_to_many_data;
00866   if (single_pencil) {
00867     for (int ib =0; ib < initdata.xBlocks; ib++ ) {
00868       data[2*ib]   = *(md++);
00869       data[2*ib+1] = *(md++);
00870     }
00871   }
00872   else {
00873     for (int ib =0; ib < initdata.xBlocks; ib++ ) {
00874       int nx = many_to_many_nb[ib];    
00875       for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
00876         float *d = data + i*ny*nz*2;
00877         for ( int j=0; j<ny; ++j, d += nz*2 ) {
00878           for ( int k=0; k<nz; ++k ) {
00879             d[2*k] = *(md++);
00880             d[2*k+1] = *(md++);
00881           }
00882         }
00883       }
00884     }
00885   }
00886 }
00887 
00888 void OptPmeXPencil::many_to_many_send_untrans() {
00889   int xBlocks = initdata.xBlocks;
00890   int block1 = initdata.grid.block1;
00891   int K1 = initdata.grid.K1;
00892   
00893   int     nx = block1;
00894   float * md = many_to_many_data;
00895   if (single_pencil) {
00896     float *d = data;
00897     for ( int ib=0; ib<xBlocks; ++ib ) {
00898       *(md++) = d[2*ib];
00899       *(md++) = d[2*ib+1];
00900     }
00901   }
00902   else {
00903     for ( int ib=0; ib<xBlocks; ++ib ) {
00904       nx = block1;
00905       if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
00906       
00907       for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
00908         float *d = data + i*ny*nz*2;
00909         for ( int j=0; j<ny; ++j, d += nz*2 ) {
00910           for ( int k=0; k<nz; ++k ) {
00911             *(md++) = d[2*k];
00912             *(md++) = d[2*k+1];
00913           }
00914         }
00915       }
00916     }
00917   }
00918   CmiDirect_manytomany_start (handle, PHASE_YB);
00919 }
00920 
00921 void OptPmeYPencil::many_to_many_recv_untrans () {  
00922   int block2 = initdata.grid.block2;
00923   int K2 = initdata.grid.K2;
00924   
00925   const float *md = many_to_many_data;
00926   if (single_pencil) {
00927     float *d = data;
00928     for (int jb = 0; jb < initdata.yBlocks; jb++ ) {        
00929       d[2*jb]   = *(md++);
00930       d[2*jb+1] = *(md++);
00931     }
00932   }
00933   else {
00934     for (int jb = 0; jb < initdata.yBlocks; jb++ ) {
00935       int ny = many_to_many_nb[jb];  
00936       float *d = data;
00937       for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
00938         for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
00939           for ( int k=0; k<nz; ++k ) {
00940             d[2*(j*nz+k)] = *(md++);
00941             d[2*(j*nz+k)+1] = *(md++);
00942           }
00943         }
00944       }
00945     }
00946   }
00947 }
00948 
00949 void OptPmeZPencil::many_to_many_recv_untrans() {  
00950   //printf ("%d, %d In recv untrans\n", thisIndex.x, thisIndex.y);                                                                             
00951   int block3 = initdata.grid.block3;
00952   int dim3 = initdata.grid.dim3;
00953   
00954   const float *md = many_to_many_data;
00955   if (single_pencil) {
00956     float *d = data;
00957     for (int kb = 0; kb < initdata.zBlocks; kb++ ) {
00958       d[2*kb]   = *(md++);
00959       d[2*kb+1] = *(md++);
00960     }
00961   }
00962   else {
00963     for (int kb = 0; kb < initdata.zBlocks; kb++ ) {
00964       int nz = many_to_many_nb[kb];
00965       float *d = data;
00966       for ( int i=0; i<nx; ++i ) {
00967         for ( int j=0; j<ny; ++j, d += dim3 ) {
00968           for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
00969             d[2*k] = *(md++);
00970             d[2*k+1] = *(md++);
00971           }
00972         }
00973       }
00974     }
00975   }
00976 }
00977 
00978 void OptPmeZPencil::many_to_many_send_ungrid () {
00979   int dim3 = initdata.grid.dim3;
00980   int K3 = initdata.grid.K3;
00981   int K3_1 = initdata.grid.K3 - 1;
00982   int k = 0;
00983   for (int idx = 0; idx < grid_msgs.size(); idx++) {
00984     int xstart = m2m_recv_grid[idx].xstart; 
00985     int xlen   = m2m_recv_grid[idx].xlen; 
00986     int ystart = m2m_recv_grid[idx].ystart; 
00987     int ylen   = m2m_recv_grid[idx].ylen; 
00988     int zstart = m2m_recv_grid[idx].zstart; 
00989     int zlen   = m2m_recv_grid[idx].zlen;   
00990     
00991     float *qmsg = m2m_recv_grid[idx].data;
00992     for ( int i=xstart; i<xstart+xlen; ++i ) {
00993       for ( int j=ystart; j<ystart+ylen; ++j) {
00994         float *d = data + (i * ny + j) * dim3;  
00995 
00996 #pragma disjoint (*d, *qmsg)
00997 #pragma unroll (4)
00998         for ( k=zstart; k<zstart+zlen; ++k ) {
00999           int kz = k;
01000           //if (kz >= K3) kz -= K3;
01001           kz = kz - ((unsigned)(K3_1 - kz)>>31)*K3;
01002           //assert (kz >= 0);
01003           //assert (kz <  K3);
01004           *(qmsg++) = d[kz];
01005         }
01006       }
01007     } 
01008   }
01009   
01010   CmiDirect_manytomany_start (handle, PHASE_UG);
01011 }
01012 
01013 
01014 
01015 void  OptPmeZPencil::initialize_manytomany () {  
01016   int idx = 0;
01017   int totalcount = 0;
01018   for (idx = 0; idx < grid_msgs.size(); idx ++) 
01019     totalcount += m2m_recv_grid[idx].xlen * m2m_recv_grid[idx].ylen * m2m_recv_grid[idx].zlen;
01020   many_to_many_gr_data = new float [totalcount];
01021   
01022   CkArrayIndex3D aidx (thisIndex.x, thisIndex.y, thisIndex.z);
01023   cbw_recvgrid.cb = CkCallback (CkIndex_OptPmeZPencil::many_to_manyRecvGrid(NULL), aidx, thisProxy.ckGetArrayID());
01024   cbw_recvgrid.msg = new (PRIORITY_SIZE) OptPmeDummyMsg;
01025   cbw_recvgrid.array = this;
01026   PatchMap *patchMap = PatchMap::Object();
01027   CmiDirect_manytomany_initialize_recvbase (handle, PHASE_GR, 
01028                                             call_ck_cb_recv_grid, 
01029                                             &cbw_recvgrid, 
01030                                             (char *)many_to_many_gr_data, 
01031                                             patchMap->numPatches(), -1);
01032   
01033   CmiDirect_manytomany_initialize_sendbase (handle, PHASE_UG, NULL, NULL, (char *)many_to_many_gr_data, 
01034                                             grid_msgs.size(), thisIndex.x *initdata.yBlocks + thisIndex.y);
01035   
01036   int offset = 0;
01037   for (idx = 0; idx < grid_msgs.size(); idx ++) {
01038     m2m_recv_grid[idx].data = (float *) ((char *)many_to_many_gr_data + offset);
01039     int fcount = m2m_recv_grid[idx].xlen * m2m_recv_grid[idx].ylen * m2m_recv_grid[idx].zlen;
01040     CmiDirect_manytomany_initialize_recv (handle, PHASE_GR, m2m_recv_grid[idx].patchID, 
01041                                           offset, fcount *sizeof(float), patchMap->node(m2m_recv_grid[idx].patchID));
01042     CmiDirect_manytomany_initialize_send (handle, PHASE_UG, idx, offset, fcount *sizeof(float), 
01043                                           patchMap->node(m2m_recv_grid[idx].patchID));
01044     offset += fcount * sizeof(float);
01045   }        
01046 
01047   int zBlocks = initdata.zBlocks;
01048   int block3 = initdata.grid.block3;
01049   int dim3 = initdata.grid.dim3;
01050 
01051   //Initialize send trans
01052   CmiDirect_manytomany_initialize_sendbase (handle, PHASE_YF, NULL, NULL, (char *)many_to_many_data, zBlocks, thisIndex.y);
01053 
01054   //Initialize recv untrans
01055   cbw_recvuntrans.cb = CkCallback(CkIndex_OptPmeZPencil::many_to_manyRecvUntrans(NULL), aidx, thisProxy.ckGetArrayID());
01056   cbw_recvuntrans.msg    = new (PRIORITY_SIZE) OptPmeDummyMsg;
01057   cbw_recvuntrans.array  = this;
01058   CmiDirect_manytomany_initialize_recvbase (handle, PHASE_ZB, 
01059                                             call_ck_cb_recv_untrans_z, 
01060                                             &cbw_recvuntrans, 
01061                                             (char *)many_to_many_data, 
01062                                             initdata.zBlocks, -1);
01063   single_pencil = false;
01064   if (nx == 1 && ny == 1 && zBlocks <= dim3/2 && block3==1)
01065     single_pencil = true;
01066   
01067   for ( int kb=0; kb<zBlocks; ++kb ) {
01068     int nz = block3;
01069     if ( (kb+1)*block3 > dim3/2 ) {
01070       single_pencil = false;
01071       nz = dim3/2 - kb*block3;
01072     }
01073     
01074     //Initialize send trans
01075     CkArrayIndex3D index (thisIndex.x,0,kb);
01076     CProxy_OptPmePencilMapY yproxy (global_map_y);
01077     int pe = yproxy.ckLocalBranch()->procNum(0, index);
01078     CmiDirect_manytomany_initialize_send (handle, PHASE_YF, kb, kb*2*nx*ny*block3*sizeof(float), 2*nx*ny*nz*sizeof(float), pe);
01079     
01080     //Initialize recv untrans
01081     CmiDirect_manytomany_initialize_recv (handle, PHASE_ZB, kb, kb*nx*ny*block3*2*sizeof(float), nx*ny*nz*2*sizeof(float), pe);
01082     many_to_many_nb [kb] = nz;
01083   }
01084 
01085 }
01086 
01087 void  OptPmeYPencil::initialize_manytomany () {
01088   int yBlocks = initdata.yBlocks;
01089   int block2 = initdata.grid.block2;
01090   int K2 = initdata.grid.K2;
01091 
01092   //send trans
01093   CmiDirect_manytomany_initialize_sendbase (handle, PHASE_XF, NULL, NULL, (char *)many_to_many_data, yBlocks, thisIndex.x);
01094   //send untrans
01095   CmiDirect_manytomany_initialize_sendbase (handle, PHASE_ZB, NULL, NULL, (char *)many_to_many_data, yBlocks, thisIndex.z);
01096 
01097   //recv trans
01098   CkArrayIndex3D idx (thisIndex.x, thisIndex.y, thisIndex.z);
01099   cbw_recvtrans.cb = CkCallback (CkIndex_OptPmeYPencil::many_to_manyRecvTrans(NULL), idx, thisProxy.ckGetArrayID());
01100   cbw_recvtrans.msg    = new (PRIORITY_SIZE) OptPmeDummyMsg;
01101   cbw_recvtrans.array  = this;
01102   CmiDirect_manytomany_initialize_recvbase (handle, PHASE_YF, 
01103                                             call_ck_cb_recv_trans_y, 
01104                                             &cbw_recvtrans, 
01105                                             (char *)many_to_many_data, 
01106                                             initdata.yBlocks, -1);
01107 
01108   //recv untrans
01109   cbw_recvuntrans.cb = CkCallback(CkIndex_OptPmeYPencil::many_to_manyRecvUntrans(NULL), idx, thisProxy.ckGetArrayID());
01110   cbw_recvuntrans.msg    = new (PRIORITY_SIZE) OptPmeDummyMsg;
01111   cbw_recvuntrans.array  = this;
01112   CmiDirect_manytomany_initialize_recvbase (handle, PHASE_YB, 
01113                                             call_ck_cb_recv_untrans_y, 
01114                                             &cbw_recvuntrans, 
01115                                             (char *)many_to_many_data, 
01116                                             initdata.yBlocks, -1);
01117 
01118   single_pencil = false;
01119   if (nz == 1 && nx == 1 && yBlocks <= K2 && block2==1)
01120     single_pencil = true;
01121   
01122   for ( int jb=0; jb<yBlocks; ++jb ) {
01123     int ny = block2;
01124     if ( (jb+1)*block2 > K2 ) {
01125       single_pencil = false;
01126       ny = K2 - jb*block2;
01127     }
01128 
01129     //send untrans
01130     CkArrayIndex3D index (thisIndex.x,jb,0);
01131     CProxy_OptPmePencilMapZ zproxy (global_map_z);
01132     int pe = zproxy.ckLocalBranch()->procNum(0, index);
01133     CmiDirect_manytomany_initialize_send (handle, PHASE_ZB, jb, 2*nx*block2*nz*jb*sizeof(float), 2*nx*ny*nz*sizeof(float), pe);
01134 
01135     //recv trans
01136     CmiDirect_manytomany_initialize_recv (handle, PHASE_YF, jb, jb*nx*block2*nz*2*sizeof(float), nx*ny*nz*2*sizeof(float), pe);
01137     
01138     //send trans
01139     index = CkArrayIndex3D (0,jb,thisIndex.z);
01140     CProxy_OptPmePencilMapX xproxy (global_map_x);
01141     pe = xproxy.ckLocalBranch()->procNum(0, index);
01142     CmiDirect_manytomany_initialize_send (handle, PHASE_XF, jb, 2*nx*block2*nz*jb*sizeof(float), 2*nx*ny*nz*sizeof(float), pe);
01143     
01144     //Recv untrans
01145     CmiDirect_manytomany_initialize_recv (handle, PHASE_YB, jb, jb*nx*block2*nz*2*sizeof(float), nx*ny*nz*2*sizeof(float), pe);
01146 
01147     many_to_many_nb [jb] = ny;
01148   }
01149 }
01150 
01151 void  OptPmeXPencil::initialize_manytomany () {  
01152   int xBlocks = initdata.xBlocks;
01153   int block1 = initdata.grid.block1;
01154   int K1 = initdata.grid.K1;
01155   
01156   CmiDirect_manytomany_initialize_sendbase (handle, PHASE_YB, NULL, NULL, (char *)many_to_many_data, xBlocks, thisIndex.y);
01157   CkArrayIndex3D idx (thisIndex.x, thisIndex.y, thisIndex.z);
01158   cbw_recvtrans.cb = CkCallback(CkIndex_OptPmeXPencil::many_to_manyRecvTrans(NULL), idx, thisProxy.ckGetArrayID());
01159   cbw_recvtrans.msg    = new (PRIORITY_SIZE) OptPmeDummyMsg;
01160   cbw_recvtrans.array  = this;
01161   CmiDirect_manytomany_initialize_recvbase (handle, PHASE_XF, 
01162                                             call_ck_cb_recv_trans_x, 
01163                                             &cbw_recvtrans,  
01164                                             (char *)many_to_many_data, 
01165                                             initdata.xBlocks, -1);
01166 
01167   single_pencil = false;
01168   if (ny == 1 && nz == 1 && xBlocks <= K1 && block1==1)
01169     single_pencil = true;
01170 
01171   for ( int ib=0; ib<xBlocks; ++ib ) {
01172     int nx = block1;
01173     if ( (ib+1)*block1 > K1 ) {
01174       single_pencil = false;
01175       nx = K1 - ib*block1;
01176     }
01177     
01178     CkArrayIndex3D index (ib,0,thisIndex.z);
01179     CProxy_OptPmePencilMapY yproxy (global_map_y);
01180     int pe = yproxy.ckLocalBranch()->procNum(0, index);
01181     CmiDirect_manytomany_initialize_send (handle, PHASE_YB, ib, 2*block1*ny*nz*ib*sizeof(float), 2*nx*ny*nz*sizeof(float), pe);
01182     
01183     CmiDirect_manytomany_initialize_recv (handle, PHASE_XF, ib, ib*block1*ny*nz*2*sizeof(float), nx*ny*nz*2*sizeof(float), pe);
01184     many_to_many_nb [ib] = nx;
01185   }
01186 }
01187 
01188 
01192 
01193 
01194 #include "PmeFFTLib.def.h"

Generated on Wed Nov 22 01:17:14 2017 for NAMD by  doxygen 1.4.7