00001
00002 #include "fftlib.h"
00003
00004 #include <assert.h>
00005
00006 void call_ck_cb (void *arg) {
00007 CkCallbackWrapper *cw = (CkCallbackWrapper *)arg;
00008 cw->cb.send (cw->msg);
00009 }
00010
00011 void call_ck_cb_recv_grid (void *arg) {
00012 CkCallbackWrapper *cw = (CkCallbackWrapper *)arg;
00013 OptPmeDummyMsg *msg = (OptPmeDummyMsg *)cw->msg;
00014 OptPmeZPencil *zp = (OptPmeZPencil *)cw->array;
00015 zp->many_to_manyRecvGrid(msg);
00016 }
00017
00018 void call_ck_cb_recv_trans_y (void *arg) {
00019 CkCallbackWrapper *cw = (CkCallbackWrapper *)arg;
00020 OptPmeDummyMsg *msg = (OptPmeDummyMsg *)cw->msg;
00021 OptPmeYPencil *yp = (OptPmeYPencil *)cw->array;
00022 yp->many_to_manyRecvTrans(msg);
00023 }
00024
00025 void call_ck_cb_recv_trans_x (void *arg) {
00026 CkCallbackWrapper *cw = (CkCallbackWrapper *)arg;
00027 OptPmeDummyMsg *msg = (OptPmeDummyMsg *)cw->msg;
00028 OptPmeXPencil *xp = (OptPmeXPencil *)cw->array;
00029 xp->many_to_manyRecvTrans(msg);
00030 }
00031
00032 void call_ck_cb_recv_untrans_y (void *arg) {
00033 CkCallbackWrapper *cw = (CkCallbackWrapper *)arg;
00034 OptPmeDummyMsg *msg = (OptPmeDummyMsg *)cw->msg;
00035 OptPmeYPencil *yp = (OptPmeYPencil *)cw->array;
00036 yp->many_to_manyRecvUntrans(msg);
00037 }
00038
00039 void call_ck_cb_recv_untrans_z (void *arg) {
00040 CkCallbackWrapper *cw = (CkCallbackWrapper *)arg;
00041 OptPmeDummyMsg *msg = (OptPmeDummyMsg *)cw->msg;
00042 OptPmeZPencil *zp = (OptPmeZPencil *)cw->array;
00043 zp->many_to_manyRecvUntrans(msg);
00044 }
00045
00046 void OptPmeZPencil::fft_init() {
00047
00048
00049
00050 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
00051 Node *node = nd.ckLocalBranch();
00052 SimParameters *simParams = node->simParameters;
00053
00054 int K1 = initdata.grid.K1;
00055 int K2 = initdata.grid.K2;
00056 int K3 = initdata.grid.K3;
00057 int dim3 = initdata.grid.dim3;
00058 int block1 = initdata.grid.block1;
00059 int block2 = initdata.grid.block2;
00060
00061 nx = block1;
00062 if ( (thisIndex.x + 1) * block1 > K1 ) nx = K1 - thisIndex.x * block1;
00063 ny = block2;
00064 if ( (thisIndex.y + 1) * block2 > K2 ) ny = K2 - thisIndex.y * block2;
00065
00066 data = new float[nx*ny*dim3];
00067 many_to_many_data = new float[nx*ny*dim3];
00068 many_to_many_nb = new int [initdata.zBlocks];
00069 work = new float[dim3];
00070
00071 memset(data, 0, sizeof(float) * nx*ny*dim3);
00072 memset(many_to_many_data, 0, sizeof(float) * nx*ny*dim3);
00073
00074 order_init(initdata.zBlocks);
00075
00076 #ifdef NAMD_FFTW
00077 CmiLock(fftw_plan_lock);
00078 #ifdef NAMD_FFTW_3
00079
00080 int numLines=nx*ny;
00081 int planLineSizes[1];
00082 planLineSizes[0]=K3;
00083 CkAbort("what are we doing in here?");
00084 forward_plan = fftwf_plan_many_dft_r2c(1, planLineSizes, numLines,
00085 (float *) data, NULL, 1,
00086 initdata.grid.dim3,
00087 (fftwf_complex *) data, NULL, 1, 0,
00088 ( simParams->FFTWEstimate ? FFTW_ESTIMATE
00089 : FFTW_MEASURE ));
00090 backward_plan = fftwf_plan_many_dft_c2r(1, planLineSizes, numLines,
00091 (fftwf_complex *) data, NULL, 1,
00092 initdata.grid.dim3/2,
00093 (float *) data, NULL, 1, 0,
00094 ( simParams->FFTWEstimate ? FFTW_ESTIMATE
00095 : FFTW_MEASURE ));
00096 #else
00097
00098 forward_plan = rfftwnd_create_plan_specific(1, &K3, FFTW_REAL_TO_COMPLEX,
00099 ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
00100 | FFTW_IN_PLACE | FFTW_USE_WISDOM, data, 1, work, 1);
00101 backward_plan = rfftwnd_create_plan_specific(1, &K3, FFTW_COMPLEX_TO_REAL,
00102 ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
00103 | FFTW_IN_PLACE | FFTW_USE_WISDOM, data, 1, work, 1);
00104 #endif
00105 CmiUnlock(fftw_plan_lock);
00106 #else
00107 NAMD_die("Sorry, FFTW must be compiled in to use PME.");
00108 #endif
00109
00110 handle = NULL;
00111 #if CHARM_VERSION > 60000
00112 handle = CmiDirect_manytomany_allocate_handle();
00113 #endif
00114 }
00115
00116 void OptPmeYPencil::fft_init() {
00117 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
00118 Node *node = nd.ckLocalBranch();
00119 SimParameters *simParams = node->simParameters;
00120
00121
00122
00123 int K1 = initdata.grid.K1;
00124 int K2 = initdata.grid.K2;
00125 int dim2 = initdata.grid.dim2;
00126 int dim3 = initdata.grid.dim3;
00127 int block1 = initdata.grid.block1;
00128 int block3 = initdata.grid.block3;
00129
00130 nx = block1;
00131 if ( (thisIndex.x + 1) * block1 > K1 ) nx = K1 - thisIndex.x * block1;
00132 nz = block3;
00133 if ( (thisIndex.z+1)*block3 > dim3/2 ) nz = dim3/2 - thisIndex.z*block3;
00134
00135 data = new float[nx*dim2*nz*2];
00136 many_to_many_data = new float[nx*dim2*nz*2];
00137 many_to_many_nb = new int [initdata.yBlocks];
00138 work = new float[2*K2];
00139
00140 memset(many_to_many_data, 0, sizeof(float) * nx*dim2*nz*2);
00141
00142 order_init(initdata.yBlocks);
00143
00144 #ifdef NAMD_FFTW
00145 CmiLock(fftw_plan_lock);
00146 #ifdef NAMD_FFTW_3
00147
00148 int numLines=nz;
00149 int planLineSizes[2];
00150 planLineSizes[0]=initdata.grid.K2;
00151 planLineSizes[1]=nz;
00152 forward_plan = fftwf_plan_many_dft(2, planLineSizes, numLines,
00153 (fftwf_complex *) data, NULL, nz, 1,
00154 (fftwf_complex *) data, NULL, 1, 0,
00155 FFTW_FORWARD,
00156 ( simParams->FFTWEstimate
00157 ? FFTW_ESTIMATE : FFTW_MEASURE ));
00158 backward_plan = fftwf_plan_many_dft(2, planLineSizes, numLines,
00159 (fftwf_complex *) data, NULL, nz, 1,
00160 (fftwf_complex *) data, NULL, 1, 0,
00161 FFTW_FORWARD,
00162 ( simParams->FFTWEstimate
00163 ? FFTW_ESTIMATE : FFTW_MEASURE ));
00164 #else
00165
00166 forward_plan = fftw_create_plan_specific(K2, FFTW_FORWARD,
00167 ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
00168 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
00169 nz, (fftw_complex *) work, 1);
00170 backward_plan = fftw_create_plan_specific(K2, FFTW_BACKWARD,
00171 ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
00172 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
00173 nz, (fftw_complex *) work, 1);
00174 #endif
00175 CmiUnlock(fftw_plan_lock);
00176 #else
00177 NAMD_die("Sorry, FFTW must be compiled in to use PME.");
00178 #endif
00179
00180 handle = NULL;
00181 #if CHARM_VERSION > 60000
00182 handle = CmiDirect_manytomany_allocate_handle();
00183 #endif
00184 initialize_manytomany();
00185 }
00186
00187
00188 void OptPmeXPencil::fft_init() {
00189 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
00190 Node *node = nd.ckLocalBranch();
00191 SimParameters *simParams = node->simParameters;
00192
00193
00194
00195 lattice = simParams->lattice;
00196
00197 int K1 = initdata.grid.K1;
00198 int K2 = initdata.grid.K2;
00199 int dim3 = initdata.grid.dim3;
00200 int block2 = initdata.grid.block2;
00201 int block3 = initdata.grid.block3;
00202
00203 ny = block2;
00204 if ( (thisIndex.y + 1) * block2 > K2 ) ny = K2 - thisIndex.y * block2;
00205 nz = block3;
00206 if ( (thisIndex.z+1)*block3 > dim3/2 ) nz = dim3/2 - thisIndex.z*block3;
00207
00208 data = new float[K1*block2*block3*2];
00209 many_to_many_data = new float[K1*block2*block3*2];
00210 many_to_many_nb = new int [initdata.xBlocks];
00211 work = new float[2*K1];
00212
00213 memset(many_to_many_data, 0, sizeof(float) * K1*block2*block3*2);
00214
00215 order_init(initdata.xBlocks);
00216
00217 #ifdef NAMD_FFTW
00218 CmiLock(fftw_plan_lock);
00219 #ifdef NAMD_FFTW_3
00220
00221 int numLines=ny*nz;
00222 int planLineSizes[1];
00223 planLineSizes[0]=K1;
00224 forward_plan = fftwf_plan_many_dft(1, planLineSizes, numLines,
00225 (fftwf_complex *) data, NULL, K1, 1,
00226 (fftwf_complex *) data, NULL, 1, 0,
00227 FFTW_FORWARD,
00228 ( simParams->FFTWEstimate ? FFTW_ESTIMATE
00229 : FFTW_MEASURE ));
00230 backward_plan = fftwf_plan_many_dft(1, planLineSizes, numLines,
00231 (fftwf_complex *) data, NULL, K1, 1,
00232 (fftwf_complex *) data, NULL, 1, 0,
00233 FFTW_BACKWARD,
00234 ( simParams->FFTWEstimate ? FFTW_ESTIMATE
00235 : FFTW_MEASURE ));
00236 #else
00237
00238 forward_plan = fftw_create_plan_specific(K1, FFTW_FORWARD,
00239 ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
00240 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
00241 ny*nz, (fftw_complex *) work, 1);
00242 backward_plan = fftw_create_plan_specific(K1, FFTW_BACKWARD,
00243 ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
00244 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
00245 ny*nz, (fftw_complex *) work, 1);
00246
00247 CmiUnlock(fftw_plan_lock);
00248 #endif
00249 #else
00250 NAMD_die("Sorry, FFTW must be compiled in to use PME.");
00251 #endif
00252
00253 myKSpace = new PmeKSpace(initdata.grid,
00254 thisIndex.y*block2, thisIndex.y*block2 + ny,
00255 thisIndex.z*block3, thisIndex.z*block3 + nz);
00256
00257 handle = NULL;
00258 #if CHARM_VERSION > 60000
00259 handle = CmiDirect_manytomany_allocate_handle();
00260 #endif
00261 initialize_manytomany();
00262
00263 constant_pressure = initdata.constant_pressure;
00264
00265 reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
00266 if (simParams->accelMDOn) {
00267 amd_reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_AMD);
00268 } else {
00269 amd_reduction = NULL;
00270 }
00271 }
00272
00273
00274
00275
00276 void OptPmeZPencil::recv_grid(const OptPmeGridMsg *msg) {
00277
00278 int dim3 = initdata.grid.dim3;
00279 if ( imsg == 0 ) {
00280 memset(data, 0, sizeof(float) * nx*ny*dim3);
00281 }
00282
00283 int xstart = msg->xstart - thisIndex.x * initdata.grid.block1;
00284 int ystart = msg->ystart - thisIndex.y * initdata.grid.block2;
00285 assert (xstart >= 0);
00286 assert (ystart >= 0);
00287 int xlen = msg->xlen;
00288 int ylen = msg->ylen;
00289 assert (xstart + xlen <= nx);
00290 assert (ystart + ylen <= ny);
00291
00292 float *qmsg = msg->qgrid;
00293 float *d = data;
00294 int zlen = msg->zlen;
00295 int zstart = msg->zstart;
00296 int zmax = zstart + zlen - 1;
00297 int k = 0;
00298
00299 int K3 = initdata.grid.K3;
00300 int K3_1 = initdata.grid.K3 - 1;
00301 while (zstart < 0) {
00302 zstart += K3;
00303 zmax += K3;
00304 }
00305
00306 for ( int i=xstart; i<xstart+xlen; ++i ) {
00307 for ( int j=ystart; j<ystart+ylen; ++j) {
00308 float *d = data + (i * ny + j) * dim3;
00309 for ( k=zstart; k<=zmax; ++k ) {
00310 int kz = k;
00311
00312 kz = kz - ((unsigned)(K3_1 - kz)>>31)*K3;
00313
00314
00315 d[kz] += *(qmsg++);
00316 }
00317 }
00318 }
00319
00320 if (_iter == MANY_TO_MANY_SETUP) {
00321 if (imsg == 0)
00322 m2m_recv_grid = new PatchGridElem [grid_msgs.size()];
00323
00324 m2m_recv_grid[imsg].xstart = xstart;
00325 m2m_recv_grid[imsg].xlen = xlen;
00326 m2m_recv_grid[imsg].ystart = ystart;
00327 m2m_recv_grid[imsg].ylen = ylen;
00328 m2m_recv_grid[imsg].zstart = zstart;
00329 m2m_recv_grid[imsg].zlen = zlen;
00330 m2m_recv_grid[imsg].patchID = msg->patchID;
00331
00332 if ( imsg == grid_msgs.size() - 1)
00333 initialize_manytomany ();
00334 }
00335 }
00336
00337
00338 void OptPmeZPencil::forward_fft() {
00339 #ifdef FFTCHECK
00340 int dim3 = initdata.grid.dim3;
00341 int K3 = initdata.grid.K3;
00342 float std_base = 100. * (thisIndex.x+1.) + 10. * (thisIndex.y+1.);
00343 float *d = data;
00344 for ( int i=0; i<nx; ++i ) {
00345 for ( int j=0; j<ny; ++j, d += dim3 ) {
00346 for ( int k=0; k<dim3; ++k ) {
00347 d[k] = 10. * (10. * (10. * std_base + i) + j) + k;
00348 }
00349 }
00350 }
00351 #endif
00352 #ifdef NAMD_FFTW
00353 #ifdef NAMD_FFTW_3
00354 fftwf_execute(forward_plan);
00355 #else
00356 rfftwnd_real_to_complex(forward_plan, nx*ny,
00357 data, 1, initdata.grid.dim3, (fftw_complex *) work, 1, 0);
00358 #endif
00359 #endif
00360 }
00361
00362 void OptPmeZPencil::send_trans() {
00363 int zBlocks = initdata.zBlocks;
00364 int block3 = initdata.grid.block3;
00365 int dim3 = initdata.grid.dim3;
00366
00367
00368 for ( int isend=0; isend<zBlocks; ++isend ) {
00369 int kb = send_order[isend];
00370 int nz = block3;
00371 if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
00372
00373 OptPmeFFTMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) OptPmeFFTMsg;
00374 msg->sourceNode = thisIndex.y;
00375 msg->nx = ny;
00376 float *md = msg->qgrid;
00377 const float *d = data;
00378 for ( int i=0; i<nx; ++i ) {
00379 for ( int j=0; j<ny; ++j, d += dim3 ) {
00380 for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
00381 *(md++) = d[2*k];
00382 *(md++) = d[2*k+1];
00383 }
00384 }
00385 }
00386
00387 initdata.yPencil(thisIndex.x,0,kb).recvTrans(msg);
00388 }
00389 }
00390
00391
00392 void OptPmeYPencil::recv_trans(const OptPmeFFTMsg *msg) {
00393
00394 int block2 = initdata.grid.block2;
00395 int K2 = initdata.grid.K2;
00396 int jb = msg->sourceNode;
00397 int ny = msg->nx;
00398 const float *md = msg->qgrid;
00399 float *d = data;
00400 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
00401 for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
00402 for ( int k=0; k<nz; ++k ) {
00403 d[2*(j*nz+k)] = *(md++);
00404 d[2*(j*nz+k)+1] = *(md++);
00405 }
00406 }
00407 }
00408 }
00409
00410 void OptPmeYPencil::forward_fft() {
00411 #ifdef NAMD_FFTW
00412 #ifdef NAMD_FFTW_3
00413 fftwf_execute(forward_plan);
00414 #else
00415 for ( int i=0; i<nx; ++i ) {
00416 fftw(forward_plan, nz,
00417 ((fftw_complex *) data) + i * nz * initdata.grid.K2,
00418 nz, 1, (fftw_complex *) work, 1, 0);
00419 }
00420 #endif
00421 #endif
00422 }
00423
00424 void OptPmeYPencil::send_trans() {
00425 int yBlocks = initdata.yBlocks;
00426 int block2 = initdata.grid.block2;
00427 int K2 = initdata.grid.K2;
00428
00429 for ( int isend=0; isend<yBlocks; ++isend ) {
00430 int jb = send_order[isend];
00431 int ny = block2;
00432 if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
00433 OptPmeFFTMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) OptPmeFFTMsg;
00434 msg->sourceNode = thisIndex.x;
00435 msg->nx = nx;
00436 float *md = msg->qgrid;
00437 const float *d = data;
00438 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
00439 for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
00440 for ( int k=0; k<nz; ++k ) {
00441 *(md++) = d[2*(j*nz+k)];
00442 *(md++) = d[2*(j*nz+k)+1];
00443 }
00444 }
00445 }
00446 if ( md != msg->qgrid + nx*ny*nz*2 ) CkPrintf("error in YX at %d %d %d\n",
00447 thisIndex.x, jb, thisIndex.z);
00448
00449
00450 initdata.xPencil(0,jb,thisIndex.z).recvTrans(msg);
00451 }
00452 }
00453
00454
00455
00456 void OptPmeXPencil::recv_trans(const OptPmeFFTMsg *msg) {
00457
00458 int block1 = initdata.grid.block1;
00459 int K1 = initdata.grid.K1;
00460 int ib = msg->sourceNode;
00461 int nx = msg->nx;
00462 const float *md = msg->qgrid;
00463 for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
00464 float *d = data + i*ny*nz*2;
00465 for ( int j=0; j<ny; ++j, d += nz*2 ) {
00466 for ( int k=0; k<nz; ++k ) {
00467 d[2*k] = *(md++);
00468 d[2*k+1] = *(md++);
00469 }
00470 }
00471 }
00472 }
00473
00474
00475
00476 void OptPmeXPencil::forward_fft() {
00477 #ifdef NAMD_FFTW
00478 #ifdef NAMD_FFTW_3
00479 fftwf_execute(forward_plan);
00480 #else
00481
00482 fftw(forward_plan, ny*nz,
00483 ((fftw_complex *) data), ny*nz, 1, (fftw_complex *) work, 1, 0);
00484 #endif
00485 #endif
00486 }
00487
00488 void OptPmeXPencil::pme_kspace() {
00489 evir = 0.;
00490
00491 #ifdef FFTCHECK
00492 return;
00493 #endif
00494
00495 BigReal ewaldcof = ComputeNonbondedUtil::ewaldcof;
00496 evir[0] = myKSpace->compute_energy(data,
00497 lattice, ewaldcof, &(evir[1]));
00498
00499
00500 }
00501
00502 void OptPmeXPencil::submit_evir() {
00503 double * cdata = (double *) evir.begin();
00504 reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += cdata[0];
00505 reduction->item(REDUCTION_VIRIAL_SLOW_XX) += cdata[1];
00506 reduction->item(REDUCTION_VIRIAL_SLOW_XY) += cdata[2];
00507 reduction->item(REDUCTION_VIRIAL_SLOW_XZ) += cdata[3];
00508 reduction->item(REDUCTION_VIRIAL_SLOW_YX) += cdata[2];
00509 reduction->item(REDUCTION_VIRIAL_SLOW_YY) += cdata[4];
00510 reduction->item(REDUCTION_VIRIAL_SLOW_YZ) += cdata[5];
00511 reduction->item(REDUCTION_VIRIAL_SLOW_ZX) += cdata[3];
00512 reduction->item(REDUCTION_VIRIAL_SLOW_ZY) += cdata[5];
00513 reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += cdata[6];
00514
00515 if (amd_reduction) {
00516 amd_reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += cdata[0];
00517 amd_reduction->item(REDUCTION_VIRIAL_SLOW_XX) += cdata[1];
00518 amd_reduction->item(REDUCTION_VIRIAL_SLOW_XY) += cdata[2];
00519 amd_reduction->item(REDUCTION_VIRIAL_SLOW_XZ) += cdata[3];
00520 amd_reduction->item(REDUCTION_VIRIAL_SLOW_YX) += cdata[2];
00521 amd_reduction->item(REDUCTION_VIRIAL_SLOW_YY) += cdata[4];
00522 amd_reduction->item(REDUCTION_VIRIAL_SLOW_YZ) += cdata[5];
00523 amd_reduction->item(REDUCTION_VIRIAL_SLOW_ZX) += cdata[3];
00524 amd_reduction->item(REDUCTION_VIRIAL_SLOW_ZY) += cdata[5];
00525 amd_reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += cdata[6];
00526 }
00527
00528 SimParameters *simParams = Node::Object()->simParameters;
00529 int fef = simParams->fullElectFrequency;
00530 for (int i = 0; i < fef; i++) {
00531 reduction->submit();
00532 if (amd_reduction) amd_reduction->submit();
00533 }
00534 }
00535
00536 void OptPmeXPencil::backward_fft() {
00537 #ifdef NAMD_FFTW
00538 #ifdef NAMD_FFTW_3
00539 fftwf_execute(backward_plan);
00540 #else
00541
00542 fftw(backward_plan, ny*nz,
00543 ((fftw_complex *) data), ny*nz, 1, (fftw_complex *) work, 1, 0);
00544 #endif
00545 #endif
00546 }
00547
00548 void OptPmeXPencil::send_untrans() {
00549 int xBlocks = initdata.xBlocks;
00550 int block1 = initdata.grid.block1;
00551 int K1 = initdata.grid.K1;
00552
00553 for ( int isend=0; isend<xBlocks; ++isend ) {
00554 int ib = send_order[isend];
00555 int nx = block1;
00556 if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
00557 OptPmeFFTMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) OptPmeFFTMsg;
00558 msg->sourceNode = thisIndex.y;
00559 msg->nx = ny;
00560 float *md = msg->qgrid;
00561 for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
00562 float *d = data + i*ny*nz*2;
00563 for ( int j=0; j<ny; ++j, d += nz*2 ) {
00564 for ( int k=0; k<nz; ++k ) {
00565 *(md++) = d[2*k];
00566 *(md++) = d[2*k+1];
00567 }
00568 }
00569 }
00570
00571
00572 initdata.yPencil(ib,0,thisIndex.z).recvUntrans(msg);
00573 }
00574 }
00575
00576
00577 void OptPmeYPencil::recv_untrans(const OptPmeFFTMsg *msg) {
00578
00579 int block2 = initdata.grid.block2;
00580 int K2 = initdata.grid.K2;
00581 int jb = msg->sourceNode;
00582 int ny = msg->nx;
00583 const float *md = msg->qgrid;
00584 float *d = data;
00585 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
00586 for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
00587 for ( int k=0; k<nz; ++k ) {
00588 d[2*(j*nz+k)] = *(md++);
00589 d[2*(j*nz+k)+1] = *(md++);
00590 }
00591 }
00592 }
00593 }
00594
00595
00596
00597 void OptPmeYPencil::backward_fft() {
00598 #ifdef NAMD_FFTW
00599 #ifdef NAMD_FFTW_3
00600 fftwf_execute(backward_plan);
00601 #else
00602 for ( int i=0; i<nx; ++i ) {
00603 #if CMK_BLUEGENEL
00604 CmiNetworkProgress();
00605 #endif
00606
00607 fftw(backward_plan, nz,
00608 ((fftw_complex *) data) + i * nz * initdata.grid.K2,
00609 nz, 1, (fftw_complex *) work, 1, 0);
00610 }
00611 #endif
00612 #endif
00613 }
00614
00615 void OptPmeYPencil::send_untrans() {
00616
00617
00618 int yBlocks = initdata.yBlocks;
00619 int block2 = initdata.grid.block2;
00620 int K2 = initdata.grid.K2;
00621
00622 for ( int isend=0; isend<yBlocks; ++isend ) {
00623 int jb = send_order[isend];
00624
00625 int ny = block2;
00626 if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
00627 OptPmeFFTMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) OptPmeFFTMsg;
00628 msg->sourceNode = thisIndex.z;
00629 msg->nx = nz;
00630 float *md = msg->qgrid;
00631 const float *d = data;
00632 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
00633 for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
00634 for ( int k=0; k<nz; ++k ) {
00635 *(md++) = d[2*(j*nz+k)];
00636 *(md++) = d[2*(j*nz+k)+1];
00637 }
00638 }
00639 }
00640
00641
00642 initdata.zPencil(thisIndex.x,jb,0).recvUntrans(msg);
00643 }
00644 }
00645
00646 void OptPmeZPencil::recv_untrans(const OptPmeFFTMsg *msg) {
00647
00648
00649
00650 int block3 = initdata.grid.block3;
00651 int dim3 = initdata.grid.dim3;
00652 int kb = msg->sourceNode;
00653 int nz = msg->nx;
00654 const float *md = msg->qgrid;
00655 float *d = data;
00656 for ( int i=0; i<nx; ++i ) {
00657 for ( int j=0; j<ny; ++j, d += dim3 ) {
00658 for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
00659 d[2*k] = *(md++);
00660 d[2*k+1] = *(md++);
00661 }
00662 }
00663 }
00664 }
00665
00666
00667 void OptPmeZPencil::backward_fft() {
00668 #ifdef NAMD_FFTW
00669 #ifdef NAMD_FFTW_3
00670 fftwf_execute(backward_plan);
00671 #else
00672 rfftwnd_complex_to_real(backward_plan, nx*ny,
00673 (fftw_complex *) data, 1, initdata.grid.dim3/2, work, 1, 0);
00674 #endif
00675 #endif
00676
00677 #if CMK_BLUEGENEL
00678 CmiNetworkProgress();
00679 #endif
00680
00681 #ifdef FFTCHECK
00682 int dim3 = initdata.grid.dim3;
00683 int K1 = initdata.grid.K1;
00684 int K2 = initdata.grid.K2;
00685 int K3 = initdata.grid.K3;
00686 float scale = 1. / (1. * K1 * K2 * K3);
00687 float maxerr = 0.;
00688 float maxstd = 0.;
00689 int mi, mj, mk; mi = mj = mk = -1;
00690 float std_base = 100. * (thisIndex.x+1.) + 10. * (thisIndex.y+1.);
00691 const float *d = data;
00692 for ( int i=0; i<nx; ++i ) {
00693 for ( int j=0; j<ny; ++j, d += dim3 ) {
00694 for ( int k=0; k<K3; ++k ) {
00695 float std = 10. * (10. * (10. * std_base + i) + j) + k;
00696 float err = scale * d[k] - std;
00697 if ( fabsf(err) > fabsf(maxerr) ) {
00698 maxerr = err;
00699 maxstd = std;
00700 mi = i; mj = j; mk = k;
00701 }
00702 }
00703 }
00704 }
00705 CkPrintf("pencil %d %d max error %f at %d %d %d (should be %f)\n",
00706 thisIndex.x, thisIndex.y, maxerr, mi, mj, mk, maxstd);
00707 #endif
00708 }
00709
00710 void OptPmeZPencil::send_ungrid(OptPmeGridMsg *msg) {
00711 int pe = msg->sourceNode;
00712 msg->sourceNode = thisIndex.x * initdata.yBlocks + thisIndex.y;
00713
00714 int dim3 = initdata.grid.dim3;
00715 int xstart = msg->xstart - thisIndex.x * initdata.grid.block1;
00716 int ystart = msg->ystart - thisIndex.y * initdata.grid.block2;
00717 assert (xstart >= 0);
00718 assert (ystart >= 0);
00719 int xlen = msg->xlen;
00720 int ylen = msg->ylen;
00721 assert (xstart + xlen <= nx);
00722 assert (ystart + ylen <= ny);
00723
00724 float *qmsg = msg->qgrid;
00725 float *d = data;
00726 int zstart = msg->zstart;
00727 int zlen = msg->zlen;
00728 int zmax = zstart + zlen - 1;
00729
00730 int K3_1 = initdata.grid.K3 - 1;
00731 int K3 = initdata.grid.K3;
00732 if (zstart < 0) {
00733 zstart += K3;
00734 zmax += K3;
00735 }
00736
00737 int k = 0;
00738 for ( int i=xstart; i<xstart+xlen; ++i ) {
00739 for ( int j=ystart; j<ystart+ylen; ++j) {
00740 float *d = data + (i * ny + j) * dim3;
00741 for ( k=zstart; k<=zmax; ++k ) {
00742 int kz = k;
00743
00744 kz = kz - ((unsigned)(K3_1 - kz)>>31)*K3;
00745 *(qmsg++) = d[kz];
00746 }
00747 }
00748 }
00749
00750 initdata.pmeProxy[pe].recvUngrid(msg);
00751 }
00752
00753
00757
00758 void OptPmeZPencil::many_to_many_recv_grid () {
00759 int dim3 = initdata.grid.dim3;
00760 memset(data, 0, sizeof(float) * nx*ny*dim3);
00761 int K3 = initdata.grid.K3;
00762 int K3_1 = initdata.grid.K3 - 1;
00763 int k = 0;
00764 for (int idx = 0; idx < grid_msgs.size(); idx++) {
00765 int xstart = m2m_recv_grid[idx].xstart;
00766 int xlen = m2m_recv_grid[idx].xlen;
00767 int ystart = m2m_recv_grid[idx].ystart;
00768 int ylen = m2m_recv_grid[idx].ylen;
00769 int zstart = m2m_recv_grid[idx].zstart;
00770 int zlen = m2m_recv_grid[idx].zlen;
00771
00772 float *qmsg = m2m_recv_grid[idx].data;
00773 for ( int i=xstart; i<xstart+xlen; ++i ) {
00774 for ( int j=ystart; j<ystart+ylen; ++j) {
00775 float *d = data + (i * ny + j) * dim3;
00776
00777 #pragma disjoint (*qmsg, *d)
00778 #pragma unroll (4)
00779 for ( k=zstart; k<zstart+zlen; ++k ) {
00780 int kz = k;
00781
00782 kz = kz - ((unsigned)(K3_1 - kz)>>31)*K3;
00783
00784
00785 d[kz] += *(qmsg++);
00786 }
00787 }
00788 }
00789 }
00790 }
00791
00792 void OptPmeZPencil::many_to_many_send_trans() {
00793 int zBlocks = initdata.zBlocks;
00794 int block3 = initdata.grid.block3;
00795 int dim3 = initdata.grid.dim3;
00796
00797 float *md = many_to_many_data;
00798 if (single_pencil) {
00799 const float *d = data;
00800 for ( int kb=0; kb<zBlocks; ++kb ) {
00801 *(md++) = d[2*kb];
00802 *(md++) = d[2*kb+1];
00803 }
00804 }
00805 else {
00806 int nz = block3;
00807 for ( int kb=0; kb<zBlocks; ++kb ) {
00808 nz = block3;
00809 if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
00810
00811 const float *d = data;
00812 for ( int i=0; i<nx; ++i ) {
00813 for ( int j=0; j<ny; ++j, d += dim3 ) {
00814 for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
00815 *(md++) = d[2*k];
00816 *(md++) = d[2*k+1];
00817 }
00818 }
00819 }
00820 }
00821 }
00822
00823 #if CHARM_VERSION > 60000
00824 CmiDirect_manytomany_start (handle, PHASE_YF);
00825 #endif
00826 }
00827
00828 void OptPmeYPencil::many_to_many_recv_trans () {
00829 int block2 = initdata.grid.block2;
00830 int K2 = initdata.grid.K2;
00831
00832 const float *md = many_to_many_data;
00833 if (single_pencil) {
00834 float *d = data;
00835 for (int jb = 0; jb < initdata.yBlocks; jb++ ) {
00836 d[2*jb] = *(md++);
00837 d[2*jb +1] = *(md++);
00838 }
00839 }
00840 else {
00841 for (int jb = 0; jb < initdata.yBlocks; jb++ ) {
00842 int ny = many_to_many_nb[jb];
00843 float *d = data;
00844 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
00845 for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
00846 for ( int k=0; k<nz; ++k ) {
00847 d[2*(j*nz+k)] = *(md++);
00848 d[2*(j*nz+k)+1] = *(md++);
00849 }
00850 }
00851 }
00852 }
00853 }
00854 }
00855
00856 void OptPmeYPencil::many_to_many_send(int phase) {
00857 int yBlocks = initdata.yBlocks;
00858 int block2 = initdata.grid.block2;
00859 int K2 = initdata.grid.K2;
00860
00861 float *md = many_to_many_data;
00862 int ny = block2;
00863 if (single_pencil) {
00864 const float *d = data;
00865 for ( int jb=0; jb<yBlocks; ++jb ) {
00866 *(md++) = d[2*jb];
00867 *(md++) = d[2*jb+1];
00868 }
00869 }
00870 else {
00871 for ( int jb=0; jb<yBlocks; ++jb ) {
00872 ny = block2;
00873 if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
00874
00875 const float *d = data;
00876 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
00877 for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
00878 for ( int k=0; k<nz; ++k ) {
00879 *(md++) = d[2*(j*nz+k)];
00880 *(md++) = d[2*(j*nz+k)+1];
00881 }
00882 }
00883 }
00884 }
00885 }
00886
00887 #if CHARM_VERSION > 60000
00888 CmiDirect_manytomany_start (handle, phase);
00889 #endif
00890 }
00891
00892 void OptPmeXPencil::many_to_many_recv_trans () {
00893
00894 int block1 = initdata.grid.block1;
00895 int K1 = initdata.grid.K1;
00896
00897 const float *md = many_to_many_data;
00898 if (single_pencil) {
00899 for (int ib =0; ib < initdata.xBlocks; ib++ ) {
00900 data[2*ib] = *(md++);
00901 data[2*ib+1] = *(md++);
00902 }
00903 }
00904 else {
00905 for (int ib =0; ib < initdata.xBlocks; ib++ ) {
00906 int nx = many_to_many_nb[ib];
00907 for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
00908 float *d = data + i*ny*nz*2;
00909 for ( int j=0; j<ny; ++j, d += nz*2 ) {
00910 for ( int k=0; k<nz; ++k ) {
00911 d[2*k] = *(md++);
00912 d[2*k+1] = *(md++);
00913 }
00914 }
00915 }
00916 }
00917 }
00918 }
00919
00920 void OptPmeXPencil::many_to_many_send_untrans() {
00921 int xBlocks = initdata.xBlocks;
00922 int block1 = initdata.grid.block1;
00923 int K1 = initdata.grid.K1;
00924
00925 int nx = block1;
00926 float * md = many_to_many_data;
00927 if (single_pencil) {
00928 float *d = data;
00929 for ( int ib=0; ib<xBlocks; ++ib ) {
00930 *(md++) = d[2*ib];
00931 *(md++) = d[2*ib+1];
00932 }
00933 }
00934 else {
00935 for ( int ib=0; ib<xBlocks; ++ib ) {
00936 nx = block1;
00937 if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
00938
00939 for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
00940 float *d = data + i*ny*nz*2;
00941 for ( int j=0; j<ny; ++j, d += nz*2 ) {
00942 for ( int k=0; k<nz; ++k ) {
00943 *(md++) = d[2*k];
00944 *(md++) = d[2*k+1];
00945 }
00946 }
00947 }
00948 }
00949 }
00950 #if CHARM_VERSION > 60000
00951 CmiDirect_manytomany_start (handle, PHASE_YB);
00952 #endif
00953 }
00954
00955 void OptPmeYPencil::many_to_many_recv_untrans () {
00956 int block2 = initdata.grid.block2;
00957 int K2 = initdata.grid.K2;
00958
00959 const float *md = many_to_many_data;
00960 if (single_pencil) {
00961 float *d = data;
00962 for (int jb = 0; jb < initdata.yBlocks; jb++ ) {
00963 d[2*jb] = *(md++);
00964 d[2*jb+1] = *(md++);
00965 }
00966 }
00967 else {
00968 for (int jb = 0; jb < initdata.yBlocks; jb++ ) {
00969 int ny = many_to_many_nb[jb];
00970 float *d = data;
00971 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
00972 for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
00973 for ( int k=0; k<nz; ++k ) {
00974 d[2*(j*nz+k)] = *(md++);
00975 d[2*(j*nz+k)+1] = *(md++);
00976 }
00977 }
00978 }
00979 }
00980 }
00981 }
00982
00983 void OptPmeZPencil::many_to_many_recv_untrans() {
00984
00985 int block3 = initdata.grid.block3;
00986 int dim3 = initdata.grid.dim3;
00987
00988 const float *md = many_to_many_data;
00989 if (single_pencil) {
00990 float *d = data;
00991 for (int kb = 0; kb < initdata.zBlocks; kb++ ) {
00992 d[2*kb] = *(md++);
00993 d[2*kb+1] = *(md++);
00994 }
00995 }
00996 else {
00997 for (int kb = 0; kb < initdata.zBlocks; kb++ ) {
00998 int nz = many_to_many_nb[kb];
00999 float *d = data;
01000 for ( int i=0; i<nx; ++i ) {
01001 for ( int j=0; j<ny; ++j, d += dim3 ) {
01002 for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
01003 d[2*k] = *(md++);
01004 d[2*k+1] = *(md++);
01005 }
01006 }
01007 }
01008 }
01009 }
01010 }
01011
01012 void OptPmeZPencil::many_to_many_send_ungrid () {
01013 int dim3 = initdata.grid.dim3;
01014 int K3 = initdata.grid.K3;
01015 int K3_1 = initdata.grid.K3 - 1;
01016 int k = 0;
01017 for (int idx = 0; idx < grid_msgs.size(); idx++) {
01018 int xstart = m2m_recv_grid[idx].xstart;
01019 int xlen = m2m_recv_grid[idx].xlen;
01020 int ystart = m2m_recv_grid[idx].ystart;
01021 int ylen = m2m_recv_grid[idx].ylen;
01022 int zstart = m2m_recv_grid[idx].zstart;
01023 int zlen = m2m_recv_grid[idx].zlen;
01024
01025 float *qmsg = m2m_recv_grid[idx].data;
01026 for ( int i=xstart; i<xstart+xlen; ++i ) {
01027 for ( int j=ystart; j<ystart+ylen; ++j) {
01028 float *d = data + (i * ny + j) * dim3;
01029
01030 #pragma disjoint (*d, *qmsg)
01031 #pragma unroll (4)
01032 for ( k=zstart; k<zstart+zlen; ++k ) {
01033 int kz = k;
01034
01035 kz = kz - ((unsigned)(K3_1 - kz)>>31)*K3;
01036
01037
01038 *(qmsg++) = d[kz];
01039 }
01040 }
01041 }
01042 }
01043
01044 #if CHARM_VERSION > 60000
01045 CmiDirect_manytomany_start (handle, PHASE_UG);
01046 #endif
01047 }
01048
01049
01050
01051 void OptPmeZPencil::initialize_manytomany () {
01052 #if CHARM_VERSION > 60000
01053 int idx = 0;
01054 int totalcount = 0;
01055 for (idx = 0; idx < grid_msgs.size(); idx ++)
01056 totalcount += m2m_recv_grid[idx].xlen * m2m_recv_grid[idx].ylen * m2m_recv_grid[idx].zlen;
01057 many_to_many_gr_data = new float [totalcount];
01058
01059 CkArrayIndex3D aidx (thisIndex.x, thisIndex.y, thisIndex.z);
01060 cbw_recvgrid.cb = CkCallback (CkIndex_OptPmeZPencil::many_to_manyRecvGrid(NULL), aidx, thisProxy.ckGetArrayID());
01061 cbw_recvgrid.msg = new (PRIORITY_SIZE) OptPmeDummyMsg;
01062 cbw_recvgrid.array = this;
01063 PatchMap *patchMap = PatchMap::Object();
01064 CmiDirect_manytomany_initialize_recvbase (handle, PHASE_GR,
01065 call_ck_cb_recv_grid,
01066 &cbw_recvgrid,
01067 (char *)many_to_many_gr_data,
01068 patchMap->numPatches(), -1);
01069
01070 CmiDirect_manytomany_initialize_sendbase (handle, PHASE_UG, NULL, NULL, (char *)many_to_many_gr_data,
01071 grid_msgs.size(), thisIndex.x *initdata.yBlocks + thisIndex.y);
01072
01073 int offset = 0;
01074 for (idx = 0; idx < grid_msgs.size(); idx ++) {
01075 m2m_recv_grid[idx].data = (float *) ((char *)many_to_many_gr_data + offset);
01076 int fcount = m2m_recv_grid[idx].xlen * m2m_recv_grid[idx].ylen * m2m_recv_grid[idx].zlen;
01077 CmiDirect_manytomany_initialize_recv (handle, PHASE_GR, m2m_recv_grid[idx].patchID,
01078 offset, fcount *sizeof(float), patchMap->node(m2m_recv_grid[idx].patchID));
01079 CmiDirect_manytomany_initialize_send (handle, PHASE_UG, idx, offset, fcount *sizeof(float),
01080 patchMap->node(m2m_recv_grid[idx].patchID));
01081 offset += fcount * sizeof(float);
01082 }
01083
01084 int zBlocks = initdata.zBlocks;
01085 int block3 = initdata.grid.block3;
01086 int dim3 = initdata.grid.dim3;
01087
01088
01089 CmiDirect_manytomany_initialize_sendbase (handle, PHASE_YF, NULL, NULL, (char *)many_to_many_data, zBlocks, thisIndex.y);
01090
01091
01092 cbw_recvuntrans.cb = CkCallback(CkIndex_OptPmeZPencil::many_to_manyRecvUntrans(NULL), aidx, thisProxy.ckGetArrayID());
01093 cbw_recvuntrans.msg = new (PRIORITY_SIZE) OptPmeDummyMsg;
01094 cbw_recvuntrans.array = this;
01095 CmiDirect_manytomany_initialize_recvbase (handle, PHASE_ZB,
01096 call_ck_cb_recv_untrans_z,
01097 &cbw_recvuntrans,
01098 (char *)many_to_many_data,
01099 initdata.zBlocks, -1);
01100 single_pencil = false;
01101 if (nx == 1 && ny == 1 && zBlocks <= dim3/2 && block3==1)
01102 single_pencil = true;
01103
01104 for ( int kb=0; kb<zBlocks; ++kb ) {
01105 int nz = block3;
01106 if ( (kb+1)*block3 > dim3/2 ) {
01107 single_pencil = false;
01108 nz = dim3/2 - kb*block3;
01109 }
01110
01111
01112 CkArrayIndex3D index (thisIndex.x,0,kb);
01113 CProxy_OptPmePencilMapY yproxy (global_map_y);
01114 int pe = yproxy.ckLocalBranch()->procNum(0, index);
01115 CmiDirect_manytomany_initialize_send (handle, PHASE_YF, kb, kb*2*nx*ny*block3*sizeof(float), 2*nx*ny*nz*sizeof(float), pe);
01116
01117
01118 CmiDirect_manytomany_initialize_recv (handle, PHASE_ZB, kb, kb*nx*ny*block3*2*sizeof(float), nx*ny*nz*2*sizeof(float), pe);
01119 many_to_many_nb [kb] = nz;
01120 }
01121
01122 #endif
01123 }
01124
01125 void OptPmeYPencil::initialize_manytomany () {
01126 #if CHARM_VERSION > 60000
01127 int yBlocks = initdata.yBlocks;
01128 int block2 = initdata.grid.block2;
01129 int K2 = initdata.grid.K2;
01130
01131
01132 CmiDirect_manytomany_initialize_sendbase (handle, PHASE_XF, NULL, NULL, (char *)many_to_many_data, yBlocks, thisIndex.x);
01133
01134 CmiDirect_manytomany_initialize_sendbase (handle, PHASE_ZB, NULL, NULL, (char *)many_to_many_data, yBlocks, thisIndex.z);
01135
01136
01137 CkArrayIndex3D idx (thisIndex.x, thisIndex.y, thisIndex.z);
01138 cbw_recvtrans.cb = CkCallback (CkIndex_OptPmeYPencil::many_to_manyRecvTrans(NULL), idx, thisProxy.ckGetArrayID());
01139 cbw_recvtrans.msg = new (PRIORITY_SIZE) OptPmeDummyMsg;
01140 cbw_recvtrans.array = this;
01141 CmiDirect_manytomany_initialize_recvbase (handle, PHASE_YF,
01142 call_ck_cb_recv_trans_y,
01143 &cbw_recvtrans,
01144 (char *)many_to_many_data,
01145 initdata.yBlocks, -1);
01146
01147
01148 cbw_recvuntrans.cb = CkCallback(CkIndex_OptPmeYPencil::many_to_manyRecvUntrans(NULL), idx, thisProxy.ckGetArrayID());
01149 cbw_recvuntrans.msg = new (PRIORITY_SIZE) OptPmeDummyMsg;
01150 cbw_recvuntrans.array = this;
01151 CmiDirect_manytomany_initialize_recvbase (handle, PHASE_YB,
01152 call_ck_cb_recv_untrans_y,
01153 &cbw_recvuntrans,
01154 (char *)many_to_many_data,
01155 initdata.yBlocks, -1);
01156
01157 single_pencil = false;
01158 if (nz == 1 && nx == 1 && yBlocks <= K2 && block2==1)
01159 single_pencil = true;
01160
01161 for ( int jb=0; jb<yBlocks; ++jb ) {
01162 int ny = block2;
01163 if ( (jb+1)*block2 > K2 ) {
01164 single_pencil = false;
01165 ny = K2 - jb*block2;
01166 }
01167
01168
01169 CkArrayIndex3D index (thisIndex.x,jb,0);
01170 CProxy_OptPmePencilMapZ zproxy (global_map_z);
01171 int pe = zproxy.ckLocalBranch()->procNum(0, index);
01172 CmiDirect_manytomany_initialize_send (handle, PHASE_ZB, jb, 2*nx*block2*nz*jb*sizeof(float), 2*nx*ny*nz*sizeof(float), pe);
01173
01174
01175 CmiDirect_manytomany_initialize_recv (handle, PHASE_YF, jb, jb*nx*block2*nz*2*sizeof(float), nx*ny*nz*2*sizeof(float), pe);
01176
01177
01178 index = CkArrayIndex3D (0,jb,thisIndex.z);
01179 CProxy_OptPmePencilMapX xproxy (global_map_x);
01180 pe = xproxy.ckLocalBranch()->procNum(0, index);
01181 CmiDirect_manytomany_initialize_send (handle, PHASE_XF, jb, 2*nx*block2*nz*jb*sizeof(float), 2*nx*ny*nz*sizeof(float), pe);
01182
01183
01184 CmiDirect_manytomany_initialize_recv (handle, PHASE_YB, jb, jb*nx*block2*nz*2*sizeof(float), nx*ny*nz*2*sizeof(float), pe);
01185
01186 many_to_many_nb [jb] = ny;
01187 }
01188 #endif
01189 }
01190
01191 void OptPmeXPencil::initialize_manytomany () {
01192 #if CHARM_VERSION > 60000
01193 int xBlocks = initdata.xBlocks;
01194 int block1 = initdata.grid.block1;
01195 int K1 = initdata.grid.K1;
01196
01197 CmiDirect_manytomany_initialize_sendbase (handle, PHASE_YB, NULL, NULL, (char *)many_to_many_data, xBlocks, thisIndex.y);
01198 CkArrayIndex3D idx (thisIndex.x, thisIndex.y, thisIndex.z);
01199 cbw_recvtrans.cb = CkCallback(CkIndex_OptPmeXPencil::many_to_manyRecvTrans(NULL), idx, thisProxy.ckGetArrayID());
01200 cbw_recvtrans.msg = new (PRIORITY_SIZE) OptPmeDummyMsg;
01201 cbw_recvtrans.array = this;
01202 CmiDirect_manytomany_initialize_recvbase (handle, PHASE_XF,
01203 call_ck_cb_recv_trans_x,
01204 &cbw_recvtrans,
01205 (char *)many_to_many_data,
01206 initdata.xBlocks, -1);
01207
01208 single_pencil = false;
01209 if (ny == 1 && nz == 1 && xBlocks <= K1 && block1==1)
01210 single_pencil = true;
01211
01212 for ( int ib=0; ib<xBlocks; ++ib ) {
01213 int nx = block1;
01214 if ( (ib+1)*block1 > K1 ) {
01215 single_pencil = false;
01216 nx = K1 - ib*block1;
01217 }
01218
01219 CkArrayIndex3D index (ib,0,thisIndex.z);
01220 CProxy_OptPmePencilMapY yproxy (global_map_y);
01221 int pe = yproxy.ckLocalBranch()->procNum(0, index);
01222 CmiDirect_manytomany_initialize_send (handle, PHASE_YB, ib, 2*block1*ny*nz*ib*sizeof(float), 2*nx*ny*nz*sizeof(float), pe);
01223
01224 CmiDirect_manytomany_initialize_recv (handle, PHASE_XF, ib, ib*block1*ny*nz*2*sizeof(float), nx*ny*nz*2*sizeof(float), pe);
01225 many_to_many_nb [ib] = nx;
01226 }
01227 #endif
01228 }
01229
01230
01234
01235
01236 #include "PmeFFTLib.def.h"