00001
00002 #include "fftlib.h"
00003
00004 #include <assert.h>
00005
00006 void call_ck_cb (void *arg) {
00007 CkCallbackWrapper *cw = (CkCallbackWrapper *)arg;
00008 cw->cb.send (cw->msg);
00009 }
00010
00011 void OptPmeZPencil::fft_init() {
00012
00013
00014
00015 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
00016 Node *node = nd.ckLocalBranch();
00017 SimParameters *simParams = node->simParameters;
00018
00019 int K1 = initdata.grid.K1;
00020 int K2 = initdata.grid.K2;
00021 int K3 = initdata.grid.K3;
00022 int dim3 = initdata.grid.dim3;
00023 int block1 = initdata.grid.block1;
00024 int block2 = initdata.grid.block2;
00025
00026 nx = block1;
00027 if ( (thisIndex.x + 1) * block1 > K1 ) nx = K1 - thisIndex.x * block1;
00028 ny = block2;
00029 if ( (thisIndex.y + 1) * block2 > K2 ) ny = K2 - thisIndex.y * block2;
00030
00031 data = new float[nx*ny*dim3];
00032 many_to_many_data = new float[nx*ny*dim3];
00033 many_to_many_nb = new int [initdata.zBlocks];
00034 work = new float[dim3];
00035
00036 memset(data, 0, sizeof(float) * nx*ny*dim3);
00037 memset(many_to_many_data, 0, sizeof(float) * nx*ny*dim3);
00038
00039 order_init(initdata.zBlocks);
00040
00041 #ifdef NAMD_FFTW
00042 CmiLock(fftw_plan_lock);
00043
00044 forward_plan = rfftwnd_create_plan_specific(1, &K3, FFTW_REAL_TO_COMPLEX,
00045 ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
00046 | FFTW_IN_PLACE | FFTW_USE_WISDOM, data, 1, work, 1);
00047 backward_plan = rfftwnd_create_plan_specific(1, &K3, FFTW_COMPLEX_TO_REAL,
00048 ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
00049 | FFTW_IN_PLACE | FFTW_USE_WISDOM, data, 1, work, 1);
00050
00051 CmiUnlock(fftw_plan_lock);
00052 #else
00053 NAMD_die("Sorry, FFTW must be compiled in to use PME.");
00054 #endif
00055
00056 handle = NULL;
00057 #if CHARM_VERSION > 60000
00058 handle = CmiDirect_manytomany_allocate_handle();
00059 #endif
00060 }
00061
00062 void OptPmeYPencil::fft_init() {
00063 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
00064 Node *node = nd.ckLocalBranch();
00065 SimParameters *simParams = node->simParameters;
00066
00067
00068
00069 int K1 = initdata.grid.K1;
00070 int K2 = initdata.grid.K2;
00071 int dim2 = initdata.grid.dim2;
00072 int dim3 = initdata.grid.dim3;
00073 int block1 = initdata.grid.block1;
00074 int block3 = initdata.grid.block3;
00075
00076 nx = block1;
00077 if ( (thisIndex.x + 1) * block1 > K1 ) nx = K1 - thisIndex.x * block1;
00078 nz = block3;
00079 if ( (thisIndex.z+1)*block3 > dim3/2 ) nz = dim3/2 - thisIndex.z*block3;
00080
00081 data = new float[nx*dim2*nz*2];
00082 many_to_many_data = new float[nx*dim2*nz*2];
00083 many_to_many_nb = new int [initdata.yBlocks];
00084 work = new float[2*K2];
00085
00086 memset(many_to_many_data, 0, sizeof(float) * nx*dim2*nz*2);
00087
00088 order_init(initdata.yBlocks);
00089
00090 #ifdef NAMD_FFTW
00091 CmiLock(fftw_plan_lock);
00092
00093 forward_plan = fftw_create_plan_specific(K2, FFTW_FORWARD,
00094 ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
00095 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
00096 nz, (fftw_complex *) work, 1);
00097 backward_plan = fftw_create_plan_specific(K2, FFTW_BACKWARD,
00098 ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
00099 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
00100 nz, (fftw_complex *) work, 1);
00101
00102 CmiUnlock(fftw_plan_lock);
00103 #else
00104 NAMD_die("Sorry, FFTW must be compiled in to use PME.");
00105 #endif
00106
00107 handle = NULL;
00108 #if CHARM_VERSION > 60000
00109 handle = CmiDirect_manytomany_allocate_handle();
00110 #endif
00111 initialize_manytomany();
00112 }
00113
00114
00115 void OptPmeXPencil::fft_init() {
00116 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
00117 Node *node = nd.ckLocalBranch();
00118 SimParameters *simParams = node->simParameters;
00119
00120
00121
00122 lattice = simParams->lattice;
00123
00124 int K1 = initdata.grid.K1;
00125 int K2 = initdata.grid.K2;
00126 int dim3 = initdata.grid.dim3;
00127 int block2 = initdata.grid.block2;
00128 int block3 = initdata.grid.block3;
00129
00130 ny = block2;
00131 if ( (thisIndex.y + 1) * block2 > K2 ) ny = K2 - thisIndex.y * block2;
00132 nz = block3;
00133 if ( (thisIndex.z+1)*block3 > dim3/2 ) nz = dim3/2 - thisIndex.z*block3;
00134
00135 data = new float[K1*block2*block3*2];
00136 many_to_many_data = new float[K1*block2*block3*2];
00137 many_to_many_nb = new int [initdata.xBlocks];
00138 work = new float[2*K1];
00139
00140 memset(many_to_many_data, 0, sizeof(float) * K1*block2*block3*2);
00141
00142 order_init(initdata.xBlocks);
00143
00144 #ifdef NAMD_FFTW
00145 CmiLock(fftw_plan_lock);
00146
00147 forward_plan = fftw_create_plan_specific(K1, FFTW_FORWARD,
00148 ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
00149 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
00150 ny*nz, (fftw_complex *) work, 1);
00151 backward_plan = fftw_create_plan_specific(K1, FFTW_BACKWARD,
00152 ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
00153 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
00154 ny*nz, (fftw_complex *) work, 1);
00155
00156 CmiUnlock(fftw_plan_lock);
00157 #else
00158 NAMD_die("Sorry, FFTW must be compiled in to use PME.");
00159 #endif
00160
00161 myKSpace = new PmeKSpace(initdata.grid,
00162 thisIndex.y*block2, thisIndex.y*block2 + ny,
00163 thisIndex.z*block3, thisIndex.z*block3 + nz);
00164
00165 handle = NULL;
00166 #if CHARM_VERSION > 60000
00167 handle = CmiDirect_manytomany_allocate_handle();
00168 #endif
00169 initialize_manytomany();
00170
00171 constant_pressure = initdata.constant_pressure;
00172 }
00173
00174
00175
00176
00177 void OptPmeZPencil::recv_grid(const OptPmeGridMsg *msg) {
00178
00179 int dim3 = initdata.grid.dim3;
00180 if ( imsg == 0 ) {
00181 memset(data, 0, sizeof(float) * nx*ny*dim3);
00182 }
00183
00184 int xstart = msg->xstart - thisIndex.x * initdata.grid.block1;
00185 int ystart = msg->ystart - thisIndex.y * initdata.grid.block2;
00186 assert (xstart >= 0);
00187 assert (ystart >= 0);
00188 int xlen = msg->xlen;
00189 int ylen = msg->ylen;
00190 assert (xstart + xlen <= nx);
00191 assert (ystart + ylen <= ny);
00192
00193 float *qmsg = msg->qgrid;
00194 float *d = data;
00195 int zlen = msg->zlen;
00196 int zstart = msg->zstart;
00197 int zmax = zstart + zlen - 1;
00198 int k = 0;
00199
00200 int K3 = initdata.grid.K3;
00201 int K3_1 = initdata.grid.K3 - 1;
00202 while (zstart < 0) {
00203 zstart += K3;
00204 zmax += K3;
00205 }
00206
00207 for ( int i=xstart; i<xstart+xlen; ++i ) {
00208 for ( int j=ystart; j<ystart+ylen; ++j) {
00209 float *d = data + (i * ny + j) * dim3;
00210 for ( k=zstart; k<=zmax; ++k ) {
00211 int kz = k;
00212
00213 kz = kz - ((unsigned)(K3_1 - kz)>>31)*K3;
00214
00215
00216 d[kz] += *(qmsg++);
00217 }
00218 }
00219 }
00220
00221 if (_iter == MANY_TO_MANY_SETUP) {
00222 if (imsg == 0)
00223 m2m_recv_grid = new PatchGridElem [grid_msgs.size()];
00224
00225 m2m_recv_grid[imsg].xstart = xstart;
00226 m2m_recv_grid[imsg].xlen = xlen;
00227 m2m_recv_grid[imsg].ystart = ystart;
00228 m2m_recv_grid[imsg].ylen = ylen;
00229 m2m_recv_grid[imsg].zstart = zstart;
00230 m2m_recv_grid[imsg].zlen = zlen;
00231 m2m_recv_grid[imsg].patchID = msg->patchID;
00232
00233 if ( imsg == grid_msgs.size() - 1)
00234 initialize_manytomany ();
00235 }
00236 }
00237
00238
00239 void OptPmeZPencil::forward_fft() {
00240 #ifdef FFTCHECK
00241 int dim3 = initdata.grid.dim3;
00242 int K3 = initdata.grid.K3;
00243 float std_base = 100. * (thisIndex.x+1.) + 10. * (thisIndex.y+1.);
00244 float *d = data;
00245 for ( int i=0; i<nx; ++i ) {
00246 for ( int j=0; j<ny; ++j, d += dim3 ) {
00247 for ( int k=0; k<dim3; ++k ) {
00248 d[k] = 10. * (10. * (10. * std_base + i) + j) + k;
00249 }
00250 }
00251 }
00252 #endif
00253 #ifdef NAMD_FFTW
00254 rfftwnd_real_to_complex(forward_plan, nx*ny,
00255 data, 1, initdata.grid.dim3, (fftw_complex *) work, 1, 0);
00256 #endif
00257 }
00258
00259 void OptPmeZPencil::send_trans() {
00260 int zBlocks = initdata.zBlocks;
00261 int block3 = initdata.grid.block3;
00262 int dim3 = initdata.grid.dim3;
00263
00264
00265 for ( int isend=0; isend<zBlocks; ++isend ) {
00266 int kb = send_order[isend];
00267 int nz = block3;
00268 if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
00269
00270 OptPmeFFTMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) OptPmeFFTMsg;
00271 msg->sourceNode = thisIndex.y;
00272 msg->nx = ny;
00273 float *md = msg->qgrid;
00274 const float *d = data;
00275 for ( int i=0; i<nx; ++i ) {
00276 for ( int j=0; j<ny; ++j, d += dim3 ) {
00277 for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
00278 *(md++) = d[2*k];
00279 *(md++) = d[2*k+1];
00280 }
00281 }
00282 }
00283
00284
00285 initdata.yPencil(thisIndex.x,0,kb).recvTrans(msg);
00286 }
00287 }
00288
00289
00290 void OptPmeYPencil::recv_trans(const OptPmeFFTMsg *msg) {
00291
00292 int block2 = initdata.grid.block2;
00293 int K2 = initdata.grid.K2;
00294 int jb = msg->sourceNode;
00295 int ny = msg->nx;
00296 const float *md = msg->qgrid;
00297 float *d = data;
00298 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
00299 for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
00300 for ( int k=0; k<nz; ++k ) {
00301 d[2*(j*nz+k)] = *(md++);
00302 d[2*(j*nz+k)+1] = *(md++);
00303 }
00304 }
00305 }
00306 }
00307
00308 void OptPmeYPencil::forward_fft() {
00309 #ifdef NAMD_FFTW
00310 for ( int i=0; i<nx; ++i ) {
00311 fftw(forward_plan, nz,
00312 ((fftw_complex *) data) + i * nz * initdata.grid.K2,
00313 nz, 1, (fftw_complex *) work, 1, 0);
00314 }
00315 #endif
00316 }
00317
00318 void OptPmeYPencil::send_trans() {
00319 int yBlocks = initdata.yBlocks;
00320 int block2 = initdata.grid.block2;
00321 int K2 = initdata.grid.K2;
00322
00323 for ( int isend=0; isend<yBlocks; ++isend ) {
00324 int jb = send_order[isend];
00325 int ny = block2;
00326 if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
00327 OptPmeFFTMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) OptPmeFFTMsg;
00328 msg->sourceNode = thisIndex.x;
00329 msg->nx = nx;
00330 float *md = msg->qgrid;
00331 const float *d = data;
00332 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
00333 for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
00334 for ( int k=0; k<nz; ++k ) {
00335 *(md++) = d[2*(j*nz+k)];
00336 *(md++) = d[2*(j*nz+k)+1];
00337 }
00338 }
00339 }
00340 if ( md != msg->qgrid + nx*ny*nz*2 ) CkPrintf("error in YX at %d %d %d\n",
00341 thisIndex.x, jb, thisIndex.z);
00342
00343
00344 initdata.xPencil(0,jb,thisIndex.z).recvTrans(msg);
00345 }
00346 }
00347
00348
00349
00350 void OptPmeXPencil::recv_trans(const OptPmeFFTMsg *msg) {
00351
00352 int block1 = initdata.grid.block1;
00353 int K1 = initdata.grid.K1;
00354 int ib = msg->sourceNode;
00355 int nx = msg->nx;
00356 const float *md = msg->qgrid;
00357 for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
00358 float *d = data + i*ny*nz*2;
00359 for ( int j=0; j<ny; ++j, d += nz*2 ) {
00360 for ( int k=0; k<nz; ++k ) {
00361 d[2*k] = *(md++);
00362 d[2*k+1] = *(md++);
00363 }
00364 }
00365 }
00366 }
00367
00368
00369
00370 void OptPmeXPencil::forward_fft() {
00371 #ifdef NAMD_FFTW
00372 fftw(forward_plan, ny*nz,
00373 ((fftw_complex *) data), ny*nz, 1, (fftw_complex *) work, 1, 0);
00374 #endif
00375 }
00376
00377 void OptPmeXPencil::pme_kspace() {
00378
00379 evir = 0.;
00380
00381 #ifdef FFTCHECK
00382 return;
00383 #endif
00384
00385 BigReal ewaldcof = ComputeNonbondedUtil::ewaldcof;
00386 evir[0] = myKSpace->compute_energy(data,
00387 lattice, ewaldcof, &(evir[1]));
00388
00389 contribute (7*sizeof(double), evir.begin(), CkReduction::sum_double, initdata.cb_energy);
00390
00391 }
00392
00393 void OptPmeXPencil::backward_fft() {
00394 #ifdef NAMD_FFTW
00395 fftw(backward_plan, ny*nz,
00396 ((fftw_complex *) data), ny*nz, 1, (fftw_complex *) work, 1, 0);
00397 #endif
00398 }
00399
00400 void OptPmeXPencil::send_untrans() {
00401 int xBlocks = initdata.xBlocks;
00402 int block1 = initdata.grid.block1;
00403 int K1 = initdata.grid.K1;
00404
00405 for ( int isend=0; isend<xBlocks; ++isend ) {
00406 int ib = send_order[isend];
00407 int nx = block1;
00408 if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
00409 OptPmeFFTMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) OptPmeFFTMsg;
00410 msg->sourceNode = thisIndex.y;
00411 msg->nx = ny;
00412 float *md = msg->qgrid;
00413 for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
00414 float *d = data + i*ny*nz*2;
00415 for ( int j=0; j<ny; ++j, d += nz*2 ) {
00416 for ( int k=0; k<nz; ++k ) {
00417 *(md++) = d[2*k];
00418 *(md++) = d[2*k+1];
00419 }
00420 }
00421 }
00422
00423
00424 initdata.yPencil(ib,0,thisIndex.z).recvUntrans(msg);
00425 }
00426 }
00427
00428
00429 void OptPmeYPencil::recv_untrans(const OptPmeFFTMsg *msg) {
00430
00431 int block2 = initdata.grid.block2;
00432 int K2 = initdata.grid.K2;
00433 int jb = msg->sourceNode;
00434 int ny = msg->nx;
00435 const float *md = msg->qgrid;
00436 float *d = data;
00437 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
00438 for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
00439 for ( int k=0; k<nz; ++k ) {
00440 d[2*(j*nz+k)] = *(md++);
00441 d[2*(j*nz+k)+1] = *(md++);
00442 }
00443 }
00444 }
00445 }
00446
00447
00448
00449 void OptPmeYPencil::backward_fft() {
00450 #ifdef NAMD_FFTW
00451 for ( int i=0; i<nx; ++i ) {
00452 #if CMK_BLUEGENEL
00453 CmiNetworkProgress();
00454 #endif
00455
00456 fftw(backward_plan, nz,
00457 ((fftw_complex *) data) + i * nz * initdata.grid.K2,
00458 nz, 1, (fftw_complex *) work, 1, 0);
00459 }
00460 #endif
00461 }
00462
00463 void OptPmeYPencil::send_untrans() {
00464
00465
00466 int yBlocks = initdata.yBlocks;
00467 int block2 = initdata.grid.block2;
00468 int K2 = initdata.grid.K2;
00469
00470 for ( int isend=0; isend<yBlocks; ++isend ) {
00471 int jb = send_order[isend];
00472
00473 int ny = block2;
00474 if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
00475 OptPmeFFTMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) OptPmeFFTMsg;
00476 msg->sourceNode = thisIndex.z;
00477 msg->nx = nz;
00478 float *md = msg->qgrid;
00479 const float *d = data;
00480 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
00481 for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
00482 for ( int k=0; k<nz; ++k ) {
00483 *(md++) = d[2*(j*nz+k)];
00484 *(md++) = d[2*(j*nz+k)+1];
00485 }
00486 }
00487 }
00488
00489
00490 initdata.zPencil(thisIndex.x,jb,0).recvUntrans(msg);
00491 }
00492 }
00493
00494 void OptPmeZPencil::recv_untrans(const OptPmeFFTMsg *msg) {
00495
00496
00497
00498 int block3 = initdata.grid.block3;
00499 int dim3 = initdata.grid.dim3;
00500 int kb = msg->sourceNode;
00501 int nz = msg->nx;
00502 const float *md = msg->qgrid;
00503 float *d = data;
00504 for ( int i=0; i<nx; ++i ) {
00505 for ( int j=0; j<ny; ++j, d += dim3 ) {
00506 for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
00507 d[2*k] = *(md++);
00508 d[2*k+1] = *(md++);
00509 }
00510 }
00511 }
00512 }
00513
00514
00515 void OptPmeZPencil::backward_fft() {
00516 #ifdef NAMD_FFTW
00517 rfftwnd_complex_to_real(backward_plan, nx*ny,
00518 (fftw_complex *) data, 1, initdata.grid.dim3/2, work, 1, 0);
00519 #endif
00520
00521 #if CMK_BLUEGENEL
00522 CmiNetworkProgress();
00523 #endif
00524
00525 #ifdef FFTCHECK
00526 int dim3 = initdata.grid.dim3;
00527 int K1 = initdata.grid.K1;
00528 int K2 = initdata.grid.K2;
00529 int K3 = initdata.grid.K3;
00530 float scale = 1. / (1. * K1 * K2 * K3);
00531 float maxerr = 0.;
00532 float maxstd = 0.;
00533 int mi, mj, mk; mi = mj = mk = -1;
00534 float std_base = 100. * (thisIndex.x+1.) + 10. * (thisIndex.y+1.);
00535 const float *d = data;
00536 for ( int i=0; i<nx; ++i ) {
00537 for ( int j=0; j<ny; ++j, d += dim3 ) {
00538 for ( int k=0; k<K3; ++k ) {
00539 float std = 10. * (10. * (10. * std_base + i) + j) + k;
00540 float err = scale * d[k] - std;
00541 if ( fabsf(err) > fabsf(maxerr) ) {
00542 maxerr = err;
00543 maxstd = std;
00544 mi = i; mj = j; mk = k;
00545 }
00546 }
00547 }
00548 }
00549 CkPrintf("pencil %d %d max error %f at %d %d %d (should be %f)\n",
00550 thisIndex.x, thisIndex.y, maxerr, mi, mj, mk, maxstd);
00551 #endif
00552 }
00553
00554 void OptPmeZPencil::send_ungrid(OptPmeGridMsg *msg) {
00555 int pe = msg->sourceNode;
00556 msg->sourceNode = thisIndex.x * initdata.yBlocks + thisIndex.y;
00557
00558 int dim3 = initdata.grid.dim3;
00559 int xstart = msg->xstart - thisIndex.x * initdata.grid.block1;
00560 int ystart = msg->ystart - thisIndex.y * initdata.grid.block2;
00561 assert (xstart >= 0);
00562 assert (ystart >= 0);
00563 int xlen = msg->xlen;
00564 int ylen = msg->ylen;
00565 assert (xstart + xlen <= nx);
00566 assert (ystart + ylen <= ny);
00567
00568 float *qmsg = msg->qgrid;
00569 float *d = data;
00570 int zstart = msg->zstart;
00571 int zlen = msg->zlen;
00572 int zmax = zstart + zlen - 1;
00573
00574 int K3_1 = initdata.grid.K3 - 1;
00575 int K3 = initdata.grid.K3;
00576 if (zstart < 0) {
00577 zstart += K3;
00578 zmax += K3;
00579 }
00580
00581 int k = 0;
00582 for ( int i=xstart; i<xstart+xlen; ++i ) {
00583 for ( int j=ystart; j<ystart+ylen; ++j) {
00584 float *d = data + (i * ny + j) * dim3;
00585 for ( k=zstart; k<=zmax; ++k ) {
00586 int kz = k;
00587
00588 kz = kz - ((unsigned)(K3_1 - kz)>>31)*K3;
00589 *(qmsg++) = d[kz];
00590 }
00591 }
00592 }
00593
00594 initdata.pmeProxy[pe].recvUngrid(msg);
00595 }
00596
00597
00601
00602 void OptPmeZPencil::many_to_many_recv_grid () {
00603 int dim3 = initdata.grid.dim3;
00604 memset(data, 0, sizeof(float) * nx*ny*dim3);
00605 int K3 = initdata.grid.K3;
00606 int K3_1 = initdata.grid.K3 - 1;
00607 int k = 0;
00608 for (int idx = 0; idx < grid_msgs.size(); idx++) {
00609 int xstart = m2m_recv_grid[idx].xstart;
00610 int xlen = m2m_recv_grid[idx].xlen;
00611 int ystart = m2m_recv_grid[idx].ystart;
00612 int ylen = m2m_recv_grid[idx].ylen;
00613 int zstart = m2m_recv_grid[idx].zstart;
00614 int zlen = m2m_recv_grid[idx].zlen;
00615
00616 float *qmsg = m2m_recv_grid[idx].data;
00617 for ( int i=xstart; i<xstart+xlen; ++i ) {
00618 for ( int j=ystart; j<ystart+ylen; ++j) {
00619 float *d = data + (i * ny + j) * dim3;
00620
00621 #pragma disjoint (*qmsg, *d)
00622 #pragma unroll (4)
00623 for ( k=zstart; k<zstart+zlen; ++k ) {
00624 int kz = k;
00625
00626 kz = kz - ((unsigned)(K3_1 - kz)>>31)*K3;
00627
00628
00629 d[kz] += *(qmsg++);
00630 }
00631 }
00632 }
00633 }
00634 }
00635
00636 void OptPmeZPencil::many_to_many_send_trans() {
00637 int zBlocks = initdata.zBlocks;
00638 int block3 = initdata.grid.block3;
00639 int dim3 = initdata.grid.dim3;
00640
00641 float *md = many_to_many_data;
00642 int nz = block3;
00643 for ( int kb=0; kb<zBlocks; ++kb ) {
00644 nz = block3;
00645 if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
00646
00647 const float *d = data;
00648 for ( int i=0; i<nx; ++i ) {
00649 for ( int j=0; j<ny; ++j, d += dim3 ) {
00650 for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
00651 *(md++) = d[2*k];
00652 *(md++) = d[2*k+1];
00653 }
00654 }
00655 }
00656 }
00657
00658 #if CHARM_VERSION > 60000
00659 CmiDirect_manytomany_start (handle, PHASE_YF);
00660 #endif
00661 }
00662
00663 void OptPmeYPencil::many_to_many_recv_trans () {
00664 int block2 = initdata.grid.block2;
00665 int K2 = initdata.grid.K2;
00666
00667 const float *md = many_to_many_data;
00668 for (int jb = 0; jb < initdata.yBlocks; jb++ ) {
00669 int ny = many_to_many_nb[jb];
00670 float *d = data;
00671 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
00672 for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
00673 for ( int k=0; k<nz; ++k ) {
00674 d[2*(j*nz+k)] = *(md++);
00675 d[2*(j*nz+k)+1] = *(md++);
00676 }
00677 }
00678 }
00679 }
00680 }
00681
00682 void OptPmeYPencil::many_to_many_send(int phase) {
00683 int yBlocks = initdata.yBlocks;
00684 int block2 = initdata.grid.block2;
00685 int K2 = initdata.grid.K2;
00686
00687 float *md = many_to_many_data;
00688 int ny = block2;
00689 for ( int jb=0; jb<yBlocks; ++jb ) {
00690 ny = block2;
00691 if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
00692
00693 const float *d = data;
00694 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
00695 for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
00696 for ( int k=0; k<nz; ++k ) {
00697 *(md++) = d[2*(j*nz+k)];
00698 *(md++) = d[2*(j*nz+k)+1];
00699 }
00700 }
00701 }
00702 }
00703
00704 #if CHARM_VERSION > 60000
00705 CmiDirect_manytomany_start (handle, phase);
00706 #endif
00707 }
00708
00709 void OptPmeXPencil::many_to_many_recv_trans () {
00710
00711 int block1 = initdata.grid.block1;
00712 int K1 = initdata.grid.K1;
00713
00714 const float *md = many_to_many_data;
00715 for (int ib =0; ib < initdata.xBlocks; ib++ ) {
00716 int nx = many_to_many_nb[ib];
00717 for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
00718 float *d = data + i*ny*nz*2;
00719 for ( int j=0; j<ny; ++j, d += nz*2 ) {
00720 for ( int k=0; k<nz; ++k ) {
00721 d[2*k] = *(md++);
00722 d[2*k+1] = *(md++);
00723 }
00724 }
00725 }
00726 }
00727 }
00728
00729 void OptPmeXPencil::many_to_many_send_untrans() {
00730 int xBlocks = initdata.xBlocks;
00731 int block1 = initdata.grid.block1;
00732 int K1 = initdata.grid.K1;
00733
00734 int nx = block1;
00735 float * md = many_to_many_data;
00736 for ( int ib=0; ib<xBlocks; ++ib ) {
00737 nx = block1;
00738 if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
00739
00740 for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
00741 float *d = data + i*ny*nz*2;
00742 for ( int j=0; j<ny; ++j, d += nz*2 ) {
00743 for ( int k=0; k<nz; ++k ) {
00744 *(md++) = d[2*k];
00745 *(md++) = d[2*k+1];
00746 }
00747 }
00748 }
00749 }
00750 #if CHARM_VERSION > 60000
00751 CmiDirect_manytomany_start (handle, PHASE_YB);
00752 #endif
00753 }
00754
00755 void OptPmeYPencil::many_to_many_recv_untrans () {
00756 int block2 = initdata.grid.block2;
00757 int K2 = initdata.grid.K2;
00758
00759 const float *md = many_to_many_data;
00760 for (int jb = 0; jb < initdata.yBlocks; jb++ ) {
00761 int ny = many_to_many_nb[jb];
00762 float *d = data;
00763 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
00764 for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
00765 for ( int k=0; k<nz; ++k ) {
00766 d[2*(j*nz+k)] = *(md++);
00767 d[2*(j*nz+k)+1] = *(md++);
00768 }
00769 }
00770 }
00771 }
00772 }
00773
00774 void OptPmeZPencil::many_to_many_recv_untrans() {
00775
00776 int block3 = initdata.grid.block3;
00777 int dim3 = initdata.grid.dim3;
00778
00779 const float *md = many_to_many_data;
00780 for (int kb = 0; kb < initdata.zBlocks; kb++ ) {
00781 int nz = many_to_many_nb[kb];
00782 float *d = data;
00783 for ( int i=0; i<nx; ++i ) {
00784 for ( int j=0; j<ny; ++j, d += dim3 ) {
00785 for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
00786 d[2*k] = *(md++);
00787 d[2*k+1] = *(md++);
00788 }
00789 }
00790 }
00791 }
00792 }
00793
00794 void OptPmeZPencil::many_to_many_send_ungrid () {
00795 int dim3 = initdata.grid.dim3;
00796 int K3 = initdata.grid.K3;
00797 int K3_1 = initdata.grid.K3 - 1;
00798 int k = 0;
00799 for (int idx = 0; idx < grid_msgs.size(); idx++) {
00800 int xstart = m2m_recv_grid[idx].xstart;
00801 int xlen = m2m_recv_grid[idx].xlen;
00802 int ystart = m2m_recv_grid[idx].ystart;
00803 int ylen = m2m_recv_grid[idx].ylen;
00804 int zstart = m2m_recv_grid[idx].zstart;
00805 int zlen = m2m_recv_grid[idx].zlen;
00806
00807 float *qmsg = m2m_recv_grid[idx].data;
00808 for ( int i=xstart; i<xstart+xlen; ++i ) {
00809 for ( int j=ystart; j<ystart+ylen; ++j) {
00810 float *d = data + (i * ny + j) * dim3;
00811
00812 #pragma disjoint (*d, *qmsg)
00813 #pragma unroll (4)
00814 for ( k=zstart; k<zstart+zlen; ++k ) {
00815 int kz = k;
00816
00817 kz = kz - ((unsigned)(K3_1 - kz)>>31)*K3;
00818
00819
00820 *(qmsg++) = d[kz];
00821 }
00822 }
00823 }
00824 }
00825
00826 #if CHARM_VERSION > 60000
00827 CmiDirect_manytomany_start (handle, PHASE_UG);
00828 #endif
00829 }
00830
00831
00832
00833 void OptPmeZPencil::initialize_manytomany () {
00834 #if CHARM_VERSION > 60000
00835 int idx = 0;
00836 int totalcount = 0;
00837 for (idx = 0; idx < grid_msgs.size(); idx ++)
00838 totalcount += m2m_recv_grid[idx].xlen * m2m_recv_grid[idx].ylen * m2m_recv_grid[idx].zlen;
00839 many_to_many_gr_data = new float [totalcount];
00840
00841 CkArrayIndex3D aidx (thisIndex.x, thisIndex.y, thisIndex.z);
00842 cbw_recvgrid.cb = CkCallback (CkIndex_OptPmeZPencil::many_to_manyRecvGrid(NULL), aidx, thisProxy.ckGetArrayID());
00843 cbw_recvgrid.msg = new (PRIORITY_SIZE) OptPmeDummyMsg;
00844 PatchMap *patchMap = PatchMap::Object();
00845 CmiDirect_manytomany_initialize_recvbase (handle, PHASE_GR,
00846 call_ck_cb, &cbw_recvgrid,
00847 (char *)many_to_many_gr_data, patchMap->numPatches(), -1);
00848
00849 CmiDirect_manytomany_initialize_sendbase (handle, PHASE_UG, NULL, NULL, (char *)many_to_many_gr_data,
00850 grid_msgs.size(), thisIndex.x *initdata.yBlocks + thisIndex.y);
00851
00852 int offset = 0;
00853 for (idx = 0; idx < grid_msgs.size(); idx ++) {
00854 m2m_recv_grid[idx].data = (float *) ((char *)many_to_many_gr_data + offset);
00855 int fcount = m2m_recv_grid[idx].xlen * m2m_recv_grid[idx].ylen * m2m_recv_grid[idx].zlen;
00856 CmiDirect_manytomany_initialize_recv (handle, PHASE_GR, m2m_recv_grid[idx].patchID,
00857 offset, fcount *sizeof(float), patchMap->node(m2m_recv_grid[idx].patchID));
00858 CmiDirect_manytomany_initialize_send (handle, PHASE_UG, idx, offset, fcount *sizeof(float),
00859 patchMap->node(m2m_recv_grid[idx].patchID));
00860 offset += fcount * sizeof(float);
00861 }
00862
00863 int zBlocks = initdata.zBlocks;
00864 int block3 = initdata.grid.block3;
00865 int dim3 = initdata.grid.dim3;
00866
00867
00868 CmiDirect_manytomany_initialize_sendbase (handle, PHASE_YF, NULL, NULL, (char *)many_to_many_data, zBlocks, thisIndex.y);
00869
00870
00871 cbw_recvuntrans.cb = CkCallback(CkIndex_OptPmeZPencil::many_to_manyRecvUntrans(NULL), aidx, thisProxy.ckGetArrayID());
00872 cbw_recvuntrans.msg = new (PRIORITY_SIZE) OptPmeDummyMsg;
00873 CmiDirect_manytomany_initialize_recvbase (handle, PHASE_ZB, call_ck_cb, &cbw_recvuntrans,
00874 (char *)many_to_many_data, initdata.zBlocks, -1);
00875
00876 for ( int kb=0; kb<zBlocks; ++kb ) {
00877 int nz = block3;
00878 if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
00879
00880
00881 CkArrayIndex3D index (thisIndex.x,0,kb);
00882 CProxy_OptPmePencilMapY yproxy (global_map_y);
00883 int pe = yproxy.ckLocalBranch()->procNum(0, index);
00884 CmiDirect_manytomany_initialize_send (handle, PHASE_YF, kb, kb*2*nx*ny*block3*sizeof(float), 2*nx*ny*nz*sizeof(float), pe);
00885
00886
00887 CmiDirect_manytomany_initialize_recv (handle, PHASE_ZB, kb, kb*nx*ny*block3*2*sizeof(float), nx*ny*nz*2*sizeof(float), pe);
00888 many_to_many_nb [kb] = nz;
00889 }
00890 #endif
00891 }
00892
00893 void OptPmeYPencil::initialize_manytomany () {
00894 #if CHARM_VERSION > 60000
00895 int yBlocks = initdata.yBlocks;
00896 int block2 = initdata.grid.block2;
00897 int K2 = initdata.grid.K2;
00898
00899
00900 CmiDirect_manytomany_initialize_sendbase (handle, PHASE_XF, NULL, NULL, (char *)many_to_many_data, yBlocks, thisIndex.x);
00901
00902 CmiDirect_manytomany_initialize_sendbase (handle, PHASE_ZB, NULL, NULL, (char *)many_to_many_data, yBlocks, thisIndex.z);
00903
00904
00905 CkArrayIndex3D idx (thisIndex.x, thisIndex.y, thisIndex.z);
00906 cbw_recvtrans.cb = CkCallback (CkIndex_OptPmeYPencil::many_to_manyRecvTrans(NULL), idx, thisProxy.ckGetArrayID());
00907 cbw_recvtrans.msg = new (PRIORITY_SIZE) OptPmeDummyMsg;
00908 CmiDirect_manytomany_initialize_recvbase (handle, PHASE_YF, call_ck_cb, &cbw_recvtrans,
00909 (char *)many_to_many_data, initdata.yBlocks, -1);
00910
00911
00912 cbw_recvuntrans.cb = CkCallback(CkIndex_OptPmeYPencil::many_to_manyRecvUntrans(NULL), idx, thisProxy.ckGetArrayID());
00913 cbw_recvuntrans.msg = new (PRIORITY_SIZE) OptPmeDummyMsg;
00914 CmiDirect_manytomany_initialize_recvbase (handle, PHASE_YB, call_ck_cb,
00915 &cbw_recvuntrans, (char *)many_to_many_data, initdata.yBlocks, -1);
00916
00917 for ( int jb=0; jb<yBlocks; ++jb ) {
00918 int ny = block2;
00919 if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
00920
00921
00922 CkArrayIndex3D index (thisIndex.x,jb,0);
00923 CProxy_OptPmePencilMapZ zproxy (global_map_z);
00924 int pe = zproxy.ckLocalBranch()->procNum(0, index);
00925 CmiDirect_manytomany_initialize_send (handle, PHASE_ZB, jb, 2*nx*block2*nz*jb*sizeof(float), 2*nx*ny*nz*sizeof(float), pe);
00926
00927
00928 CmiDirect_manytomany_initialize_recv (handle, PHASE_YF, jb, jb*nx*block2*nz*2*sizeof(float), nx*ny*nz*2*sizeof(float), pe);
00929
00930
00931 index = CkArrayIndex3D (0,jb,thisIndex.z);
00932 CProxy_OptPmePencilMapX xproxy (global_map_x);
00933 pe = xproxy.ckLocalBranch()->procNum(0, index);
00934 CmiDirect_manytomany_initialize_send (handle, PHASE_XF, jb, 2*nx*block2*nz*jb*sizeof(float), 2*nx*ny*nz*sizeof(float), pe);
00935
00936
00937 CmiDirect_manytomany_initialize_recv (handle, PHASE_YB, jb, jb*nx*block2*nz*2*sizeof(float), nx*ny*nz*2*sizeof(float), pe);
00938
00939 many_to_many_nb [jb] = ny;
00940 }
00941 #endif
00942 }
00943
00944 void OptPmeXPencil::initialize_manytomany () {
00945 #if CHARM_VERSION > 60000
00946 int xBlocks = initdata.xBlocks;
00947 int block1 = initdata.grid.block1;
00948 int K1 = initdata.grid.K1;
00949
00950 CmiDirect_manytomany_initialize_sendbase (handle, PHASE_YB, NULL, NULL, (char *)many_to_many_data, xBlocks, thisIndex.y);
00951 CkArrayIndex3D idx (thisIndex.x, thisIndex.y, thisIndex.z);
00952 cbw_recvtrans.cb = CkCallback(CkIndex_OptPmeXPencil::many_to_manyRecvTrans(NULL), idx, thisProxy.ckGetArrayID());
00953 cbw_recvtrans.msg = new (PRIORITY_SIZE) OptPmeDummyMsg;
00954 CmiDirect_manytomany_initialize_recvbase (handle, PHASE_XF, call_ck_cb,
00955 &cbw_recvtrans, (char *)many_to_many_data,
00956 initdata.xBlocks, -1);
00957
00958 for ( int ib=0; ib<xBlocks; ++ib ) {
00959 int nx = block1;
00960 if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
00961
00962 CkArrayIndex3D index (ib,0,thisIndex.z);
00963 CProxy_OptPmePencilMapY yproxy (global_map_y);
00964 int pe = yproxy.ckLocalBranch()->procNum(0, index);
00965 CmiDirect_manytomany_initialize_send (handle, PHASE_YB, ib, 2*block1*ny*nz*ib*sizeof(float), 2*nx*ny*nz*sizeof(float), pe);
00966
00967 CmiDirect_manytomany_initialize_recv (handle, PHASE_XF, ib, ib*block1*ny*nz*2*sizeof(float), nx*ny*nz*2*sizeof(float), pe);
00968 many_to_many_nb [ib] = nx;
00969 }
00970 #endif
00971 }
00972
00973
00977
00978
00979 #include "PmeFFTLib.def.h"