00001
00002 #include "fftlib.h"
00003
00004 #include <assert.h>
00005
00006 void call_ck_cb (void *arg) {
00007 CkCallbackWrapper *cw = (CkCallbackWrapper *)arg;
00008 cw->cb.send (cw->msg);
00009 }
00010
00011 void OptPmeZPencil::fft_init() {
00012
00013
00014
00015 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
00016 Node *node = nd.ckLocalBranch();
00017 SimParameters *simParams = node->simParameters;
00018
00019 int K1 = initdata.grid.K1;
00020 int K2 = initdata.grid.K2;
00021 int K3 = initdata.grid.K3;
00022 int dim3 = initdata.grid.dim3;
00023 int block1 = initdata.grid.block1;
00024 int block2 = initdata.grid.block2;
00025
00026 nx = block1;
00027 if ( (thisIndex.x + 1) * block1 > K1 ) nx = K1 - thisIndex.x * block1;
00028 ny = block2;
00029 if ( (thisIndex.y + 1) * block2 > K2 ) ny = K2 - thisIndex.y * block2;
00030
00031 data = new float[nx*ny*dim3];
00032 many_to_many_data = new float[nx*ny*dim3];
00033 many_to_many_nb = new int [initdata.zBlocks];
00034 work = new float[dim3];
00035
00036 memset(data, 0, sizeof(float) * nx*ny*dim3);
00037 memset(many_to_many_data, 0, sizeof(float) * nx*ny*dim3);
00038
00039 order_init(initdata.zBlocks);
00040
00041 #ifdef NAMD_FFTW
00042 CmiLock(fftw_plan_lock);
00043 #ifdef NAMD_FFTW_3
00044
00045 int numLines=nx*ny;
00046 int planLineSizes[1];
00047 planLineSizes[0]=K3;
00048 CkAbort("what are we doing in here?");
00049 forward_plan = fftwf_plan_many_dft_r2c(1, planLineSizes, numLines,
00050 (float *) data, NULL, 1,
00051 initdata.grid.dim3,
00052 (fftwf_complex *) data, NULL, 1, 0,
00053 ( simParams->FFTWEstimate ? FFTW_ESTIMATE
00054 : FFTW_MEASURE ));
00055 backward_plan = fftwf_plan_many_dft_c2r(1, planLineSizes, numLines,
00056 (fftwf_complex *) data, NULL, 1,
00057 initdata.grid.dim3/2,
00058 (float *) data, NULL, 1, 0,
00059 ( simParams->FFTWEstimate ? FFTW_ESTIMATE
00060 : FFTW_MEASURE ));
00061 #else
00062
00063 forward_plan = rfftwnd_create_plan_specific(1, &K3, FFTW_REAL_TO_COMPLEX,
00064 ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
00065 | FFTW_IN_PLACE | FFTW_USE_WISDOM, data, 1, work, 1);
00066 backward_plan = rfftwnd_create_plan_specific(1, &K3, FFTW_COMPLEX_TO_REAL,
00067 ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
00068 | FFTW_IN_PLACE | FFTW_USE_WISDOM, data, 1, work, 1);
00069 #endif
00070 CmiUnlock(fftw_plan_lock);
00071 #else
00072 NAMD_die("Sorry, FFTW must be compiled in to use PME.");
00073 #endif
00074
00075 handle = NULL;
00076 #if CHARM_VERSION > 60000
00077 handle = CmiDirect_manytomany_allocate_handle();
00078 #endif
00079 }
00080
00081 void OptPmeYPencil::fft_init() {
00082 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
00083 Node *node = nd.ckLocalBranch();
00084 SimParameters *simParams = node->simParameters;
00085
00086
00087
00088 int K1 = initdata.grid.K1;
00089 int K2 = initdata.grid.K2;
00090 int dim2 = initdata.grid.dim2;
00091 int dim3 = initdata.grid.dim3;
00092 int block1 = initdata.grid.block1;
00093 int block3 = initdata.grid.block3;
00094
00095 nx = block1;
00096 if ( (thisIndex.x + 1) * block1 > K1 ) nx = K1 - thisIndex.x * block1;
00097 nz = block3;
00098 if ( (thisIndex.z+1)*block3 > dim3/2 ) nz = dim3/2 - thisIndex.z*block3;
00099
00100 data = new float[nx*dim2*nz*2];
00101 many_to_many_data = new float[nx*dim2*nz*2];
00102 many_to_many_nb = new int [initdata.yBlocks];
00103 work = new float[2*K2];
00104
00105 memset(many_to_many_data, 0, sizeof(float) * nx*dim2*nz*2);
00106
00107 order_init(initdata.yBlocks);
00108
00109 #ifdef NAMD_FFTW
00110 CmiLock(fftw_plan_lock);
00111 #ifdef NAMD_FFTW_3
00112
00113 int numLines=nz;
00114 int planLineSizes[2];
00115 planLineSizes[0]=initdata.grid.K2;
00116 planLineSizes[1]=nz;
00117 forward_plan = fftwf_plan_many_dft(2, planLineSizes, numLines,
00118 (fftwf_complex *) data, NULL, nz, 1,
00119 (fftwf_complex *) data, NULL, 1, 0,
00120 FFTW_FORWARD,
00121 ( simParams->FFTWEstimate
00122 ? FFTW_ESTIMATE : FFTW_MEASURE ));
00123 backward_plan = fftwf_plan_many_dft(2, planLineSizes, numLines,
00124 (fftwf_complex *) data, NULL, nz, 1,
00125 (fftwf_complex *) data, NULL, 1, 0,
00126 FFTW_FORWARD,
00127 ( simParams->FFTWEstimate
00128 ? FFTW_ESTIMATE : FFTW_MEASURE ));
00129 #else
00130
00131 forward_plan = fftw_create_plan_specific(K2, FFTW_FORWARD,
00132 ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
00133 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
00134 nz, (fftw_complex *) work, 1);
00135 backward_plan = fftw_create_plan_specific(K2, FFTW_BACKWARD,
00136 ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
00137 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
00138 nz, (fftw_complex *) work, 1);
00139 #endif
00140 CmiUnlock(fftw_plan_lock);
00141 #else
00142 NAMD_die("Sorry, FFTW must be compiled in to use PME.");
00143 #endif
00144
00145 handle = NULL;
00146 #if CHARM_VERSION > 60000
00147 handle = CmiDirect_manytomany_allocate_handle();
00148 #endif
00149 initialize_manytomany();
00150 }
00151
00152
00153 void OptPmeXPencil::fft_init() {
00154 CProxy_Node nd(CkpvAccess(BOCclass_group).node);
00155 Node *node = nd.ckLocalBranch();
00156 SimParameters *simParams = node->simParameters;
00157
00158
00159
00160 lattice = simParams->lattice;
00161
00162 int K1 = initdata.grid.K1;
00163 int K2 = initdata.grid.K2;
00164 int dim3 = initdata.grid.dim3;
00165 int block2 = initdata.grid.block2;
00166 int block3 = initdata.grid.block3;
00167
00168 ny = block2;
00169 if ( (thisIndex.y + 1) * block2 > K2 ) ny = K2 - thisIndex.y * block2;
00170 nz = block3;
00171 if ( (thisIndex.z+1)*block3 > dim3/2 ) nz = dim3/2 - thisIndex.z*block3;
00172
00173 data = new float[K1*block2*block3*2];
00174 many_to_many_data = new float[K1*block2*block3*2];
00175 many_to_many_nb = new int [initdata.xBlocks];
00176 work = new float[2*K1];
00177
00178 memset(many_to_many_data, 0, sizeof(float) * K1*block2*block3*2);
00179
00180 order_init(initdata.xBlocks);
00181
00182 #ifdef NAMD_FFTW
00183 CmiLock(fftw_plan_lock);
00184 #ifdef NAMD_FFTW_3
00185
00186 int numLines=ny*nz;
00187 int planLineSizes[1];
00188 planLineSizes[0]=K1;
00189 forward_plan = fftwf_plan_many_dft(1, planLineSizes, numLines,
00190 (fftwf_complex *) data, NULL, K1, 1,
00191 (fftwf_complex *) data, NULL, 1, 0,
00192 FFTW_FORWARD,
00193 ( simParams->FFTWEstimate ? FFTW_ESTIMATE
00194 : FFTW_MEASURE ));
00195 backward_plan = fftwf_plan_many_dft(1, planLineSizes, numLines,
00196 (fftwf_complex *) data, NULL, K1, 1,
00197 (fftwf_complex *) data, NULL, 1, 0,
00198 FFTW_BACKWARD,
00199 ( simParams->FFTWEstimate ? FFTW_ESTIMATE
00200 : FFTW_MEASURE ));
00201 #else
00202
00203 forward_plan = fftw_create_plan_specific(K1, FFTW_FORWARD,
00204 ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
00205 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
00206 ny*nz, (fftw_complex *) work, 1);
00207 backward_plan = fftw_create_plan_specific(K1, FFTW_BACKWARD,
00208 ( simParams->FFTWEstimate ? FFTW_ESTIMATE : FFTW_MEASURE )
00209 | FFTW_IN_PLACE | FFTW_USE_WISDOM, (fftw_complex *) data,
00210 ny*nz, (fftw_complex *) work, 1);
00211
00212 CmiUnlock(fftw_plan_lock);
00213 #endif
00214 #else
00215 NAMD_die("Sorry, FFTW must be compiled in to use PME.");
00216 #endif
00217
00218 myKSpace = new PmeKSpace(initdata.grid,
00219 thisIndex.y*block2, thisIndex.y*block2 + ny,
00220 thisIndex.z*block3, thisIndex.z*block3 + nz);
00221
00222 handle = NULL;
00223 #if CHARM_VERSION > 60000
00224 handle = CmiDirect_manytomany_allocate_handle();
00225 #endif
00226 initialize_manytomany();
00227
00228 constant_pressure = initdata.constant_pressure;
00229 }
00230
00231
00232
00233
00234 void OptPmeZPencil::recv_grid(const OptPmeGridMsg *msg) {
00235
00236 int dim3 = initdata.grid.dim3;
00237 if ( imsg == 0 ) {
00238 memset(data, 0, sizeof(float) * nx*ny*dim3);
00239 }
00240
00241 int xstart = msg->xstart - thisIndex.x * initdata.grid.block1;
00242 int ystart = msg->ystart - thisIndex.y * initdata.grid.block2;
00243 assert (xstart >= 0);
00244 assert (ystart >= 0);
00245 int xlen = msg->xlen;
00246 int ylen = msg->ylen;
00247 assert (xstart + xlen <= nx);
00248 assert (ystart + ylen <= ny);
00249
00250 float *qmsg = msg->qgrid;
00251 float *d = data;
00252 int zlen = msg->zlen;
00253 int zstart = msg->zstart;
00254 int zmax = zstart + zlen - 1;
00255 int k = 0;
00256
00257 int K3 = initdata.grid.K3;
00258 int K3_1 = initdata.grid.K3 - 1;
00259 while (zstart < 0) {
00260 zstart += K3;
00261 zmax += K3;
00262 }
00263
00264 for ( int i=xstart; i<xstart+xlen; ++i ) {
00265 for ( int j=ystart; j<ystart+ylen; ++j) {
00266 float *d = data + (i * ny + j) * dim3;
00267 for ( k=zstart; k<=zmax; ++k ) {
00268 int kz = k;
00269
00270 kz = kz - ((unsigned)(K3_1 - kz)>>31)*K3;
00271
00272
00273 d[kz] += *(qmsg++);
00274 }
00275 }
00276 }
00277
00278 if (_iter == MANY_TO_MANY_SETUP) {
00279 if (imsg == 0)
00280 m2m_recv_grid = new PatchGridElem [grid_msgs.size()];
00281
00282 m2m_recv_grid[imsg].xstart = xstart;
00283 m2m_recv_grid[imsg].xlen = xlen;
00284 m2m_recv_grid[imsg].ystart = ystart;
00285 m2m_recv_grid[imsg].ylen = ylen;
00286 m2m_recv_grid[imsg].zstart = zstart;
00287 m2m_recv_grid[imsg].zlen = zlen;
00288 m2m_recv_grid[imsg].patchID = msg->patchID;
00289
00290 if ( imsg == grid_msgs.size() - 1)
00291 initialize_manytomany ();
00292 }
00293 }
00294
00295
00296 void OptPmeZPencil::forward_fft() {
00297 #ifdef FFTCHECK
00298 int dim3 = initdata.grid.dim3;
00299 int K3 = initdata.grid.K3;
00300 float std_base = 100. * (thisIndex.x+1.) + 10. * (thisIndex.y+1.);
00301 float *d = data;
00302 for ( int i=0; i<nx; ++i ) {
00303 for ( int j=0; j<ny; ++j, d += dim3 ) {
00304 for ( int k=0; k<dim3; ++k ) {
00305 d[k] = 10. * (10. * (10. * std_base + i) + j) + k;
00306 }
00307 }
00308 }
00309 #endif
00310 #ifdef NAMD_FFTW
00311 #ifdef NAMD_FFTW_3
00312 fftwf_execute(forward_plan);
00313 #else
00314 rfftwnd_real_to_complex(forward_plan, nx*ny,
00315 data, 1, initdata.grid.dim3, (fftw_complex *) work, 1, 0);
00316 #endif
00317 #endif
00318 }
00319
00320 void OptPmeZPencil::send_trans() {
00321 int zBlocks = initdata.zBlocks;
00322 int block3 = initdata.grid.block3;
00323 int dim3 = initdata.grid.dim3;
00324
00325
00326 for ( int isend=0; isend<zBlocks; ++isend ) {
00327 int kb = send_order[isend];
00328 int nz = block3;
00329 if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
00330
00331 OptPmeFFTMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) OptPmeFFTMsg;
00332 msg->sourceNode = thisIndex.y;
00333 msg->nx = ny;
00334 float *md = msg->qgrid;
00335 const float *d = data;
00336 for ( int i=0; i<nx; ++i ) {
00337 for ( int j=0; j<ny; ++j, d += dim3 ) {
00338 for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
00339 *(md++) = d[2*k];
00340 *(md++) = d[2*k+1];
00341 }
00342 }
00343 }
00344
00345
00346 initdata.yPencil(thisIndex.x,0,kb).recvTrans(msg);
00347 }
00348 }
00349
00350
00351 void OptPmeYPencil::recv_trans(const OptPmeFFTMsg *msg) {
00352
00353 int block2 = initdata.grid.block2;
00354 int K2 = initdata.grid.K2;
00355 int jb = msg->sourceNode;
00356 int ny = msg->nx;
00357 const float *md = msg->qgrid;
00358 float *d = data;
00359 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
00360 for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
00361 for ( int k=0; k<nz; ++k ) {
00362 d[2*(j*nz+k)] = *(md++);
00363 d[2*(j*nz+k)+1] = *(md++);
00364 }
00365 }
00366 }
00367 }
00368
00369 void OptPmeYPencil::forward_fft() {
00370 #ifdef NAMD_FFTW
00371 #ifdef NAMD_FFTW_3
00372 fftwf_execute(forward_plan);
00373 #else
00374 for ( int i=0; i<nx; ++i ) {
00375 fftw(forward_plan, nz,
00376 ((fftw_complex *) data) + i * nz * initdata.grid.K2,
00377 nz, 1, (fftw_complex *) work, 1, 0);
00378 }
00379 #endif
00380 #endif
00381 }
00382
00383 void OptPmeYPencil::send_trans() {
00384 int yBlocks = initdata.yBlocks;
00385 int block2 = initdata.grid.block2;
00386 int K2 = initdata.grid.K2;
00387
00388 for ( int isend=0; isend<yBlocks; ++isend ) {
00389 int jb = send_order[isend];
00390 int ny = block2;
00391 if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
00392 OptPmeFFTMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) OptPmeFFTMsg;
00393 msg->sourceNode = thisIndex.x;
00394 msg->nx = nx;
00395 float *md = msg->qgrid;
00396 const float *d = data;
00397 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
00398 for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
00399 for ( int k=0; k<nz; ++k ) {
00400 *(md++) = d[2*(j*nz+k)];
00401 *(md++) = d[2*(j*nz+k)+1];
00402 }
00403 }
00404 }
00405 if ( md != msg->qgrid + nx*ny*nz*2 ) CkPrintf("error in YX at %d %d %d\n",
00406 thisIndex.x, jb, thisIndex.z);
00407
00408
00409 initdata.xPencil(0,jb,thisIndex.z).recvTrans(msg);
00410 }
00411 }
00412
00413
00414
00415 void OptPmeXPencil::recv_trans(const OptPmeFFTMsg *msg) {
00416
00417 int block1 = initdata.grid.block1;
00418 int K1 = initdata.grid.K1;
00419 int ib = msg->sourceNode;
00420 int nx = msg->nx;
00421 const float *md = msg->qgrid;
00422 for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
00423 float *d = data + i*ny*nz*2;
00424 for ( int j=0; j<ny; ++j, d += nz*2 ) {
00425 for ( int k=0; k<nz; ++k ) {
00426 d[2*k] = *(md++);
00427 d[2*k+1] = *(md++);
00428 }
00429 }
00430 }
00431 }
00432
00433
00434
00435 void OptPmeXPencil::forward_fft() {
00436 #ifdef NAMD_FFTW
00437 #ifdef NAMD_FFTW_3
00438 fftwf_execute(forward_plan);
00439 #else
00440
00441 fftw(forward_plan, ny*nz,
00442 ((fftw_complex *) data), ny*nz, 1, (fftw_complex *) work, 1, 0);
00443 #endif
00444 #endif
00445 }
00446
00447 void OptPmeXPencil::pme_kspace() {
00448
00449 evir = 0.;
00450
00451 #ifdef FFTCHECK
00452 return;
00453 #endif
00454
00455 BigReal ewaldcof = ComputeNonbondedUtil::ewaldcof;
00456 evir[0] = myKSpace->compute_energy(data,
00457 lattice, ewaldcof, &(evir[1]));
00458
00459 contribute (7*sizeof(double), evir.begin(), CkReduction::sum_double, initdata.cb_energy);
00460
00461 }
00462
00463 void OptPmeXPencil::backward_fft() {
00464 #ifdef NAMD_FFTW
00465 #ifdef NAMD_FFTW_3
00466 fftwf_execute(backward_plan);
00467 #else
00468
00469 fftw(backward_plan, ny*nz,
00470 ((fftw_complex *) data), ny*nz, 1, (fftw_complex *) work, 1, 0);
00471 #endif
00472 #endif
00473 }
00474
00475 void OptPmeXPencil::send_untrans() {
00476 int xBlocks = initdata.xBlocks;
00477 int block1 = initdata.grid.block1;
00478 int K1 = initdata.grid.K1;
00479
00480 for ( int isend=0; isend<xBlocks; ++isend ) {
00481 int ib = send_order[isend];
00482 int nx = block1;
00483 if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
00484 OptPmeFFTMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) OptPmeFFTMsg;
00485 msg->sourceNode = thisIndex.y;
00486 msg->nx = ny;
00487 float *md = msg->qgrid;
00488 for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
00489 float *d = data + i*ny*nz*2;
00490 for ( int j=0; j<ny; ++j, d += nz*2 ) {
00491 for ( int k=0; k<nz; ++k ) {
00492 *(md++) = d[2*k];
00493 *(md++) = d[2*k+1];
00494 }
00495 }
00496 }
00497
00498
00499 initdata.yPencil(ib,0,thisIndex.z).recvUntrans(msg);
00500 }
00501 }
00502
00503
00504 void OptPmeYPencil::recv_untrans(const OptPmeFFTMsg *msg) {
00505
00506 int block2 = initdata.grid.block2;
00507 int K2 = initdata.grid.K2;
00508 int jb = msg->sourceNode;
00509 int ny = msg->nx;
00510 const float *md = msg->qgrid;
00511 float *d = data;
00512 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
00513 for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
00514 for ( int k=0; k<nz; ++k ) {
00515 d[2*(j*nz+k)] = *(md++);
00516 d[2*(j*nz+k)+1] = *(md++);
00517 }
00518 }
00519 }
00520 }
00521
00522
00523
00524 void OptPmeYPencil::backward_fft() {
00525 #ifdef NAMD_FFTW
00526 #ifdef NAMD_FFTW_3
00527 fftwf_execute(backward_plan);
00528 #else
00529 for ( int i=0; i<nx; ++i ) {
00530 #if CMK_BLUEGENEL
00531 CmiNetworkProgress();
00532 #endif
00533
00534 fftw(backward_plan, nz,
00535 ((fftw_complex *) data) + i * nz * initdata.grid.K2,
00536 nz, 1, (fftw_complex *) work, 1, 0);
00537 }
00538 #endif
00539 #endif
00540 }
00541
00542 void OptPmeYPencil::send_untrans() {
00543
00544
00545 int yBlocks = initdata.yBlocks;
00546 int block2 = initdata.grid.block2;
00547 int K2 = initdata.grid.K2;
00548
00549 for ( int isend=0; isend<yBlocks; ++isend ) {
00550 int jb = send_order[isend];
00551
00552 int ny = block2;
00553 if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
00554 OptPmeFFTMsg *msg = new (nx*ny*nz*2,PRIORITY_SIZE) OptPmeFFTMsg;
00555 msg->sourceNode = thisIndex.z;
00556 msg->nx = nz;
00557 float *md = msg->qgrid;
00558 const float *d = data;
00559 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
00560 for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
00561 for ( int k=0; k<nz; ++k ) {
00562 *(md++) = d[2*(j*nz+k)];
00563 *(md++) = d[2*(j*nz+k)+1];
00564 }
00565 }
00566 }
00567
00568
00569 initdata.zPencil(thisIndex.x,jb,0).recvUntrans(msg);
00570 }
00571 }
00572
00573 void OptPmeZPencil::recv_untrans(const OptPmeFFTMsg *msg) {
00574
00575
00576
00577 int block3 = initdata.grid.block3;
00578 int dim3 = initdata.grid.dim3;
00579 int kb = msg->sourceNode;
00580 int nz = msg->nx;
00581 const float *md = msg->qgrid;
00582 float *d = data;
00583 for ( int i=0; i<nx; ++i ) {
00584 for ( int j=0; j<ny; ++j, d += dim3 ) {
00585 for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
00586 d[2*k] = *(md++);
00587 d[2*k+1] = *(md++);
00588 }
00589 }
00590 }
00591 }
00592
00593
00594 void OptPmeZPencil::backward_fft() {
00595 #ifdef NAMD_FFTW
00596 #ifdef NAMD_FFTW_3
00597 fftwf_execute(backward_plan);
00598 #else
00599 rfftwnd_complex_to_real(backward_plan, nx*ny,
00600 (fftw_complex *) data, 1, initdata.grid.dim3/2, work, 1, 0);
00601 #endif
00602 #endif
00603
00604 #if CMK_BLUEGENEL
00605 CmiNetworkProgress();
00606 #endif
00607
00608 #ifdef FFTCHECK
00609 int dim3 = initdata.grid.dim3;
00610 int K1 = initdata.grid.K1;
00611 int K2 = initdata.grid.K2;
00612 int K3 = initdata.grid.K3;
00613 float scale = 1. / (1. * K1 * K2 * K3);
00614 float maxerr = 0.;
00615 float maxstd = 0.;
00616 int mi, mj, mk; mi = mj = mk = -1;
00617 float std_base = 100. * (thisIndex.x+1.) + 10. * (thisIndex.y+1.);
00618 const float *d = data;
00619 for ( int i=0; i<nx; ++i ) {
00620 for ( int j=0; j<ny; ++j, d += dim3 ) {
00621 for ( int k=0; k<K3; ++k ) {
00622 float std = 10. * (10. * (10. * std_base + i) + j) + k;
00623 float err = scale * d[k] - std;
00624 if ( fabsf(err) > fabsf(maxerr) ) {
00625 maxerr = err;
00626 maxstd = std;
00627 mi = i; mj = j; mk = k;
00628 }
00629 }
00630 }
00631 }
00632 CkPrintf("pencil %d %d max error %f at %d %d %d (should be %f)\n",
00633 thisIndex.x, thisIndex.y, maxerr, mi, mj, mk, maxstd);
00634 #endif
00635 }
00636
00637 void OptPmeZPencil::send_ungrid(OptPmeGridMsg *msg) {
00638 int pe = msg->sourceNode;
00639 msg->sourceNode = thisIndex.x * initdata.yBlocks + thisIndex.y;
00640
00641 int dim3 = initdata.grid.dim3;
00642 int xstart = msg->xstart - thisIndex.x * initdata.grid.block1;
00643 int ystart = msg->ystart - thisIndex.y * initdata.grid.block2;
00644 assert (xstart >= 0);
00645 assert (ystart >= 0);
00646 int xlen = msg->xlen;
00647 int ylen = msg->ylen;
00648 assert (xstart + xlen <= nx);
00649 assert (ystart + ylen <= ny);
00650
00651 float *qmsg = msg->qgrid;
00652 float *d = data;
00653 int zstart = msg->zstart;
00654 int zlen = msg->zlen;
00655 int zmax = zstart + zlen - 1;
00656
00657 int K3_1 = initdata.grid.K3 - 1;
00658 int K3 = initdata.grid.K3;
00659 if (zstart < 0) {
00660 zstart += K3;
00661 zmax += K3;
00662 }
00663
00664 int k = 0;
00665 for ( int i=xstart; i<xstart+xlen; ++i ) {
00666 for ( int j=ystart; j<ystart+ylen; ++j) {
00667 float *d = data + (i * ny + j) * dim3;
00668 for ( k=zstart; k<=zmax; ++k ) {
00669 int kz = k;
00670
00671 kz = kz - ((unsigned)(K3_1 - kz)>>31)*K3;
00672 *(qmsg++) = d[kz];
00673 }
00674 }
00675 }
00676
00677 initdata.pmeProxy[pe].recvUngrid(msg);
00678 }
00679
00680
00684
00685 void OptPmeZPencil::many_to_many_recv_grid () {
00686 int dim3 = initdata.grid.dim3;
00687 memset(data, 0, sizeof(float) * nx*ny*dim3);
00688 int K3 = initdata.grid.K3;
00689 int K3_1 = initdata.grid.K3 - 1;
00690 int k = 0;
00691 for (int idx = 0; idx < grid_msgs.size(); idx++) {
00692 int xstart = m2m_recv_grid[idx].xstart;
00693 int xlen = m2m_recv_grid[idx].xlen;
00694 int ystart = m2m_recv_grid[idx].ystart;
00695 int ylen = m2m_recv_grid[idx].ylen;
00696 int zstart = m2m_recv_grid[idx].zstart;
00697 int zlen = m2m_recv_grid[idx].zlen;
00698
00699 float *qmsg = m2m_recv_grid[idx].data;
00700 for ( int i=xstart; i<xstart+xlen; ++i ) {
00701 for ( int j=ystart; j<ystart+ylen; ++j) {
00702 float *d = data + (i * ny + j) * dim3;
00703
00704 #pragma disjoint (*qmsg, *d)
00705 #pragma unroll (4)
00706 for ( k=zstart; k<zstart+zlen; ++k ) {
00707 int kz = k;
00708
00709 kz = kz - ((unsigned)(K3_1 - kz)>>31)*K3;
00710
00711
00712 d[kz] += *(qmsg++);
00713 }
00714 }
00715 }
00716 }
00717 }
00718
00719 void OptPmeZPencil::many_to_many_send_trans() {
00720 int zBlocks = initdata.zBlocks;
00721 int block3 = initdata.grid.block3;
00722 int dim3 = initdata.grid.dim3;
00723
00724 float *md = many_to_many_data;
00725 int nz = block3;
00726 for ( int kb=0; kb<zBlocks; ++kb ) {
00727 nz = block3;
00728 if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
00729
00730 const float *d = data;
00731 for ( int i=0; i<nx; ++i ) {
00732 for ( int j=0; j<ny; ++j, d += dim3 ) {
00733 for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
00734 *(md++) = d[2*k];
00735 *(md++) = d[2*k+1];
00736 }
00737 }
00738 }
00739 }
00740
00741 #if CHARM_VERSION > 60000
00742 CmiDirect_manytomany_start (handle, PHASE_YF);
00743 #endif
00744 }
00745
00746 void OptPmeYPencil::many_to_many_recv_trans () {
00747 int block2 = initdata.grid.block2;
00748 int K2 = initdata.grid.K2;
00749
00750 const float *md = many_to_many_data;
00751 for (int jb = 0; jb < initdata.yBlocks; jb++ ) {
00752 int ny = many_to_many_nb[jb];
00753 float *d = data;
00754 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
00755 for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
00756 for ( int k=0; k<nz; ++k ) {
00757 d[2*(j*nz+k)] = *(md++);
00758 d[2*(j*nz+k)+1] = *(md++);
00759 }
00760 }
00761 }
00762 }
00763 }
00764
00765 void OptPmeYPencil::many_to_many_send(int phase) {
00766 int yBlocks = initdata.yBlocks;
00767 int block2 = initdata.grid.block2;
00768 int K2 = initdata.grid.K2;
00769
00770 float *md = many_to_many_data;
00771 int ny = block2;
00772 for ( int jb=0; jb<yBlocks; ++jb ) {
00773 ny = block2;
00774 if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
00775
00776 const float *d = data;
00777 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
00778 for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
00779 for ( int k=0; k<nz; ++k ) {
00780 *(md++) = d[2*(j*nz+k)];
00781 *(md++) = d[2*(j*nz+k)+1];
00782 }
00783 }
00784 }
00785 }
00786
00787 #if CHARM_VERSION > 60000
00788 CmiDirect_manytomany_start (handle, phase);
00789 #endif
00790 }
00791
00792 void OptPmeXPencil::many_to_many_recv_trans () {
00793
00794 int block1 = initdata.grid.block1;
00795 int K1 = initdata.grid.K1;
00796
00797 const float *md = many_to_many_data;
00798 for (int ib =0; ib < initdata.xBlocks; ib++ ) {
00799 int nx = many_to_many_nb[ib];
00800 for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
00801 float *d = data + i*ny*nz*2;
00802 for ( int j=0; j<ny; ++j, d += nz*2 ) {
00803 for ( int k=0; k<nz; ++k ) {
00804 d[2*k] = *(md++);
00805 d[2*k+1] = *(md++);
00806 }
00807 }
00808 }
00809 }
00810 }
00811
00812 void OptPmeXPencil::many_to_many_send_untrans() {
00813 int xBlocks = initdata.xBlocks;
00814 int block1 = initdata.grid.block1;
00815 int K1 = initdata.grid.K1;
00816
00817 int nx = block1;
00818 float * md = many_to_many_data;
00819 for ( int ib=0; ib<xBlocks; ++ib ) {
00820 nx = block1;
00821 if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
00822
00823 for ( int i=ib*block1; i<(ib*block1+nx); ++i ) {
00824 float *d = data + i*ny*nz*2;
00825 for ( int j=0; j<ny; ++j, d += nz*2 ) {
00826 for ( int k=0; k<nz; ++k ) {
00827 *(md++) = d[2*k];
00828 *(md++) = d[2*k+1];
00829 }
00830 }
00831 }
00832 }
00833 #if CHARM_VERSION > 60000
00834 CmiDirect_manytomany_start (handle, PHASE_YB);
00835 #endif
00836 }
00837
00838 void OptPmeYPencil::many_to_many_recv_untrans () {
00839 int block2 = initdata.grid.block2;
00840 int K2 = initdata.grid.K2;
00841
00842 const float *md = many_to_many_data;
00843 for (int jb = 0; jb < initdata.yBlocks; jb++ ) {
00844 int ny = many_to_many_nb[jb];
00845 float *d = data;
00846 for ( int i=0; i<nx; ++i, d += K2*nz*2 ) {
00847 for ( int j=jb*block2; j<(jb*block2+ny); ++j ) {
00848 for ( int k=0; k<nz; ++k ) {
00849 d[2*(j*nz+k)] = *(md++);
00850 d[2*(j*nz+k)+1] = *(md++);
00851 }
00852 }
00853 }
00854 }
00855 }
00856
00857 void OptPmeZPencil::many_to_many_recv_untrans() {
00858
00859 int block3 = initdata.grid.block3;
00860 int dim3 = initdata.grid.dim3;
00861
00862 const float *md = many_to_many_data;
00863 for (int kb = 0; kb < initdata.zBlocks; kb++ ) {
00864 int nz = many_to_many_nb[kb];
00865 float *d = data;
00866 for ( int i=0; i<nx; ++i ) {
00867 for ( int j=0; j<ny; ++j, d += dim3 ) {
00868 for ( int k=kb*block3; k<(kb*block3+nz); ++k ) {
00869 d[2*k] = *(md++);
00870 d[2*k+1] = *(md++);
00871 }
00872 }
00873 }
00874 }
00875 }
00876
00877 void OptPmeZPencil::many_to_many_send_ungrid () {
00878 int dim3 = initdata.grid.dim3;
00879 int K3 = initdata.grid.K3;
00880 int K3_1 = initdata.grid.K3 - 1;
00881 int k = 0;
00882 for (int idx = 0; idx < grid_msgs.size(); idx++) {
00883 int xstart = m2m_recv_grid[idx].xstart;
00884 int xlen = m2m_recv_grid[idx].xlen;
00885 int ystart = m2m_recv_grid[idx].ystart;
00886 int ylen = m2m_recv_grid[idx].ylen;
00887 int zstart = m2m_recv_grid[idx].zstart;
00888 int zlen = m2m_recv_grid[idx].zlen;
00889
00890 float *qmsg = m2m_recv_grid[idx].data;
00891 for ( int i=xstart; i<xstart+xlen; ++i ) {
00892 for ( int j=ystart; j<ystart+ylen; ++j) {
00893 float *d = data + (i * ny + j) * dim3;
00894
00895 #pragma disjoint (*d, *qmsg)
00896 #pragma unroll (4)
00897 for ( k=zstart; k<zstart+zlen; ++k ) {
00898 int kz = k;
00899
00900 kz = kz - ((unsigned)(K3_1 - kz)>>31)*K3;
00901
00902
00903 *(qmsg++) = d[kz];
00904 }
00905 }
00906 }
00907 }
00908
00909 #if CHARM_VERSION > 60000
00910 CmiDirect_manytomany_start (handle, PHASE_UG);
00911 #endif
00912 }
00913
00914
00915
00916 void OptPmeZPencil::initialize_manytomany () {
00917 #if CHARM_VERSION > 60000
00918 int idx = 0;
00919 int totalcount = 0;
00920 for (idx = 0; idx < grid_msgs.size(); idx ++)
00921 totalcount += m2m_recv_grid[idx].xlen * m2m_recv_grid[idx].ylen * m2m_recv_grid[idx].zlen;
00922 many_to_many_gr_data = new float [totalcount];
00923
00924 CkArrayIndex3D aidx (thisIndex.x, thisIndex.y, thisIndex.z);
00925 cbw_recvgrid.cb = CkCallback (CkIndex_OptPmeZPencil::many_to_manyRecvGrid(NULL), aidx, thisProxy.ckGetArrayID());
00926 cbw_recvgrid.msg = new (PRIORITY_SIZE) OptPmeDummyMsg;
00927 PatchMap *patchMap = PatchMap::Object();
00928 CmiDirect_manytomany_initialize_recvbase (handle, PHASE_GR,
00929 call_ck_cb, &cbw_recvgrid,
00930 (char *)many_to_many_gr_data, patchMap->numPatches(), -1);
00931
00932 CmiDirect_manytomany_initialize_sendbase (handle, PHASE_UG, NULL, NULL, (char *)many_to_many_gr_data,
00933 grid_msgs.size(), thisIndex.x *initdata.yBlocks + thisIndex.y);
00934
00935 int offset = 0;
00936 for (idx = 0; idx < grid_msgs.size(); idx ++) {
00937 m2m_recv_grid[idx].data = (float *) ((char *)many_to_many_gr_data + offset);
00938 int fcount = m2m_recv_grid[idx].xlen * m2m_recv_grid[idx].ylen * m2m_recv_grid[idx].zlen;
00939 CmiDirect_manytomany_initialize_recv (handle, PHASE_GR, m2m_recv_grid[idx].patchID,
00940 offset, fcount *sizeof(float), patchMap->node(m2m_recv_grid[idx].patchID));
00941 CmiDirect_manytomany_initialize_send (handle, PHASE_UG, idx, offset, fcount *sizeof(float),
00942 patchMap->node(m2m_recv_grid[idx].patchID));
00943 offset += fcount * sizeof(float);
00944 }
00945
00946 int zBlocks = initdata.zBlocks;
00947 int block3 = initdata.grid.block3;
00948 int dim3 = initdata.grid.dim3;
00949
00950
00951 CmiDirect_manytomany_initialize_sendbase (handle, PHASE_YF, NULL, NULL, (char *)many_to_many_data, zBlocks, thisIndex.y);
00952
00953
00954 cbw_recvuntrans.cb = CkCallback(CkIndex_OptPmeZPencil::many_to_manyRecvUntrans(NULL), aidx, thisProxy.ckGetArrayID());
00955 cbw_recvuntrans.msg = new (PRIORITY_SIZE) OptPmeDummyMsg;
00956 CmiDirect_manytomany_initialize_recvbase (handle, PHASE_ZB, call_ck_cb, &cbw_recvuntrans,
00957 (char *)many_to_many_data, initdata.zBlocks, -1);
00958
00959 for ( int kb=0; kb<zBlocks; ++kb ) {
00960 int nz = block3;
00961 if ( (kb+1)*block3 > dim3/2 ) nz = dim3/2 - kb*block3;
00962
00963
00964 CkArrayIndex3D index (thisIndex.x,0,kb);
00965 CProxy_OptPmePencilMapY yproxy (global_map_y);
00966 int pe = yproxy.ckLocalBranch()->procNum(0, index);
00967 CmiDirect_manytomany_initialize_send (handle, PHASE_YF, kb, kb*2*nx*ny*block3*sizeof(float), 2*nx*ny*nz*sizeof(float), pe);
00968
00969
00970 CmiDirect_manytomany_initialize_recv (handle, PHASE_ZB, kb, kb*nx*ny*block3*2*sizeof(float), nx*ny*nz*2*sizeof(float), pe);
00971 many_to_many_nb [kb] = nz;
00972 }
00973 #endif
00974 }
00975
00976 void OptPmeYPencil::initialize_manytomany () {
00977 #if CHARM_VERSION > 60000
00978 int yBlocks = initdata.yBlocks;
00979 int block2 = initdata.grid.block2;
00980 int K2 = initdata.grid.K2;
00981
00982
00983 CmiDirect_manytomany_initialize_sendbase (handle, PHASE_XF, NULL, NULL, (char *)many_to_many_data, yBlocks, thisIndex.x);
00984
00985 CmiDirect_manytomany_initialize_sendbase (handle, PHASE_ZB, NULL, NULL, (char *)many_to_many_data, yBlocks, thisIndex.z);
00986
00987
00988 CkArrayIndex3D idx (thisIndex.x, thisIndex.y, thisIndex.z);
00989 cbw_recvtrans.cb = CkCallback (CkIndex_OptPmeYPencil::many_to_manyRecvTrans(NULL), idx, thisProxy.ckGetArrayID());
00990 cbw_recvtrans.msg = new (PRIORITY_SIZE) OptPmeDummyMsg;
00991 CmiDirect_manytomany_initialize_recvbase (handle, PHASE_YF, call_ck_cb, &cbw_recvtrans,
00992 (char *)many_to_many_data, initdata.yBlocks, -1);
00993
00994
00995 cbw_recvuntrans.cb = CkCallback(CkIndex_OptPmeYPencil::many_to_manyRecvUntrans(NULL), idx, thisProxy.ckGetArrayID());
00996 cbw_recvuntrans.msg = new (PRIORITY_SIZE) OptPmeDummyMsg;
00997 CmiDirect_manytomany_initialize_recvbase (handle, PHASE_YB, call_ck_cb,
00998 &cbw_recvuntrans, (char *)many_to_many_data, initdata.yBlocks, -1);
00999
01000 for ( int jb=0; jb<yBlocks; ++jb ) {
01001 int ny = block2;
01002 if ( (jb+1)*block2 > K2 ) ny = K2 - jb*block2;
01003
01004
01005 CkArrayIndex3D index (thisIndex.x,jb,0);
01006 CProxy_OptPmePencilMapZ zproxy (global_map_z);
01007 int pe = zproxy.ckLocalBranch()->procNum(0, index);
01008 CmiDirect_manytomany_initialize_send (handle, PHASE_ZB, jb, 2*nx*block2*nz*jb*sizeof(float), 2*nx*ny*nz*sizeof(float), pe);
01009
01010
01011 CmiDirect_manytomany_initialize_recv (handle, PHASE_YF, jb, jb*nx*block2*nz*2*sizeof(float), nx*ny*nz*2*sizeof(float), pe);
01012
01013
01014 index = CkArrayIndex3D (0,jb,thisIndex.z);
01015 CProxy_OptPmePencilMapX xproxy (global_map_x);
01016 pe = xproxy.ckLocalBranch()->procNum(0, index);
01017 CmiDirect_manytomany_initialize_send (handle, PHASE_XF, jb, 2*nx*block2*nz*jb*sizeof(float), 2*nx*ny*nz*sizeof(float), pe);
01018
01019
01020 CmiDirect_manytomany_initialize_recv (handle, PHASE_YB, jb, jb*nx*block2*nz*2*sizeof(float), nx*ny*nz*2*sizeof(float), pe);
01021
01022 many_to_many_nb [jb] = ny;
01023 }
01024 #endif
01025 }
01026
01027 void OptPmeXPencil::initialize_manytomany () {
01028 #if CHARM_VERSION > 60000
01029 int xBlocks = initdata.xBlocks;
01030 int block1 = initdata.grid.block1;
01031 int K1 = initdata.grid.K1;
01032
01033 CmiDirect_manytomany_initialize_sendbase (handle, PHASE_YB, NULL, NULL, (char *)many_to_many_data, xBlocks, thisIndex.y);
01034 CkArrayIndex3D idx (thisIndex.x, thisIndex.y, thisIndex.z);
01035 cbw_recvtrans.cb = CkCallback(CkIndex_OptPmeXPencil::many_to_manyRecvTrans(NULL), idx, thisProxy.ckGetArrayID());
01036 cbw_recvtrans.msg = new (PRIORITY_SIZE) OptPmeDummyMsg;
01037 CmiDirect_manytomany_initialize_recvbase (handle, PHASE_XF, call_ck_cb,
01038 &cbw_recvtrans, (char *)many_to_many_data,
01039 initdata.xBlocks, -1);
01040
01041 for ( int ib=0; ib<xBlocks; ++ib ) {
01042 int nx = block1;
01043 if ( (ib+1)*block1 > K1 ) nx = K1 - ib*block1;
01044
01045 CkArrayIndex3D index (ib,0,thisIndex.z);
01046 CProxy_OptPmePencilMapY yproxy (global_map_y);
01047 int pe = yproxy.ckLocalBranch()->procNum(0, index);
01048 CmiDirect_manytomany_initialize_send (handle, PHASE_YB, ib, 2*block1*ny*nz*ib*sizeof(float), 2*nx*ny*nz*sizeof(float), pe);
01049
01050 CmiDirect_manytomany_initialize_recv (handle, PHASE_XF, ib, ib*block1*ny*nz*2*sizeof(float), nx*ny*nz*2*sizeof(float), pe);
01051 many_to_many_nb [ib] = nx;
01052 }
01053 #endif
01054 }
01055
01056
01060
01061
01062 #include "PmeFFTLib.def.h"