msm_longrng.c

Go to the documentation of this file.
00001 
00041 #include "msm_defn.h"
00042 
00043 static int anterpolation(NL_Msm *);
00044 static int interpolation(NL_Msm *);
00045 static int restriction(NL_Msm *, int level);
00046 static int prolongation(NL_Msm *, int level);
00047 static int restriction_factored(NL_Msm *, int level);
00048 static int prolongation_factored(NL_Msm *, int level);
00049 static int gridcutoff(NL_Msm *, int level);
00050 
00051 
00052 int NL_msm_compute_long_range(NL_Msm *msm) {
00053   double time_delta;
00054   int rc = 0;
00055   int level;
00056   int nlevels = msm->nlevels;
00057   int use_cuda_gridcutoff = (msm->msmflags & NL_MSM_COMPUTE_CUDA_GRID_CUTOFF);
00058   int fallback_cpu = (msm->msmflags & NL_MSM_COMPUTE_CUDA_FALL_BACK);
00059   int use_nonfactored = (msm->msmflags & NL_MSM_COMPUTE_NONFACTORED);
00060 
00061   wkf_timer_start(msm->timer_longrng);
00062   rc = anterpolation(msm);
00063   if (rc) return rc;
00064   wkf_timer_stop(msm->timer_longrng);
00065   time_delta = wkf_timer_time(msm->timer_longrng);
00066   if (msm->report_timings) {
00067     printf("MSM anterpolation:  %6.3f sec\n", time_delta);
00068   }
00069 
00070   wkf_timer_start(msm->timer_longrng);
00071   if (use_nonfactored) {
00072     for (level = 0;  level < nlevels - 1;  level++) {
00073       rc = restriction(msm, level);
00074       if (rc) return rc;
00075     }
00076   }
00077   else {
00078     for (level = 0;  level < nlevels - 1;  level++) {
00079       rc = restriction_factored(msm, level);
00080       if (rc) return rc;
00081     }
00082   }
00083   wkf_timer_stop(msm->timer_longrng);
00084   time_delta = wkf_timer_time(msm->timer_longrng);
00085   if (msm->report_timings) {
00086     printf("MSM restriction:    %6.3f sec\n", time_delta);
00087   }
00088 
00089   wkf_timer_start(msm->timer_longrng);
00090   if (use_cuda_gridcutoff && nlevels > 1) {
00091 #ifdef NL_USE_CUDA
00092     if ((rc = NL_msm_cuda_condense_qgrids(msm)) != NL_MSM_SUCCESS ||
00093         (rc = NL_msm_cuda_compute_gridcutoff(msm)) != NL_MSM_SUCCESS ||
00094         (rc = NL_msm_cuda_expand_egrids(msm)) != NL_MSM_SUCCESS) {
00095       if (fallback_cpu) {
00096         printf("Falling back on CPU for grid cutoff computation\n");
00097         use_cuda_gridcutoff = 0;
00098       }
00099       else return rc;
00100     }
00101 #else
00102     if (fallback_cpu) {
00103       printf("Falling back on CPU for grid cutoff computation\n");
00104       use_cuda_gridcutoff = 0;
00105     }
00106     else return NL_MSM_ERROR_SUPPORT;
00107 #endif
00108   }
00109 
00110   if ( ! use_cuda_gridcutoff ) {
00111     for (level = 0;  level < nlevels - 1;  level++) {
00112       rc = gridcutoff(msm, level);
00113       if (rc) return rc;
00114     }
00115   }
00116 
00117   rc = gridcutoff(msm, level);  /* top level */
00118   if (rc) return rc;
00119 
00120   wkf_timer_stop(msm->timer_longrng);
00121   time_delta = wkf_timer_time(msm->timer_longrng);
00122   if (msm->report_timings) {
00123     printf("MSM grid cutoff:    %6.3f sec\n", time_delta);
00124   }
00125 
00126   wkf_timer_start(msm->timer_longrng);
00127   if (use_nonfactored) {
00128     for (level--;  level >= 0;  level--) {
00129       rc = prolongation(msm, level);
00130       if (rc) return rc;
00131     }
00132   }
00133   else {
00134     for (level--;  level >= 0;  level--) {
00135       rc = prolongation_factored(msm, level);
00136       if (rc) return rc;
00137     }
00138   }
00139   wkf_timer_stop(msm->timer_longrng);
00140   time_delta = wkf_timer_time(msm->timer_longrng);
00141   if (msm->report_timings) {
00142     printf("MSM prolongation:   %6.3f sec\n", time_delta);
00143   }
00144 
00145   wkf_timer_start(msm->timer_longrng);
00146   rc = interpolation(msm);
00147   if (rc) return rc;
00148   wkf_timer_stop(msm->timer_longrng);
00149   time_delta = wkf_timer_time(msm->timer_longrng);
00150   if (msm->report_timings) {
00151     printf("MSM interpolation:  %6.3f sec\n", time_delta);
00152   }
00153 
00154   return 0;
00155 }
00156 
00157 
00159 enum { MAX_POLY_DEGREE = 9 };
00160 
00163 static const int PolyDegree[NL_MSM_APPROX_END] = {
00164   3, 5, 5, 7, 7, 9, 9, 3,
00165 };
00166 
00169 static const double
00170 PhiStencilFactored[NL_MSM_APPROX_END][2*MAX_POLY_DEGREE+1] = {
00171   /* cubic */
00172   {-1./16, 0, 9./16, 1, 9./16, 0, -1./16},
00173 
00174   /* quintic C1 */
00175   {3./256, 0, -25./256, 0, 75./128, 1, 75./128, 0, -25./256, 0, 3./256},
00176 
00177   /* quintic C2  (same as quintic C1) */
00178   {3./256, 0, -25./256, 0, 75./128, 1, 75./128, 0, -25./256, 0, 3./256},
00179 
00180   /* septic C1 */
00181   { -5./2048, 0, 49./2048, 0, -245./2048, 0, 1225./2048, 1, 1225./2048,
00182     0, -245./2048, 0, 49./2048, 0, -5./2048 },
00183 
00184   /* septic C3  (same as septic C3) */
00185   { -5./2048, 0, 49./2048, 0, -245./2048, 0, 1225./2048, 1, 1225./2048,
00186     0, -245./2048, 0, 49./2048, 0, -5./2048 },
00187 
00188   /* nonic C1 */
00189   { 35./65536, 0, -405./65536, 0, 567./16384, 0, -2205./16384, 0,
00190     19845./32768, 1, 19845./32768, 0, -2205./16384, 0, 567./16384, 0,
00191     -405./65536, 0, 35./65536 },
00192 
00193   /* nonic C4  (same as nonic C1) */
00194   { 35./65536, 0, -405./65536, 0, 567./16384, 0, -2205./16384, 0,
00195     19845./32768, 1, 19845./32768, 0, -2205./16384, 0, 567./16384, 0,
00196     -405./65536, 0, 35./65536 },
00197 
00198   /* bspline */
00199   { 1./48, 1./6, 23./48, 2./3, 23./48, 1./6, 1./48 },
00200 };
00201 
00202 
00205 enum { MAX_NSTENCIL = 11 };
00206 
00208 static const int Nstencil[NL_MSM_APPROX_END] = {
00209   5, 7, 7, 9, 9, 11, 11, 7
00210 };
00211 
00214 static const int IndexOffset[NL_MSM_APPROX_END][MAX_NSTENCIL] = {
00215   /* cubic */
00216   {-3, -1, 0, 1, 3},
00217 
00218   /* quintic C1 */
00219   {-5, -3, -1, 0, 1, 3, 5},
00220 
00221   /* quintic C2  (same as quintic C1) */
00222   {-5, -3, -1, 0, 1, 3, 5},
00223 
00224   /* septic C1 */
00225   {-7, -5, -3, -1, 0, 1, 3, 5, 7},
00226 
00227   /* septic C3  (same as septic C3) */
00228   {-7, -5, -3, -1, 0, 1, 3, 5, 7},
00229 
00230   /* nonic C1 */
00231   {-9, -7, -5, -3, -1, 0, 1, 3, 5, 7, 9},
00232 
00233   /* nonic C4  (same as nonic C1) */
00234   {-9, -7, -5, -3, -1, 0, 1, 3, 5, 7, 9},
00235 
00236   /* bspline */
00237   {-3, -2, -1, 0, 1, 2, 3},
00238 };
00239 
00242 static const double PhiStencil[NL_MSM_APPROX_END][MAX_NSTENCIL] = {
00243   /* cubic */
00244   {-1./16, 9./16, 1, 9./16, -1./16},
00245 
00246   /* quintic C1 */
00247   {3./256, -25./256, 75./128, 1, 75./128, -25./256, 3./256},
00248 
00249   /* quintic C2  (same as quintic C1) */
00250   {3./256, -25./256, 75./128, 1, 75./128, -25./256, 3./256},
00251 
00252   /* septic C1 */
00253   { -5./2048, 49./2048, -245./2048, 1225./2048, 1, 1225./2048,
00254     -245./2048, 49./2048, -5./2048 },
00255 
00256   /* septic C3  (same as septic C3) */
00257   { -5./2048, 49./2048, -245./2048, 1225./2048, 1, 1225./2048,
00258     -245./2048, 49./2048, -5./2048 },
00259 
00260   /* nonic C1 */
00261   { 35./65536, -405./65536, 567./16384, -2205./16384, 
00262     19845./32768, 1, 19845./32768, -2205./16384, 567./16384, 
00263     -405./65536, 35./65536 },
00264 
00265   /* nonic C4  (same as nonic C1) */
00266   { 35./65536, -405./65536, 567./16384, -2205./16384, 
00267     19845./32768, 1, 19845./32768, -2205./16384, 567./16384, 
00268     -405./65536, 35./65536 },
00269 
00270   /* bspline */
00271   { 1./48, 1./6, 23./48, 2./3, 23./48, 1./6, 1./48 },
00272 };
00273 
00274 
00280 #define STENCIL_1D(_phi, _delta, _approx) \
00281   do { \
00282     double *phi = _phi; \
00283     double t = _delta; \
00284     switch (_approx) { \
00285       case NL_MSM_APPROX_CUBIC: \
00286         phi[0] = 0.5 * (1 - t) * (2 - t) * (2 - t); \
00287         t--; \
00288         phi[1] = (1 - t) * (1 + t - 1.5 * t * t); \
00289         t--; \
00290         phi[2] = (1 + t) * (1 - t - 1.5 * t * t); \
00291         t--; \
00292         phi[3] = 0.5 * (1 + t) * (2 + t) * (2 + t); \
00293         break; \
00294       case NL_MSM_APPROX_QUINTIC: \
00295         phi[0] = (1./24) * (1-t) * (2-t) * (3-t) * (3-t) * (4-t); \
00296         t--; \
00297         phi[1] = (1-t) * (2-t) * (3-t) * ((1./6) + t * (0.375 - (5./24)*t)); \
00298         t--; \
00299         phi[2] = (1-t*t) * (2-t) * (0.5 + t * (0.25 - (5./12)*t)); \
00300         t--; \
00301         phi[3] = (1-t*t) * (2+t) * (0.5 - t * (0.25 + (5./12)*t)); \
00302         t--; \
00303         phi[4] = (1+t) * (2+t) * (3+t) * ((1./6) - t * (0.375 + (5./24)*t)); \
00304         t--; \
00305         phi[5] = (1./24) * (1+t) * (2+t) * (3+t) * (3+t) * (4+t); \
00306         break; \
00307       case NL_MSM_APPROX_QUINTIC2: \
00308         phi[0] = (1./24) * (3-t) * (3-t) * (3-t) * (t-2) * (5*t-8); \
00309         t--; \
00310         phi[1] = (-1./24) * (2-t) * (t-1) * (-48+t*(153+t*(-114+t*25))); \
00311         t--; \
00312         phi[2] = (1./12) * (1-t) * (12+t*(12+t*(-3+t*(-38+t*25)))); \
00313         t--; \
00314         phi[3] = (1./12) * (1+t) * (12+t*(-12+t*(-3+t*(38+t*25)))); \
00315         t--; \
00316         phi[4] = (-1./24) * (2+t) * (t+1) * (48+t*(153+t*(114+t*25))); \
00317         t--; \
00318         phi[5] = (1./24) * (3+t) * (3+t) * (3+t) * (t+2) * (5*t+8); \
00319         break; \
00320       case NL_MSM_APPROX_SEPTIC: \
00321         phi[0] = (-1./720)*(t-1)*(t-2)*(t-3)*(t-4)*(t-4)*(t-5)*(t-6); \
00322         t--; \
00323         phi[1] = (1./720)*(t-1)*(t-2)*(t-3)*(t-4)*(t-5)*(-6+t*(-20+7*t)); \
00324         t--; \
00325         phi[2] = (-1./240)*(t*t-1)*(t-2)*(t-3)*(t-4)*(-10+t*(-12+7*t)); \
00326         t--; \
00327         phi[3] = (1./144)*(t*t-1)*(t*t-4)*(t-3)*(-12+t*(-4+7*t)); \
00328         t--; \
00329         phi[4] = (-1./144)*(t*t-1)*(t*t-4)*(t+3)*(-12+t*(4+7*t)); \
00330         t--; \
00331         phi[5] = (1./240)*(t*t-1)*(t+2)*(t+3)*(t+4)*(-10+t*(12+7*t)); \
00332         t--; \
00333         phi[6] = (-1./720)*(t+1)*(t+2)*(t+3)*(t+4)*(t+5)*(-6+t*(20+7*t)); \
00334         t--; \
00335         phi[7] = (1./720)*(t+1)*(t+2)*(t+3)*(t+4)*(t+4)*(t+5)*(t+6); \
00336         break; \
00337       case NL_MSM_APPROX_SEPTIC3: \
00338         phi[0] = (3632./5) + t*((-7456./5) + t*((58786./45) + t*(-633 \
00339                 + t*((26383./144) + t*((-22807./720) + t*((727./240) \
00340                       + t*(-89./720))))))); \
00341         t--; \
00342         phi[1] = -440 + t*((25949./20) + t*((-117131./72) + t*((2247./2) \
00343                 + t*((-66437./144) + t*((81109./720) + t*((-727./48) \
00344                       + t*(623./720))))))); \
00345         t--; \
00346         phi[2] = (138./5) + t*((-8617./60) + t*((12873./40) + t*((-791./2) \
00347                 + t*((4557./16) + t*((-9583./80) + t*((2181./80) \
00348                       + t*(-623./240))))))); \
00349         t--; \
00350         phi[3] = 1 + t*t*((-49./36) + t*t*((-959./144) + t*((2569./144) \
00351                 + t*((-727./48) + t*(623./144))))); \
00352         t--; \
00353         phi[4] = 1 + t*t*((-49./36) + t*t*((-959./144) + t*((-2569./144) \
00354                 + t*((-727./48) + t*(-623./144))))); \
00355         t--; \
00356         phi[5] = (138./5) + t*((8617./60) + t*((12873./40) + t*((791./2) \
00357                 + t*((4557./16) + t*((9583./80) + t*((2181./80) \
00358                       + t*(623./240))))))); \
00359         t--; \
00360         phi[6] = -440 + t*((-25949./20) + t*((-117131./72) + t*((-2247./2) \
00361                 + t*((-66437./144) + t*((-81109./720) + t*((-727./48) \
00362                       + t*(-623./720))))))); \
00363         t--; \
00364         phi[7] = (3632./5) + t*((7456./5) + t*((58786./45) + t*(633 \
00365                 + t*((26383./144) + t*((22807./720) + t*((727./240) \
00366                       + t*(89./720))))))); \
00367         break; \
00368       case NL_MSM_APPROX_NONIC: \
00369         phi[0] = (-1./40320)*(t-8)*(t-7)*(t-6)*(t-5)*(t-5)*(t-4)*(t-3)* \
00370           (t-2)*(t-1); \
00371         t--; \
00372         phi[1] = (1./40320)*(t-7)*(t-6)*(t-5)*(t-4)*(t-3)*(t-2)*(t-1)* \
00373           (-8+t*(-35+9*t)); \
00374         t--; \
00375         phi[2] = (-1./10080)*(t-6)*(t-5)*(t-4)*(t-3)*(t-2)*(t-1)*(t+1)* \
00376           (-14+t*(-25+9*t)); \
00377         t--; \
00378         phi[3] = (1./1440)*(t-5)*(t-4)*(t-3)*(t-2)*(t-1)*(t+1)*(t+2)* \
00379           (-6+t*(-5+3*t)); \
00380         t--; \
00381         phi[4] = (-1./2880)*(t-4)*(t-3)*(t-2)*(t-1)*(t+1)*(t+2)*(t+3)* \
00382           (-20+t*(-5+9*t)); \
00383         t--; \
00384         phi[5] = (1./2880)*(t-3)*(t-2)*(t-1)*(t+1)*(t+2)*(t+3)*(t+4)* \
00385           (-20+t*(5+9*t)); \
00386         t--; \
00387         phi[6] = (-1./1440)*(t-2)*(t-1)*(t+1)*(t+2)*(t+3)*(t+4)*(t+5)* \
00388           (-6+t*(5+3*t)); \
00389         t--; \
00390         phi[7] = (1./10080)*(t-1)*(t+1)*(t+2)*(t+3)*(t+4)*(t+5)*(t+6)* \
00391           (-14+t*(25+9*t)); \
00392         t--; \
00393         phi[8] = (-1./40320)*(t+1)*(t+2)*(t+3)*(t+4)*(t+5)*(t+6)*(t+7)* \
00394           (-8+t*(35+9*t)); \
00395         t--; \
00396         phi[9] = (1./40320)*(t+1)*(t+2)*(t+3)*(t+4)*(t+5)*(t+5)*(t+6)* \
00397           (t+7)*(t+8); \
00398         break; \
00399       case NL_MSM_APPROX_NONIC4: \
00400         phi[0] = 439375./7+t*(-64188125./504+t*(231125375./2016 \
00401               +t*(-17306975./288+t*(7761805./384+t*(-2895587./640 \
00402                     +t*(129391./192+t*(-259715./4032+t*(28909./8064 \
00403                           +t*(-3569./40320))))))))); \
00404         t--; \
00405         phi[1] = -56375+t*(8314091./56+t*(-49901303./288+t*(3763529./32 \
00406                 +t*(-19648027./384+t*(9469163./640+t*(-545977./192 \
00407                       +t*(156927./448+t*(-28909./1152 \
00408                           +t*(3569./4480))))))))); \
00409         t--; \
00410         phi[2] = 68776./7+t*(-1038011./28+t*(31157515./504+t*(-956669./16 \
00411                 +t*(3548009./96+t*(-2422263./160+t*(197255./48 \
00412                       +t*(-19959./28+t*(144545./2016 \
00413                           +t*(-3569./1120))))))))); \
00414         t--; \
00415         phi[3] = -154+t*(12757./12+t*(-230123./72+t*(264481./48 \
00416                 +t*(-576499./96+t*(686147./160+t*(-96277./48 \
00417                       +t*(14221./24+t*(-28909./288+t*(3569./480))))))))); \
00418         t--; \
00419         phi[4] = 1+t*t*(-205./144+t*t*(91./192+t*(-6181./320 \
00420                 +t*(6337./96+t*(-2745./32+t*(28909./576 \
00421                       +t*(-3569./320))))))); \
00422         t--; \
00423         phi[5] = 1+t*t*(-205./144+t*t*(91./192+t*(6181./320 \
00424                 +t*(6337./96+t*(2745./32+t*(28909./576 \
00425                       +t*(3569./320))))))); \
00426         t--; \
00427         phi[6] = -154+t*(-12757./12+t*(-230123./72+t*(-264481./48 \
00428                 +t*(-576499./96+t*(-686147./160+t*(-96277./48 \
00429                       +t*(-14221./24+t*(-28909./288+t*(-3569./480))))))))); \
00430         t--; \
00431         phi[7] = 68776./7+t*(1038011./28+t*(31157515./504+t*(956669./16 \
00432                 +t*(3548009./96+t*(2422263./160+t*(197255./48 \
00433                       +t*(19959./28+t*(144545./2016+t*(3569./1120))))))))); \
00434         t--; \
00435         phi[8] = -56375+t*(-8314091./56+t*(-49901303./288+t*(-3763529./32 \
00436                 +t*(-19648027./384+t*(-9469163./640+t*(-545977./192 \
00437                       +t*(-156927./448+t*(-28909./1152 \
00438                           +t*(-3569./4480))))))))); \
00439         t--; \
00440         phi[9] = 439375./7+t*(64188125./504+t*(231125375./2016 \
00441               +t*(17306975./288+t*(7761805./384+t*(2895587./640 \
00442                     +t*(129391./192+t*(259715./4032+t*(28909./8064 \
00443                           +t*(3569./40320))))))))); \
00444         break; \
00445       case NL_MSM_APPROX_BSPLINE: \
00446         phi[0] = (1./6) * (2-t) * (2-t) * (2-t); \
00447         t--; \
00448         phi[1] = (2./3) + t*t*(-1 + 0.5*t); \
00449         t--; \
00450         phi[2] = (2./3) + t*t*(-1 - 0.5*t); \
00451         t--; \
00452         phi[3] = (1./6) * (2+t) * (2+t) * (2+t); \
00453         break; \
00454       default: \
00455         return NL_MSM_ERROR_SUPPORT; \
00456     } \
00457   } while (0)
00458   /* closing ';' from use as function call */
00459 
00460 
00468 #define D_STENCIL_1D(_dphi, _phi, _h_1, _delta, _approx) \
00469   do { \
00470     double *dphi = _dphi; \
00471     double *phi = _phi; \
00472     double h_1 = _h_1; \
00473     double t = _delta; \
00474     switch (_approx) { \
00475       case NL_MSM_APPROX_CUBIC: \
00476         phi[0] = 0.5 * (1 - t) * (2 - t) * (2 - t); \
00477         dphi[0] = (1.5 * t - 2) * (2 - t) * h_1; \
00478         t--; \
00479         phi[1] = (1 - t) * (1 + t - 1.5 * t * t); \
00480         dphi[1] = (-5 + 4.5 * t) * t * h_1; \
00481         t--; \
00482         phi[2] = (1 + t) * (1 - t - 1.5 * t * t); \
00483         dphi[2] = (-5 - 4.5 * t) * t * h_1; \
00484         t--; \
00485         phi[3] = 0.5 * (1 + t) * (2 + t) * (2 + t); \
00486         dphi[3] = (1.5 * t + 2) * (2 + t) * h_1; \
00487         break; \
00488       case NL_MSM_APPROX_QUINTIC: \
00489         phi[0] = (1./24) * (1-t) * (2-t) * (3-t) * (3-t) * (4-t); \
00490         dphi[0] = ((-1./24) * ((3-t) * (3-t) * (14 + t * (-14 + 3*t)) \
00491               + 2 * (1-t) * (2-t) * (3-t) * (4-t))) * h_1; \
00492         t--; \
00493         phi[1] = (1-t) * (2-t) * (3-t) * ((1./6) + t * (0.375 - (5./24)*t)); \
00494         dphi[1] = (-((1./6) + t * (0.375 - (5./24)*t)) * (11 + t * (-12 + 3*t))\
00495             + (1-t) * (2-t) * (3-t) * (0.375 - (5./12)*t)) * h_1; \
00496         t--; \
00497         phi[2] = (1-t*t) * (2-t) * (0.5 + t * (0.25 - (5./12)*t)); \
00498         dphi[2] = (-(0.5 + t * (0.25 - (5./12)*t)) * (1 + t * (4 - 3*t)) \
00499             + (1-t*t) * (2-t) * (0.25 - (5./6)*t)) * h_1; \
00500         t--; \
00501         phi[3] = (1-t*t) * (2+t) * (0.5 - t * (0.25 + (5./12)*t)); \
00502         dphi[3] = ((0.5 + t * (-0.25 - (5./12)*t)) * (1 + t * (-4 - 3*t)) \
00503             - (1-t*t) * (2+t) * (0.25 + (5./6)*t)) * h_1; \
00504         t--; \
00505         phi[4] = (1+t) * (2+t) * (3+t) * ((1./6) - t * (0.375 + (5./24)*t)); \
00506         dphi[4] = (((1./6) + t * (-0.375 - (5./24)*t)) * (11 + t * (12 + 3*t)) \
00507             - (1+t) * (2+t) * (3+t) * (0.375 + (5./12)*t)) * h_1; \
00508         t--; \
00509         phi[5] = (1./24) * (1+t) * (2+t) * (3+t) * (3+t) * (4+t); \
00510         dphi[5] = ((1./24) * ((3+t) * (3+t) * (14 + t * (14 + 3*t)) \
00511               + 2 * (1+t) * (2+t) * (3+t) * (4+t))) * h_1; \
00512         break; \
00513       case NL_MSM_APPROX_QUINTIC2: \
00514         phi[0] = (1./24) * (3-t) * (3-t) * (3-t) * (t-2) * (5*t-8); \
00515         dphi[0] = ((1./24) * (3-t) * (3-t) * ((3-t)*(5*t-8) - 3*(t-2)*(5*t-8) \
00516               + 5*(t-2)*(3-t))) * h_1; \
00517         t--; \
00518         phi[1] = (-1./24) * (2-t) * (t-1) * (-48+t*(153+t*(-114+t*25))); \
00519         dphi[1] = ((-1./24) * ((2-t)*(-48+t*(153+t*(-114+t*25))) - (t-1)* \
00520               (-48+t*(153+t*(-114+t*25))) \
00521               + (2-t)*(t-1)*(153+t*(-228+t*75)))) * h_1; \
00522         t--; \
00523         phi[2] = (1./12) * (1-t) * (12+t*(12+t*(-3+t*(-38+t*25)))); \
00524         dphi[2] = ((1./12) * (-(12+t*(12+t*(-3+t*(-38+t*25)))) \
00525               + (1-t)*(12+t*(-6+t*(-114+t*100))))) * h_1; \
00526         t--; \
00527         phi[3] = (1./12) * (1+t) * (12+t*(-12+t*(-3+t*(38+t*25)))); \
00528         dphi[3] = ((1./12) * ((12+t*(-12+t*(-3+t*(38+t*25)))) \
00529               + (1+t)*(-12+t*(-6+t*(114+t*100))))) * h_1; \
00530         t--; \
00531         phi[4] = (-1./24) * (2+t) * (t+1) * (48+t*(153+t*(114+t*25))); \
00532         dphi[4] = ((-1./24) * ((2+t)*(48+t*(153+t*(114+t*25))) + (t+1)* \
00533               (48+t*(153+t*(114+t*25))) \
00534               + (2+t)*(t+1)*(153+t*(228+t*75)))) * h_1; \
00535         t--; \
00536         phi[5] = (1./24) * (3+t) * (3+t) * (3+t) * (t+2) * (5*t+8); \
00537         dphi[5] = ((1./24) * (3+t) * (3+t) * ((3+t)*(5*t+8) + 3*(t+2)*(5*t+8) \
00538               + 5*(t+2)*(3+t))) * h_1; \
00539         break; \
00540       case NL_MSM_APPROX_SEPTIC: \
00541         phi[0] = (-1./720)*(t-1)*(t-2)*(t-3)*(t-4)*(t-4)*(t-5)*(t-6); \
00542         dphi[0] = (-1./720)*(t-4)*(-1944+t*(3644+t*(-2512+t*(807 \
00543                   +t*(-122+t*7))))) * h_1; \
00544         t--; \
00545         phi[1] = (1./720)*(t-1)*(t-2)*(t-3)*(t-4)*(t-5)*(-6+t*(-20+7*t)); \
00546         dphi[1] = (1./720)*(756+t*(-9940+t*(17724+t*(-12740+t*(4445 \
00547                     +t*(-750+t*49)))))) * h_1; \
00548         t--; \
00549         phi[2] = (-1./240)*(t*t-1)*(t-2)*(t-3)*(t-4)*(-10+t*(-12+7*t)); \
00550         dphi[2] = (-1./240)*(-28+t*(1260+t*(-756+t*(-1260+t*(1365 \
00551                     +t*(-450+t*49)))))) * h_1; \
00552         t--; \
00553         phi[3] = (1./144)*(t*t-1)*(t*t-4)*(t-3)*(-12+t*(-4+7*t)); \
00554         dphi[3] = (1./144)*t*(-560+t*(84+t*(644+t*(-175+t*(-150+t*49))))) * \
00555           h_1; \
00556         t--; \
00557         phi[4] = (-1./144)*(t*t-1)*(t*t-4)*(t+3)*(-12+t*(4+7*t)); \
00558         dphi[4] = (-1./144)*t*(560+t*(84+t*(-644+t*(-175+t*(150+t*49))))) * \
00559           h_1; \
00560         t--; \
00561         phi[5] = (1./240)*(t*t-1)*(t+2)*(t+3)*(t+4)*(-10+t*(12+7*t)); \
00562         dphi[5] = (1./240)*(-28+t*(-1260+t*(-756+t*(1260+t*(1365 \
00563                     +t*(450+t*49)))))) * h_1; \
00564         t--; \
00565         phi[6] = (-1./720)*(t+1)*(t+2)*(t+3)*(t+4)*(t+5)*(-6+t*(20+7*t)); \
00566         dphi[6] = (-1./720)*(756+t*(9940+t*(17724+t*(12740+t*(4445 \
00567                     +t*(750+t*49)))))) * h_1; \
00568         t--; \
00569         phi[7] = (1./720)*(t+1)*(t+2)*(t+3)*(t+4)*(t+4)*(t+5)*(t+6); \
00570         dphi[7] = (1./720)*(t+4)*(1944+t*(3644+t*(2512+t*(807 \
00571                   +t*(122+t*7))))) * h_1; \
00572         break; \
00573       case NL_MSM_APPROX_SEPTIC3: \
00574         phi[0] = (3632./5) + t*((-7456./5) + t*((58786./45) + t*(-633 \
00575                 + t*((26383./144) + t*((-22807./720) + t*((727./240) \
00576                       + t*(-89./720))))))); \
00577         dphi[0] = ((-7456./5) + t*((117572./45) + t*(-1899 + t*((26383./36) \
00578                   + t*((-22807./144) + t*((727./40) + t*(-623./720)))))))*h_1; \
00579         t--; \
00580         phi[1] = -440 + t*((25949./20) + t*((-117131./72) + t*((2247./2) \
00581                 + t*((-66437./144) + t*((81109./720) + t*((-727./48) \
00582                       + t*(623./720))))))); \
00583         dphi[1] = ((25949./20) + t*((-117131./36) + t*((6741./2) \
00584                 + t*((-66437./36) + t*((81109./144) + t*((-727./8) \
00585                       + t*(4361./720))))))) * h_1; \
00586         t--; \
00587         phi[2] = (138./5) + t*((-8617./60) + t*((12873./40) + t*((-791./2) \
00588                 + t*((4557./16) + t*((-9583./80) + t*((2181./80) \
00589                       + t*(-623./240))))))); \
00590         dphi[2] = ((-8617./60) + t*((12873./20) + t*((-2373./2) + t*((4557./4) \
00591                   + t*((-9583./16) + t*((6543./40) + t*(-4361./240)))))))*h_1; \
00592         t--; \
00593         phi[3] = 1 + t*t*((-49./36) + t*t*((-959./144) + t*((2569./144) \
00594                 + t*((-727./48) + t*(623./144))))); \
00595         dphi[3] = (t*((-49./18) + t*t*((-959./36) + t*((12845./144) \
00596                   + t*((-727./8) + t*(4361./144)))))) * h_1; \
00597         t--; \
00598         phi[4] = 1 + t*t*((-49./36) + t*t*((-959./144) + t*((-2569./144) \
00599                 + t*((-727./48) + t*(-623./144))))); \
00600         dphi[4] = (t*((-49./18) + t*t*((-959./36) + t*((-12845./144) \
00601                   + t*((-727./8) + t*(-4361./144)))))) * h_1; \
00602         t--; \
00603         phi[5] = (138./5) + t*((8617./60) + t*((12873./40) + t*((791./2) \
00604                 + t*((4557./16) + t*((9583./80) + t*((2181./80) \
00605                       + t*(623./240))))))); \
00606         dphi[5] = ((8617./60) + t*((12873./20) + t*((2373./2) + t*((4557./4) \
00607                   + t*((9583./16) + t*((6543./40) + t*(4361./240)))))))*h_1; \
00608         t--; \
00609         phi[6] = -440 + t*((-25949./20) + t*((-117131./72) + t*((-2247./2) \
00610                 + t*((-66437./144) + t*((-81109./720) + t*((-727./48) \
00611                       + t*(-623./720))))))); \
00612         dphi[6] = ((-25949./20) + t*((-117131./36) + t*((-6741./2) \
00613                 + t*((-66437./36) + t*((-81109./144) + t*((-727./8) \
00614                       + t*(-4361./720))))))) * h_1; \
00615         t--; \
00616         phi[7] = (3632./5) + t*((7456./5) + t*((58786./45) + t*(633 \
00617                 + t*((26383./144) + t*((22807./720) + t*((727./240) \
00618                       + t*(89./720))))))); \
00619         dphi[7] = ((7456./5) + t*((117572./45) + t*(1899 + t*((26383./36) \
00620                   + t*((22807./144) + t*((727./40) + t*(623./720)))))))*h_1; \
00621         break; \
00622       case NL_MSM_APPROX_NONIC: \
00623         phi[0] = (-1./40320)*(t-8)*(t-7)*(t-6)*(t-5)*(t-5)*(t-4)*(t-3)* \
00624           (t-2)*(t-1); \
00625         dphi[0] = (-1./40320)*(t-5)*(-117648+t*(256552+t*(-221416 \
00626                 +t*(99340+t*(-25261+t*(3667+t*(-283+t*9)))))))*h_1; \
00627         t--; \
00628         phi[1] = (1./40320)*(t-7)*(t-6)*(t-5)*(t-4)*(t-3)*(t-2)*(t-1)* \
00629           (-8+t*(-35+9*t)); \
00630         dphi[1] = (1./40320)*(71856+t*(-795368+t*(1569240+t*(-1357692 \
00631                   +t*(634725+t*(-172116+t*(27090+t*(-2296+t*81))))))))*h_1; \
00632         t--; \
00633         phi[2] = (-1./10080)*(t-6)*(t-5)*(t-4)*(t-3)*(t-2)*(t-1)*(t+1)* \
00634           (-14+t*(-25+9*t)); \
00635         dphi[2] = (1./10080)*(3384+t*(-69080+t*(55026 \
00636                 +t*(62580+t*(-99225+t*(51660+t*(-13104+t*(1640 \
00637                           +t*(-81)))))))))*h_1; \
00638         t--; \
00639         phi[3] = (1./1440)*(t-5)*(t-4)*(t-3)*(t-2)*(t-1)*(t+1)*(t+2)* \
00640           (-6+t*(-5+3*t)); \
00641         dphi[3] = (1./1440)*(72+t*(-6344+t*(2070 \
00642                 +t*(7644+t*(-4725+t*(-828+t*(1260+t*(-328+t*27))))))))*h_1; \
00643         t--; \
00644         phi[4] = (-1./2880)*(t-4)*(t-3)*(t-2)*(t-1)*(t+1)*(t+2)*(t+3)* \
00645           (-20+t*(-5+9*t)); \
00646         dphi[4] = (-1./2880)*t*(10792+t*(-972+t*(-12516 \
00647                 +t*(2205+t*(3924+t*(-882+t*(-328+t*81)))))))*h_1; \
00648         t--; \
00649         phi[5] = (1./2880)*(t-3)*(t-2)*(t-1)*(t+1)*(t+2)*(t+3)*(t+4)* \
00650           (-20+t*(5+9*t)); \
00651         dphi[5] = (1./2880)*t*(-10792+t*(-972+t*(12516 \
00652                 +t*(2205+t*(-3924+t*(-882+t*(328+t*81)))))))*h_1; \
00653         t--; \
00654         phi[6] = (-1./1440)*(t-2)*(t-1)*(t+1)*(t+2)*(t+3)*(t+4)*(t+5)* \
00655           (-6+t*(5+3*t)); \
00656         dphi[6] = (1./1440)*(-72+t*(-6344+t*(-2070 \
00657                 +t*(7644+t*(4725+t*(-828+t*(-1260+t*(-328+t*(-27)))))))))*h_1; \
00658         t--; \
00659         phi[7] = (1./10080)*(t-1)*(t+1)*(t+2)*(t+3)*(t+4)*(t+5)*(t+6)* \
00660           (-14+t*(25+9*t)); \
00661         dphi[7] = (1./10080)*(-3384+t*(-69080+t*(-55026 \
00662                 +t*(62580+t*(99225+t*(51660+t*(13104+t*(1640+t*81))))))))*h_1; \
00663         t--; \
00664         phi[8] = (-1./40320)*(t+1)*(t+2)*(t+3)*(t+4)*(t+5)*(t+6)*(t+7)* \
00665           (-8+t*(35+9*t)); \
00666         dphi[8] = (-1./40320)*(71856+t*(795368+t*(1569240 \
00667                 +t*(1357692+t*(634725+t*(172116+t*(27090+t*(2296 \
00668                           +t*81))))))))*h_1; \
00669         t--; \
00670         phi[9] = (1./40320)*(t+1)*(t+2)*(t+3)*(t+4)*(t+5)*(t+5)*(t+6)* \
00671           (t+7)*(t+8); \
00672         dphi[9] = (1./40320)*(t+5)*(117648+t*(256552+t*(221416 \
00673                 +t*(99340+t*(25261+t*(3667+t*(283+t*9)))))))*h_1; \
00674         break; \
00675       case NL_MSM_APPROX_NONIC4: \
00676         phi[0] = 439375./7+t*(-64188125./504+t*(231125375./2016 \
00677               +t*(-17306975./288+t*(7761805./384+t*(-2895587./640 \
00678                     +t*(129391./192+t*(-259715./4032+t*(28909./8064 \
00679                           +t*(-3569./40320))))))))); \
00680         dphi[0] = (-64188125./504+t*(231125375./1008 \
00681               +t*(-17306975./96+t*(7761805./96+t*(-2895587./128 \
00682                     +t*(129391./32+t*(-259715./576+t*(28909./1008 \
00683                           +t*(-3569./4480)))))))))*h_1; \
00684         t--; \
00685         phi[1] = -56375+t*(8314091./56+t*(-49901303./288+t*(3763529./32 \
00686                 +t*(-19648027./384+t*(9469163./640+t*(-545977./192 \
00687                       +t*(156927./448+t*(-28909./1152 \
00688                           +t*(3569./4480))))))))); \
00689         dphi[1] = (8314091./56+t*(-49901303./144+t*(11290587./32 \
00690                 +t*(-19648027./96+t*(9469163./128+t*(-545977./32 \
00691                       +t*(156927./64+t*(-28909./144 \
00692                           +t*(32121./4480)))))))))*h_1; \
00693         t--; \
00694         phi[2] = 68776./7+t*(-1038011./28+t*(31157515./504+t*(-956669./16 \
00695                 +t*(3548009./96+t*(-2422263./160+t*(197255./48 \
00696                       +t*(-19959./28+t*(144545./2016 \
00697                           +t*(-3569./1120))))))))); \
00698         dphi[2] = (-1038011./28+t*(31157515./252+t*(-2870007./16 \
00699                 +t*(3548009./24+t*(-2422263./32+t*(197255./8 \
00700                       +t*(-19959./4+t*(144545./252 \
00701                           +t*(-32121./1120)))))))))*h_1; \
00702         t--; \
00703         phi[3] = -154+t*(12757./12+t*(-230123./72+t*(264481./48 \
00704                 +t*(-576499./96+t*(686147./160+t*(-96277./48 \
00705                       +t*(14221./24+t*(-28909./288+t*(3569./480))))))))); \
00706         dphi[3] = (12757./12+t*(-230123./36+t*(264481./16 \
00707                 +t*(-576499./24+t*(686147./32+t*(-96277./8 \
00708                       +t*(99547./24+t*(-28909./36 \
00709                           +t*(10707./160)))))))))*h_1; \
00710         t--; \
00711         phi[4] = 1+t*t*(-205./144+t*t*(91./192+t*(-6181./320 \
00712                 +t*(6337./96+t*(-2745./32+t*(28909./576 \
00713                       +t*(-3569./320))))))); \
00714         dphi[4] = t*(-205./72+t*t*(91./48+t*(-6181./64 \
00715                 +t*(6337./16+t*(-19215./32+t*(28909./72 \
00716                       +t*(-32121./320)))))))*h_1; \
00717         t--; \
00718         phi[5] = 1+t*t*(-205./144+t*t*(91./192+t*(6181./320 \
00719                 +t*(6337./96+t*(2745./32+t*(28909./576 \
00720                       +t*(3569./320))))))); \
00721         dphi[5] = t*(-205./72+t*t*(91./48+t*(6181./64 \
00722                 +t*(6337./16+t*(19215./32+t*(28909./72 \
00723                       +t*(32121./320)))))))*h_1; \
00724         t--; \
00725         phi[6] = -154+t*(-12757./12+t*(-230123./72+t*(-264481./48 \
00726                 +t*(-576499./96+t*(-686147./160+t*(-96277./48 \
00727                       +t*(-14221./24+t*(-28909./288+t*(-3569./480))))))))); \
00728         dphi[6] = (-12757./12+t*(-230123./36+t*(-264481./16 \
00729                 +t*(-576499./24+t*(-686147./32+t*(-96277./8 \
00730                       +t*(-99547./24+t*(-28909./36 \
00731                           +t*(-10707./160)))))))))*h_1; \
00732         t--; \
00733         phi[7] = 68776./7+t*(1038011./28+t*(31157515./504+t*(956669./16 \
00734                 +t*(3548009./96+t*(2422263./160+t*(197255./48 \
00735                       +t*(19959./28+t*(144545./2016+t*(3569./1120))))))))); \
00736         dphi[7] = (1038011./28+t*(31157515./252+t*(2870007./16 \
00737                 +t*(3548009./24+t*(2422263./32+t*(197255./8 \
00738                       +t*(19959./4+t*(144545./252 \
00739                           +t*(32121./1120)))))))))*h_1; \
00740         t--; \
00741         phi[8] = -56375+t*(-8314091./56+t*(-49901303./288+t*(-3763529./32 \
00742                 +t*(-19648027./384+t*(-9469163./640+t*(-545977./192 \
00743                       +t*(-156927./448+t*(-28909./1152 \
00744                           +t*(-3569./4480))))))))); \
00745         dphi[8] = (-8314091./56+t*(-49901303./144+t*(-11290587./32 \
00746                 +t*(-19648027./96+t*(-9469163./128+t*(-545977./32 \
00747                       +t*(-156927./64+t*(-28909./144 \
00748                           +t*(-32121./4480)))))))))*h_1; \
00749         t--; \
00750         phi[9] = 439375./7+t*(64188125./504+t*(231125375./2016 \
00751               +t*(17306975./288+t*(7761805./384+t*(2895587./640 \
00752                     +t*(129391./192+t*(259715./4032+t*(28909./8064 \
00753                           +t*(3569./40320))))))))); \
00754         dphi[9] = (64188125./504+t*(231125375./1008 \
00755               +t*(17306975./96+t*(7761805./96+t*(2895587./128 \
00756                     +t*(129391./32+t*(259715./576+t*(28909./1008 \
00757                           +t*(3569./4480)))))))))*h_1; \
00758         break; \
00759       case NL_MSM_APPROX_BSPLINE: \
00760         phi[0] = (1./6) * (2-t) * (2-t) * (2-t); \
00761         dphi[0] = -0.5 * (2-t) * (2-t) * h_1; \
00762         t--; \
00763         phi[1] = (2./3) + t*t*(-1 + 0.5*t); \
00764         dphi[1] = t*(-2 + 1.5*t) * h_1; \
00765         t--; \
00766         phi[2] = (2./3) + t*t*(-1 - 0.5*t); \
00767         dphi[2] = t*(-2 - 1.5*t) * h_1; \
00768         t--; \
00769         phi[3] = (1./6) * (2+t) * (2+t) * (2+t); \
00770         dphi[3] = 0.5 * (2+t) * (2+t) * h_1; \
00771         break; \
00772       default: \
00773         return NL_MSM_ERROR_SUPPORT; \
00774     } \
00775   } while (0)
00776   /* closing ';' from use as function call */
00777 
00778 
00779 int anterpolation(NL_Msm *msm)
00780 {
00781   const double *atom = msm->atom;
00782   const int natoms = msm->numatoms;
00783 
00784   double xphi[MAX_POLY_DEGREE+1];  /* Phi stencil along x-dimension */
00785   double yphi[MAX_POLY_DEGREE+1];  /* Phi stencil along y-dimension */
00786   double zphi[MAX_POLY_DEGREE+1];  /* Phi stencil along z-dimension */
00787   double rx_hx, ry_hy, rz_hz;      /* normalized distance from atom to origin */
00788 #ifndef TEST_INLINING
00789   double delta;                    /* normalized distance to stencil point */
00790 #else
00791   double t;                    /* normalized distance to stencil point */
00792 #endif
00793   double ck, cjk;
00794   const double hx_1 = 1/msm->hx;
00795   const double hy_1 = 1/msm->hy;
00796   const double hz_1 = 1/msm->hz;
00797   const double xm0 = msm->gx;      /* grid origin x */
00798   const double ym0 = msm->gy;      /* grid origin y */
00799   const double zm0 = msm->gz;      /* grid origin z */
00800   double q;
00801 
00802   NL_Msmgrid_double *qhgrid = &(msm->qh[0]);
00803   double *qh = qhgrid->data;
00804   const int ni = qhgrid->ni;
00805   const int nj = qhgrid->nj;
00806   const int nk = qhgrid->nk;
00807   const int ia = qhgrid->i0;
00808   const int ib = ia + ni - 1;
00809   const int ja = qhgrid->j0;
00810   const int jb = ja + nj - 1;
00811   const int ka = qhgrid->k0;
00812   const int kb = ka + nk - 1;
00813 
00814   const int ispx = (msm->msmflags & NL_MSM_PERIODIC_VEC1);
00815   const int ispy = (msm->msmflags & NL_MSM_PERIODIC_VEC2);
00816   const int ispz = (msm->msmflags & NL_MSM_PERIODIC_VEC3);
00817   int iswithin;
00818 
00819   int n, i, j, k, ilo, jlo, klo, koff;
00820   int jkoff, index;
00821 
00822   const int approx = msm->approx;
00823   const int s_size = PolyDegree[approx] + 1;         /* stencil size */
00824   const int s_edge = (PolyDegree[approx] - 1) >> 1;  /* stencil "edge" size */
00825 
00826   GRID_ZERO(qhgrid);
00827 
00828   for (n = 0;  n < natoms;  n++) {
00829 
00830     /* atomic charge */
00831     q = atom[4*n + 3];
00832     if (0==q) continue;
00833 
00834     /* distance between atom and origin measured in grid points */
00835     rx_hx = (atom[4*n    ] - xm0) * hx_1;
00836     ry_hy = (atom[4*n + 1] - ym0) * hy_1;
00837     rz_hz = (atom[4*n + 2] - zm0) * hz_1;
00838 
00839     /* find smallest numbered grid point in stencil */
00840     ilo = (int) floor(rx_hx) - s_edge;
00841     jlo = (int) floor(ry_hy) - s_edge;
00842     klo = (int) floor(rz_hz) - s_edge;
00843 
00844     /* calculate Phi stencils along each dimension */
00845 #ifndef TEST_INLINING
00846     delta = rx_hx - (double) ilo;
00847     STENCIL_1D(xphi, delta, approx);
00848     delta = ry_hy - (double) jlo;
00849     STENCIL_1D(yphi, delta, approx);
00850     delta = rz_hz - (double) klo;
00851     STENCIL_1D(zphi, delta, approx);
00852 #else
00853     t = rx_hx - (double) ilo;
00854         xphi[0] = 0.5 * (1 - t) * (2 - t) * (2 - t); \
00855         t--; \
00856         xphi[1] = (1 - t) * (1 + t - 1.5 * t * t); \
00857         t--; \
00858         xphi[2] = (1 + t) * (1 - t - 1.5 * t * t); \
00859         t--; \
00860         xphi[3] = 0.5 * (1 + t) * (2 + t) * (2 + t); \
00861 
00862     t = ry_hy - (double) jlo;
00863         yphi[0] = 0.5 * (1 - t) * (2 - t) * (2 - t); \
00864         t--; \
00865         yphi[1] = (1 - t) * (1 + t - 1.5 * t * t); \
00866         t--; \
00867         yphi[2] = (1 + t) * (1 - t - 1.5 * t * t); \
00868         t--; \
00869         yphi[3] = 0.5 * (1 + t) * (2 + t) * (2 + t); \
00870 
00871     t = rz_hz - (double) klo;
00872         zphi[0] = 0.5 * (1 - t) * (2 - t) * (2 - t); \
00873         t--; \
00874         zphi[1] = (1 - t) * (1 + t - 1.5 * t * t); \
00875         t--; \
00876         zphi[2] = (1 + t) * (1 - t - 1.5 * t * t); \
00877         t--; \
00878         zphi[3] = 0.5 * (1 + t) * (2 + t) * (2 + t); \
00879 
00880 #endif
00881 
00882     /* test to see if stencil is within edges of grid */
00883     iswithin = ( ia <= ilo && (ilo+(s_size-1)) <= ib &&
00884                  ja <= jlo && (jlo+(s_size-1)) <= jb &&
00885                  ka <= klo && (klo+(s_size-1)) <= kb );
00886 
00887     if ( iswithin ) {  /* no wrapping needed */
00888 
00889       /* determine charge on cube of grid points around atom */
00890       for (k = 0;  k < s_size;  k++) {
00891         koff = (k + klo) * nj;
00892         ck = zphi[k] * q;
00893         for (j = 0;  j < s_size;  j++) {
00894           jkoff = (koff + (j + jlo)) * ni;
00895           cjk = yphi[j] * ck;
00896           for (i = 0;  i < s_size;  i++) {
00897             index = jkoff + (i + ilo);
00898             GRID_INDEX_CHECK(qhgrid, i+ilo, j+jlo, k+klo);
00899             ASSERT(GRID_INDEX(qhgrid, i+ilo, j+jlo, k+klo) == index);
00900             qh[index] += xphi[i] * cjk;
00901           }
00902         }
00903       }
00904     } /* if */
00905 
00906     else {  /* requires wrapping around grid */
00907       int ip, jp, kp;
00908 
00909       /* adjust ilo, jlo, klo so they are within grid indexing */
00910       if (ispx) {
00911         if      (ilo < ia) do { ilo += ni; } while (ilo < ia);
00912         else if (ilo > ib) do { ilo -= ni; } while (ilo > ib);
00913       }
00914       else if (ilo < ia || (ilo+(s_size-1)) > ib) {
00915         return NL_MSM_ERROR_RANGE;
00916       }
00917 
00918       if (ispy) {
00919         if      (jlo < ja) do { jlo += nj; } while (jlo < ja);
00920         else if (jlo > jb) do { jlo -= nj; } while (jlo > jb);
00921       }
00922       else if (jlo < ja || (jlo+(s_size-1)) > jb) {
00923         return NL_MSM_ERROR_RANGE;
00924       }
00925 
00926       if (ispz) {
00927         if      (klo < ka) do { klo += nk; } while (klo < ka);
00928         else if (klo > kb) do { klo -= nk; } while (klo > kb);
00929       }
00930       else if (klo < ka || (klo+(s_size-1)) > kb) {
00931         return NL_MSM_ERROR_RANGE;
00932       }
00933 
00934       /* determine charge on cube of grid points around atom, with wrapping */
00935       for (k = 0, kp = klo;  k < s_size;  k++, kp++) {
00936         if (kp > kb) kp = ka;  /* wrap stencil around grid */
00937         koff = kp * nj;
00938         ck = zphi[k] * q;
00939         for (j = 0, jp = jlo;  j < s_size;  j++, jp++) {
00940           if (jp > jb) jp = ja;  /* wrap stencil around grid */
00941           jkoff = (koff + jp) * ni;
00942           cjk = yphi[j] * ck;
00943           for (i = 0, ip = ilo;  i < s_size;  i++, ip++) {
00944             if (ip > ib) ip = ia;  /* wrap stencil around grid */
00945             index = jkoff + ip;
00946             GRID_INDEX_CHECK(qhgrid, ip, jp, kp);
00947             ASSERT(GRID_INDEX(qhgrid, ip, jp, kp) == index);
00948             qh[index] += xphi[i] * cjk;
00949           }
00950         }
00951       }
00952     } /* else */
00953 
00954   } /* end loop over atoms */
00955 
00956   return NL_MSM_SUCCESS;
00957 } /* anterpolation */
00958 
00959 
00960 int interpolation(NL_Msm *msm)
00961 {
00962   double *f = msm->felec;
00963   const double *atom = msm->atom;
00964   const int natoms = msm->numatoms;
00965 
00966   double xphi[MAX_POLY_DEGREE+1];  /* Phi stencil along x-dimension */
00967   double yphi[MAX_POLY_DEGREE+1];  /* Phi stencil along y-dimension */
00968   double zphi[MAX_POLY_DEGREE+1];  /* Phi stencil along z-dimension */
00969   double dxphi[MAX_POLY_DEGREE+1]; /* derivative of Phi along x-dimension */
00970   double dyphi[MAX_POLY_DEGREE+1]; /* derivative of Phi along y-dimension */
00971   double dzphi[MAX_POLY_DEGREE+1]; /* derivative of Phi along z-dimension */
00972   double rx_hx, ry_hy, rz_hz;      /* normalized distance from atom to origin */
00973 #ifndef TEST_INLINING
00974   double delta;                    /* normalized distance to stencil point */
00975 #else
00976   double t;                    /* normalized distance to stencil point */
00977 #endif
00978   const double hx_1 = 1/msm->hx;
00979   const double hy_1 = 1/msm->hy;
00980   const double hz_1 = 1/msm->hz;
00981   const double xm0 = msm->gx;      /* grid origin x */
00982   const double ym0 = msm->gy;      /* grid origin y */
00983   const double zm0 = msm->gz;      /* grid origin z */
00984   double q;
00985   double u = 0;
00986 
00987   const NL_Msmgrid_double *ehgrid = &(msm->eh[0]);
00988   const double *eh = ehgrid->data;
00989   const double *ebuf = ehgrid->buffer;
00990   const NL_Msmgrid_double *qhgrid = &(msm->qh[0]);
00991   const double *qbuf = qhgrid->buffer;
00992   const int ni = ehgrid->ni;
00993   const int nj = ehgrid->nj;
00994   const int nk = ehgrid->nk;
00995   const int ia = ehgrid->i0;
00996   const int ib = ia + ni - 1;
00997   const int ja = ehgrid->j0;
00998   const int jb = ja + nj - 1;
00999   const int ka = ehgrid->k0;
01000   const int kb = ka + nk - 1;
01001 
01002   const int ispx = (msm->msmflags & NL_MSM_PERIODIC_VEC1);
01003   const int ispy = (msm->msmflags & NL_MSM_PERIODIC_VEC2);
01004   const int ispz = (msm->msmflags & NL_MSM_PERIODIC_VEC3);
01005   int iswithin;
01006 
01007   double fx, fy, fz, cx, cy, cz;
01008 
01009   int n, i, j, k, ilo, jlo, klo, koff;
01010   long jkoff, index;
01011   const int nn = (ni*nj) * nk;
01012 
01013   const int approx = msm->approx;
01014   const int s_size = PolyDegree[approx] + 1;         /* stencil size */
01015   const int s_edge = (PolyDegree[approx] - 1) >> 1;  /* stencil "edge" size */
01016 
01017   for (n = 0;  n < natoms;  n++) {
01018 
01019     /* atomic charge */
01020     q = atom[4*n + 3];
01021     if (0==q) continue;
01022 
01023     /* distance between atom and origin measured in grid points */
01024     rx_hx = (atom[4*n    ] - xm0) * hx_1;
01025     ry_hy = (atom[4*n + 1] - ym0) * hy_1;
01026     rz_hz = (atom[4*n + 2] - zm0) * hz_1;
01027 
01028     /* find smallest numbered grid point in stencil */
01029     ilo = (int) floor(rx_hx) - s_edge;
01030     jlo = (int) floor(ry_hy) - s_edge;
01031     klo = (int) floor(rz_hz) - s_edge;
01032 
01033     /* calculate Phi stencils and derivatives along each dimension */
01034 #ifndef TEST_INLINING
01035     delta = rx_hx - (double) ilo;
01036     //STENCIL_1D(xphi, delta, approx);
01037     D_STENCIL_1D(dxphi, xphi, hx_1, delta, approx);
01038     delta = ry_hy - (double) jlo;
01039     //STENCIL_1D(yphi, delta, approx);
01040     D_STENCIL_1D(dyphi, yphi, hy_1, delta, approx);
01041     delta = rz_hz - (double) klo;
01042     //STENCIL_1D(zphi, delta, approx);
01043     D_STENCIL_1D(dzphi, zphi, hz_1, delta, approx);
01044 #else
01045     t = rx_hx - (double) ilo;
01046         xphi[0] = 0.5 * (1 - t) * (2 - t) * (2 - t); \
01047         dxphi[0] = (1.5 * t - 2) * (2 - t) * hx_1; \
01048         t--; \
01049         xphi[1] = (1 - t) * (1 + t - 1.5 * t * t); \
01050         dxphi[1] = (-5 + 4.5 * t) * t * hx_1; \
01051         t--; \
01052         xphi[2] = (1 + t) * (1 - t - 1.5 * t * t); \
01053         dxphi[2] = (-5 - 4.5 * t) * t * hx_1; \
01054         t--; \
01055         xphi[3] = 0.5 * (1 + t) * (2 + t) * (2 + t); \
01056         dxphi[3] = (1.5 * t + 2) * (2 + t) * hx_1; \
01057 
01058     t = ry_hy - (double) jlo;
01059         yphi[0] = 0.5 * (1 - t) * (2 - t) * (2 - t); \
01060         dyphi[0] = (1.5 * t - 2) * (2 - t) * hy_1; \
01061         t--; \
01062         yphi[1] = (1 - t) * (1 + t - 1.5 * t * t); \
01063         dyphi[1] = (-5 + 4.5 * t) * t * hy_1; \
01064         t--; \
01065         yphi[2] = (1 + t) * (1 - t - 1.5 * t * t); \
01066         dyphi[2] = (-5 - 4.5 * t) * t * hy_1; \
01067         t--; \
01068         yphi[3] = 0.5 * (1 + t) * (2 + t) * (2 + t); \
01069         dyphi[3] = (1.5 * t + 2) * (2 + t) * hy_1; \
01070 
01071     t = rz_hz - (double) klo;
01072         zphi[0] = 0.5 * (1 - t) * (2 - t) * (2 - t); \
01073         dzphi[0] = (1.5 * t - 2) * (2 - t) * hz_1; \
01074         t--; \
01075         zphi[1] = (1 - t) * (1 + t - 1.5 * t * t); \
01076         dzphi[1] = (-5 + 4.5 * t) * t * hz_1; \
01077         t--; \
01078         zphi[2] = (1 + t) * (1 - t - 1.5 * t * t); \
01079         dzphi[2] = (-5 - 4.5 * t) * t * hz_1; \
01080         t--; \
01081         zphi[3] = 0.5 * (1 + t) * (2 + t) * (2 + t); \
01082         dzphi[3] = (1.5 * t + 2) * (2 + t) * hz_1; \
01083 
01084 #endif
01085 
01086     /* test to see if stencil is within edges of grid */
01087     iswithin = ( ia <= ilo && (ilo+(s_size-1)) <= ib &&
01088                  ja <= jlo && (jlo+(s_size-1)) <= jb &&
01089                  ka <= klo && (klo+(s_size-1)) <= kb );
01090 
01091     if ( iswithin ) {  /* no wrapping needed */
01092 
01093       /* determine force on atom from cube of grid point potentials */
01094       fx = fy = fz = 0;
01095       for (k = 0;  k < s_size;  k++) {
01096         koff = (k + klo) * nj;
01097         for (j = 0;  j < s_size;  j++) {
01098           jkoff = (koff + (j + jlo)) * ni;
01099           cx = yphi[j] * zphi[k];
01100           cy = dyphi[j] * zphi[k];
01101           cz = yphi[j] * dzphi[k];
01102           for (i = 0;  i < s_size;  i++) {
01103             index = jkoff + (i + ilo);
01104             GRID_INDEX_CHECK(ehgrid, i+ilo, j+jlo, k+klo);
01105             ASSERT(GRID_INDEX(ehgrid, i+ilo, j+jlo, k+klo) == index);
01106             fx += eh[index] * dxphi[i] * cx;
01107             fy += eh[index] * xphi[i] * cy;
01108             fz += eh[index] * xphi[i] * cz;
01109           }
01110         }
01111       }
01112     } /* if */
01113 
01114     else {  /* requires wrapping around grid */
01115       int ip, jp, kp;
01116 
01117       /* adjust ilo, jlo, klo so they are within grid indexing */
01118       if (ispx) {
01119         if      (ilo < ia) do { ilo += ni; } while (ilo < ia);
01120         else if (ilo > ib) do { ilo -= ni; } while (ilo > ib);
01121       }
01122       else if (ilo < ia || (ilo+(s_size-1)) > ib) {
01123         return NL_MSM_ERROR_RANGE;
01124       }
01125 
01126       if (ispy) {
01127         if      (jlo < ja) do { jlo += nj; } while (jlo < ja);
01128         else if (jlo > jb) do { jlo -= nj; } while (jlo > jb);
01129       }
01130       else if (jlo < ja || (jlo+(s_size-1)) > jb) {
01131         return NL_MSM_ERROR_RANGE;
01132       }
01133 
01134       if (ispz) {
01135         if      (klo < ka) do { klo += nk; } while (klo < ka);
01136         else if (klo > kb) do { klo -= nk; } while (klo > kb);
01137       }
01138       else if (klo < ka || (klo+(s_size-1)) > kb) {
01139         return NL_MSM_ERROR_RANGE;
01140       }
01141 
01142       /* determine force on atom from cube of grid point potentials, wrapping */
01143       fx = fy = fz = 0;
01144       for (k = 0, kp = klo;  k < s_size;  k++, kp++) {
01145         if (kp > kb) kp = ka;  /* wrap stencil around grid */
01146         koff = kp * nj;
01147         for (j = 0, jp = jlo;  j < s_size;  j++, jp++) {
01148           if (jp > jb) jp = ja;  /* wrap stencil around grid */
01149           jkoff = (koff + jp) * ni;
01150           cx = yphi[j] * zphi[k];
01151           cy = dyphi[j] * zphi[k];
01152           cz = yphi[j] * dzphi[k];
01153           for (i = 0, ip = ilo;  i < s_size;  i++, ip++) {
01154             if (ip > ib) ip = ia;  /* wrap stencil around grid */
01155             index = jkoff + ip;
01156             GRID_INDEX_CHECK(ehgrid, ip, jp, kp);
01157             ASSERT(GRID_INDEX(ehgrid, ip, jp, kp) == index);
01158             fx += eh[index] * dxphi[i] * cx;
01159             fy += eh[index] * xphi[i] * cy;
01160             fz += eh[index] * xphi[i] * cz;
01161           }
01162         }
01163       }
01164     } /* else */
01165 
01166     /* update force */
01167     f[3*n  ] -= q * fx;
01168     f[3*n+1] -= q * fy;
01169     f[3*n+2] -= q * fz;
01170 //    printf("force[%d] = %g %g %g\n", n, -q*fx, -q*fy, -q*fz);
01171 
01172   } /* end loop over atoms */
01173 
01174   /* compute potential, subtract self potential */
01175   u = 0;
01176   if (1) {
01177     for (n = 0;  n < natoms;  n++) {
01178       double q = atom[4*n + 3];
01179       u += q * q;
01180     }
01181     u *= -msm->gzero;
01182   }
01183   for (index = 0;  index < nn;  index++) {
01184     u += qbuf[index] * ebuf[index];
01185   }
01186   msm->uelec += 0.5 * u;
01187 //  printf("MSM long-range potential:  %g\n", 0.5 * u);
01188 
01189   return NL_MSM_SUCCESS;
01190 } /* interpolation() */
01191 
01192 
01193 int restriction_factored(NL_Msm *msm, int level) {
01194   /* grids of charge, finer grid and coarser grid */
01195   const NL_Msmgrid_double *qhgrid = &(msm->qh[level]);
01196   const double *qh = qhgrid->data;
01197   NL_Msmgrid_double *q2hgrid = &(msm->qh[level+1]);
01198   double *q2h = q2hgrid->data;
01199 
01200   /* finer grid index ranges and dimensions */
01201   const int ia1 = qhgrid->i0;             /* lowest x-index */
01202   const int ib1 = ia1 + qhgrid->ni - 1;   /* highest x-index */
01203   const int ja1 = qhgrid->j0;             /* lowest y-index */
01204   const int jb1 = ja1 + qhgrid->nj - 1;   /* highest y-index */
01205   const int ka1 = qhgrid->k0;             /* lowest z-index */
01206   const int kb1 = ka1 + qhgrid->nk - 1;   /* highest z-index */
01207   const int ni1 = qhgrid->ni;             /* length along x-dim */
01208   const int nj1 = qhgrid->nj;             /* length along y-dim */
01209   const int nk1 = qhgrid->nk;             /* length along z-dim */
01210 
01211   /* coarser grid index ranges and dimensions */
01212   const int ia2 = q2hgrid->i0;            /* lowest x-index */
01213   const int ib2 = ia2 + q2hgrid->ni - 1;  /* highest x-index */
01214   const int ja2 = q2hgrid->j0;            /* lowest y-index */
01215   const int jb2 = ja2 + q2hgrid->nj - 1;  /* highest y-index */
01216   const int ka2 = q2hgrid->k0;            /* lowest z-index */
01217   const int kb2 = ka2 + q2hgrid->nk - 1;  /* highest z-index */
01218   const int nrow_q2 = q2hgrid->ni;
01219   const int nstride_q2 = nrow_q2 * q2hgrid->nj;
01220 
01221   const int ispx = (msm->msmflags & NL_MSM_PERIODIC_VEC1);
01222   const int ispy = (msm->msmflags & NL_MSM_PERIODIC_VEC2);
01223   const int ispz = (msm->msmflags & NL_MSM_PERIODIC_VEC3);
01224 
01225   /* set buffer using indexing offset, so that indexing matches qh grid */
01226   double *qzd = msm->lzd + (-ka1);
01227   double *qyzd = msm->lyzd + (-ka1*nj1 + -ja1);
01228   double qsum;
01229 
01230   const double *phi = NULL;
01231 
01232   int i2, j2, k2;
01233   int im, jm, km;
01234   int i, j, k;
01235   int index_plane_q2, index_q2;
01236   int index_jk, offset_k;
01237   int offset;
01238 
01239   const double *phi_factored = PhiStencilFactored[msm->approx];
01240   const int r_stencil = PolyDegree[msm->approx];  /* "radius" of stencil */
01241   const int diam_stencil = 2*r_stencil + 1;       /* "diameter" of stencil */
01242 
01243   for (i2 = ia2;  i2 <= ib2;  i2++) {
01244 
01245     for (k = ka1;  k <= kb1;  k++) {
01246       offset_k = k * nj1;
01247 
01248       for (j = ja1;  j <= jb1;  j++) {
01249         index_jk = offset_k + j;
01250         offset = index_jk * ni1;
01251         im = (i2 << 1);  /* = 2*i2 */
01252         qsum = 0;
01253         if ( ! ispx ) {  /* nonperiodic */
01254           int lower = im - r_stencil;
01255           int upper = im + r_stencil;
01256           if (lower < ia1) lower = ia1;
01257           if (upper > ib1) upper = ib1;  /* clip edges */
01258           phi = phi_factored + r_stencil;  /* center of stencil */
01259           for (i = lower;  i <= upper;  i++) {
01260             qsum += phi[i-im] * qh[offset + i];
01261           }
01262         }
01263         else {  /* periodic */
01264           int ip = im - r_stencil;  /* index at left end of stencil */
01265           if (ip < ia1) do { ip += ni1; } while (ip < ia1);  /* start inside */
01266           phi = phi_factored;  /* left end of stencil */
01267           for (i = 0;  i < diam_stencil;  i++, ip++) {
01268             if (ip > ib1) ip = ia1;  /* wrap around edge of grid */
01269             qsum += phi[i] * qh[offset + ip];
01270           }
01271         }
01272         qyzd[index_jk] = qsum;
01273       } /* for j */
01274 
01275     } /* for k */
01276 
01277     for (j2 = ja2;  j2 <= jb2;  j2++) {
01278       index_plane_q2 = j2 * nrow_q2 + i2;
01279 
01280       for (k = ka1;  k <= kb1;  k++) {
01281         offset = k * nj1;
01282         jm = (j2 << 1);  /* = 2*j2 */
01283         qsum = 0;
01284         if ( ! ispy ) {  /* nonperiodic */
01285           int lower = jm - r_stencil;
01286           int upper = jm + r_stencil;
01287           if (lower < ja1) lower = ja1;
01288           if (upper > jb1) upper = jb1;  /* clip edges */
01289           phi = phi_factored + r_stencil;  /* center of stencil */
01290           for (j = lower;  j <= upper;  j++) {
01291             qsum += phi[j-jm] * qyzd[offset + j];
01292           }
01293         }
01294         else {  /* periodic */
01295           int jp = jm - r_stencil;  /* index at left end of stencil */
01296           if (jp < ja1) do { jp += nj1; } while (jp < ja1);  /* start inside */
01297           phi = phi_factored;  /* left end of stencil */
01298           for (j = 0;  j < diam_stencil;  j++, jp++) {
01299             if (jp > jb1) jp = ja1;  /* wrap around edge of grid */
01300             qsum += phi[j] * qyzd[offset + jp];
01301           }
01302         }
01303         qzd[k] = qsum;
01304       } /* for k */
01305 
01306       for (k2 = ka2;  k2 <= kb2;  k2++) {
01307         index_q2 = k2 * nstride_q2 + index_plane_q2;
01308         km = (k2 << 1);  /* = 2*k2 */
01309         qsum = 0;
01310         if ( ! ispz ) {  /* nonperiodic */
01311           int lower = km - r_stencil;
01312           int upper = km + r_stencil;
01313           if (lower < ka1) lower = ka1;
01314           if (upper > kb1) upper = kb1;  /* clip edges */
01315           phi = phi_factored + r_stencil;  /* center of stencil */
01316           for (k = lower;  k <= upper;  k++) {
01317             qsum += phi[k-km] * qzd[k];
01318           }
01319         }
01320         else {  /* periodic */
01321           int kp = km - r_stencil;  /* index at left end of stencil */
01322           if (kp < ka1) do { kp += nk1; } while (kp < ka1);  /* start inside */
01323           phi = phi_factored;  /* left end of stencil */
01324           for (k = 0;  k < diam_stencil;  k++, kp++) {
01325             if (kp > kb1) kp = ka1;  /* wrap around edge of grid */
01326             qsum += phi[k] * qzd[kp];
01327           }
01328         }
01329         q2h[index_q2] = qsum;
01330       } /* for k2 */
01331 
01332     } /* for j2 */
01333 
01334   } /* for i2 */
01335 
01336   return NL_MSM_SUCCESS;
01337 } /* restriction_factored */
01338 
01339 
01340 int prolongation_factored(NL_Msm *msm, int level) {
01341   /* grids of potential, finer grid and coarser grid */
01342   NL_Msmgrid_double *ehgrid = &(msm->eh[level]);
01343   double *eh = ehgrid->data;
01344   const NL_Msmgrid_double *e2hgrid = &(msm->eh[level+1]);
01345   const double *e2h = e2hgrid->data;
01346 
01347   /* finer grid index ranges and dimensions */
01348   const int ia1 = ehgrid->i0;             /* lowest x-index */
01349   const int ib1 = ia1 + ehgrid->ni - 1;   /* highest x-index */
01350   const int ja1 = ehgrid->j0;             /* lowest y-index */
01351   const int jb1 = ja1 + ehgrid->nj - 1;   /* highest y-index */
01352   const int ka1 = ehgrid->k0;             /* lowest z-index */
01353   const int kb1 = ka1 + ehgrid->nk - 1;   /* highest z-index */
01354   const int ni1 = ehgrid->ni;             /* length along x-dim */
01355   const int nj1 = ehgrid->nj;             /* length along y-dim */
01356   const int nk1 = ehgrid->nk;             /* length along z-dim */
01357 
01358   /* coarser grid index ranges and dimensions */
01359   const int ia2 = e2hgrid->i0;            /* lowest x-index */
01360   const int ib2 = ia2 + e2hgrid->ni - 1;  /* highest x-index */
01361   const int ja2 = e2hgrid->j0;            /* lowest y-index */
01362   const int jb2 = ja2 + e2hgrid->nj - 1;  /* highest y-index */
01363   const int ka2 = e2hgrid->k0;            /* lowest z-index */
01364   const int kb2 = ka2 + e2hgrid->nk - 1;  /* highest z-index */
01365   const int nrow_e2 = e2hgrid->ni;
01366   const int nstride_e2 = nrow_e2 * e2hgrid->nj;
01367 
01368   const int ispx = (msm->msmflags & NL_MSM_PERIODIC_VEC1);
01369   const int ispy = (msm->msmflags & NL_MSM_PERIODIC_VEC2);
01370   const int ispz = (msm->msmflags & NL_MSM_PERIODIC_VEC3);
01371 
01372   /* set buffer using indexing offset, so that indexing matches eh grid */
01373   double *ezd = msm->lzd + (-ka1);
01374   double *eyzd = msm->lyzd + (-ka1*nj1 + -ja1);
01375 
01376   const size_t size_lzd = nk1 * sizeof(double);
01377   const size_t size_lyzd = nj1 * nk1 * sizeof(double);
01378 
01379   const double *phi = NULL;
01380 
01381   int i2, j2, k2;
01382   int im, jm, km;
01383   int i, j, k;
01384   int index_plane_e2, index_e2;
01385   int index_jk, offset_k;
01386   int offset;
01387 
01388   const double *phi_factored = PhiStencilFactored[msm->approx];
01389   const int r_stencil = PolyDegree[msm->approx];  /* "radius" of stencil */
01390   const int diam_stencil = 2*r_stencil + 1;       /* "diameter" of stencil */
01391 
01392   for (i2 = ia2;  i2 <= ib2;  i2++) {
01393     memset(msm->lyzd, 0, size_lyzd);
01394 
01395     for (j2 = ja2;  j2 <= jb2;  j2++) {
01396       memset(msm->lzd, 0, size_lzd);
01397       index_plane_e2 = j2 * nrow_e2 + i2;
01398 
01399       for (k2 = ka2;  k2 <= kb2;  k2++) {
01400         index_e2 = k2 * nstride_e2 + index_plane_e2;
01401         km = (k2 << 1);  /* = 2*k2 */
01402         if ( ! ispz ) {  /* nonperiodic */
01403           int lower = km - r_stencil;
01404           int upper = km + r_stencil;
01405           if (lower < ka1) lower = ka1;
01406           if (upper > kb1) upper = kb1;  /* clip edges */
01407           phi = phi_factored + r_stencil;  /* center of stencil */
01408           for (k = lower;  k <= upper;  k++) {
01409             ezd[k] += phi[k-km] * e2h[index_e2];
01410           }
01411         }
01412         else {  /* periodic */
01413           int kp = km - r_stencil;  /* index at left end of stencil */
01414           if (kp < ka1) do { kp += nk1; } while (kp < ka1);  /* start inside */
01415           phi = phi_factored;  /* left end of stencil */
01416           for (k = 0;  k < diam_stencil;  k++, kp++) {
01417             if (kp > kb1) kp = ka1;  /* wrap around edge of grid */
01418             ezd[kp] += phi[k] * e2h[index_e2];
01419           }
01420         }
01421       } /* for k2 */
01422 
01423       for (k = ka1;  k <= kb1;  k++) {
01424         offset = k * nj1;
01425         jm = (j2 << 1);  /* = 2*j2 */
01426         if ( ! ispy ) {  /* nonperiodic */
01427           int lower = jm - r_stencil;
01428           int upper = jm + r_stencil;
01429           if (lower < ja1) lower = ja1;
01430           if (upper > jb1) upper = jb1;  /* clip edges */
01431           phi = phi_factored + r_stencil;  /* center of stencil */
01432           for (j = lower;  j <= upper;  j++) {
01433             eyzd[offset + j] += phi[j-jm] * ezd[k];
01434           }
01435         }
01436         else {  /* periodic */
01437           int jp = jm - r_stencil;  /* index at left end of stencil */
01438           if (jp < ja1) do { jp += nj1; } while (jp < ja1);  /* start inside */
01439           phi = phi_factored;  /* left end of stencil */
01440           for (j = 0;  j < diam_stencil;  j++, jp++) {
01441             if (jp > jb1) jp = ja1;  /* wrap around edge of grid */
01442             eyzd[offset + jp] += phi[j] * ezd[k];
01443           }
01444         }
01445       } /* for k */
01446 
01447     } /* for j2 */
01448 
01449     for (k = ka1;  k <= kb1;  k++) {
01450       offset_k = k * nj1;
01451 
01452       for (j = ja1;  j <= jb1;  j++) {
01453         index_jk = offset_k + j;
01454         offset = index_jk * ni1;
01455         im = (i2 << 1);  /* = 2*i2 */
01456         if ( ! ispx ) {  /* nonperiodic */
01457           int lower = im - r_stencil;
01458           int upper = im + r_stencil;
01459           if (lower < ia1) lower = ia1;
01460           if (upper > ib1) upper = ib1;  /* clip edges */
01461           phi = phi_factored + r_stencil;  /* center of stencil */
01462           for (i = lower;  i <= upper;  i++) {
01463             eh[offset + i] += phi[i-im] * eyzd[index_jk];
01464           }
01465         }
01466         else {  /* periodic */
01467           int ip = im - r_stencil;  /* index at left end of stencil */
01468           if (ip < ia1) do { ip += ni1; } while (ip < ia1);  /* start inside */
01469           phi = phi_factored;  /* left end of stencil */
01470           for (i = 0;  i < diam_stencil;  i++, ip++) {
01471             if (ip > ib1) ip = ia1;  /* wrap around edge of grid */
01472             eh[offset + ip] += phi[i] * eyzd[index_jk];
01473           }
01474         }
01475       } /* for j */
01476 
01477     } /* for k */
01478 
01479   } /* for i2 */
01480 
01481   return NL_MSM_SUCCESS;
01482 } /* prolongation_factored */
01483 
01484 
01485 int restriction(NL_Msm *msm, int level) {
01486   /* grids of charge, finer grid and coarser grid */
01487   const NL_Msmgrid_double *qhgrid = &(msm->qh[level]);
01488   const double *qh = qhgrid->data;        /* index the offset data buffer */
01489   NL_Msmgrid_double *q2hgrid = &(msm->qh[level+1]);
01490   double *q2h_buffer = q2hgrid->buffer;   /* index the raw buffer */
01491 
01492   /* finer grid index ranges and dimensions */
01493   const int ia1 = qhgrid->i0;             /* lowest x-index */
01494   const int ib1 = ia1 + qhgrid->ni - 1;   /* highest x-index */
01495   const int ja1 = qhgrid->j0;             /* lowest y-index */
01496   const int jb1 = ja1 + qhgrid->nj - 1;   /* highest y-index */
01497   const int ka1 = qhgrid->k0;             /* lowest z-index */
01498   const int kb1 = ka1 + qhgrid->nk - 1;   /* highest z-index */
01499   const int ni1 = qhgrid->ni;             /* length along x-dim */
01500   const int nj1 = qhgrid->nj;             /* length along y-dim */
01501   const int nk1 = qhgrid->nk;             /* length along z-dim */
01502 
01503   /* coarser grid index ranges and dimensions */
01504   const int ia2 = q2hgrid->i0;            /* lowest x-index */
01505   const int ib2 = ia2 + q2hgrid->ni - 1;  /* highest x-index */
01506   const int ja2 = q2hgrid->j0;            /* lowest y-index */
01507   const int jb2 = ja2 + q2hgrid->nj - 1;  /* highest y-index */
01508   const int ka2 = q2hgrid->k0;            /* lowest z-index */
01509   const int kb2 = ka2 + q2hgrid->nk - 1;  /* highest z-index */
01510 
01511   const int ispx = (msm->msmflags & NL_MSM_PERIODIC_VEC1);
01512   const int ispy = (msm->msmflags & NL_MSM_PERIODIC_VEC2);
01513   const int ispz = (msm->msmflags & NL_MSM_PERIODIC_VEC3);
01514 
01515   const int nstencil = Nstencil[msm->approx];
01516   const int *offset = IndexOffset[msm->approx];
01517   const double *phi = PhiStencil[msm->approx];
01518 
01519   double q2h_sum, cjk;
01520 
01521   int i1, j1, k1, index1, jk1off, k1off;
01522   int i2, j2, k2;
01523   int index2;
01524   int i, j, k;
01525 
01526   for (index2 = 0, k2 = ka2;  k2 <= kb2;  k2++) {
01527     k1 = k2 * 2;
01528     for (j2 = ja2;  j2 <= jb2;  j2++) {
01529       j1 = j2 * 2;
01530       for (i2 = ia2;  i2 <= ib2;  i2++, index2++) {
01531         i1 = i2 * 2;
01532 
01533         q2h_sum = 0;
01534         for (k = 0;  k < nstencil;  k++) {
01535           k1off = k1 + offset[k];
01536           if (k1off < ka1) {
01537             if (ispz) do { k1off += nk1; } while (k1off < ka1);
01538             else continue;
01539           }
01540           else if (k1off > kb1) {
01541             if (ispz) do { k1off -= nk1; } while (k1off > kb1);
01542             else break;
01543           }
01544           k1off *= nj1;
01545           for (j = 0;  j < nstencil;  j++) {
01546             jk1off = j1 + offset[j];
01547             if (jk1off < ja1) {
01548               if (ispy) do { jk1off += nj1; } while (jk1off < ja1);
01549               else continue;
01550             }
01551             else if (jk1off > jb1) {
01552               if (ispy) do { jk1off -= nj1; } while (jk1off > jb1);
01553               else break;
01554             }
01555             jk1off = (k1off + jk1off) * ni1;
01556             cjk = phi[j] * phi[k];
01557             for (i = 0;  i < nstencil;  i++) {
01558               index1 = i1 + offset[i];
01559               if (index1 < ia1) {
01560                 if (ispx) do { index1 += ni1; } while (index1 < ia1);
01561                 else continue;
01562               }
01563               else if (index1 > ib1) {
01564                 if (ispx) do { index1 -= ni1; } while (index1 > ib1);
01565                 else break;
01566               }
01567               index1 += jk1off;
01568               q2h_sum += qh[index1] * phi[i] * cjk;
01569             }
01570           }
01571         } /* end loop over finer grid stencil */
01572 
01573         q2h_buffer[index2] = q2h_sum;  /* store charge to coarser grid */
01574       }
01575     }
01576   } /* end loop over each coarser grid point */
01577 
01578   return NL_MSM_SUCCESS;
01579 }
01580 
01581 
01582 int prolongation(NL_Msm *msm, int level) {
01583   /* grids of charge, finer grid and coarser grid */
01584   NL_Msmgrid_double *ehgrid = &(msm->eh[level]);
01585   double *eh = ehgrid->data;        /* index the offset data buffer */
01586   const NL_Msmgrid_double *e2hgrid = &(msm->eh[level+1]);
01587   const double *e2h_buffer = e2hgrid->buffer;   /* index the raw buffer */
01588 
01589   /* finer grid index ranges and dimensions */
01590   const int ia1 = ehgrid->i0;             /* lowest x-index */
01591   const int ib1 = ia1 + ehgrid->ni - 1;   /* highest x-index */
01592   const int ja1 = ehgrid->j0;             /* lowest y-index */
01593   const int jb1 = ja1 + ehgrid->nj - 1;   /* highest y-index */
01594   const int ka1 = ehgrid->k0;             /* lowest z-index */
01595   const int kb1 = ka1 + ehgrid->nk - 1;   /* highest z-index */
01596   const int ni1 = ehgrid->ni;             /* length along x-dim */
01597   const int nj1 = ehgrid->nj;             /* length along y-dim */
01598   const int nk1 = ehgrid->nk;             /* length along z-dim */
01599 
01600   /* coarser grid index ranges and dimensions */
01601   const int ia2 = e2hgrid->i0;            /* lowest x-index */
01602   const int ib2 = ia2 + e2hgrid->ni - 1;  /* highest x-index */
01603   const int ja2 = e2hgrid->j0;            /* lowest y-index */
01604   const int jb2 = ja2 + e2hgrid->nj - 1;  /* highest y-index */
01605   const int ka2 = e2hgrid->k0;            /* lowest z-index */
01606   const int kb2 = ka2 + e2hgrid->nk - 1;  /* highest z-index */
01607 
01608   const int ispx = (msm->msmflags & NL_MSM_PERIODIC_VEC1);
01609   const int ispy = (msm->msmflags & NL_MSM_PERIODIC_VEC2);
01610   const int ispz = (msm->msmflags & NL_MSM_PERIODIC_VEC3);
01611 
01612   const int nstencil = Nstencil[msm->approx];
01613   const int *offset = IndexOffset[msm->approx];
01614   const double *phi = PhiStencil[msm->approx];
01615 
01616   double cjk;
01617 
01618   int i1, j1, k1, index1, jk1off, k1off;
01619   int i2, j2, k2;
01620   int index2;
01621   int i, j, k;
01622 
01623   for (index2 = 0, k2 = ka2;  k2 <= kb2;  k2++) {
01624     k1 = k2 * 2;
01625     for (j2 = ja2;  j2 <= jb2;  j2++) {
01626       j1 = j2 * 2;
01627       for (i2 = ia2;  i2 <= ib2;  i2++, index2++) {
01628         i1 = i2 * 2;
01629 
01630         for (k = 0;  k < nstencil;  k++) {
01631           k1off = k1 + offset[k];
01632           if (k1off < ka1) {
01633             if (ispz) do { k1off += nk1; } while (k1off < ka1);
01634             else continue;
01635           }
01636           else if (k1off > kb1) {
01637             if (ispz) do { k1off -= nk1; } while (k1off > kb1);
01638             else break;
01639           }
01640           k1off *= nj1;
01641           for (j = 0;  j < nstencil;  j++) {
01642             jk1off = j1 + offset[j];
01643             if (jk1off < ja1) {
01644               if (ispy) do { jk1off += nj1; } while (jk1off < ja1);
01645               else continue;
01646             }
01647             else if (jk1off > jb1) {
01648               if (ispy) do { jk1off -= nj1; } while (jk1off > jb1);
01649               else break;
01650             }
01651             jk1off = (k1off + jk1off) * ni1;
01652             cjk = phi[j] * phi[k];
01653             for (i = 0;  i < nstencil;  i++) {
01654               index1 = i1 + offset[i];
01655               if (index1 < ia1) {
01656                 if (ispx) do { index1 += ni1; } while (index1 < ia1);
01657                 else continue;
01658               }
01659               else if (index1 > ib1) {
01660                 if (ispx) do { index1 -= ni1; } while (index1 > ib1);
01661                 else break;
01662               }
01663               index1 += jk1off;
01664               eh[index1] += e2h_buffer[index2] * phi[i] * cjk;
01665             }
01666           }
01667         } /* end loop over finer grid stencil */
01668 
01669       }
01670     }
01671   } /* end loop over each coarser grid point */
01672 
01673   return NL_MSM_SUCCESS;
01674 }
01675 
01676 
01677 int gridcutoff(NL_Msm *msm, int level)
01678 {
01679   double eh_sum;
01680 
01681   /* grids of charge and potential */
01682   const NL_Msmgrid_double *qhgrid = &(msm->qh[level]);
01683   const double *qh = qhgrid->data;
01684   NL_Msmgrid_double *ehgrid = &(msm->eh[level]);
01685   double *eh = ehgrid->data;
01686   const int ia = qhgrid->i0;            /* lowest x-index */
01687   const int ib = ia + qhgrid->ni - 1;   /* highest x-index */
01688   const int ja = qhgrid->j0;            /* lowest y-index */
01689   const int jb = ja + qhgrid->nj - 1;   /* highest y-index */
01690   const int ka = qhgrid->k0;            /* lowest z-index */
01691   const int kb = ka + qhgrid->nk - 1;   /* highest z-index */
01692   const int ni = qhgrid->ni;            /* length along x-dim */
01693   const int nj = qhgrid->nj;            /* length along y-dim */
01694   const int nk = qhgrid->nk;            /* length along z-dim */
01695 
01696   /* grid of weights for pairwise grid point interactions within cutoff */
01697   const NL_Msmgrid_double *gcgrid = &(msm->gc[level]);
01698   const double *gc = gcgrid->data;
01699   const int gia = gcgrid->i0;            /* lowest x-index */
01700   const int gib = gia + gcgrid->ni - 1;  /* highest x-index */
01701   const int gja = gcgrid->j0;            /* lowest y-index */
01702   const int gjb = gja + gcgrid->nj - 1;  /* highest y-index */
01703   const int gka = gcgrid->k0;            /* lowest z-index */
01704   const int gkb = gka + gcgrid->nk - 1;  /* highest z-index */
01705   const int gni = gcgrid->ni;            /* length along x-dim */
01706   const int gnj = gcgrid->nj;            /* length along y-dim */
01707 
01708   const int ispx = (msm->msmflags & NL_MSM_PERIODIC_VEC1);
01709   const int ispy = (msm->msmflags & NL_MSM_PERIODIC_VEC2);
01710   const int ispz = (msm->msmflags & NL_MSM_PERIODIC_VEC3);
01711 
01712   const int ispnone = !(ispx || ispy || ispz);
01713 
01714   int i, j, k;
01715   int gia_clip, gib_clip;
01716   int gja_clip, gjb_clip;
01717   int gka_clip, gkb_clip;
01718   int koff;
01719   long jkoff, index;
01720   int id, jd, kd;
01721   int knoff;
01722   long jknoff, nindex;
01723   int kgoff, jkgoff, ngindex;
01724 
01725   if ( ispnone ) {  /* non-periodic boundaries */
01726 
01727     /* loop over all grid points */
01728     for (k = ka;  k <= kb;  k++) {
01729 
01730       /* clip gc ranges to keep offset for k index within grid */
01731       gka_clip = (k + gka < ka ? ka - k : gka);
01732       gkb_clip = (k + gkb > kb ? kb - k : gkb);
01733 
01734       koff = k * nj;  /* find eh flat index */
01735 
01736       for (j = ja;  j <= jb;  j++) {
01737 
01738         /* clip gc ranges to keep offset for j index within grid */
01739         gja_clip = (j + gja < ja ? ja - j : gja);
01740         gjb_clip = (j + gjb > jb ? jb - j : gjb);
01741 
01742         jkoff = (koff + j) * ni;  /* find eh flat index */
01743 
01744         for (i = ia;  i <= ib;  i++) {
01745 
01746           /* clip gc ranges to keep offset for i index within grid */
01747           gia_clip = (i + gia < ia ? ia - i : gia);
01748           gib_clip = (i + gib > ib ? ib - i : gib);
01749 
01750           index = jkoff + i;  /* eh flat index */
01751 
01752           /* sum over "sphere" of weighted charge */
01753           eh_sum = 0;
01754           for (kd = gka_clip;  kd <= gkb_clip;  kd++) {
01755             knoff = (k + kd) * nj;  /* find qh flat index */
01756             kgoff = kd * gnj;       /* find gc flat index */
01757 
01758             for (jd = gja_clip;  jd <= gjb_clip;  jd++) {
01759               jknoff = (knoff + (j + jd)) * ni;  /* find qh flat index */
01760               jkgoff = (kgoff + jd) * gni;       /* find gc flat index */
01761 
01762               for (id = gia_clip;  id <= gib_clip;  id++) {
01763                 nindex = jknoff + (i + id);  /* qh flat index */
01764                 ngindex = jkgoff + id;       /* gc flat index */
01765 
01766                 GRID_INDEX_CHECK(qhgrid, i+id, j+jd, k+kd);
01767                 ASSERT(GRID_INDEX(qhgrid, i+id, j+jd, k+kd) == nindex);
01768 
01769                 GRID_INDEX_CHECK(gcgrid, id, jd, kd);
01770                 ASSERT(GRID_INDEX(gcgrid, id, jd, kd) == ngindex);
01771 
01772                 eh_sum += qh[nindex] * gc[ngindex];  /* sum weighted charge */
01773               }
01774             }
01775           } /* end loop over "sphere" of charge */
01776 
01777           GRID_INDEX_CHECK(ehgrid, i, j, k);
01778           ASSERT(GRID_INDEX(ehgrid, i, j, k) == index);
01779           eh[index] = eh_sum;  /* store potential */
01780         }
01781       }
01782     } /* end loop over all grid points */
01783 
01784   } /* if nonperiodic boundaries */
01785   else {
01786     /* some boundary is periodic */
01787     int ilo, jlo, klo;
01788     int ip, jp, kp;
01789 
01790     /* loop over all grid points */
01791     for (k = ka;  k <= kb;  k++) {
01792       klo = k + gka;
01793       if ( ! ispz ) {  /* nonperiodic z */
01794         /* clip gc ranges to keep offset for k index within grid */
01795         gka_clip = (k + gka < ka ? ka - k : gka);
01796         gkb_clip = (k + gkb > kb ? kb - k : gkb);
01797         if (klo < ka) klo = ka;  /* keep lowest qh index within grid */
01798       }
01799       else {  /* periodic z */
01800         gka_clip = gka;
01801         gkb_clip = gkb;
01802         if (klo < ka) do { klo += nk; } while (klo < ka);
01803       }
01804       ASSERT(klo <= kb);
01805 
01806       koff = k * nj;  /* find eh flat index */
01807 
01808       for (j = ja;  j <= jb;  j++) {
01809         jlo = j + gja;
01810         if ( ! ispy ) {  /* nonperiodic y */
01811           /* clip gc ranges to keep offset for j index within grid */
01812           gja_clip = (j + gja < ja ? ja - j : gja);
01813           gjb_clip = (j + gjb > jb ? jb - j : gjb);
01814           if (jlo < ja) jlo = ja;  /* keep lowest qh index within grid */
01815         }
01816         else {  /* periodic y */
01817           gja_clip = gja;
01818           gjb_clip = gjb;
01819           if (jlo < ja) do { jlo += nj; } while (jlo < ja);
01820         }
01821         ASSERT(jlo <= jb);
01822 
01823         jkoff = (koff + j) * ni;  /* find eh flat index */
01824 
01825         for (i = ia;  i <= ib;  i++) {
01826           ilo = i + gia;
01827           if ( ! ispx ) {  /* nonperiodic x */
01828             /* clip gc ranges to keep offset for i index within grid */
01829             gia_clip = (i + gia < ia ? ia - i : gia);
01830             gib_clip = (i + gib > ib ? ib - i : gib);
01831             if (ilo < ia) ilo = ia;  /* keep lowest qh index within grid */
01832           }
01833           else {  /* periodic x */
01834             gia_clip = gia;
01835             gib_clip = gib;
01836             if (ilo < ia) do { ilo += ni; } while (ilo < ia);
01837           }
01838           ASSERT(ilo <= ib);
01839 
01840           index = jkoff + i;  /* eh flat index */
01841 
01842           /* sum over "sphere" of weighted charge */
01843           eh_sum = 0;
01844           for (kd = gka_clip, kp = klo;  kd <= gkb_clip;  kd++, kp++) {
01845             /* clipping makes conditional always fail for nonperiodic */
01846             if (kp > kb) kp = ka;  /* wrap z direction */
01847             knoff = kp * nj;       /* find qh flat index */
01848             kgoff = kd * gnj;      /* find gc flat index */
01849 
01850             for (jd = gja_clip, jp = jlo;  jd <= gjb_clip;  jd++, jp++) {
01851               /* clipping makes conditional always fail for nonperiodic */
01852               if (jp > jb) jp = ja;              /* wrap y direction */
01853               jknoff = (knoff + jp) * ni;  /* find qh flat index */
01854               jkgoff = (kgoff + jd) * gni;       /* find gc flat index */
01855 
01856               for (id = gia_clip, ip = ilo;  id <= gib_clip;  id++, ip++) {
01857                 /* clipping makes conditional always fail for nonperiodic */
01858                 if (ip > ib) ip = ia;   /* wrap x direction */
01859                 nindex = jknoff +  ip;  /* qh flat index */
01860                 ngindex = jkgoff + id;  /* gc flat index */
01861 
01862                 GRID_INDEX_CHECK(qhgrid, ip, jp, kp);
01863                 ASSERT(GRID_INDEX(qhgrid, ip, jp, kp) == nindex);
01864 
01865                 GRID_INDEX_CHECK(gcgrid, id, jd, kd);
01866                 ASSERT(GRID_INDEX(gcgrid, id, jd, kd) == ngindex);
01867 
01868                 eh_sum += qh[nindex] * gc[ngindex];  /* sum weighted charge */
01869 
01870               }
01871             }
01872           } /* end loop over "sphere" of charge */
01873 
01874           GRID_INDEX_CHECK(ehgrid, i, j, k);
01875           ASSERT(GRID_INDEX(ehgrid, i, j, k) == index);
01876           eh[index] = eh_sum;  /* store potential */
01877         }
01878       }
01879     } /* end loop over all grid points */
01880 
01881   } /* else some boundary is periodic */
01882 
01883   return NL_MSM_SUCCESS;
01884 }

Generated on Tue Sep 26 01:17:14 2017 for NAMD by  doxygen 1.4.7