67 printf(
"MSM anterpolation: %6.3f sec\n", time_delta);
71 if (use_nonfactored) {
72 for (level = 0; level < nlevels - 1; level++) {
78 for (level = 0; level < nlevels - 1; level++) {
86 printf(
"MSM restriction: %6.3f sec\n", time_delta);
90 if (use_cuda_gridcutoff && nlevels > 1) {
96 printf(
"Falling back on CPU for grid cutoff computation\n");
97 use_cuda_gridcutoff = 0;
103 printf(
"Falling back on CPU for grid cutoff computation\n");
104 use_cuda_gridcutoff = 0;
110 if ( ! use_cuda_gridcutoff ) {
111 for (level = 0; level < nlevels - 1; level++) {
123 printf(
"MSM grid cutoff: %6.3f sec\n", time_delta);
127 if (use_nonfactored) {
128 for (level--; level >= 0; level--) {
134 for (level--; level >= 0; level--) {
142 printf(
"MSM prolongation: %6.3f sec\n", time_delta);
151 printf(
"MSM interpolation: %6.3f sec\n", time_delta);
164 3, 5, 5, 7, 7, 9, 9, 3,
172 {-1.f/16, 0, 9.f/16, 1, 9.f/16, 0, -1.f/16},
175 {3.f/256, 0, -25.f/256, 0, 75.f/128, 1, 75.f/128, 0, -25.f/256, 0, 3.f/256},
178 {3.f/256, 0, -25.f/256, 0, 75.f/128, 1, 75.f/128, 0, -25.f/256, 0, 3.f/256},
181 { -5.f/2048, 0, 49.f/2048, 0, -245.f/2048, 0, 1225.f/2048, 1, 1225.f/2048,
182 0, -245.f/2048, 0, 49.f/2048, 0, -5.f/2048 },
185 { -5.f/2048, 0, 49.f/2048, 0, -245.f/2048, 0, 1225.f/2048, 1, 1225.f/2048,
186 0, -245.f/2048, 0, 49.f/2048, 0, -5.f/2048 },
189 { 35.f/65536, 0, -405.f/65536, 0, 567.f/16384, 0, -2205.f/16384, 0,
190 19845.f/32768, 1, 19845.f/32768, 0, -2205.f/16384, 0, 567.f/16384, 0,
191 -405.f/65536, 0, 35.f/65536 },
194 { 35.f/65536, 0, -405.f/65536, 0, 567.f/16384, 0, -2205.f/16384, 0,
195 19845.f/32768, 1, 19845.f/32768, 0, -2205.f/16384, 0, 567.f/16384, 0,
196 -405.f/65536, 0, 35.f/65536 },
199 { 1.f/48, 1.f/6, 23.f/48, 2.f/3, 23.f/48, 1.f/6, 1.f/48 },
209 5, 7, 7, 9, 9, 11, 11, 7
219 {-5, -3, -1, 0, 1, 3, 5},
222 {-5, -3, -1, 0, 1, 3, 5},
225 {-7, -5, -3, -1, 0, 1, 3, 5, 7},
228 {-7, -5, -3, -1, 0, 1, 3, 5, 7},
231 {-9, -7, -5, -3, -1, 0, 1, 3, 5, 7, 9},
234 {-9, -7, -5, -3, -1, 0, 1, 3, 5, 7, 9},
237 {-3, -2, -1, 0, 1, 2, 3},
244 {-1.f/16, 9.f/16, 1, 9.f/16, -1.f/16},
247 {3.f/256, -25.f/256, 75.f/128, 1, 75.f/128, -25.f/256, 3.f/256},
250 {3.f/256, -25.f/256, 75.f/128, 1, 75.f/128, -25.f/256, 3.f/256},
253 { -5.f/2048, 49.f/2048, -245.f/2048, 1225.f/2048, 1, 1225.f/2048,
254 -245.f/2048, 49.f/2048, -5.f/2048 },
257 { -5.f/2048, 49.f/2048, -245.f/2048, 1225.f/2048, 1, 1225.f/2048,
258 -245.f/2048, 49.f/2048, -5.f/2048 },
261 { 35.f/65536, -405.f/65536, 567.f/16384, -2205.f/16384,
262 19845.f/32768, 1, 19845.f/32768, -2205.f/16384, 567.f/16384,
263 -405.f/65536, 35.f/65536 },
266 { 35.f/65536, -405.f/65536, 567.f/16384, -2205.f/16384,
267 19845.f/32768, 1, 19845.f/32768, -2205.f/16384, 567.f/16384,
268 -405.f/65536, 35.f/65536 },
271 { 1.f/48, 1.f/6, 23.f/48, 2.f/3, 23.f/48, 1.f/6, 1.f/48 },
280 #define STENCIL_1D(_phi, _delta, _approx) \
285 case NL_MSM_APPROX_CUBIC: \
286 phi[0] = 0.5f * (1 - t) * (2 - t) * (2 - t); \
288 phi[1] = (1 - t) * (1 + t - 1.5f * t * t); \
290 phi[2] = (1 + t) * (1 - t - 1.5f * t * t); \
292 phi[3] = 0.5f * (1 + t) * (2 + t) * (2 + t); \
294 case NL_MSM_APPROX_QUINTIC: \
295 phi[0] = (1.f/24) * (1-t) * (2-t) * (3-t) * (3-t) * (4-t); \
297 phi[1] = (1-t) * (2-t) * (3-t) * ((1.f/6) + t * (0.375f - (5.f/24)*t));\
299 phi[2] = (1-t*t) * (2-t) * (0.5f + t * (0.25f - (5.f/12)*t)); \
301 phi[3] = (1-t*t) * (2+t) * (0.5f - t * (0.25f + (5.f/12)*t)); \
303 phi[4] = (1+t) * (2+t) * (3+t) * ((1.f/6) - t * (0.375f + (5.f/24)*t));\
305 phi[5] = (1.f/24) * (1+t) * (2+t) * (3+t) * (3+t) * (4+t); \
307 case NL_MSM_APPROX_QUINTIC2: \
308 phi[0] = (1.f/24) * (3-t) * (3-t) * (3-t) * (t-2) * (5*t-8); \
310 phi[1] = (-1.f/24) * (2-t) * (t-1) * (-48+t*(153+t*(-114+t*25))); \
312 phi[2] = (1.f/12) * (1-t) * (12+t*(12+t*(-3+t*(-38+t*25)))); \
314 phi[3] = (1.f/12) * (1+t) * (12+t*(-12+t*(-3+t*(38+t*25)))); \
316 phi[4] = (-1.f/24) * (2+t) * (t+1) * (48+t*(153+t*(114+t*25))); \
318 phi[5] = (1.f/24) * (3+t) * (3+t) * (3+t) * (t+2) * (5*t+8); \
320 case NL_MSM_APPROX_SEPTIC: \
321 phi[0] = (-1.f/720)*(t-1)*(t-2)*(t-3)*(t-4)*(t-4)*(t-5)*(t-6); \
323 phi[1] = (1.f/720)*(t-1)*(t-2)*(t-3)*(t-4)*(t-5)*(-6+t*(-20+7*t)); \
325 phi[2] = (-1.f/240)*(t*t-1)*(t-2)*(t-3)*(t-4)*(-10+t*(-12+7*t)); \
327 phi[3] = (1.f/144)*(t*t-1)*(t*t-4)*(t-3)*(-12+t*(-4+7*t)); \
329 phi[4] = (-1.f/144)*(t*t-1)*(t*t-4)*(t+3)*(-12+t*(4+7*t)); \
331 phi[5] = (1.f/240)*(t*t-1)*(t+2)*(t+3)*(t+4)*(-10+t*(12+7*t)); \
333 phi[6] = (-1.f/720)*(t+1)*(t+2)*(t+3)*(t+4)*(t+5)*(-6+t*(20+7*t)); \
335 phi[7] = (1.f/720)*(t+1)*(t+2)*(t+3)*(t+4)*(t+4)*(t+5)*(t+6); \
337 case NL_MSM_APPROX_SEPTIC3: \
338 phi[0] = (3632.f/5) + t*((-7456.f/5) + t*((58786.f/45) + t*(-633 \
339 + t*((26383.f/144) + t*((-22807.f/720) + t*((727.f/240) \
340 + t*(-89.f/720))))))); \
342 phi[1] = -440 + t*((25949.f/20) + t*((-117131.f/72) + t*((2247.f/2) \
343 + t*((-66437.f/144) + t*((81109.f/720) + t*((-727.f/48) \
344 + t*(623.f/720))))))); \
346 phi[2] = (138.f/5) + t*((-8617.f/60) + t*((12873.f/40) + t*((-791.f/2) \
347 + t*((4557.f/16) + t*((-9583.f/80) + t*((2181.f/80) \
348 + t*(-623.f/240))))))); \
350 phi[3] = 1 + t*t*((-49.f/36) + t*t*((-959.f/144) + t*((2569.f/144) \
351 + t*((-727.f/48) + t*(623.f/144))))); \
353 phi[4] = 1 + t*t*((-49.f/36) + t*t*((-959.f/144) + t*((-2569.f/144) \
354 + t*((-727.f/48) + t*(-623.f/144))))); \
356 phi[5] = (138.f/5) + t*((8617.f/60) + t*((12873.f/40) + t*((791.f/2) \
357 + t*((4557.f/16) + t*((9583.f/80) + t*((2181.f/80) \
358 + t*(623.f/240))))))); \
360 phi[6] = -440 + t*((-25949.f/20) + t*((-117131.f/72) + t*((-2247.f/2) \
361 + t*((-66437.f/144) + t*((-81109.f/720) + t*((-727.f/48) \
362 + t*(-623.f/720))))))); \
364 phi[7] = (3632.f/5) + t*((7456.f/5) + t*((58786.f/45) + t*(633 \
365 + t*((26383.f/144) + t*((22807.f/720) + t*((727.f/240) \
366 + t*(89.f/720))))))); \
368 case NL_MSM_APPROX_NONIC: \
369 phi[0] = (-1.f/40320)*(t-8)*(t-7)*(t-6)*(t-5)*(t-5)*(t-4)*(t-3)* \
372 phi[1] = (1.f/40320)*(t-7)*(t-6)*(t-5)*(t-4)*(t-3)*(t-2)*(t-1)* \
375 phi[2] = (-1.f/10080)*(t-6)*(t-5)*(t-4)*(t-3)*(t-2)*(t-1)*(t+1)* \
378 phi[3] = (1.f/1440)*(t-5)*(t-4)*(t-3)*(t-2)*(t-1)*(t+1)*(t+2)* \
381 phi[4] = (-1.f/2880)*(t-4)*(t-3)*(t-2)*(t-1)*(t+1)*(t+2)*(t+3)* \
384 phi[5] = (1.f/2880)*(t-3)*(t-2)*(t-1)*(t+1)*(t+2)*(t+3)*(t+4)* \
387 phi[6] = (-1.f/1440)*(t-2)*(t-1)*(t+1)*(t+2)*(t+3)*(t+4)*(t+5)* \
390 phi[7] = (1.f/10080)*(t-1)*(t+1)*(t+2)*(t+3)*(t+4)*(t+5)*(t+6)* \
393 phi[8] = (-1.f/40320)*(t+1)*(t+2)*(t+3)*(t+4)*(t+5)*(t+6)*(t+7)* \
396 phi[9] = (1.f/40320)*(t+1)*(t+2)*(t+3)*(t+4)*(t+5)*(t+5)*(t+6)* \
399 case NL_MSM_APPROX_NONIC4: \
400 phi[0] = 439375.f/7+t*(-64188125.f/504+t*(231125375.f/2016 \
401 +t*(-17306975.f/288+t*(7761805.f/384+t*(-2895587.f/640 \
402 +t*(129391.f/192+t*(-259715.f/4032+t*(28909.f/8064 \
403 +t*(-3569.f/40320))))))))); \
405 phi[1] = -56375+t*(8314091.f/56+t*(-49901303.f/288+t*(3763529.f/32 \
406 +t*(-19648027.f/384+t*(9469163.f/640+t*(-545977.f/192 \
407 +t*(156927.f/448+t*(-28909.f/1152 \
408 +t*(3569.f/4480))))))))); \
410 phi[2] = 68776.f/7+t*(-1038011.f/28+t*(31157515.f/504+t*(-956669.f/16 \
411 +t*(3548009.f/96+t*(-2422263.f/160+t*(197255.f/48 \
412 +t*(-19959.f/28+t*(144545.f/2016 \
413 +t*(-3569.f/1120))))))))); \
415 phi[3] = -154+t*(12757.f/12+t*(-230123.f/72+t*(264481.f/48 \
416 +t*(-576499.f/96+t*(686147.f/160+t*(-96277.f/48 \
417 +t*(14221.f/24+t*(-28909.f/288+t*(3569.f/480))))))))); \
419 phi[4] = 1+t*t*(-205.f/144+t*t*(91.f/192+t*(-6181.f/320 \
420 +t*(6337.f/96+t*(-2745.f/32+t*(28909.f/576 \
421 +t*(-3569.f/320))))))); \
423 phi[5] = 1+t*t*(-205.f/144+t*t*(91.f/192+t*(6181.f/320 \
424 +t*(6337.f/96+t*(2745.f/32+t*(28909.f/576 \
425 +t*(3569.f/320))))))); \
427 phi[6] = -154+t*(-12757.f/12+t*(-230123.f/72+t*(-264481.f/48 \
428 +t*(-576499.f/96+t*(-686147.f/160+t*(-96277.f/48 \
429 +t*(-14221.f/24+t*(-28909.f/288+t*(-3569.f/480))))))))); \
431 phi[7] = 68776.f/7+t*(1038011.f/28+t*(31157515.f/504+t*(956669.f/16 \
432 +t*(3548009.f/96+t*(2422263.f/160+t*(197255.f/48 \
433 +t*(19959.f/28+t*(144545.f/2016+t*(3569.f/1120))))))))); \
435 phi[8] = -56375+t*(-8314091.f/56+t*(-49901303.f/288+t*(-3763529.f/32 \
436 +t*(-19648027.f/384+t*(-9469163.f/640+t*(-545977.f/192 \
437 +t*(-156927.f/448+t*(-28909.f/1152 \
438 +t*(-3569.f/4480))))))))); \
440 phi[9] = 439375.f/7+t*(64188125.f/504+t*(231125375.f/2016 \
441 +t*(17306975.f/288+t*(7761805.f/384+t*(2895587.f/640 \
442 +t*(129391.f/192+t*(259715.f/4032+t*(28909.f/8064 \
443 +t*(3569.f/40320))))))))); \
445 case NL_MSM_APPROX_BSPLINE: \
446 phi[0] = (1.f/6) * (2-t) * (2-t) * (2-t); \
448 phi[1] = (2.f/3) + t*t*(-1 + 0.5f*t); \
450 phi[2] = (2.f/3) + t*t*(-1 - 0.5f*t); \
452 phi[3] = (1.f/6) * (2+t) * (2+t) * (2+t); \
455 return NL_MSM_ERROR_SUPPORT; \
468 #define D_STENCIL_1D(_dphi, _phi, _h_1, _delta, _approx) \
470 float *dphi = _dphi; \
475 case NL_MSM_APPROX_CUBIC: \
476 phi[0] = 0.5f * (1 - t) * (2 - t) * (2 - t); \
477 dphi[0] = (1.5f * t - 2) * (2 - t) * h_1; \
479 phi[1] = (1 - t) * (1 + t - 1.5f * t * t); \
480 dphi[1] = (-5 + 4.5f * t) * t * h_1; \
482 phi[2] = (1 + t) * (1 - t - 1.5f * t * t); \
483 dphi[2] = (-5 - 4.5f * t) * t * h_1; \
485 phi[3] = 0.5f * (1 + t) * (2 + t) * (2 + t); \
486 dphi[3] = (1.5f * t + 2) * (2 + t) * h_1; \
488 case NL_MSM_APPROX_QUINTIC: \
489 phi[0] = (1.f/24) * (1-t) * (2-t) * (3-t) * (3-t) * (4-t); \
490 dphi[0] = ((-1.f/24) * ((3-t) * (3-t) * (14 + t * (-14 + 3*t)) \
491 + 2 * (1-t) * (2-t) * (3-t) * (4-t))) * h_1; \
493 phi[1] = (1-t) * (2-t) * (3-t) * ((1.f/6) + t * (0.375f - (5.f/24)*t));\
494 dphi[1] = (-((1.f/6) + t * (0.375f - (5.f/24)*t)) * (11 + t * (-12 + 3*t))\
495 + (1-t) * (2-t) * (3-t) * (0.375f - (5.f/12)*t)) * h_1; \
497 phi[2] = (1-t*t) * (2-t) * (0.5f + t * (0.25f - (5.f/12)*t)); \
498 dphi[2] = (-(0.5f + t * (0.25f - (5.f/12)*t)) * (1 + t * (4 - 3*t)) \
499 + (1-t*t) * (2-t) * (0.25f - (5.f/6)*t)) * h_1; \
501 phi[3] = (1-t*t) * (2+t) * (0.5f - t * (0.25f + (5.f/12)*t)); \
502 dphi[3] = ((0.5f + t * (-0.25f - (5.f/12)*t)) * (1 + t * (-4 - 3*t)) \
503 - (1-t*t) * (2+t) * (0.25f + (5.f/6)*t)) * h_1; \
505 phi[4] = (1+t) * (2+t) * (3+t) * ((1.f/6) - t * (0.375f + (5.f/24)*t));\
506 dphi[4] = (((1.f/6) + t * (-0.375f - (5.f/24)*t)) * (11 + t * (12 + 3*t)) \
507 - (1+t) * (2+t) * (3+t) * (0.375f + (5.f/12)*t)) * h_1; \
509 phi[5] = (1.f/24) * (1+t) * (2+t) * (3+t) * (3+t) * (4+t); \
510 dphi[5] = ((1.f/24) * ((3+t) * (3+t) * (14 + t * (14 + 3*t)) \
511 + 2 * (1+t) * (2+t) * (3+t) * (4+t))) * h_1; \
513 case NL_MSM_APPROX_QUINTIC2: \
514 phi[0] = (1.f/24) * (3-t) * (3-t) * (3-t) * (t-2) * (5*t-8); \
515 dphi[0] = ((1.f/24) * (3-t) * (3-t) * ((3-t)*(5*t-8) - 3*(t-2)*(5*t-8) \
516 + 5*(t-2)*(3-t))) * h_1; \
518 phi[1] = (-1.f/24) * (2-t) * (t-1) * (-48+t*(153+t*(-114+t*25))); \
519 dphi[1] = ((-1.f/24) * ((2-t)*(-48+t*(153+t*(-114+t*25))) - (t-1)* \
520 (-48+t*(153+t*(-114+t*25))) \
521 + (2-t)*(t-1)*(153+t*(-228+t*75)))) * h_1; \
523 phi[2] = (1.f/12) * (1-t) * (12+t*(12+t*(-3+t*(-38+t*25)))); \
524 dphi[2] = ((1.f/12) * (-(12+t*(12+t*(-3+t*(-38+t*25)))) \
525 + (1-t)*(12+t*(-6+t*(-114+t*100))))) * h_1; \
527 phi[3] = (1.f/12) * (1+t) * (12+t*(-12+t*(-3+t*(38+t*25)))); \
528 dphi[3] = ((1.f/12) * ((12+t*(-12+t*(-3+t*(38+t*25)))) \
529 + (1+t)*(-12+t*(-6+t*(114+t*100))))) * h_1; \
531 phi[4] = (-1.f/24) * (2+t) * (t+1) * (48+t*(153+t*(114+t*25))); \
532 dphi[4] = ((-1.f/24) * ((2+t)*(48+t*(153+t*(114+t*25))) + (t+1)* \
533 (48+t*(153+t*(114+t*25))) \
534 + (2+t)*(t+1)*(153+t*(228+t*75)))) * h_1; \
536 phi[5] = (1.f/24) * (3+t) * (3+t) * (3+t) * (t+2) * (5*t+8); \
537 dphi[5] = ((1.f/24) * (3+t) * (3+t) * ((3+t)*(5*t+8) + 3*(t+2)*(5*t+8) \
538 + 5*(t+2)*(3+t))) * h_1; \
540 case NL_MSM_APPROX_SEPTIC: \
541 phi[0] = (-1.f/720)*(t-1)*(t-2)*(t-3)*(t-4)*(t-4)*(t-5)*(t-6); \
542 dphi[0] = (-1.f/720)*(t-4)*(-1944+t*(3644+t*(-2512+t*(807 \
543 +t*(-122+t*7))))) * h_1; \
545 phi[1] = (1.f/720)*(t-1)*(t-2)*(t-3)*(t-4)*(t-5)*(-6+t*(-20+7*t)); \
546 dphi[1] = (1.f/720)*(756+t*(-9940+t*(17724+t*(-12740+t*(4445 \
547 +t*(-750+t*49)))))) * h_1; \
549 phi[2] = (-1.f/240)*(t*t-1)*(t-2)*(t-3)*(t-4)*(-10+t*(-12+7*t)); \
550 dphi[2] = (-1.f/240)*(-28+t*(1260+t*(-756+t*(-1260+t*(1365 \
551 +t*(-450+t*49)))))) * h_1; \
553 phi[3] = (1.f/144)*(t*t-1)*(t*t-4)*(t-3)*(-12+t*(-4+7*t)); \
554 dphi[3] = (1.f/144)*t*(-560+t*(84+t*(644+t*(-175+t*(-150+t*49))))) * \
557 phi[4] = (-1.f/144)*(t*t-1)*(t*t-4)*(t+3)*(-12+t*(4+7*t)); \
558 dphi[4] = (-1.f/144)*t*(560+t*(84+t*(-644+t*(-175+t*(150+t*49))))) * \
561 phi[5] = (1.f/240)*(t*t-1)*(t+2)*(t+3)*(t+4)*(-10+t*(12+7*t)); \
562 dphi[5] = (1.f/240)*(-28+t*(-1260+t*(-756+t*(1260+t*(1365 \
563 +t*(450+t*49)))))) * h_1; \
565 phi[6] = (-1.f/720)*(t+1)*(t+2)*(t+3)*(t+4)*(t+5)*(-6+t*(20+7*t)); \
566 dphi[6] = (-1.f/720)*(756+t*(9940+t*(17724+t*(12740+t*(4445 \
567 +t*(750+t*49)))))) * h_1; \
569 phi[7] = (1.f/720)*(t+1)*(t+2)*(t+3)*(t+4)*(t+4)*(t+5)*(t+6); \
570 dphi[7] = (1.f/720)*(t+4)*(1944+t*(3644+t*(2512+t*(807 \
571 +t*(122+t*7))))) * h_1; \
573 case NL_MSM_APPROX_SEPTIC3: \
574 phi[0] = (3632.f/5) + t*((-7456.f/5) + t*((58786.f/45) + t*(-633 \
575 + t*((26383.f/144) + t*((-22807.f/720) + t*((727.f/240) \
576 + t*(-89.f/720))))))); \
577 dphi[0] = ((-7456.f/5) + t*((117572.f/45) + t*(-1899 + t*((26383.f/36) \
578 + t*((-22807.f/144) + t*((727.f/40) + t*(-623.f/720)))))))*h_1; \
580 phi[1] = -440 + t*((25949.f/20) + t*((-117131.f/72) + t*((2247.f/2) \
581 + t*((-66437.f/144) + t*((81109.f/720) + t*((-727.f/48) \
582 + t*(623.f/720))))))); \
583 dphi[1] = ((25949.f/20) + t*((-117131.f/36) + t*((6741.f/2) \
584 + t*((-66437.f/36) + t*((81109.f/144) + t*((-727.f/8) \
585 + t*(4361.f/720))))))) * h_1; \
587 phi[2] = (138.f/5) + t*((-8617.f/60) + t*((12873.f/40) + t*((-791.f/2) \
588 + t*((4557.f/16) + t*((-9583.f/80) + t*((2181.f/80) \
589 + t*(-623.f/240))))))); \
590 dphi[2] = ((-8617.f/60) + t*((12873.f/20) + t*((-2373.f/2) + t*((4557.f/4) \
591 + t*((-9583.f/16) + t*((6543.f/40) + t*(-4361.f/240)))))))*h_1; \
593 phi[3] = 1 + t*t*((-49.f/36) + t*t*((-959.f/144) + t*((2569.f/144) \
594 + t*((-727.f/48) + t*(623.f/144))))); \
595 dphi[3] = (t*((-49.f/18) + t*t*((-959.f/36) + t*((12845.f/144) \
596 + t*((-727.f/8) + t*(4361.f/144)))))) * h_1; \
598 phi[4] = 1 + t*t*((-49.f/36) + t*t*((-959.f/144) + t*((-2569.f/144) \
599 + t*((-727.f/48) + t*(-623.f/144))))); \
600 dphi[4] = (t*((-49.f/18) + t*t*((-959.f/36) + t*((-12845.f/144) \
601 + t*((-727.f/8) + t*(-4361.f/144)))))) * h_1; \
603 phi[5] = (138.f/5) + t*((8617.f/60) + t*((12873.f/40) + t*((791.f/2) \
604 + t*((4557.f/16) + t*((9583.f/80) + t*((2181.f/80) \
605 + t*(623.f/240))))))); \
606 dphi[5] = ((8617.f/60) + t*((12873.f/20) + t*((2373.f/2) + t*((4557.f/4) \
607 + t*((9583.f/16) + t*((6543.f/40) + t*(4361.f/240)))))))*h_1; \
609 phi[6] = -440 + t*((-25949.f/20) + t*((-117131.f/72) + t*((-2247.f/2) \
610 + t*((-66437.f/144) + t*((-81109.f/720) + t*((-727.f/48) \
611 + t*(-623.f/720))))))); \
612 dphi[6] = ((-25949.f/20) + t*((-117131.f/36) + t*((-6741.f/2) \
613 + t*((-66437.f/36) + t*((-81109.f/144) + t*((-727.f/8) \
614 + t*(-4361.f/720))))))) * h_1; \
616 phi[7] = (3632.f/5) + t*((7456.f/5) + t*((58786.f/45) + t*(633 \
617 + t*((26383.f/144) + t*((22807.f/720) + t*((727.f/240) \
618 + t*(89.f/720))))))); \
619 dphi[7] = ((7456.f/5) + t*((117572.f/45) + t*(1899 + t*((26383.f/36) \
620 + t*((22807.f/144) + t*((727.f/40) + t*(623.f/720)))))))*h_1; \
622 case NL_MSM_APPROX_NONIC: \
623 phi[0] = (-1.f/40320)*(t-8)*(t-7)*(t-6)*(t-5)*(t-5)*(t-4)*(t-3)* \
625 dphi[0] = (-1.f/40320)*(t-5)*(-117648+t*(256552+t*(-221416 \
626 +t*(99340+t*(-25261+t*(3667+t*(-283+t*9)))))))*h_1; \
628 phi[1] = (1.f/40320)*(t-7)*(t-6)*(t-5)*(t-4)*(t-3)*(t-2)*(t-1)* \
630 dphi[1] = (1.f/40320)*(71856+t*(-795368+t*(1569240+t*(-1357692 \
631 +t*(634725+t*(-172116+t*(27090+t*(-2296+t*81))))))))*h_1; \
633 phi[2] = (-1.f/10080)*(t-6)*(t-5)*(t-4)*(t-3)*(t-2)*(t-1)*(t+1)* \
635 dphi[2] = (1.f/10080)*(3384+t*(-69080+t*(55026 \
636 +t*(62580+t*(-99225+t*(51660+t*(-13104+t*(1640 \
637 +t*(-81)))))))))*h_1; \
639 phi[3] = (1.f/1440)*(t-5)*(t-4)*(t-3)*(t-2)*(t-1)*(t+1)*(t+2)* \
641 dphi[3] = (1.f/1440)*(72+t*(-6344+t*(2070 \
642 +t*(7644+t*(-4725+t*(-828+t*(1260+t*(-328+t*27))))))))*h_1; \
644 phi[4] = (-1.f/2880)*(t-4)*(t-3)*(t-2)*(t-1)*(t+1)*(t+2)*(t+3)* \
646 dphi[4] = (-1.f/2880)*t*(10792+t*(-972+t*(-12516 \
647 +t*(2205+t*(3924+t*(-882+t*(-328+t*81)))))))*h_1; \
649 phi[5] = (1.f/2880)*(t-3)*(t-2)*(t-1)*(t+1)*(t+2)*(t+3)*(t+4)* \
651 dphi[5] = (1.f/2880)*t*(-10792+t*(-972+t*(12516 \
652 +t*(2205+t*(-3924+t*(-882+t*(328+t*81)))))))*h_1; \
654 phi[6] = (-1.f/1440)*(t-2)*(t-1)*(t+1)*(t+2)*(t+3)*(t+4)*(t+5)* \
656 dphi[6] = (1.f/1440)*(-72+t*(-6344+t*(-2070 \
657 +t*(7644+t*(4725+t*(-828+t*(-1260+t*(-328+t*(-27)))))))))*h_1; \
659 phi[7] = (1.f/10080)*(t-1)*(t+1)*(t+2)*(t+3)*(t+4)*(t+5)*(t+6)* \
661 dphi[7] = (1.f/10080)*(-3384+t*(-69080+t*(-55026 \
662 +t*(62580+t*(99225+t*(51660+t*(13104+t*(1640+t*81))))))))*h_1; \
664 phi[8] = (-1.f/40320)*(t+1)*(t+2)*(t+3)*(t+4)*(t+5)*(t+6)*(t+7)* \
666 dphi[8] = (-1.f/40320)*(71856+t*(795368+t*(1569240 \
667 +t*(1357692+t*(634725+t*(172116+t*(27090+t*(2296 \
670 phi[9] = (1.f/40320)*(t+1)*(t+2)*(t+3)*(t+4)*(t+5)*(t+5)*(t+6)* \
672 dphi[9] = (1.f/40320)*(t+5)*(117648+t*(256552+t*(221416 \
673 +t*(99340+t*(25261+t*(3667+t*(283+t*9)))))))*h_1; \
675 case NL_MSM_APPROX_NONIC4: \
676 phi[0] = 439375.f/7+t*(-64188125.f/504+t*(231125375.f/2016 \
677 +t*(-17306975.f/288+t*(7761805.f/384+t*(-2895587.f/640 \
678 +t*(129391.f/192+t*(-259715.f/4032+t*(28909.f/8064 \
679 +t*(-3569.f/40320))))))))); \
680 dphi[0] = (-64188125.f/504+t*(231125375.f/1008 \
681 +t*(-17306975.f/96+t*(7761805.f/96+t*(-2895587.f/128 \
682 +t*(129391.f/32+t*(-259715.f/576+t*(28909.f/1008 \
683 +t*(-3569.f/4480)))))))))*h_1; \
685 phi[1] = -56375+t*(8314091.f/56+t*(-49901303.f/288+t*(3763529.f/32 \
686 +t*(-19648027.f/384+t*(9469163.f/640+t*(-545977.f/192 \
687 +t*(156927.f/448+t*(-28909.f/1152 \
688 +t*(3569.f/4480))))))))); \
689 dphi[1] = (8314091.f/56+t*(-49901303.f/144+t*(11290587.f/32 \
690 +t*(-19648027.f/96+t*(9469163.f/128+t*(-545977.f/32 \
691 +t*(156927.f/64+t*(-28909.f/144 \
692 +t*(32121.f/4480)))))))))*h_1; \
694 phi[2] = 68776.f/7+t*(-1038011.f/28+t*(31157515.f/504+t*(-956669.f/16 \
695 +t*(3548009.f/96+t*(-2422263.f/160+t*(197255.f/48 \
696 +t*(-19959.f/28+t*(144545.f/2016 \
697 +t*(-3569.f/1120))))))))); \
698 dphi[2] = (-1038011.f/28+t*(31157515.f/252+t*(-2870007.f/16 \
699 +t*(3548009.f/24+t*(-2422263.f/32+t*(197255.f/8 \
700 +t*(-19959.f/4+t*(144545.f/252 \
701 +t*(-32121.f/1120)))))))))*h_1; \
703 phi[3] = -154+t*(12757.f/12+t*(-230123.f/72+t*(264481.f/48 \
704 +t*(-576499.f/96+t*(686147.f/160+t*(-96277.f/48 \
705 +t*(14221.f/24+t*(-28909.f/288+t*(3569.f/480))))))))); \
706 dphi[3] = (12757.f/12+t*(-230123.f/36+t*(264481.f/16 \
707 +t*(-576499.f/24+t*(686147.f/32+t*(-96277.f/8 \
708 +t*(99547.f/24+t*(-28909.f/36 \
709 +t*(10707.f/160)))))))))*h_1; \
711 phi[4] = 1+t*t*(-205.f/144+t*t*(91.f/192+t*(-6181.f/320 \
712 +t*(6337.f/96+t*(-2745.f/32+t*(28909.f/576 \
713 +t*(-3569.f/320))))))); \
714 dphi[4] = t*(-205.f/72+t*t*(91.f/48+t*(-6181.f/64 \
715 +t*(6337.f/16+t*(-19215.f/32+t*(28909.f/72 \
716 +t*(-32121.f/320)))))))*h_1; \
718 phi[5] = 1+t*t*(-205.f/144+t*t*(91.f/192+t*(6181.f/320 \
719 +t*(6337.f/96+t*(2745.f/32+t*(28909.f/576 \
720 +t*(3569.f/320))))))); \
721 dphi[5] = t*(-205.f/72+t*t*(91.f/48+t*(6181.f/64 \
722 +t*(6337.f/16+t*(19215.f/32+t*(28909.f/72 \
723 +t*(32121.f/320)))))))*h_1; \
725 phi[6] = -154+t*(-12757.f/12+t*(-230123.f/72+t*(-264481.f/48 \
726 +t*(-576499.f/96+t*(-686147.f/160+t*(-96277.f/48 \
727 +t*(-14221.f/24+t*(-28909.f/288+t*(-3569.f/480))))))))); \
728 dphi[6] = (-12757.f/12+t*(-230123.f/36+t*(-264481.f/16 \
729 +t*(-576499.f/24+t*(-686147.f/32+t*(-96277.f/8 \
730 +t*(-99547.f/24+t*(-28909.f/36 \
731 +t*(-10707.f/160)))))))))*h_1; \
733 phi[7] = 68776.f/7+t*(1038011.f/28+t*(31157515.f/504+t*(956669.f/16 \
734 +t*(3548009.f/96+t*(2422263.f/160+t*(197255.f/48 \
735 +t*(19959.f/28+t*(144545.f/2016+t*(3569.f/1120))))))))); \
736 dphi[7] = (1038011.f/28+t*(31157515.f/252+t*(2870007.f/16 \
737 +t*(3548009.f/24+t*(2422263.f/32+t*(197255.f/8 \
738 +t*(19959.f/4+t*(144545.f/252 \
739 +t*(32121.f/1120)))))))))*h_1; \
741 phi[8] = -56375+t*(-8314091.f/56+t*(-49901303.f/288+t*(-3763529.f/32 \
742 +t*(-19648027.f/384+t*(-9469163.f/640+t*(-545977.f/192 \
743 +t*(-156927.f/448+t*(-28909.f/1152 \
744 +t*(-3569.f/4480))))))))); \
745 dphi[8] = (-8314091.f/56+t*(-49901303.f/144+t*(-11290587.f/32 \
746 +t*(-19648027.f/96+t*(-9469163.f/128+t*(-545977.f/32 \
747 +t*(-156927.f/64+t*(-28909.f/144 \
748 +t*(-32121.f/4480)))))))))*h_1; \
750 phi[9] = 439375.f/7+t*(64188125.f/504+t*(231125375.f/2016 \
751 +t*(17306975.f/288+t*(7761805.f/384+t*(2895587.f/640 \
752 +t*(129391.f/192+t*(259715.f/4032+t*(28909.f/8064 \
753 +t*(3569.f/40320))))))))); \
754 dphi[9] = (64188125.f/504+t*(231125375.f/1008 \
755 +t*(17306975.f/96+t*(7761805.f/96+t*(2895587.f/128 \
756 +t*(129391.f/32+t*(259715.f/576+t*(28909.f/1008 \
757 +t*(3569.f/4480)))))))))*h_1; \
759 case NL_MSM_APPROX_BSPLINE: \
760 phi[0] = (1.f/6) * (2-t) * (2-t) * (2-t); \
761 dphi[0] = -0.5f * (2-t) * (2-t) * h_1; \
763 phi[1] = (2.f/3) + t*t*(-1 + 0.5f*t); \
764 dphi[1] = t*(-2 + 1.5f*t) * h_1; \
766 phi[2] = (2.f/3) + t*t*(-1 - 0.5f*t); \
767 dphi[2] = t*(-2 - 1.5f*t) * h_1; \
769 phi[3] = (1.f/6) * (2+t) * (2+t) * (2+t); \
770 dphi[3] = 0.5f * (2+t) * (2+t) * h_1; \
773 return NL_MSM_ERROR_SUPPORT; \
781 const float *atom = msm->
atom_f;
787 float rx_hx, ry_hy, rz_hz;
788 #ifndef TEST_INLINING
794 const float hx_1 = 1/msm->
hx_f;
795 const float hy_1 = 1/msm->
hy_f;
796 const float hz_1 = 1/msm->
hz_f;
797 const float xm0 = msm->
gx_f;
798 const float ym0 = msm->
gy_f;
799 const float zm0 = msm->
gz_f;
802 NL_Msmgrid_float *qhgrid = &(msm->
qh_f[0]);
803 float *qh = qhgrid->data;
804 const int ni = qhgrid->ni;
805 const int nj = qhgrid->nj;
806 const int nk = qhgrid->nk;
807 const int ia = qhgrid->i0;
808 const int ib = ia + ni - 1;
809 const int ja = qhgrid->j0;
810 const int jb = ja + nj - 1;
811 const int ka = qhgrid->k0;
812 const int kb = ka + nk - 1;
819 int n, i, j, k, ilo, jlo, klo, koff;
822 const int approx = msm->
approx;
824 const int s_edge = (
PolyDegree[approx] - 1) >> 1;
828 for (n = 0; n < natoms; n++) {
835 rx_hx = (atom[4*n ] - xm0) * hx_1;
836 ry_hy = (atom[4*n + 1] - ym0) * hy_1;
837 rz_hz = (atom[4*n + 2] - zm0) * hz_1;
840 ilo = (int) floorf(rx_hx) - s_edge;
841 jlo = (int) floorf(ry_hy) - s_edge;
842 klo = (int) floorf(rz_hz) - s_edge;
845 #ifndef TEST_INLINING
846 delta = rx_hx - (float) ilo;
848 delta = ry_hy - (float) jlo;
850 delta = rz_hz - (float) klo;
853 t = rx_hx - (float) ilo;
854 xphi[0] = 0.5f * (1 - t) * (2 - t) * (2 - t); \
856 xphi[1] = (1 - t) * (1 + t - 1.5f * t * t); \
858 xphi[2] = (1 + t) * (1 - t - 1.5f * t * t); \
860 xphi[3] = 0.5f * (1 + t) * (2 + t) * (2 + t); \
862 t = ry_hy - (double) jlo;
863 yphi[0] = 0.5f * (1 - t) * (2 - t) * (2 - t); \
865 yphi[1] = (1 - t) * (1 + t - 1.5f * t * t); \
867 yphi[2] = (1 + t) * (1 - t - 1.5f * t * t); \
869 yphi[3] = 0.5f * (1 + t) * (2 + t) * (2 + t); \
871 t = rz_hz - (double) klo;
872 zphi[0] = 0.5f * (1 - t) * (2 - t) * (2 - t); \
874 zphi[1] = (1 - t) * (1 + t - 1.5f * t * t); \
876 zphi[2] = (1 + t) * (1 - t - 1.5f * t * t); \
878 zphi[3] = 0.5f * (1 + t) * (2 + t) * (2 + t); \
883 iswithin = ( ia <= ilo && (ilo+(s_size-1)) <= ib &&
884 ja <= jlo && (jlo+(s_size-1)) <= jb &&
885 ka <= klo && (klo+(s_size-1)) <= kb );
890 for (k = 0; k < s_size; k++) {
891 koff = (k + klo) * nj;
893 for (j = 0; j < s_size; j++) {
894 jkoff = (koff + (j + jlo)) * ni;
896 for (i = 0; i < s_size; i++) {
897 index = jkoff + (i + ilo);
900 qh[index] += xphi[i] * cjk;
911 if (ilo < ia)
do { ilo += ni; }
while (ilo < ia);
912 else if (ilo > ib)
do { ilo -= ni; }
while (ilo > ib);
914 else if (ilo < ia || (ilo+(s_size-1)) > ib) {
919 if (jlo < ja)
do { jlo += nj; }
while (jlo < ja);
920 else if (jlo > jb)
do { jlo -= nj; }
while (jlo > jb);
922 else if (jlo < ja || (jlo+(s_size-1)) > jb) {
927 if (klo < ka)
do { klo += nk; }
while (klo < ka);
928 else if (klo > kb)
do { klo -= nk; }
while (klo > kb);
930 else if (klo < ka || (klo+(s_size-1)) > kb) {
935 for (k = 0, kp = klo; k < s_size; k++, kp++) {
936 if (kp > kb) kp = ka;
939 for (j = 0, jp = jlo; j < s_size; j++, jp++) {
940 if (jp > jb) jp = ja;
941 jkoff = (koff + jp) * ni;
943 for (i = 0, ip = ilo; i < s_size; i++, ip++) {
944 if (ip > ib) ip = ia;
948 qh[index] += xphi[i] * cjk;
963 const float *atom = msm->
atom_f;
972 float rx_hx, ry_hy, rz_hz;
973 #ifndef TEST_INLINING
978 const float hx_1 = 1/msm->
hx_f;
979 const float hy_1 = 1/msm->
hy_f;
980 const float hz_1 = 1/msm->
hz_f;
981 const float xm0 = msm->
gx_f;
982 const float ym0 = msm->
gy_f;
983 const float zm0 = msm->
gz_f;
987 const NL_Msmgrid_float *ehgrid = &(msm->
eh_f[0]);
988 const float *eh = ehgrid->data;
989 const float *ebuf = ehgrid->buffer;
990 const NL_Msmgrid_float *qhgrid = &(msm->
qh_f[0]);
991 const float *qbuf = qhgrid->buffer;
992 const int ni = ehgrid->ni;
993 const int nj = ehgrid->nj;
994 const int nk = ehgrid->nk;
995 const int ia = ehgrid->i0;
996 const int ib = ia + ni - 1;
997 const int ja = ehgrid->j0;
998 const int jb = ja + nj - 1;
999 const int ka = ehgrid->k0;
1000 const int kb = ka + nk - 1;
1007 float fx, fy, fz, cx, cy, cz;
1009 int n, i, j, k, ilo, jlo, klo, koff;
1011 const int nn = (ni*nj) * nk;
1013 const int approx = msm->
approx;
1015 const int s_edge = (
PolyDegree[approx] - 1) >> 1;
1019 printf(
"+++ eh[0,0,0] = %g\n", eh[0]);
1022 for (k = ka; k <= kb; k++) {
1023 for (j = ja; j <= jb; j++) {
1024 for (i = ia; i <= ib; i++, n++) {
1025 printf(
"=== eh[%d,%d,%d] = %g\n", i, j, k, ebuf[n]);
1032 for (n = 0; n < natoms; n++) {
1039 rx_hx = (atom[4*n ] - xm0) * hx_1;
1040 ry_hy = (atom[4*n + 1] - ym0) * hy_1;
1041 rz_hz = (atom[4*n + 2] - zm0) * hz_1;
1044 ilo = (int) floorf(rx_hx) - s_edge;
1045 jlo = (int) floorf(ry_hy) - s_edge;
1046 klo = (int) floorf(rz_hz) - s_edge;
1049 #ifndef TEST_INLINING
1050 delta = rx_hx - (float) ilo;
1053 delta = ry_hy - (float) jlo;
1056 delta = rz_hz - (float) klo;
1060 t = rx_hx - (float) ilo;
1061 xphi[0] = 0.5f * (1 - t) * (2 - t) * (2 - t); \
1062 dxphi[0] = (1.5f * t - 2) * (2 - t) * hx_1; \
1064 xphi[1] = (1 - t) * (1 + t - 1.5f * t * t); \
1065 dxphi[1] = (-5 + 4.5f * t) * t * hx_1; \
1067 xphi[2] = (1 + t) * (1 - t - 1.5f * t * t); \
1068 dxphi[2] = (-5 - 4.5f * t) * t * hx_1; \
1070 xphi[3] = 0.5f * (1 + t) * (2 + t) * (2 + t); \
1071 dxphi[3] = (1.5f * t + 2) * (2 + t) * hx_1; \
1073 t = ry_hy - (double) jlo;
1074 yphi[0] = 0.5f * (1 - t) * (2 - t) * (2 - t); \
1075 dyphi[0] = (1.5f * t - 2) * (2 - t) * hy_1; \
1077 yphi[1] = (1 - t) * (1 + t - 1.5f * t * t); \
1078 dyphi[1] = (-5 + 4.5f * t) * t * hy_1; \
1080 yphi[2] = (1 + t) * (1 - t - 1.5f * t * t); \
1081 dyphi[2] = (-5 - 4.5f * t) * t * hy_1; \
1083 yphi[3] = 0.5f * (1 + t) * (2 + t) * (2 + t); \
1084 dyphi[3] = (1.5f * t + 2) * (2 + t) * hy_1; \
1086 t = rz_hz - (double) klo;
1087 zphi[0] = 0.5f * (1 - t) * (2 - t) * (2 - t); \
1088 dzphi[0] = (1.5f * t - 2) * (2 - t) * hz_1; \
1090 zphi[1] = (1 - t) * (1 + t - 1.5f * t * t); \
1091 dzphi[1] = (-5 + 4.5f * t) * t * hz_1; \
1093 zphi[2] = (1 + t) * (1 - t - 1.5f * t * t); \
1094 dzphi[2] = (-5 - 4.5f * t) * t * hz_1; \
1096 zphi[3] = 0.5f * (1 + t) * (2 + t) * (2 + t); \
1097 dzphi[3] = (1.5f * t + 2) * (2 + t) * hz_1; \
1102 iswithin = ( ia <= ilo && (ilo+(s_size-1)) <= ib &&
1103 ja <= jlo && (jlo+(s_size-1)) <= jb &&
1104 ka <= klo && (klo+(s_size-1)) <= kb );
1110 for (k = 0; k < s_size; k++) {
1111 koff = (k + klo) * nj;
1112 for (j = 0; j < s_size; j++) {
1113 jkoff = (koff + (j + jlo)) * ni;
1114 cx = yphi[j] * zphi[k];
1115 cy = dyphi[j] * zphi[k];
1116 cz = yphi[j] * dzphi[k];
1117 for (i = 0; i < s_size; i++) {
1118 index = jkoff + (i + ilo);
1121 fx += eh[index] * dxphi[i] * cx;
1122 fy += eh[index] * xphi[i] * cy;
1123 fz += eh[index] * xphi[i] * cz;
1134 if (ilo < ia)
do { ilo += ni; }
while (ilo < ia);
1135 else if (ilo > ib)
do { ilo -= ni; }
while (ilo > ib);
1137 else if (ilo < ia || (ilo+(s_size-1)) > ib) {
1142 if (jlo < ja)
do { jlo += nj; }
while (jlo < ja);
1143 else if (jlo > jb)
do { jlo -= nj; }
while (jlo > jb);
1145 else if (jlo < ja || (jlo+(s_size-1)) > jb) {
1150 if (klo < ka)
do { klo += nk; }
while (klo < ka);
1151 else if (klo > kb)
do { klo -= nk; }
while (klo > kb);
1153 else if (klo < ka || (klo+(s_size-1)) > kb) {
1159 for (k = 0, kp = klo; k < s_size; k++, kp++) {
1160 if (kp > kb) kp = ka;
1162 for (j = 0, jp = jlo; j < s_size; j++, jp++) {
1163 if (jp > jb) jp = ja;
1164 jkoff = (koff + jp) * ni;
1165 cx = yphi[j] * zphi[k];
1166 cy = dyphi[j] * zphi[k];
1167 cz = yphi[j] * dzphi[k];
1168 for (i = 0, ip = ilo; i < s_size; i++, ip++) {
1169 if (ip > ib) ip = ia;
1173 fx += eh[index] * dxphi[i] * cx;
1174 fy += eh[index] * xphi[i] * cy;
1175 fz += eh[index] * xphi[i] * cz;
1191 for (n = 0; n < natoms; n++) {
1192 float q = atom[4*n + 3];
1197 for (index = 0; index < nn; index++) {
1198 u += qbuf[index] * ebuf[index];
1200 msm->
uelec += 0.5 * u;
1208 const NL_Msmgrid_float *qhgrid = &(msm->
qh_f[level]);
1209 const float *qh = qhgrid->data;
1210 NL_Msmgrid_float *q2hgrid = &(msm->
qh_f[level+1]);
1211 float *q2h = q2hgrid->data;
1214 const int ia1 = qhgrid->i0;
1215 const int ib1 = ia1 + qhgrid->ni - 1;
1216 const int ja1 = qhgrid->j0;
1217 const int jb1 = ja1 + qhgrid->nj - 1;
1218 const int ka1 = qhgrid->k0;
1219 const int kb1 = ka1 + qhgrid->nk - 1;
1220 const int ni1 = qhgrid->ni;
1221 const int nj1 = qhgrid->nj;
1222 const int nk1 = qhgrid->nk;
1225 const int ia2 = q2hgrid->i0;
1226 const int ib2 = ia2 + q2hgrid->ni - 1;
1227 const int ja2 = q2hgrid->j0;
1228 const int jb2 = ja2 + q2hgrid->nj - 1;
1229 const int ka2 = q2hgrid->k0;
1230 const int kb2 = ka2 + q2hgrid->nk - 1;
1231 const int nrow_q2 = q2hgrid->ni;
1232 const int nstride_q2 = nrow_q2 * q2hgrid->nj;
1239 float *qzd = msm->
lzd_f + (-ka1);
1240 float *qyzd = msm->
lyzd_f + (-ka1*nj1 + -ja1);
1243 const float *phi = NULL;
1248 int index_plane_q2, index_q2;
1249 int index_jk, offset_k;
1254 const int diam_stencil = 2*r_stencil + 1;
1256 for (i2 = ia2; i2 <= ib2; i2++) {
1258 for (k = ka1; k <= kb1; k++) {
1261 for (j = ja1; j <= jb1; j++) {
1262 index_jk = offset_k + j;
1263 offset = index_jk * ni1;
1267 int lower = im - r_stencil;
1268 int upper = im + r_stencil;
1269 if (lower < ia1) lower = ia1;
1270 if (upper > ib1) upper = ib1;
1271 phi = phi_factored + r_stencil;
1272 for (i = lower; i <= upper; i++) {
1273 qsum += phi[i-im] * qh[offset + i];
1277 int ip = im - r_stencil;
1278 if (ip < ia1)
do { ip += ni1; }
while (ip < ia1);
1280 for (i = 0; i < diam_stencil; i++, ip++) {
1281 if (ip > ib1) ip = ia1;
1282 qsum += phi[i] * qh[offset + ip];
1285 qyzd[index_jk] = qsum;
1290 for (j2 = ja2; j2 <= jb2; j2++) {
1291 index_plane_q2 = j2 * nrow_q2 + i2;
1293 for (k = ka1; k <= kb1; k++) {
1298 int lower = jm - r_stencil;
1299 int upper = jm + r_stencil;
1300 if (lower < ja1) lower = ja1;
1301 if (upper > jb1) upper = jb1;
1302 phi = phi_factored + r_stencil;
1303 for (j = lower; j <= upper; j++) {
1304 qsum += phi[j-jm] * qyzd[offset + j];
1308 int jp = jm - r_stencil;
1309 if (jp < ja1)
do { jp += nj1; }
while (jp < ja1);
1311 for (j = 0; j < diam_stencil; j++, jp++) {
1312 if (jp > jb1) jp = ja1;
1313 qsum += phi[j] * qyzd[offset + jp];
1319 for (k2 = ka2; k2 <= kb2; k2++) {
1320 index_q2 = k2 * nstride_q2 + index_plane_q2;
1324 int lower = km - r_stencil;
1325 int upper = km + r_stencil;
1326 if (lower < ka1) lower = ka1;
1327 if (upper > kb1) upper = kb1;
1328 phi = phi_factored + r_stencil;
1329 for (k = lower; k <= upper; k++) {
1330 qsum += phi[k-km] * qzd[k];
1334 int kp = km - r_stencil;
1335 if (kp < ka1)
do { kp += nk1; }
while (kp < ka1);
1337 for (k = 0; k < diam_stencil; k++, kp++) {
1338 if (kp > kb1) kp = ka1;
1339 qsum += phi[k] * qzd[kp];
1342 q2h[index_q2] = qsum;
1355 NL_Msmgrid_float *ehgrid = &(msm->
eh_f[level]);
1356 float *eh = ehgrid->data;
1357 const NL_Msmgrid_float *e2hgrid = &(msm->
eh_f[level+1]);
1358 const float *e2h = e2hgrid->data;
1361 const int ia1 = ehgrid->i0;
1362 const int ib1 = ia1 + ehgrid->ni - 1;
1363 const int ja1 = ehgrid->j0;
1364 const int jb1 = ja1 + ehgrid->nj - 1;
1365 const int ka1 = ehgrid->k0;
1366 const int kb1 = ka1 + ehgrid->nk - 1;
1367 const int ni1 = ehgrid->ni;
1368 const int nj1 = ehgrid->nj;
1369 const int nk1 = ehgrid->nk;
1372 const int ia2 = e2hgrid->i0;
1373 const int ib2 = ia2 + e2hgrid->ni - 1;
1374 const int ja2 = e2hgrid->j0;
1375 const int jb2 = ja2 + e2hgrid->nj - 1;
1376 const int ka2 = e2hgrid->k0;
1377 const int kb2 = ka2 + e2hgrid->nk - 1;
1378 const int nrow_e2 = e2hgrid->ni;
1379 const int nstride_e2 = nrow_e2 * e2hgrid->nj;
1386 float *ezd = msm->
lzd_f + (-ka1);
1387 float *eyzd = msm->
lyzd_f + (-ka1*nj1 + -ja1);
1389 const size_t size_lzd = nk1 *
sizeof(float);
1390 const size_t size_lyzd = nj1 * nk1 *
sizeof(float);
1392 const float *phi = NULL;
1397 int index_plane_e2, index_e2;
1398 int index_jk, offset_k;
1403 const int diam_stencil = 2*r_stencil + 1;
1405 for (i2 = ia2; i2 <= ib2; i2++) {
1406 memset(msm->
lyzd_f, 0, size_lyzd);
1408 for (j2 = ja2; j2 <= jb2; j2++) {
1409 memset(msm->
lzd_f, 0, size_lzd);
1410 index_plane_e2 = j2 * nrow_e2 + i2;
1412 for (k2 = ka2; k2 <= kb2; k2++) {
1413 index_e2 = k2 * nstride_e2 + index_plane_e2;
1416 int lower = km - r_stencil;
1417 int upper = km + r_stencil;
1418 if (lower < ka1) lower = ka1;
1419 if (upper > kb1) upper = kb1;
1420 phi = phi_factored + r_stencil;
1421 for (k = lower; k <= upper; k++) {
1422 ezd[k] += phi[k-km] * e2h[index_e2];
1426 int kp = km - r_stencil;
1427 if (kp < ka1)
do { kp += nk1; }
while (kp < ka1);
1429 for (k = 0; k < diam_stencil; k++, kp++) {
1430 if (kp > kb1) kp = ka1;
1431 ezd[kp] += phi[k] * e2h[index_e2];
1436 for (k = ka1; k <= kb1; k++) {
1440 int lower = jm - r_stencil;
1441 int upper = jm + r_stencil;
1442 if (lower < ja1) lower = ja1;
1443 if (upper > jb1) upper = jb1;
1444 phi = phi_factored + r_stencil;
1445 for (j = lower; j <= upper; j++) {
1446 eyzd[offset + j] += phi[j-jm] * ezd[k];
1450 int jp = jm - r_stencil;
1451 if (jp < ja1)
do { jp += nj1; }
while (jp < ja1);
1453 for (j = 0; j < diam_stencil; j++, jp++) {
1454 if (jp > jb1) jp = ja1;
1455 eyzd[offset + jp] += phi[j] * ezd[k];
1462 for (k = ka1; k <= kb1; k++) {
1465 for (j = ja1; j <= jb1; j++) {
1466 index_jk = offset_k + j;
1467 offset = index_jk * ni1;
1470 int lower = im - r_stencil;
1471 int upper = im + r_stencil;
1472 if (lower < ia1) lower = ia1;
1473 if (upper > ib1) upper = ib1;
1474 phi = phi_factored + r_stencil;
1475 for (i = lower; i <= upper; i++) {
1476 eh[offset + i] += phi[i-im] * eyzd[index_jk];
1480 int ip = im - r_stencil;
1481 if (ip < ia1)
do { ip += ni1; }
while (ip < ia1);
1483 for (i = 0; i < diam_stencil; i++, ip++) {
1484 if (ip > ib1) ip = ia1;
1485 eh[offset + ip] += phi[i] * eyzd[index_jk];
1500 const NL_Msmgrid_float *qhgrid = &(msm->
qh_f[level]);
1501 const float *qh = qhgrid->data;
1502 NL_Msmgrid_float *q2hgrid = &(msm->
qh_f[level+1]);
1503 float *q2h_buffer = q2hgrid->buffer;
1506 const int ia1 = qhgrid->i0;
1507 const int ib1 = ia1 + qhgrid->ni - 1;
1508 const int ja1 = qhgrid->j0;
1509 const int jb1 = ja1 + qhgrid->nj - 1;
1510 const int ka1 = qhgrid->k0;
1511 const int kb1 = ka1 + qhgrid->nk - 1;
1512 const int ni1 = qhgrid->ni;
1513 const int nj1 = qhgrid->nj;
1514 const int nk1 = qhgrid->nk;
1517 const int ia2 = q2hgrid->i0;
1518 const int ib2 = ia2 + q2hgrid->ni - 1;
1519 const int ja2 = q2hgrid->j0;
1520 const int jb2 = ja2 + q2hgrid->nj - 1;
1521 const int ka2 = q2hgrid->k0;
1522 const int kb2 = ka2 + q2hgrid->nk - 1;
1534 int i1, j1, k1, index1, jk1off, k1off;
1539 for (index2 = 0, k2 = ka2; k2 <= kb2; k2++) {
1541 for (j2 = ja2; j2 <= jb2; j2++) {
1543 for (i2 = ia2; i2 <= ib2; i2++, index2++) {
1547 for (k = 0; k < nstencil; k++) {
1548 k1off = k1 + offset[k];
1550 if (ispz)
do { k1off += nk1; }
while (k1off < ka1);
1553 else if (k1off > kb1) {
1554 if (ispz)
do { k1off -= nk1; }
while (k1off > kb1);
1558 for (j = 0; j < nstencil; j++) {
1559 jk1off = j1 + offset[j];
1561 if (ispy)
do { jk1off += nj1; }
while (jk1off < ja1);
1564 else if (jk1off > jb1) {
1565 if (ispy)
do { jk1off -= nj1; }
while (jk1off > jb1);
1568 jk1off = (k1off + jk1off) * ni1;
1569 cjk = phi[j] * phi[k];
1570 for (i = 0; i < nstencil; i++) {
1571 index1 = i1 + offset[i];
1573 if (ispx)
do { index1 += ni1; }
while (index1 < ia1);
1576 else if (index1 > ib1) {
1577 if (ispx)
do { index1 -= ni1; }
while (index1 > ib1);
1581 q2h_sum += qh[index1] * phi[i] * cjk;
1586 q2h_buffer[index2] = q2h_sum;
1597 NL_Msmgrid_float *ehgrid = &(msm->
eh_f[level]);
1598 float *eh = ehgrid->data;
1599 const NL_Msmgrid_float *e2hgrid = &(msm->
eh_f[level+1]);
1600 const float *e2h_buffer = e2hgrid->buffer;
1603 const int ia1 = ehgrid->i0;
1604 const int ib1 = ia1 + ehgrid->ni - 1;
1605 const int ja1 = ehgrid->j0;
1606 const int jb1 = ja1 + ehgrid->nj - 1;
1607 const int ka1 = ehgrid->k0;
1608 const int kb1 = ka1 + ehgrid->nk - 1;
1609 const int ni1 = ehgrid->ni;
1610 const int nj1 = ehgrid->nj;
1611 const int nk1 = ehgrid->nk;
1614 const int ia2 = e2hgrid->i0;
1615 const int ib2 = ia2 + e2hgrid->ni - 1;
1616 const int ja2 = e2hgrid->j0;
1617 const int jb2 = ja2 + e2hgrid->nj - 1;
1618 const int ka2 = e2hgrid->k0;
1619 const int kb2 = ka2 + e2hgrid->nk - 1;
1631 int i1, j1, k1, index1, jk1off, k1off;
1636 for (index2 = 0, k2 = ka2; k2 <= kb2; k2++) {
1638 for (j2 = ja2; j2 <= jb2; j2++) {
1640 for (i2 = ia2; i2 <= ib2; i2++, index2++) {
1643 for (k = 0; k < nstencil; k++) {
1644 k1off = k1 + offset[k];
1646 if (ispz)
do { k1off += nk1; }
while (k1off < ka1);
1649 else if (k1off > kb1) {
1650 if (ispz)
do { k1off -= nk1; }
while (k1off > kb1);
1654 for (j = 0; j < nstencil; j++) {
1655 jk1off = j1 + offset[j];
1657 if (ispy)
do { jk1off += nj1; }
while (jk1off < ja1);
1660 else if (jk1off > jb1) {
1661 if (ispy)
do { jk1off -= nj1; }
while (jk1off > jb1);
1664 jk1off = (k1off + jk1off) * ni1;
1665 cjk = phi[j] * phi[k];
1666 for (i = 0; i < nstencil; i++) {
1667 index1 = i1 + offset[i];
1669 if (ispx)
do { index1 += ni1; }
while (index1 < ia1);
1672 else if (index1 > ib1) {
1673 if (ispx)
do { index1 -= ni1; }
while (index1 > ib1);
1677 eh[index1] += e2h_buffer[index2] * phi[i] * cjk;
1695 const NL_Msmgrid_float *qhgrid = &(msm->
qh_f[level]);
1696 const float *qh = qhgrid->data;
1697 NL_Msmgrid_float *ehgrid = &(msm->
eh_f[level]);
1698 float *eh = ehgrid->data;
1699 const int ia = qhgrid->i0;
1700 const int ib = ia + qhgrid->ni - 1;
1701 const int ja = qhgrid->j0;
1702 const int jb = ja + qhgrid->nj - 1;
1703 const int ka = qhgrid->k0;
1704 const int kb = ka + qhgrid->nk - 1;
1705 const int ni = qhgrid->ni;
1706 const int nj = qhgrid->nj;
1707 const int nk = qhgrid->nk;
1710 const NL_Msmgrid_float *gcgrid = &(msm->
gc_f[level]);
1711 const float *gc = gcgrid->data;
1712 const int gia = gcgrid->i0;
1713 const int gib = gia + gcgrid->ni - 1;
1714 const int gja = gcgrid->j0;
1715 const int gjb = gja + gcgrid->nj - 1;
1716 const int gka = gcgrid->k0;
1717 const int gkb = gka + gcgrid->nk - 1;
1718 const int gni = gcgrid->ni;
1719 const int gnj = gcgrid->nj;
1725 const int ispnone = !(ispx || ispy || ispz);
1728 int gia_clip, gib_clip;
1729 int gja_clip, gjb_clip;
1730 int gka_clip, gkb_clip;
1735 long jknoff, nindex;
1736 int kgoff, jkgoff, ngindex;
1740 const float *gcbuf = gcgrid->buffer;
1742 printf(
"+++ qh[0,0,0] = %g\n", qh[0]);
1744 for (k = gka; k <= gkb; k++) {
1745 for (j = gja; j <= gjb; j++) {
1746 for (i = gia; i <= gib; i++, index++) {
1747 printf(
"=== gc[%d,%d,%d] = %g\n", i, j, k, gcbuf[index]);
1758 for (k = ka; k <= kb; k++) {
1761 gka_clip = (k + gka < ka ? ka - k : gka);
1762 gkb_clip = (k + gkb > kb ? kb - k : gkb);
1766 for (j = ja; j <= jb; j++) {
1769 gja_clip = (j + gja < ja ? ja - j : gja);
1770 gjb_clip = (j + gjb > jb ? jb - j : gjb);
1772 jkoff = (koff + j) * ni;
1774 for (i = ia; i <= ib; i++) {
1777 gia_clip = (i + gia < ia ? ia - i : gia);
1778 gib_clip = (i + gib > ib ? ib - i : gib);
1784 for (kd = gka_clip; kd <= gkb_clip; kd++) {
1785 knoff = (k + kd) * nj;
1788 for (jd = gja_clip; jd <= gjb_clip; jd++) {
1789 jknoff = (knoff + (j + jd)) * ni;
1790 jkgoff = (kgoff + jd) * gni;
1792 for (
id = gia_clip;
id <= gib_clip;
id++) {
1793 nindex = jknoff + (i + id);
1794 ngindex = jkgoff + id;
1802 eh_sum += qh[nindex] * gc[ngindex];
1821 for (k = ka; k <= kb; k++) {
1825 gka_clip = (k + gka < ka ? ka - k : gka);
1826 gkb_clip = (k + gkb > kb ? kb - k : gkb);
1827 if (klo < ka) klo = ka;
1832 if (klo < ka)
do { klo += nk; }
while (klo < ka);
1838 for (j = ja; j <= jb; j++) {
1842 gja_clip = (j + gja < ja ? ja - j : gja);
1843 gjb_clip = (j + gjb > jb ? jb - j : gjb);
1844 if (jlo < ja) jlo = ja;
1849 if (jlo < ja)
do { jlo += nj; }
while (jlo < ja);
1853 jkoff = (koff + j) * ni;
1855 for (i = ia; i <= ib; i++) {
1859 gia_clip = (i + gia < ia ? ia - i : gia);
1860 gib_clip = (i + gib > ib ? ib - i : gib);
1861 if (ilo < ia) ilo = ia;
1866 if (ilo < ia)
do { ilo += ni; }
while (ilo < ia);
1874 for (kd = gka_clip, kp = klo; kd <= gkb_clip; kd++, kp++) {
1876 if (kp > kb) kp = ka;
1880 for (jd = gja_clip, jp = jlo; jd <= gjb_clip; jd++, jp++) {
1882 if (jp > jb) jp = ja;
1883 jknoff = (knoff + jp) * ni;
1884 jkgoff = (kgoff + jd) * gni;
1886 for (
id = gia_clip, ip = ilo;
id <= gib_clip;
id++, ip++) {
1888 if (ip > ib) ip = ia;
1889 nindex = jknoff + ip;
1890 ngindex = jkgoff + id;
1898 eh_sum += qh[nindex] * gc[ngindex];
double wkf_timer_time(wkf_timerhandle v)
static int interpolation(NL_Msm *)
static const int PolyDegree[NL_MSM_APPROX_END]
static int prolongation_factored(NL_Msm *, int level)
#define GRID_INDEX_CHECK(a, _i, _j, _k)
static int gridcutoff(NL_Msm *, int level)
static const int Nstencil[NL_MSM_APPROX_END]
static const float PhiStencilFactored[NL_MSM_APPROX_END][2 *MAX_POLY_DEGREE+1]
static int prolongation(NL_Msm *, int level)
void wkf_timer_stop(wkf_timerhandle v)
int NL_msm_compute_long_range_sprec(NL_Msm *pm)
#define D_STENCIL_1D(_dphi, _phi, _h_1, _delta, _approx)
#define GRID_INDEX(_p, _i, _j, _k)
wkf_timerhandle timer_longrng
static int anterpolation(NL_Msm *)
static const float PhiStencil[NL_MSM_APPROX_END][MAX_NSTENCIL]
static const int IndexOffset[NL_MSM_APPROX_END][MAX_NSTENCIL]
#define STENCIL_1D(_phi, _delta, _approx)
static int restriction(NL_Msm *, int level)
void wkf_timer_start(wkf_timerhandle v)
static int restriction_factored(NL_Msm *, int level)