11 NL_msm_cuda_cleanup_gridcutoff(pm);
84 if (cellvec1[1] != 0 || cellvec1[2] != 0 ||
85 cellvec2[0] != 0 || cellvec2[2] != 0 ||
86 cellvec3[0] != 0 || cellvec3[1] != 0 ||
87 cellvec1[0] <= 0 || cellvec2[1] <= 0 || cellvec3[2] <= 0) {
172 rc = NL_msm_cuda_setup_gridcutoff(pm);
174 printf(
"Using CUDA for grid cutoff computation\n");
177 printf(
"Unable to set up CUDA for grid cutoff computation\n");
179 NL_msm_cuda_cleanup_gridcutoff(pm);
180 printf(
"Falling back on CPU\n");
181 pm->
msmflags &= ~NL_MSM_COMPUTE_CUDA_GRID_CUTOFF;
187 if (msmflags & NL_MSM_COMPUTE_CUDA_GRID_CUTOFF) {
189 printf(
"Falling back on CPU\n");
190 pm->
msmflags &= ~NL_MSM_COMPUTE_CUDA_GRID_CUTOFF;
226 const double xlen = pm->
cellvec1[0];
227 const double ylen = pm->
cellvec2[1];
228 const double zlen = pm->
cellvec3[2];
230 printf(
"#MSM SETUP\n");
231 printf(
"# cell lengths= %g %g %g\n", xlen, ylen, zlen);
232 printf(
"# grid origin= %g %g %g\n", pm->
gx, pm->
gy, pm->
gz);
234 printf(
"# periodic boundaries\n");
237 printf(
"# non-periodic boundaries\n");
240 printf(
"# periodic boundaries in:%s%s%s\n",
241 (ispx ?
" x" :
""), (ispy ?
" y" :
""), (ispz ?
" z" :
""));
243 printf(
"# cutoff= %g\n", pm->
a);
245 printf(
"# hx, hy, hz= %g, %g, %g\n", pm->
hx, pm->
hy, pm->
hz);
246 printf(
"# h-grid size= %d x %d x %d\n", pm->
nx, pm->
ny, pm->
nz);
247 printf(
"# number of levels= %d\n", pm->
nlevels);
250 printf(
"# grid hierarchy:\n");
252 for (k = 0; k < pm->
nlevels; k++) {
253 NL_Msmgrid_float *qh = &(pm->
qh_f[k]);
255 int ib = ia + qh->ni - 1;
257 int jb = ja + qh->nj - 1;
259 int kb = ka + qh->nk - 1;
260 printf(
"# level= %d: [%d..%d] x [%d..%d] x [%d..%d]\n",
261 k, ia, ib, ja, jb, ka, kb);
265 for (k = 0; k < pm->
nlevels; k++) {
266 NL_Msmgrid_double *qh = &(pm->
qh[k]);
268 int ib = ia + qh->ni - 1;
270 int jb = ja + qh->nj - 1;
272 int kb = ka + qh->nk - 1;
273 printf(
"# level= %d: [%d..%d] x [%d..%d] x [%d..%d]\n",
274 k, ia, ib, ja, jb, ka, kb);
290 c[
X] = v[
Y]*w[
Z] - v[
Z]*w[
Y];
291 c[
Y] = v[
Z]*w[
X] - v[
X]*w[
Z];
292 c[
Z] = v[
X]*w[
Y] - v[
Y]*w[
X];
293 s = 1 / (u[
X]*c[
X] + u[
Y]*c[
Y] + u[
Z]*c[
Z]);
298 c[
X] = w[
Y]*u[
Z] - w[
Z]*u[
Y];
299 c[
Y] = w[
Z]*u[
X] - w[
X]*u[
Z];
300 c[
Z] = w[
X]*u[
Y] - w[
Y]*u[
X];
301 s = 1 / (v[
X]*c[
X] + v[
Y]*c[
Y] + v[
Z]*c[
Z]);
306 c[
X] = u[
Y]*v[
Z] - u[
Z]*v[
Y];
307 c[
Y] = u[
Z]*v[
X] - u[
X]*v[
Z];
308 c[
Z] = u[
X]*v[
Y] - u[
Y]*v[
X];
309 s = 1 / (w[
X]*c[
X] + w[
Y]*c[
Y] + w[
Z]*c[
Z]);
326 double pulen, pvlen, pwlen, s;
327 double cutoff = pm->
a;
328 int nbx, nby, nbz, numbins;
342 p[
X] = v[
Y]*w[
Z] - v[
Z]*w[
Y];
343 p[
Y] = v[
Z]*w[
X] - v[
X]*w[
Z];
344 p[
Z] = v[
X]*w[
Y] - v[
Y]*w[
X];
346 s = (u[
X]*p[
X] + u[
Y]*p[
Y] + u[
Z]*p[
Z]) / (p[
X]*p[
X] + p[
Y]*p[
Y] + p[
Z]*p[
Z]);
350 pulen = sqrt(p[
X]*p[
X] + p[
Y]*p[
Y] + p[Z]*p[Z]);
352 p[
X] = w[
Y]*u[
Z] - w[
Z]*u[
Y];
353 p[
Y] = w[
Z]*u[
X] - w[
X]*u[
Z];
354 p[
Z] = w[
X]*u[
Y] - w[
Y]*u[
X];
356 s = (v[
X]*p[
X] + v[
Y]*p[
Y] + v[
Z]*p[
Z]) / (p[
X]*p[
X] + p[
Y]*p[
Y] + p[Z]*p[Z]);
360 pvlen = sqrt(p[
X]*p[
X] + p[
Y]*p[
Y] + p[Z]*p[Z]);
362 p[
X] = u[
Y]*v[
Z] - u[
Z]*v[
Y];
363 p[
Y] = u[
Z]*v[
X] - u[
X]*v[
Z];
364 p[
Z] = u[
X]*v[
Y] - u[
Y]*v[
X];
366 s = (w[
X]*p[
X] + w[
Y]*p[
Y] + w[
Z]*p[
Z]) / (p[
X]*p[
X] + p[
Y]*p[
Y] + p[Z]*p[Z]);
370 pwlen = sqrt(p[
X]*p[
X] + p[
Y]*p[
Y] + p[Z]*p[Z]);
374 nbx = (ispx ? (int) floor(pulen / cutoff) : (int) ceil(pulen / cutoff));
375 nby = (ispy ? (int) floor(pvlen / cutoff) : (int) ceil(pvlen / cutoff));
376 nbz = (ispz ? (int) floor(pwlen / cutoff) : (int) ceil(pwlen / cutoff));
377 if (nbx == 0 || nby == 0 || nbz == 0) {
380 numbins = nbx * nby * nbz;
385 void *vptr = malloc(numbins *
sizeof(
int));
387 pm->
bin = (
int *) vptr;
399 s = pulen / (nbx * cutoff);
405 s = pvlen / (nby * cutoff);
411 s = pwlen / (nbz * cutoff);
431 double pulen, pvlen, pwlen, s;
432 double cutoff = pm->
a;
433 int nbx, nby, nbz, numbins;
437 int nbrhdmax, nradius, ndiameter, index, i, j, k;
439 double binvol, abincnt, binlen, c;
444 p[
X] = v[
Y]*w[
Z] - v[
Z]*w[
Y];
445 p[
Y] = v[
Z]*w[
X] - v[
X]*w[
Z];
446 p[
Z] = v[
X]*w[
Y] - v[
Y]*w[
X];
448 s = (u[
X]*p[
X] + u[
Y]*p[
Y] + u[
Z]*p[
Z]) / (p[
X]*p[
X] + p[
Y]*p[
Y] + p[
Z]*p[
Z]);
452 pulen = sqrt(p[
X]*p[
X] + p[
Y]*p[
Y] + p[Z]*p[Z]);
454 p[
X] = w[
Y]*u[
Z] - w[
Z]*u[
Y];
455 p[
Y] = w[
Z]*u[
X] - w[
X]*u[
Z];
456 p[
Z] = w[
X]*u[
Y] - w[
Y]*u[
X];
458 s = (v[
X]*p[
X] + v[
Y]*p[
Y] + v[
Z]*p[
Z]) / (p[
X]*p[
X] + p[
Y]*p[
Y] + p[Z]*p[Z]);
462 pvlen = sqrt(p[
X]*p[
X] + p[
Y]*p[
Y] + p[Z]*p[Z]);
464 p[
X] = u[
Y]*v[
Z] - u[
Z]*v[
Y];
465 p[
Y] = u[
Z]*v[
X] - u[
X]*v[
Z];
466 p[
Z] = u[
X]*v[
Y] - u[
Y]*v[
X];
467 volume = w[
X]*p[
X] + w[
Y]*p[
Y] + w[
Z]*p[
Z];
469 s = volume / (p[
X]*p[
X] + p[
Y]*p[
Y] + p[
Z]*p[
Z]);
473 pwlen = sqrt(p[
X]*p[
X] + p[
Y]*p[
Y] + p[Z]*p[Z]);
475 if ((ispx && cutoff > pulen) ||
476 (ispy && cutoff > pvlen) ||
477 (ispz && cutoff > pwlen)) {
478 printf(
"Cutoff %.3g too big for cell along basis%s%s%s\n",
479 cutoff, (cutoff > pulen ?
" x" :
""), (cutoff > pvlen ?
" y" :
""),
480 (cutoff > pwlen ?
" z" :
""));
487 abincnt = volume / binvol;
488 binlen = pow(binvol, 1./3);
492 c = pow(abincnt / (pulen*pvlen*pwlen), 1./3);
493 nbx = (int) ceil(c * pulen);
494 nby = (int) ceil(c * pvlen);
495 nbz = (int) ceil(c * pwlen);
496 numbins = nbx * nby * nbz;
498 printf(
"nbx=%d nby=%d nbz=%d numbins=%d\n", nbx, nby, nbz, numbins);
502 void *vptr = malloc(numbins *
sizeof(
int));
504 pm->
bin = (
int *) vptr;
514 s = pulen / (nbx * binlen);
524 s = pvlen / (nby * binlen);
534 s = pwlen / (nbz * binlen);
561 min2 = (bu[
X]*bu[
X] + bu[
Y]*bu[
Y] + bu[
Z]*bu[
Z]);
562 s = (bv[
X]*bv[
X] + bv[
Y]*bv[
Y] + bv[
Z]*bv[
Z]);
563 if (min2 > s) min2 = s;
564 s = (bw[
X]*bw[
X] + bw[
Y]*bw[
Y] + bw[
Z]*bw[
Z]);
565 if (min2 > s) min2 = s;
568 p[
X] = bu[
X] + bv[
X] + bw[
X];
569 p[
Y] = bu[
Y] + bv[
Y] + bw[
Y];
570 p[
Z] = bu[
Z] + bv[
Z] + bw[
Z];
571 s = (p[
X]*p[
X] + p[
Y]*p[
Y] + p[
Z]*p[
Z]);
572 if (min2 > s) min2 = s;
573 p[
X] = -bu[
X] + bv[
X] + bw[
X];
574 p[
Y] = -bu[
Y] + bv[
Y] + bw[
Y];
575 p[
Z] = -bu[
Z] + bv[
Z] + bw[
Z];
576 s = (p[
X]*p[
X] + p[
Y]*p[
Y] + p[
Z]*p[
Z]);
577 if (min2 > s) min2 = s;
578 p[
X] = bu[
X] - bv[
X] + bw[
X];
579 p[
Y] = bu[
Y] - bv[
Y] + bw[
Y];
580 p[
Z] = bu[
Z] - bv[
Z] + bw[
Z];
581 s = (p[
X]*p[
X] + p[
Y]*p[
Y] + p[
Z]*p[
Z]);
582 if (min2 > s) min2 = s;
583 p[
X] = bu[
X] + bv[
X] - bw[
X];
584 p[
Y] = bu[
Y] + bv[
Y] - bw[
Y];
585 p[
Z] = bu[
Z] + bv[
Z] - bw[
Z];
586 s = (p[
X]*p[
X] + p[
Y]*p[
Y] + p[
Z]*p[
Z]);
587 if (min2 > s) min2 = s;
591 nradius = (int) ceil(sqrt(cutoff*cutoff / min2));
592 ndiameter = 2 * nradius + 1;
593 nbrhdmax = (ndiameter * ndiameter * ndiameter) / 2 + 1;
596 printf(
"Neighborhood radius = %d\n", nradius);
597 printf(
"min2 = %g\n", min2);
602 pm->
nbrhd = (
int *) calloc(3*nbrhdmax,
sizeof(
int));
609 for (k = -nradius; k <= nradius; k++) {
610 for (j = -nradius; j <= nradius; j++) {
611 for (i = -nradius; i <= nradius; i++) {
613 int ip, jp, kp, iq, jq, kq;
615 int binindex = (k * ndiameter + j) * ndiameter + i;
616 if (binindex < 0)
continue;
617 for (kp = 0; kp <= 1; kp++) {
618 for (jp = 0; jp <= 1; jp++) {
619 for (ip = 0; ip <= 1; ip++) {
620 p[
X] = (i+ip)*bu[
X] + (j+jp)*bv[
X] + (k+kp)*bw[
X];
621 p[
Y] = (i+ip)*bu[
Y] + (j+jp)*bv[
Y] + (k+kp)*bw[
Y];
622 p[
Z] = (i+ip)*bu[Z] + (j+jp)*bv[
Z] + (k+kp)*bw[Z];
623 for (kq = 0; kq <= 1; kq++) {
624 for (jq = 0; jq <= 1; jq++) {
625 for (iq = 0; iq <= 1; iq++) {
627 double dx, dy, dz, r2;
628 q[
X] = iq*bu[
X] + jq*bv[
X] + kq*bw[
X];
629 q[
Y] = iq*bu[
Y] + jq*bv[
Y] + kq*bw[
Y];
630 q[
Z] = iq*bu[
Z] + jq*bv[
Z] + kq*bw[
Z];
634 r2 = dx*dx + dy*dy + dz*dz;
635 is_within |= (r2 < cutoff*cutoff);
644 pm->
nbrhd[index++] = i;
645 pm->
nbrhd[index++] = j;
646 pm->
nbrhd[index++] = k;
668 const int nu = INTERP_PARAMS[pm->
approx].
nu;
673 const double hmax = 1.5 * hmin;
697 int n = (int) floorf(len / h) + 1;
708 const int nu = INTERP_PARAMS[pm->
approx].
nu;
718 const double xlen = pm->
cellvec1[0];
719 const double ylen = pm->
cellvec2[1];
720 const double zlen = pm->
cellvec3[2];
722 const double a = pm->
a;
727 NL_Msmgrid_double *p = NULL;
728 NL_Msmgrid_float *p_f = NULL;
729 int ia, ib, ja, jb, ka, kb, ni, nj, nk;
734 int level, toplevel, nlevels, maxlevels;
768 n = (nk > omega ? nk : omega);
772 t = (
float *) realloc(pm->
lzd, n *
sizeof(
float));
778 t = (
double *) realloc(pm->
lzd, n *
sizeof(
double));
784 n *= (nj > omega ? nj : omega);
788 t = (
float *) realloc(pm->
lyzd, n *
sizeof(
float));
794 t = (
double *) realloc(pm->
lyzd, n *
sizeof(
double));
807 for (maxlevels = 1; n > 0; n >>= 1) maxlevels++;
810 int omega3 = omega * omega * omega;
811 int nhalf = (int) sqrtf(ni*nj*nk);
812 lastnelems = (nhalf > omega3 ? nhalf : omega3);
823 void *vqh, *veh, *vgc;
825 vqh = realloc(pm->
qh, maxlevels *
sizeof(NL_Msmgrid_float));
827 veh = realloc(pm->
eh, maxlevels *
sizeof(NL_Msmgrid_float));
829 vgc = realloc(pm->
gc, maxlevels *
sizeof(NL_Msmgrid_float));
831 pm->
qh_f = (NL_Msmgrid_float *) vqh;
832 pm->
eh_f = (NL_Msmgrid_float *) veh;
833 pm->
gc_f = (NL_Msmgrid_float *) vgc;
835 for (level = pm->
maxlevels; level < maxlevels; level++) {
842 vqh = realloc(pm->
qh, maxlevels *
sizeof(NL_Msmgrid_double));
844 veh = realloc(pm->
eh, maxlevels *
sizeof(NL_Msmgrid_double));
846 vgc = realloc(pm->
gc, maxlevels *
sizeof(NL_Msmgrid_double));
848 pm->
qh = (NL_Msmgrid_double *) vqh;
849 pm->
eh = (NL_Msmgrid_double *) veh;
850 pm->
gc = (NL_Msmgrid_double *) vgc;
852 for (level = pm->
maxlevels; level < maxlevels; level++) {
870 GRID_RESIZE( &(pm->
qh[level]),
double, ia, ni, ja, nj, ka, nk);
871 GRID_RESIZE( &(pm->
eh[level]),
double, ia, ni, ja, nj, ka, nk);
874 if (++level == nlevels) done |= 0x07;
876 alldone = (done == 0x07);
879 int nelems = ni * nj * nk;
880 if (nelems <= lastnelems) done |= 0x07;
886 if (ni & 1) done |= 0x07;
887 else if (ni == 2) done |= 0x01;
890 ia = -((-ia+1)/2) - nu;
893 if (ni <= omega) done |= 0x01;
899 if (nj & 1) done |= 0x07;
900 else if (nj == 2) done |= 0x02;
903 ja = -((-ja+1)/2) - nu;
906 if (nj <= omega) done |= 0x02;
912 if (nk & 1) done |= 0x07;
913 else if (nk == 2) done |= 0x04;
916 ka = -((-ka+1)/2) - nu;
919 if (nk <= omega) done |= 0x04;
922 }
while ( ! alldone );
928 ni = (int) ceil(2*a/hx) - 1;
929 nj = (int) ceil(2*a/hy) - 1;
930 nk = (int) ceil(2*a/hz) - 1;
932 for (level = 0; level < toplevel; level++) {
934 p_f = &(pm->
gc_f[level]);
935 GRID_RESIZE(p_f,
float, -ni, 2*ni+1, -nj, 2*nj+1, -nk, 2*nk+1);
938 p = &(pm->
gc[level]);
939 GRID_RESIZE(p,
double, -ni, 2*ni+1, -nj, 2*nj+1, -nk, 2*nk+1);
944 for (k = -nk; k <= nk; k++) {
945 for (j = -nj; j <= nj; j++) {
946 for (i = -ni; i <= ni; i++) {
947 double s, t, gs, gt, g;
948 s = sqrt((i*hx)*(i*hx) + (j*hy)*(j*hy) + (k*hz)*(k*hz)) / a;
955 SPOLY(>, &d, t, split);
956 g = (gs - 0.5 * gt) / a;
959 SPOLY(&gs, &d, s, split);
960 SPOLY(>, &d, t, split);
961 g = (gs - 0.5 * gt) / a;
966 p_f->buffer[index] = (float) g;
971 p->buffer[index] = g;
981 const NL_Msmgrid_float *first = &(pm->
gc_f[0]);
983 for (k = -nk; k <= nk; k++) {
984 for (j = -nj; j <= nj; j++) {
985 for (i = -ni; i <= ni; i++) {
988 p_f->buffer[index] = (float) (scaling * first->buffer[index]);
995 const NL_Msmgrid_double *first = &(pm->
gc[0]);
997 for (k = -nk; k <= nk; k++) {
998 for (j = -nj; j <= nj; j++) {
999 for (i = -ni; i <= ni; i++) {
1002 p->buffer[index] = scaling * first->buffer[index];
1012 if (toplevel < pm->nlevels) {
1016 const NL_Msmgrid_float *qhtop_f = &(pm->
qh_f[toplevel]);
1017 ni = qhtop_f->ni - 1;
1018 nj = qhtop_f->nj - 1;
1019 nk = qhtop_f->nk - 1;
1022 const NL_Msmgrid_double *qhtop = &(pm->
qh[toplevel]);
1028 p_f = &(pm->
gc_f[toplevel]);
1029 GRID_RESIZE(p_f,
float, -ni, 2*ni+1, -nj, 2*nj+1, -nk, 2*nk+1);
1032 p = &(pm->
gc[toplevel]);
1033 GRID_RESIZE(p,
double, -ni, 2*ni+1, -nj, 2*nj+1, -nk, 2*nk+1);
1036 for (k = -nk; k <= nk; k++) {
1037 for (j = -nj; j <= nj; j++) {
1038 for (i = -ni; i <= ni; i++) {
1040 s = sqrt((i*hx)*(i*hx) + (j*hy)*(j*hy) + (k*hz)*(k*hz)) / a;
1045 SPOLY(&gs, &d, s, split);
1050 p_f->buffer[index] = (float) (scaling * gs/a);
1055 p->buffer[index] = scaling * gs/a;
1067 SPOLY(&gs, &d, s, split);
#define GRID_RESIZE(_p, TYPE, __i0, __ni, __j0, __nj, __k0, __nk)
static InterpParams INTERP_PARAMS[]
const char * NL_msm_split_name(int split)
#define GRID_INDEX_CHECK(a, _i, _j, _k)
static int setup_grids(NL_Msm *pm)
#define GRID_INDEX(_p, _i, _j, _k)
static int setup_bins_k_away(NL_Msm *pm)
static int setup_bins_1away(NL_Msm *pm)
static int print_status(NL_Msm *msm)
void NL_msm_cleanup(NL_Msm *pm)
struct InterpParams_t InterpParams
std::vector< std::string > split(const std::string &text, std::string delimiter)
static int setup_cell_vectors(NL_Msm *pm)
static int setup_hgrid_1d(NL_Msm *pm, double len, double *hh, int *nn, int *aindex, int *bindex, int isperiodic)
const char * NL_msm_approx_name(int approx)
#define SPOLY(pg, pdg, ra, split)
int NL_msm_setup(NL_Msm *msm, double cutoff, double cellvec1[3], double cellvec2[3], double cellvec3[3], double cellcenter[3], int msmflags)