11 #if defined(__SSE2__) && ! defined(NAMD_DISABLE_SSE) 12 #include <emmintrin.h> 13 #if defined(__INTEL_COMPILER) 14 #define __align(X) __declspec(align(X) ) 16 #define __align(X) __attribute__((aligned(X) )) 17 #define MISSING_mm_cvtsd_f64 18 #elif defined(__GNUC__) 19 #define __align(X) __attribute__((aligned(X) )) 21 #define MISSING_mm_cvtsd_f64 24 #define __align(X) __declspec(align(X) ) 33 #define DEBUG_MSM_MIGRATE 34 #undef DEBUG_MSM_MIGRATE 36 #define MSM_MAX_BLOCK_SIZE 8 37 #define MSM_MAX_BLOCK_VOLUME \ 38 (MSM_MAX_BLOCK_SIZE * MSM_MAX_BLOCK_SIZE * MSM_MAX_BLOCK_SIZE) 40 #define MSM_C1VECTOR_MAX_BLOCK_SIZE (MSM_MAX_BLOCK_SIZE / 2) 41 #define MSM_C1VECTOR_MAX_BLOCK_VOLUME \ 42 (MSM_C1VECTOR_MAX_BLOCK_SIZE * \ 43 MSM_C1VECTOR_MAX_BLOCK_SIZE * \ 44 MSM_C1VECTOR_MAX_BLOCK_SIZE) 49 #define DEBUG_MSM_VERBOSE 50 #undef DEBUG_MSM_VERBOSE 52 #define DEBUG_MSM_GRID 58 #define ASSERT(expr) \ 62 snprintf(msg, sizeof(msg), "ASSERT: \"%s\" " \ 63 "(%s, %d)\n", #expr, __FILE__, __LINE__); \ 120 #if 1 && (defined(__SSE2__) && ! defined(NAMD_DISABLE_SSE)) 126 __m128 melem4 = _mm_load_ps(&m.
melem[k]);
127 __m128 uelem4 = _mm_load_ps(&u.
velem[0]);
128 __m128 tmp4 = _mm_mul_ps(melem4, uelem4);
129 melem4 = _mm_load_ps(&m.
melem[k+4]);
130 uelem4 = _mm_load_ps(&u.
velem[4]);
131 tmp4 = _mm_add_ps(tmp4, _mm_mul_ps(melem4, uelem4));
135 sum4 = _mm_shuffle_ps(sum4, sum4, _MM_SHUFFLE(2, 3, 0, 1));
136 sum4 = _mm_add_ps(sum4, tmp4);
138 sum4 = _mm_shuffle_ps(sum4, sum4, _MM_SHUFFLE(1, 0, 3, 2));
139 sum4 = _mm_add_ps(sum4, tmp4);
143 _mm_store_ss(&sum, sum4);
147 #elif 0 && (defined(__AVX__) && ! defined(NAMD_DISABLE_SSE)) 153 __m256 melem8 = _mm256_load_ps(&m.
melem[k]);
154 __m256 uelem8 = _mm256_load_ps(&u.
velem[0]);
155 __m256 tmp8 = _mm256_mul_ps(melem8, uelem8);
160 sum8 = _mm256_hadd_ps(sum8, sum8);
161 sum8 = _mm256_hadd_ps(sum8, sum8);
163 tmp8 = _mm256_permute2f128_ps(tmp8, tmp8, 1);
164 sum8 = _mm256_hadd_ps(tmp8, sum8);
168 _mm_store_ss(&sum, sum8);
173 #if defined(__INTEL_COMPILER) 174 #pragma vector always 191 #define C1INDEX(drj,dri) ((drj)*C1_VECTOR_SIZE + (dri)) 215 if (
this != &a)
copy(a);
228 if (i < 0 || i >=
alen) {
230 snprintf(msg,
sizeof(msg),
"Array index: alen=%d, i=%d\n",
alen, i);
243 if (i < 0 || i >=
alen) {
245 snprintf(msg,
sizeof(msg),
"Array index: alen=%d, i=%d\n",
alen, i);
268 void print(
const char *s=0)
const {
269 if (s) printf(
"PRINTING DATA FOR ARRAY \"%s\":\n", s);
270 printf(
"abuffer=%p\n alen=%d amax=%d\n",
282 if (m == amax)
return;
284 T *newbuffer =
new T[m];
287 snprintf(msg,
sizeof(msg),
288 "Can't allocate %lu KB for msm::Array\n",
289 (
unsigned long)(m *
sizeof(T) / 1024));
292 if (alen > m) alen = m;
293 for (
int i = 0; i < alen; i++) {
294 newbuffer[i] = abuffer[i];
312 for (
int i = 0; i < alen; i++) {
324 tmpn = s.astate; s.astate = t.astate; t.astate = tmpn;
340 if (nelems > 0)
init(nelems);
354 int last = a.len() - 1;
355 if (last < 0)
return;
357 if (last > 0) a[0] = a[last];
366 int parent = (n-1) / 2;
367 if (a[parent] <= a[n])
break;
379 int right = left + 1;
381 if (right < len && a[right] <= a[left]) {
382 if (a[n] <= a[right])
break;
389 if (a[n] <= a[left])
break;
413 Ivec(
int ni,
int nj,
int nk) :
i(ni),
j(nj),
k(nk) { }
416 void pup(PUP::er& p) {
426 void set(
int pia,
int pni,
int pja,
int pnj,
int pka,
int pnk) {
427 ASSERT(pni >= 0 && pnj >= 0 && pnk >= 0);
431 void setbounds(
int pia,
int pib,
int pja,
int pjb,
int pka,
int pkb) {
432 set(pia, pib-pia+1, pja, pjb-pja+1, pka, pkb-pka+1);
448 return (
ia() >= n.
ia() &&
ib() <= n.
ib() &&
453 void pup(PUP::er& p) {
466 template <
class T,
int N>
476 void set(
int pia,
int pni,
int pja,
int pnj,
int pka,
int pnk) {
480 void setbounds(
int pia,
int pib,
int pja,
int pjb,
int pka,
int pkb) {
494 const T&
elem(
int i,
int j,
int k)
const {
495 if (i<
ia() || i>
ib() || j<
ja() || j>
jb() || k<
ka() || k>
kb()) {
497 snprintf(msg,
sizeof(msg),
"Grid indexing:\n" 498 "ia=%d, ib=%d, i=%d\n" 499 "ja=%d, jb=%d, j=%d\n" 500 "ka=%d, kb=%d, k=%d\n",
517 if (i<
ia() || i>
ib() || j<
ja() || j>
jb() || k<
ka() || k>
kb()) {
519 snprintf(msg,
sizeof(msg),
"Grid indexing:\n" 520 "ia=%d, ib=%d, i=%d\n" 521 "ja=%d, jb=%d, j=%d\n" 522 "ka=%d, kb=%d, k=%d\n",
529 return ((k-
ka())*
nj() + (j-
ja()))*
ni() + (i-
ia());
531 const T *
buffer()
const {
return gdata; }
537 for (
int n = 0; n < len; n++) { gdata[n] = t; }
555 const T *gbuf = g.gdata.buffer();
556 T *buf = gdata.buffer();
557 for (
int k = 0; k < gnk; k++) {
558 int jkoff = k * nij + koff;
559 for (
int j = 0; j < gnj; j++) {
560 int ijkoff = j *
ni + jkoff;
561 for (
int i = 0; i < gni; i++, index++) {
562 buf[i + ijkoff] += gbuf[index];
581 T *gbuf = g.gdata.buffer();
582 const T *buf = gdata.buffer();
583 for (
int k = 0; k < gnk; k++) {
584 int jkoff = k * nij + koff;
585 for (
int j = 0; j < gnj; j++) {
586 int ijkoff = j *
ni + jkoff;
587 for (
int i = 0; i < gni; i++, index++) {
588 gbuf[index] = buf[i + ijkoff];
600 class Grid :
public IndexRange {
608 void set(
int pia,
int pni,
int pja,
int pnj,
int pka,
int pnk) {
612 void setbounds(
int pia,
int pib,
int pja,
int pjb,
int pka,
int pkb) {
629 const T&
elem(
int i,
int j,
int k)
const {
630 if (i<
ia() || i>
ib() || j<
ja() || j>
jb() || k<
ka() || k>
kb()) {
632 snprintf(msg,
sizeof(msg),
"Grid indexing:\n" 633 "ia=%d, ib=%d, i=%d\n" 634 "ja=%d, jb=%d, j=%d\n" 635 "ka=%d, kb=%d, k=%d\n",
652 if (i<
ia() || i>
ib() || j<
ja() || j>
jb() || k<
ka() || k>
kb()) {
654 snprintf(msg,
sizeof(msg),
"Grid indexing:\n" 655 "ia=%d, ib=%d, i=%d\n" 656 "ja=%d, jb=%d, j=%d\n" 657 "ka=%d, kb=%d, k=%d\n",
664 return ((k-
ka())*
nj() + (j-
ja()))*
ni() + (i-
ia());
671 T *buf = gdata.buffer();
673 for (
int n = 0; n < len; n++) { buf[n] = t; }
689 if (
ia > g.nlower.i)
ia = g.nlower.i;
691 if (
ja > g.nlower.j)
ja = g.nlower.j;
693 if (
ka > g.nlower.k)
ka = g.nlower.k;
695 int gib1 = g.nlower.i + g.nextent.i;
696 if (ib1 < gib1) ib1 = gib1;
698 int gjb1 = g.nlower.j + g.nextent.j;
699 if (jb1 < gjb1) jb1 = gjb1;
701 int gkb1 = g.nlower.k + g.nextent.k;
702 if (kb1 < gkb1) kb1 = gkb1;
709 int koff = (tmp.nlower.k -
nlower.
k) * nij
711 const T *gbuf = tmp.gdata.buffer();
712 T *buf = gdata.buffer();
713 for (
int k = 0; k < tmp.nextent.k; k++) {
714 int jkoff = k * nij + koff;
715 for (
int j = 0; j < tmp.nextent.j; j++) {
716 int ijkoff = j *
ni + jkoff;
717 for (
int i = 0; i < tmp.nextent.i; i++, index++) {
718 buf[i + ijkoff] = gbuf[index];
724 int gni = g.nextent.i;
725 int gnj = g.nextent.j;
726 int gnk = g.nextent.k;
730 int koff = (g.nlower.k -
nlower.
k) * nij
732 const T *gbuf = g.gdata.buffer();
733 T *buf = gdata.buffer();
734 for (
int k = 0; k < gnk; k++) {
735 int jkoff = k * nij + koff;
736 for (
int j = 0; j < gnj; j++) {
737 int ijkoff = j *
ni + jkoff;
738 for (
int i = 0; i < gni; i++, index++) {
739 buf[i + ijkoff] += gbuf[index];
750 int gni = g.nextent.i;
751 int gnj = g.nextent.j;
752 int gnk = g.nextent.k;
756 int koff = (g.nlower.k -
nlower.
k) * nij
758 T *gbuf = g.gdata.buffer();
759 const T *buf = gdata.buffer();
760 for (
int k = 0; k < gnk; k++) {
761 int jkoff = k * nij + koff;
762 for (
int j = 0; j < gnj; j++) {
763 int ijkoff = j *
ni + jkoff;
764 for (
int i = 0; i < gni; i++, index++) {
765 gbuf[index] = buf[i + ijkoff];
784 const T *gbuf = g.
buffer();
785 T *buf = gdata.buffer();
786 for (
int k = 0; k < gnk; k++) {
787 int jkoff = k * nij + koff;
788 for (
int j = 0; j < gnj; j++) {
789 int ijkoff = j *
ni + jkoff;
790 for (
int i = 0; i < gni; i++, index++) {
791 buf[i + ijkoff] += gbuf[index];
812 const T *buf = gdata.buffer();
813 for (
int k = 0; k < gnk; k++) {
814 int jkoff = k * nij + koff;
815 for (
int j = 0; j < gnj; j++) {
816 int ijkoff = j *
ni + jkoff;
817 for (
int i = 0; i < gni; i++, index++) {
818 gbuf[index] = buf[i + ijkoff];
842 void pup(PUP::er& p) {
856 void set(
int i,
int j,
int k) {
860 if (i > 1 || j > 1 || k > 1)
active = 1;
878 void pup(PUP::er& p) {
972 if (pn.i < a) pn.i = a;
973 if (pn.i > b) pn.i = b;
978 if (pn.j < a) pn.j = a;
979 if (pn.j > b) pn.j = b;
984 if (pn.k < a) pn.k = a;
985 if (pn.k > b) pn.k = b;
997 bn.n.i = (d >= 0 ? d /
bsx[level] : -((-d+
bsx[level]-1) /
bsx[level]));
999 bn.n.j = (d >= 0 ? d /
bsy[level] : -((-d+
bsy[level]-1) /
bsy[level]));
1001 bn.n.k = (d >= 0 ? d /
bsz[level] : -((-d+
bsz[level]-1) /
bsz[level]));
1017 bn.n.i = (d >= 0 ? d / bsi : -((-d+bsi-1) / bsi));
1019 bn.n.j = (d >= 0 ? d / bsj : -((-d+bsj-1) / bsj));
1021 bn.n.k = (d >= 0 ? d / bsk : -((-d+bsk-1) / bsk));
1048 nr.
set(ia, bsi, ja, bsj, ka, bsk);
1056 int nia = nrange.
ia();
1057 int nib = nrange.
ib();
1058 int nja = nrange.
ja();
1059 int njb = nrange.
jb();
1060 int nka = nrange.
ka();
1061 int nkb = nrange.
kb();
1063 if (ia < nia) ia = nia;
1065 if (ib > nib) ib = nib;
1067 if (ja < nja) ja = nja;
1069 if (jb > njb) jb = njb;
1071 if (ka < nka) ka = nka;
1073 if (kb > nkb) kb = nkb;
1083 int nia = nrange.
ia();
1084 int nib = nrange.
ib();
1085 int nja = nrange.
ja();
1086 int njb = nrange.
jb();
1087 int nka = nrange.
ka();
1088 int nkb = nrange.
kb();
1090 if (ia < nia) ia = nia;
1092 if (ib > nib) ib = nib;
1094 if (ja < nja) ja = nja;
1096 if (jb > njb) jb = njb;
1098 if (ka < nka) ka = nka;
1100 if (kb > nkb) kb = nkb;
1114 int di=0, dj=0, dk=0;
1116 while (nb.
n.
i < 0) {
1118 di += ni *
bsx[level];
1120 while (nb.
n.
i >= ni) {
1122 di -= ni *
bsx[level];
1126 while (nb.
n.
j < 0) {
1128 dj += nj *
bsy[level];
1130 while (nb.
n.
j >= nj) {
1132 dj -= nj *
bsy[level];
1136 while (nb.
n.
k < 0) {
1138 dk += nk *
bsz[level];
1140 while (nb.
n.
k >= nk) {
1142 dk -= nk *
bsz[level];
1151 nr.
setbounds(ia + di, ib + di, ja + dj, jb + dj, ka + dk, kb + dk);
1169 int bsi = foldi *
bsx[level];
1170 int bsj = foldj *
bsy[level];
1171 int bsk = foldk *
bsz[level];
1172 int di=0, dj=0, dk=0;
1174 while (nb.
n.
i < 0) {
1178 while (nb.
n.
i >= ni) {
1184 while (nb.
n.
j < 0) {
1188 while (nb.
n.
j >= nj) {
1194 while (nb.
n.
k < 0) {
1198 while (nb.
n.
k >= nk) {
1209 nr.
setbounds(ia + di, ib + di, ja + dj, jb + dj, ka + dk, kb + dk);
1215 int level = bn.
level;
1220 while (bn.
n.
i < 0) {
1223 while (bn.
n.
i >= ni) {
1228 while (bn.
n.
j < 0) {
1231 while (bn.
n.
j >= nj) {
1236 while (bn.
n.
k < 0) {
1239 while (bn.
n.
k >= nk) {
GridFixed< T, N > & operator+=(const GridFixed< T, N > &g)
Array< Grid< C1Matrix > > gpro_c1hermite
Array< PatchDiagram > patchList
Array< int > recvGridCutoff
const T & operator()(int i, int j, int k) const
BlockIndex blockOfGridIndexFold(const Ivec &n, int level) const
const T & elem(int i, int j, int k) const
Ivec clipIndexToLevel(const Ivec &n, int level) const
Grid< T > & operator+=(const GridFixed< T, N > &g)
IndexRange clipBlockToIndexRangeFold(const BlockIndex &nb, const IndexRange &nrange) const
friend C1Vector operator+(const C1Vector &u, const C1Vector &v)
Array & operator=(const Array &a)
T & operator()(int i, int j, int k)
IndexRange indexRangeOfBlock(const BlockIndex &nb) const
int operator==(const Ivec &n)
const T & operator()(int i, int j, int k) const
int flatindex(int i, int j, int k) const
void wrapBlockSend(BlockSend &bs) const
Array< BlockSend > sendUp
void set(int pia, int pni, int pja, int pnj, int pka, int pnk)
void extract(Grid< T > &g)
void setbounds(int pia, int pib, int pja, int pjb, int pka, int pkb)
void swap(Array< T > &s, Array< T > &t)
void wrapBlockIndex(BlockIndex &bn) const
void setbounds(int pia, int pib, int pja, int pjb, int pka, int pkb)
Array< Force > ForceArray
T & operator()(int i, int j, int k)
void setbounds(int pia, int pib, int pja, int pjb, int pka, int pkb)
Array< Grid< BlockDiagram > > blockLevel
T & operator()(const Ivec &n)
friend C1Vector operator*(const C1Matrix &m, const C1Vector &u)
void init(const IndexRange &n)
Array< BlockSend > sendAcross
Array< Grid< C1Matrix > > gres_c1hermite
Array< AtomCoord > AtomCoordArray
IndexRange indexRangeOfBlockFold(const BlockIndex &nb) const
PriorityQueue(int nelems=0)
const T * buffer(int &n) const
Array< PatchData * > PatchPtrArray
void updateLower(const Ivec &n)
IndexRange nrangeRestricted
void extract(GridFixed< T, N > &g)
BlockIndex(int ll, const Ivec &nn)
void extract(GridFixed< T, N > &g)
Array< Grid< C1Matrix > > gvc_c1hermite
Array< IndexRange > gridrange
Array< PatchSend > sendPatch
void NAMD_die(const char *err_msg)
BlockIndex blockOfGridIndex(const Ivec &n, int level) const
FoldFactor(int i, int j, int k)
const T & operator[](int i) const
const T & elem(int i, int j, int k) const
const Array< T > & data() const
void updateLower(const Ivec &n)
Float melem[C1_MATRIX_SIZE]
void copy(const Array &a)
IndexRange clipBlockToIndexRange(const BlockIndex &nb, const IndexRange &nrange) const
T & operator()(const Ivec &n)
Array< FoldFactor > foldfactor
Grid< T > & operator+=(const Grid< T > &g)
Array< int > indexGridCutoff
Array< Grid< Float > > gc
int flatindex(int i, int j, int k) const
const T & operator()(const Ivec &n) const
void wrapBlockSendFold(BlockSend &bs) const
const T & operator()(const Ivec &n) const
Array< Grid< Float > > gvc
friend Float operator*(const C1Vector &u, const C1Vector &v)
void init(const IndexRange &n)
Array< Grid< C1Matrix > > gc_c1hermite
int operator<=(const IndexRange &n)
C1Vector & operator+=(const C1Vector &v)
const T & elem(int i) const
IndexRange nrangeProlongated
Ivec(int ni, int nj, int nk)
Float velem[C1_VECTOR_SIZE]
Array< BlockSend > sendDown
T & elem(int i, int j, int k)
T & elem(int i, int j, int k)