120 #if 1 && (defined(__SSE2__) && ! defined(NAMD_DISABLE_SSE))
126 __m128 melem4 = _mm_load_ps(&m.
melem[k]);
127 __m128 uelem4 = _mm_load_ps(&u.
velem[0]);
128 __m128 tmp4 = _mm_mul_ps(melem4, uelem4);
129 melem4 = _mm_load_ps(&m.
melem[k+4]);
130 uelem4 = _mm_load_ps(&u.
velem[4]);
131 tmp4 = _mm_add_ps(tmp4, _mm_mul_ps(melem4, uelem4));
135 sum4 = _mm_shuffle_ps(sum4, sum4, _MM_SHUFFLE(2, 3, 0, 1));
136 sum4 = _mm_add_ps(sum4, tmp4);
138 sum4 = _mm_shuffle_ps(sum4, sum4, _MM_SHUFFLE(1, 0, 3, 2));
139 sum4 = _mm_add_ps(sum4, tmp4);
143 _mm_store_ss(&sum, sum4);
147 #elif 0 && (defined(__AVX__) && ! defined(NAMD_DISABLE_SSE))
153 __m256 melem8 = _mm256_load_ps(&m.
melem[k]);
154 __m256 uelem8 = _mm256_load_ps(&u.
velem[0]);
155 __m256 tmp8 = _mm256_mul_ps(melem8, uelem8);
160 sum8 = _mm256_hadd_ps(sum8, sum8);
161 sum8 = _mm256_hadd_ps(sum8, sum8);
163 tmp8 = _mm256_permute2f128_ps(tmp8, tmp8, 1);
164 sum8 = _mm256_hadd_ps(tmp8, sum8);
168 _mm_store_ss(&sum, sum8);
173 #if defined(__INTEL_COMPILER)
174 #pragma vector always
Float melem[C1_MATRIX_SIZE]
Float velem[C1_VECTOR_SIZE]