13 #if defined __INTEL_COMPILER || defined __INTEL_LLVM_COMPILER 15 #define FORCEINLINE __forceinline 17 #else // fixes for non-Intel compilers 19 #define FORCEINLINE inline 30 #define _popcnt32 _mm_popcnt_u32 36 static inline __m512 _mm512_invsqrt_ps(__m512 a){
37 __m512 xn = _mm512_rsqrt14_ps(a);
38 __m512 xn1 = _mm512_set1_ps(3.0f) - _mm512_mul_ps(a, _mm512_mul_ps(xn, xn));
39 return _mm512_mul_ps(_mm512_set1_ps(0.5f), _mm512_mul_ps(xn, xn1));
42 #ifndef _mm512_i32logather_pd 43 #define _mm512_i32logather_pd(vindex, base_addr, scale) _mm512_i32gather_pd(_mm512_extracti32x8_epi32((vindex), 0), (base_addr), (scale)) 46 #ifndef _mm512_mask_i32logather_pd 47 #define _mm512_mask_i32logather_pd(src, mask, vindex, base_addr, scale) _mm512_mask_i32gather_pd((src), (mask), _mm512_extracti32x8_epi32((vindex), 0), (base_addr), (scale)) 50 #ifndef _mm512_i32loscatter_pd 51 #define _mm512_i32loscatter_pd(base_addr, vindex, a, scale) _mm512_i32scatter_pd((base_addr), _mm512_extracti32x8_epi32((vindex), 0), (a), (scale)) 54 #ifndef _mm512_mask_i32loscatter_pd 55 #define _mm512_mask_i32loscatter_pd(base_addr, k, vindex, a, scale) _mm512_mask_i32scatter_pd((base_addr), (k), _mm512_extracti32x8_epi32((vindex), 0), (a), (scale)) 58 #endif // fixes for non-Intel compilers 65 #define NAMD_AVXTILES_PAIR_THRESHOLD 4 67 #define NAMD_AVXTILES_IPAIRCOUNT 300 69 #define NAMD_AVXTILES_ORDER_PATCHES 78 inline int numTiles()
const {
return _numTiles; }
79 inline int maxTiles()
const {
return _numTilesAlloc; }
80 inline bool realloc(
const int n) {
82 if (n>_numTilesAlloc) {
97 int _numTiles, _numTilesAlloc;
118 void setSimParams(
const float scale,
const float scale14,
const float c1,
119 const float c3,
const float switchOn2,
float *fastTable,
120 float *fastEnergyTable,
float *slowTable,
121 float *slowEnergyTable,
float *eps4sigma,
122 float *eps4sigma14,
float *ljTable,
123 const float ljTableWidth,
float *modifiedTable,
124 float *modifiedEnergyTable,
float *excludedTable,
125 float *excludedEnergyTable,
const int interpolationMode);
127 inline void atomUpdate(AVXTiles *patch0tiles, AVXTiles *patch1tiles) {
128 tiles_p0 = patch0tiles;
129 tiles_p1 = patch1tiles;
132 #ifdef NAMD_AVXTILES_ORDER_PATCHES 135 bool reorder =
false;
136 const int rem0 = patch0tiles->numAtoms() & 15;
137 const int rem1 = patch1tiles->numAtoms() & 15;
138 if (rem1 && rem1 <= NAMD_AVXTILES_PAIR_THRESHOLD && rem1 < rem0)
140 else if ((rem0 > NAMD_AVXTILES_PAIR_THRESHOLD || rem0 == 0) &&
141 patch1tiles->numAtoms() < patch0tiles->numAtoms())
144 tiles_p0 = patch1tiles;
145 tiles_p1 = patch0tiles;
151 realloc(tiles_p0->numTiles());
154 inline void updateParams(
const Lattice &lattice,
const Vector &offset,
155 const double cutoff) {
156 _cutoff2 = cutoff * cutoff;
157 _paramMinvCut3 = -1.0 / (_cutoff2 * sqrt(_cutoff2));
158 _paramCutUnder3 = 3.0 / sqrt(_cutoff2);
159 _shx = offset.
x*lattice.
a().
x + offset.
y*lattice.
b().
x +
160 offset.
z*lattice.
c().
x;
161 _shy = offset.
x*lattice.
a().
y + offset.
y*lattice.
b().
y +
162 offset.
z*lattice.
c().
y;
163 _shz = offset.
x*lattice.
a().
z + offset.
y*lattice.
b().
z +
164 offset.
z*lattice.
c().
z;
167 inline void updateBuildInfo(
const int step,
const int minPart,
168 const int maxPart,
const int numParts,
169 const double plcutoff) {
173 _numParts = numParts;
174 _plcutoff2 = plcutoff * plcutoff;
177 inline int numLists()
const {
return _numLists; }
179 inline void realloc(
const int numLists) {
180 _numLists = numLists;
181 if (numLists > _numListsAlloc) _realloc();
184 inline void reallocModified(
const int numModified) {
185 _numModified = numModified;
186 if (numModified > _numModifiedAlloc) _reallocModified();
189 inline void reallocExcluded(
const int numExcluded) {
190 _numExcluded = numExcluded;
191 if (numExcluded > _numExcludedAlloc) _reallocExcluded();
194 inline void reallocPairLists(
const int numPairLists,
const int maxPairs) {
195 if (numPairLists > _maxPairLists || maxPairs > _maxPairs)
196 _reallocPairLists(numPairLists, maxPairs);
199 inline int exclusionChecksum()
const {
return _exclusionChecksum; }
200 inline float energyVdw()
const {
return _energyVdw; }
201 inline float energyElec()
const {
return _energyElec; }
202 inline float energySlow()
const {
return _energySlow; }
203 inline float virialXX()
const {
return _fNet_x * _shx; }
204 inline float virialXY()
const {
return _fNet_x * _shy; }
205 inline float virialXZ()
const {
return _fNet_x * _shz; }
206 inline float virialYY()
const {
return _fNet_y * _shy; }
207 inline float virialYZ()
const {
return _fNet_y * _shz; }
208 inline float virialZZ()
const {
return _fNet_z * _shz; }
209 inline float virialSlowXX()
const {
return _fNetSlow_x * _shx; }
210 inline float virialSlowXY()
const {
return _fNetSlow_x * _shy; }
211 inline float virialSlowXZ()
const {
return _fNetSlow_x * _shz; }
212 inline float virialSlowYY()
const {
return _fNetSlow_y * _shy; }
213 inline float virialSlowYZ()
const {
return _fNetSlow_y * _shz; }
214 inline float virialSlowZZ()
const {
return _fNetSlow_z * _shz; }
216 #ifdef NAMD_AVXTILES_ORDER_PATCHES 217 inline int patchOrder0()
const {
return _patchOrder0; }
218 inline int patchOrder1()
const {
return _patchOrder1; }
220 inline int patchOrder0()
const {
return 0; }
221 inline int patchOrder1()
const {
return 1; }
230 void delEmptyLists();
233 void nbForceAVX512(
const int doEnergy,
const int doVirial,
const int doList,
238 unsigned int *listDepth;
241 AVXTiles *tiles_p0, *tiles_p1;
246 template <
bool count,
bool partitionMode>
249 template <
bool doEnergy,
bool doVirial,
bool doSlow,
250 bool doList,
int interpMode>
251 FORCEINLINE
void nbAVX512Tiles(__m512 &energyVdw, __m512 &energyElec,
252 __m512 &energySlow, __m512 &fNet_x,
253 __m512 &fNet_y, __m512 &fNet_z,
254 __m512 &fNetSlow_x, __m512 &fNetSlow_y,
256 template <
bool doEnergy,
bool doVirial,
bool doSlow,
int interpMode>
257 FORCEINLINE
void nbAVX512Pairs(__m512 &energyVdw, __m512 &energyElec,
258 __m512 &energySlow, __m512 &fNet_x,
259 __m512 &fNet_y, __m512 &fNet_z,
260 __m512 &fNetSlow_x, __m512 &fNetSlow_y,
262 template <
bool doEnergy,
bool doVirial,
bool doSlow,
int interpMode>
263 inline void nbAVX512Modified(__m512 &energyVdw, __m512 &energyElec,
264 __m512 &energySlow, __m512 &fNet_x,
265 __m512 &fNet_y, __m512 &fNet_z,
266 __m512 &fNetSlow_x, __m512 &fNetSlow_y,
268 template <
bool doEnergy,
bool doVirial>
269 inline void nbAVX512Excluded(__m512 &energySlow, __m512 &fNetSlow_x,
270 __m512 &fNetSlow_y, __m512 &fNetSlow_z);
272 template <
bool doEnergy,
bool doVirial,
bool doSlow,
273 bool doList,
int interpMode>
277 void _reallocModified();
278 void _reallocExcluded();
279 void _reallocPairLists(
const int numPairLists,
const int maxPairs);
281 float _cutoff2, _plcutoff2;
283 float *_paramSlowTable, *_paramSlowEnergyTable;
285 float *_paramEps4Sigma, *_paramEps4Sigma14;
287 float _paramMinvCut3, _paramCutUnder3;
289 float *_paramModifiedTable, *_paramModifiedEnergyTable;
290 float *_paramExcludedTable, *_paramExcludedEnergyTable;
293 float *_paramFastTable, *_paramFastEnergyTable;
294 const float *_paramLjTable;
298 float _shx, _shy, _shz;
299 float _paramScale, _paramScale14, _paramC1, _paramC3, _paramSwitchOn2;
300 int _numLists, _numListsAlloc;
301 int _numModified, _numModifiedAlloc, _numExcluded, _numExcludedAlloc;
302 int *_modified_i, *_modified_j, *_excluded_i, *_excluded_j;
304 int _numPairLists, _maxPairLists, _maxPairs;
305 int *_pair_i, *_numPairs, *_pairStart, *_pairLists;
307 float _fNet_x, _fNet_y, _fNet_z, _fNetSlow_x, _fNetSlow_y, _fNetSlow_z;
308 int _exclusionChecksum;
309 float _energyVdw, _energyElec, _energySlow;
311 int _interpolationMode, _minPart, _maxPart, _numParts, _lastBuild;
313 #ifdef NAMD_AVXTILES_ORDER_PATCHES 314 int _patchOrder0, _patchOrder1;
317 #ifndef MEM_OPT_VERSION 318 char * _exclFlyListBuffer;
319 char * _exclFlyLists[16];
320 const int32 * _fullExcl[16], * _modExcl[16];
321 int _lastFlyListTile;
322 const char * buildExclFlyList(
const int itileList,
const int z,
323 const __m512i &atomIndex_i,
const int n,
328 #endif // NAMD_AVXTILES 329 #endif // AVXTILELISTS_H NAMD_HOST_DEVICE Vector c() const
NAMD_HOST_DEVICE Vector b() const
NAMD_HOST_DEVICE Vector a() const