NAMD
AVXTiles.C
Go to the documentation of this file.
1 #include "AVXTiles.h"
2 
3 #include "Molecule.h"
4 #include "Node.h"
5 
6 #ifdef NAMD_AVXTILES
7 #include <immintrin.h>
8 
9 //---------------------------------------------------------------------------
10 
11 AVXTiles::AVXTiles() : _numAtoms(0), _numTiles(0), _numTilesAlloc(0),
12  _lastBuild(-1) {
13 }
14 
15 //---------------------------------------------------------------------------
16 
17 AVXTiles::~AVXTiles() {
18  if (_numTilesAlloc) {
19  free(forces);
20  free(forcesSlow);
21  free(vdwTypes);
22  free(atomIndex);
23 #ifdef MEM_OPT_VERSION
24  free(atomExclIndex);
25 #endif
26  free(exclIndexStart);
27  free(exclIndexMaxDiff);
28  free(reverseOrder);
29  free(bbox_x);
30  free(bbox_y);
31  free(bbox_z);
32  free(bbox_wx);
33  free(bbox_wy);
34  free(bbox_wz);
35  }
36 }
37 
38 //---------------------------------------------------------------------------
39 
40 void AVXTiles::_realloc() {
41  if (_numTilesAlloc) {
42  free(forces);
43  free(forcesSlow);
44  free(vdwTypes);
45  free(atomIndex);
46 #ifdef MEM_OPT_VERSION
47  free(atomExclIndex);
48 #endif
49  free(exclIndexStart);
50  free(exclIndexMaxDiff);
51  free(reverseOrder);
52  free(bbox_x);
53  free(bbox_y);
54  free(bbox_z);
55  free(bbox_wx);
56  free(bbox_wy);
57  free(bbox_wz);
58  }
59  _touched = true;
60  _numTilesAlloc = 1.2 * _numTiles;
61  const int atomsAlloc = _numTilesAlloc << 4;
62  int e = posix_memalign((void **)&forces, 64,
63  sizeof(AVXTilesForce)*atomsAlloc);
64  e = e | posix_memalign((void **)&forcesSlow, 64,
65  sizeof(AVXTilesForce)*atomsAlloc);
66  e = e | posix_memalign((void **)&vdwTypes, 64, sizeof(int)*atomsAlloc);
67  e = e | posix_memalign((void **)&atomIndex, 64 ,sizeof(int)*atomsAlloc);
68 #ifdef MEM_OPT_VERSION
69  e = e | posix_memalign((void **)&atomExclIndex, 64 ,sizeof(int)*atomsAlloc);
70 #endif
71  e = e | posix_memalign((void **)&exclIndexStart, 64, sizeof(int)*atomsAlloc);
72  e = e | posix_memalign((void **)&exclIndexMaxDiff, 64,
73  sizeof(int)*atomsAlloc);
74  e = e | posix_memalign((void **)&reverseOrder, 64, sizeof(int)*atomsAlloc);
75  e = e | posix_memalign((void **)&bbox_x, 64, sizeof(float)*_numTilesAlloc);
76  e = e | posix_memalign((void **)&bbox_y, 64, sizeof(float)*_numTilesAlloc);
77  e = e | posix_memalign((void **)&bbox_z, 64, sizeof(float)*_numTilesAlloc);
78  e = e | posix_memalign((void **)&bbox_wx, 64, sizeof(float)*_numTilesAlloc);
79  e = e | posix_memalign((void **)&bbox_wy, 64, sizeof(float)*_numTilesAlloc);
80  e = e | posix_memalign((void **)&bbox_wz, 64, sizeof(float)*_numTilesAlloc);
81 
82  if (e) NAMD_die("Could not allocate memory for tiles.");
83 }
84 
85 //---------------------------------------------------------------------------
86 
87 void AVXTiles::atomUpdate(const CompAtom *compAtom,
88  const CompAtomExt *compAtomExt) {
89  Molecule* mol = Node::Object()->molecule;
90  int numFreeAtoms = _numAtoms;
91 
92  #pragma ivdep
93  for (int k=0; k < _numAtoms; ++k) {
94  int j = compAtomExt[k].sortOrder;
95  int type = compAtom[j].vdwType;
96  vdwTypes[k] = type;
97  const int id = compAtomExt[j].id;
98  atomIndex[k] = id;
99  if (compAtomExt[j].atomFixed) --numFreeAtoms;
100 #ifdef MEM_OPT_VERSION
101  const int exclId = compAtomExt[j].exclId;
102  atomExclIndex[k] = exclId;
103  const ExclusionCheck *exclcheck = mol->get_excl_check_for_idx(exclId);
104  exclIndexStart[k] = id + exclcheck->min;
105  exclIndexMaxDiff[k] = id + exclcheck->max;
106 #else // ! MEM_OPT_VERSION
107  const ExclusionCheck *exclcheck = mol->get_excl_check_for_atom(id);
108  exclIndexStart[k] = exclcheck->min;
109  exclIndexMaxDiff[k] = exclcheck->max;
110 #endif // MEM_OPT_VERSION
111  reverseOrder[j] = k;
112  }
113 
114  _numFreeAtoms = numFreeAtoms;
115  const int end = _numTiles << 4;
116  for (int i = _numAtoms; i < end; i++) vdwTypes[i] = 0;
117 }
118 
119 //---------------------------------------------------------------------------
120 
121 void AVXTiles::_buildBoundingBoxes(const int step) {
122  _lastBuild = step;
123  for (int tile = 0; tile < _numTiles; tile ++) {
124  // Load x, y, and z for tile
125  const float *iptr = (float *)(atoms + (tile << 4));
126  const __m512 t0 = _mm512_load_ps(iptr);
127  const __m512 t1 = _mm512_load_ps(iptr+16);
128  const __m512 t2 = _mm512_load_ps(iptr+32);
129  const __m512 t3 = _mm512_load_ps(iptr+48);
130  const __m512i t4 = _mm512_set_epi32(29,25,21,17,13,9,5,1,
131  28,24,20,16,12,8,4,0);
132  const __m512 t5 = _mm512_permutex2var_ps(t0, t4, t1);
133  const __m512i t6 = _mm512_set_epi32(28,24,20,16,12,8,4,0,
134  29,25,21,17,13,9,5,1);
135  const __m512 t7 = _mm512_permutex2var_ps(t2, t6, t3);
136  const __mmask16 t9 = 0xFF00;
137  const __m512i t5i = _mm512_castps_si512(t5);
138  const __m512i t7i = _mm512_castps_si512(t7);
139  const __m512 x = _mm512_castsi512_ps(_mm512_mask_blend_epi32(t9, t5i,
140  t7i));
141  const __m512 y = _mm512_shuffle_f32x4(t5, t7, 0x4E);
142  const __m512i t12 = _mm512_set_epi32(31,27,23,19,15,11,7,3,30,26,22,
143  18,14,10,6,2);
144  const __m512 t13 = _mm512_permutex2var_ps(t0, t12, t1);
145  const __m512i t14 = _mm512_set_epi32(30,26,22,18,14,10,6,2,31,27,23,
146  19,15,11,7,3);
147  const __m512 t15 = _mm512_permutex2var_ps(t2, t14, t3);
148  const __m512i t13i = _mm512_castps_si512(t13);
149  const __m512i t15i = _mm512_castps_si512(t15);
150  const __m512 z = _mm512_castsi512_ps(_mm512_mask_blend_epi32(t9, t13i,
151  t15i));
152 
153  // Min x for tile
154  const float min_x = _mm512_reduce_min_ps(x);
155  // Max x for tile
156  const float max_x = _mm512_reduce_max_ps(x);
157  // Min y for tile
158  const float min_y = _mm512_reduce_min_ps(y);
159  // Max y for tile
160  const float max_y = _mm512_reduce_max_ps(y);
161  // Min z for tile
162  const float min_z = _mm512_reduce_min_ps(z);
163  // Max z for tile
164  const float max_z = _mm512_reduce_max_ps(z);
165 
166  bbox_x[tile] = 0.5f*(min_x + max_x);
167  bbox_y[tile] = 0.5f*(min_y + max_y);
168  bbox_z[tile] = 0.5f*(min_z + max_z);
169  bbox_wx[tile] = 0.5f*(max_x - min_x);
170  bbox_wy[tile] = 0.5f*(max_y - min_y);
171  bbox_wz[tile] = 0.5f*(max_z - min_z);
172  }
173 }
174 
175 //---------------------------------------------------------------------------
176 
177 template <int doSlow, int doVirial, int touched>
178 void AVXTiles::_nativeForceVirialUpdate(const CompAtom *p,
179  const Vector &center, Force * __restrict__ nativeForces,
180  Force * __restrict__ nativeForcesSlow,
181  const Force * __restrict__ nativeForcesVirial,
182  const Force * __restrict__ nativeForcesSlowVirial,
183  double virial[6], double virialSlow[6]) {
184 
185  BigReal v_xx, v_xy, v_xz, v_yy, v_yz, v_zz;
186  BigReal vS_xx, vS_xy, vS_xz, vS_yy, vS_yz, vS_zz;
187  if (doVirial) {
188  v_xx = v_xy = v_xz = v_yy = v_yz = v_zz = 0.0;
189  if (doSlow) vS_xx = vS_xy = vS_xz = vS_yy = vS_yz = vS_zz = 0.0;
190  }
191 
192  const int * __restrict__ reverseOrder = this->reverseOrder;
193  const AVXTilesForce * __restrict__ forces = this->forces;
194  const AVXTilesForce * __restrict__ forcesSlow = this->forcesSlow;
195  #pragma omp simd aligned(reverseOrder, forces, forcesSlow:64) \
196  reduction(+:v_xx, v_xy, v_xz, v_yy, v_yz, v_zz, \
197  vS_xx, vS_xy, vS_xz, vS_yy, vS_yz, vS_zz)
198  for (int i = 0; i < _numAtoms; i++) {
199  BigReal f_x = nativeForcesVirial[i].x;
200  BigReal f_y = nativeForcesVirial[i].y;
201  BigReal f_z = nativeForcesVirial[i].z;
202  if (touched) {
203  const int order = reverseOrder[i];
204  f_x += forces[order].x;
205  f_y += forces[order].y;
206  f_z += forces[order].z;
207  }
208  BigReal p_x, p_y, p_z;
209  if (doVirial) {
210  p_x = p[i].position.x - center.x;
211  p_y = p[i].position.y - center.y;
212  p_z = p[i].position.z - center.z;
213  v_xx += f_x * p_x;
214  v_xy += f_x * p_y;
215  v_xz += f_x * p_z;
216  v_yy += f_y * p_y;
217  v_yz += f_y * p_z;
218  v_zz += f_z * p_z;
219  }
220  nativeForces[i].x += f_x;
221  nativeForces[i].y += f_y;
222  nativeForces[i].z += f_z;
223  if (doSlow) {
224  BigReal f_x = nativeForcesSlowVirial[i].x;
225  BigReal f_y = nativeForcesSlowVirial[i].y;
226  BigReal f_z = nativeForcesSlowVirial[i].z;
227  if (touched) {
228  const int order = reverseOrder[i];
229  f_x += forcesSlow[order].x;
230  f_y += forcesSlow[order].y;
231  f_z += forcesSlow[order].z;
232  }
233  if (doVirial) {
234  vS_xx += f_x * p_x;
235  vS_xy += f_x * p_y;
236  vS_xz += f_x * p_z;
237  vS_yy += f_y * p_y;
238  vS_yz += f_y * p_z;
239  vS_zz += f_z * p_z;
240  }
241  nativeForcesSlow[i].x += f_x;
242  nativeForcesSlow[i].y += f_y;
243  nativeForcesSlow[i].z += f_z;
244  }
245  }
246  if (doVirial) {
247  virial[0] = v_xx;
248  virial[1] = v_xy;
249  virial[2] = v_xz;
250  virial[3] = v_yy;
251  virial[4] = v_yz;
252  virial[5] = v_zz;
253  if (doSlow) {
254  virialSlow[0] = vS_xx;
255  virialSlow[1] = vS_xy;
256  virialSlow[2] = vS_xz;
257  virialSlow[3] = vS_yy;
258  virialSlow[4] = vS_yz;
259  virialSlow[5] = vS_zz;
260  }
261  }
262 }
263 
264 //---------------------------------------------------------------------------
265 
266 void AVXTiles::nativeForceVirialUpdate(const int doSlow, const int doVirial,
267  const CompAtom *p, const Vector &center,
268  Force * __restrict__ nativeForces,
269  Force * __restrict__ nativeForcesSlow,
270  const Force * __restrict__ nativeForcesVirial,
271  const Force * __restrict__ nativeForcesSlowVirial,
272  double virial[6], double virialSlow[6]) {
273 
274 #define CALL(DOSLOW, DOVIRIAL, TOUCHED) \
275  _nativeForceVirialUpdate<DOSLOW, DOVIRIAL, \
276  TOUCHED>(p, center, nativeForces, nativeForcesSlow,\
277  nativeForcesVirial, \
278  nativeForcesSlowVirial, virial, \
279  virialSlow);
280 
281  if (_touched) {
282  if (!doSlow && !doVirial) CALL(0, 0, 1);
283  if (!doSlow && doVirial) CALL(0, 1, 1);
284  if (doSlow && doVirial) CALL(1, 1, 1);
285  if (doSlow && !doVirial) CALL(1, 0, 1);
286  } else {
287  if (!doSlow && !doVirial) CALL(0, 0, 0);
288  if (!doSlow && doVirial) CALL(0, 1, 0);
289  if (doSlow && doVirial) CALL(1, 1, 0);
290  if (doSlow && !doVirial) CALL(1, 0, 0);
291  }
292 #undef CALL
293 
294 }
295 #endif // NAMD_AVXTILES
static Node * Object()
Definition: Node.h:86
int sortOrder
Definition: NamdTypes.h:87
Definition: Vector.h:64
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ vdwTypes
static __thread atom * atoms
static __thread float4 * forces
BigReal z
Definition: Vector.h:66
Position position
Definition: NamdTypes.h:53
#define order
Definition: PmeRealSpace.C:235
gridSize z
BigReal x
Definition: Vector.h:66
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ exclIndexMaxDiff
void NAMD_die(const char *err_msg)
Definition: common.C:85
short vdwType
Definition: NamdTypes.h:55
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ atomIndex
BigReal y
Definition: Vector.h:66
const ExclusionCheck * get_excl_check_for_atom(int anum) const
Definition: Molecule.h:1177
gridSize y
gridSize x
#define CALL(DOENERGY, DOVIRIAL)
Molecule * molecule
Definition: Node.h:176
double BigReal
Definition: common.h:114