NAMD
AVXTileLists.C
Go to the documentation of this file.
1 #include "AVXTileLists.h"
2 
3 #ifdef NAMD_AVXTILES
4 
5 //---------------------------------------------------------------------------
6 
7 AVXJTiles::AVXJTiles() : _numTiles(0), _numTilesAlloc(0) {
8 }
9 
10 AVXJTiles::~AVXJTiles() {
11  if (_numTilesAlloc) {
12  free(excl);
13  free(atomStart);
14  free(status);
15  }
16 }
17 
18 void AVXJTiles::_realloc() {
19  if (_numTilesAlloc) {
20  free(excl);
21  free(atomStart);
22  free(status);
23  }
24  _numTilesAlloc = 1.2 * _numTiles;
25  int e = posix_memalign((void **)&excl, 64,
26  sizeof(unsigned int)*_numTilesAlloc << 4);
27  e = e | posix_memalign((void **)&atomStart, 64, sizeof(int)*_numTilesAlloc);
28  e = e | posix_memalign((void **)&status, 64, sizeof(int)*_numTilesAlloc);
29  if (e) NAMD_die("Could not allocate memory for tiles.");
30 }
31 
32 //---------------------------------------------------------------------------
33 
34 AVXTileLists::AVXTileLists() : _numLists(0), _numListsAlloc(0),
35  _numModifiedAlloc(0), _numExcludedAlloc(0),
36  _maxPairLists(0), _maxPairs(0)
37 {
38  #ifndef MEM_OPT_VERSION
39  _exclFlyListBuffer = 0;
40  #endif
41 }
42 
43 //---------------------------------------------------------------------------
44 
45 AVXTileLists::~AVXTileLists() {
46  if (_numListsAlloc) {
47  free(lists);
48  free(listDepth);
49  }
50  if (_numModifiedAlloc) {
51  free(_modified_i);
52  free(_modified_j);
53  }
54  if (_numExcludedAlloc) {
55  free(_excluded_i);
56  free(_excluded_j);
57  }
58  if (NAMD_AVXTILES_PAIR_THRESHOLD > 0) {
59  if (_maxPairLists) {
60  free(_pair_i);
61  free(_numPairs);
62  free(_pairStart);
63  free(_pairLists);
64  }
65  }
66  #ifndef MEM_OPT_VERSION
67  delete []_exclFlyListBuffer;
68  #endif
69 }
70 
71 //---------------------------------------------------------------------------
72 
73 void AVXTileLists::setSimParams(const float scale, const float scale14,
74  const float c1, const float c3,
75  const float switchOn2, float *fastTable,
76  float *fastEnergyTable, float *slowTable,
77  float *slowEnergyTable, float *eps4sigma,
78  float *eps4sigma14, float *ljTable,
79  const float ljTableWidth, float *modifiedTable,
80  float *modifiedEnergyTable,
81  float *excludedTable,
82  float *excludedEnergyTable,
83  const int interpolationMode) {
84  _interpolationMode = interpolationMode;
85 
86  if (_interpolationMode > 1)
87  _paramScale = 2.0 * scale;
88  else
89  _paramScale = scale;
90  _paramScale14 = scale14;
91  _paramC1 = c1;
92  _paramC3 = c3;
93  _paramSwitchOn2 = switchOn2;
94  if (_interpolationMode == 3) {
95  _paramFastTable = fastTable;
96  _paramFastEnergyTable = fastEnergyTable;
97  }
98  _paramSlowTable = slowTable;
99  _paramSlowEnergyTable = slowEnergyTable;
100  _paramModifiedTable = modifiedTable;
101  _paramModifiedEnergyTable = modifiedEnergyTable;
102  _paramExcludedTable = excludedTable;
103  _paramExcludedEnergyTable = excludedEnergyTable;
104  if (_interpolationMode > 1) {
105  _paramLjTable = ljTable;
106  _paramLjWidth = ljTableWidth;
107  } else {
108  _paramEps4Sigma = eps4sigma;
109  _paramEps4Sigma14 = eps4sigma14;
110  }
111 }
112 
113 //---------------------------------------------------------------------------
114 
115 // Build tiles lists based on bounding boxes
116 // - count is true if we are only counting and not storing lists
117 // * used first time and when in parition mode
118 // * if count is false, can still fail due to insufficient storage
119 // * requiring realloc and repeat
120 // - paritionMode is true if the compute is partitioned for LB
121 // * in this case, first call with count=true to paritition based on
122 // * number of neighbor tiles. Then reallocate and build only for i-tiles
123 // * active in partition
124 template <bool count, bool partitionMode>
125 int AVXTileLists::_buildBB() {
126  // Get a count of J tiles for initial allocation or for partitioning
127  // If we already ran w/ count==true, guaranteed to have enough storage
128  // If we already ran w/ count==true for paritions, listDepth is valid
129  if (!count)
130  if (partitionMode || jTiles.maxTiles() == 0)
131  _buildBB<true, partitionMode>();
132 
133  int numJtiles = 0;
134  const int maxJtiles = jTiles.maxTiles();
135  const int numTileLists = tiles_p0->numTiles();
136  const int numTiles2 = tiles_p1->numTiles();
137  const bool zeroShift = ! (_shx*_shx + _shy*_shy + _shz*_shz > 0);
138  const bool self = zeroShift && (tiles_p0 == tiles_p1);
139 
140  int out_i;
141  if (!count) out_i = 0;
142 
143  for (int itileList = 0; itileList < numTileLists; itileList++) {
144  if (partitionMode && count==false && listDepth[itileList]==0)
145  continue;
146 
147  int itileListLen = 0;
148  int jinit;
149  if (self) jinit = itileList;
150  else jinit = 0;
151 
152  // Load i-atom data (and shift coordinates)
153 
154  // Loading bounding boxes and shift
155  const __m512 bbIx = _mm512_set1_ps(tiles_p0->bbox_x[itileList] + _shx);
156  const __m512 bbIy = _mm512_set1_ps(tiles_p0->bbox_y[itileList] + _shy);
157  const __m512 bbIz = _mm512_set1_ps(tiles_p0->bbox_z[itileList] + _shz);
158  const __m512 bbIwx = _mm512_set1_ps(tiles_p0->bbox_wx[itileList]);
159  const __m512 bbIwy = _mm512_set1_ps(tiles_p0->bbox_wy[itileList]);
160  const __m512 bbIwz = _mm512_set1_ps(tiles_p0->bbox_wz[itileList]);
161 
162  // Starting indices for jtiles
163  __m512i jAtomStart = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
164  128, 144, 160, 176, 192, 208, 224,
165  240);
166  jAtomStart = _mm512_add_epi32(jAtomStart, _mm512_set1_epi32(jinit<<4));
167  int jtileStart = numJtiles;
168 
169  __mmask16 loopMask = 0xFFFF;
170  for (int j=jinit; j < numTiles2; j += 16) {
171  // Remainder Predication
172  if (numTiles2 - j < 16)
173  loopMask >>= (16 - (numTiles2 - j));
174 
175  // Load bounding boxes for j-tiles
176  const __m512 bbJx = _mm512_mask_loadu_ps(_mm512_undefined_ps(),
177  loopMask, tiles_p1->bbox_x + j);
178  const __m512 bbJy = _mm512_mask_loadu_ps(_mm512_undefined_ps(),
179  loopMask, tiles_p1->bbox_y + j);
180  const __m512 bbJz = _mm512_mask_loadu_ps(_mm512_undefined_ps(),
181  loopMask, tiles_p1->bbox_z + j);
182  const __m512 bbJwx = _mm512_mask_loadu_ps(_mm512_undefined_ps(),
183  loopMask, tiles_p1->bbox_wx + j);
184  const __m512 bbJwy = _mm512_mask_loadu_ps(_mm512_undefined_ps(),
185  loopMask, tiles_p1->bbox_wy + j);
186  const __m512 bbJwz = _mm512_mask_loadu_ps(_mm512_undefined_ps(),
187  loopMask, tiles_p1->bbox_wz + j);
188 
189  // dx = max(0.f,abs(bbIx - bbJx) - bbIwx - bbJwx)
190  const __m512 dx_one = _mm512_abs_ps(_mm512_sub_ps(bbIx, bbJx));
191  const __m512 dx_two = _mm512_add_ps(bbIwx, bbJwx);
192  const __mmask16 lxmask = _mm512_cmplt_ps_mask(dx_two, dx_one);
193  const __m512 dx = _mm512_mask_sub_ps(_mm512_setzero_ps(), lxmask,
194  dx_one, dx_two);
195  // dy = max(0.f,abs(bbIy - bbJy) - bbIwy - bbJwy)
196  const __m512 dy_one = _mm512_abs_ps(_mm512_sub_ps(bbIy, bbJy));
197  const __m512 dy_two = _mm512_add_ps(bbIwy, bbJwy);
198  const __mmask16 lymask = _mm512_cmplt_ps_mask(dy_two, dy_one);
199  const __m512 dy = _mm512_mask_sub_ps(_mm512_setzero_ps(), lymask,
200  dy_one, dy_two);
201  // dz = max(0.f,abs(bbIz - bbJz) - bbIwz - bbJwz)
202  const __m512 dz_one = _mm512_abs_ps(_mm512_sub_ps(bbIz, bbJz));
203  const __m512 dz_two = _mm512_add_ps(bbIwz, bbJwz);
204  const __mmask16 lzmask = _mm512_cmplt_ps_mask(dz_two, dz_one);
205  const __m512 dz = _mm512_mask_sub_ps(_mm512_setzero_ps(), lzmask,
206  dz_one, dz_two);
207  // r2bb = dx*dx + dy*dy + dz*dz
208  const __m512 r2bb = _mm512_fmadd_ps(dz,dz,
209  _mm512_fmadd_ps(dx, dx, _mm512_mul_ps(dy, dy)));
210  // Count lanes with tiles within the cutoff
211  const __mmask16 keep = _mm512_cmplt_ps_mask(r2bb,
212  _mm512_set1_ps(_plcutoff2)) & loopMask;
213  const int nKeep = _popcnt32(keep);
214  // Store neighbor tiles within cutoff
215  if (count == false && nKeep)
216  if (partitionMode || jtileStart + itileListLen + nKeep <= maxJtiles)
217  _mm512_mask_compressstoreu_epi32(jTiles.atomStart+jtileStart+
218  itileListLen, keep, jAtomStart);
219 
220  itileListLen += nKeep;
221  // Next set of j-tile atoms
222  jAtomStart = _mm512_add_epi32(jAtomStart, _mm512_set1_epi32(256));
223  }
224 
225  if (count == false) {
226  if (itileListLen > 0) {
227  listDepth[out_i] = (unsigned int)itileListLen;
228  lists[out_i].atomStart_i = itileList<<4;
229  lists[out_i].jtileStart = jtileStart;
230  out_i++;
231  }
232  } else if (partitionMode)
233  listDepth[itileList] = (unsigned int)itileListLen;
234 
235  numJtiles += itileListLen;
236  } // for (int itileList
237 
238  // Divide i-tiles between partitions to approximately evenly divide # total
239  // number of neighbor tiles between partitions
240  if (partitionMode && count) {
241  const int jtilesPerPart = static_cast<double>(numJtiles)/_numParts + 0.5;
242  int jtileCount = 0, curPart = 0, nextPart = jtilesPerPart;
243  numJtiles = 0;
244 
245  for (int j = 0; j < numTileLists; j++) {
246  jtileCount += listDepth[j];
247  if (curPart < _minPart || curPart >= _maxPart)
248  listDepth[j] = 0;
249  else
250  numJtiles += listDepth[j];
251  if (jtileCount >= nextPart) {
252  curPart++;
253  nextPart += jtilesPerPart;
254  if (curPart == _numParts) curPart--;
255  }
256  }
257  }
258 
259  if (count) {
260  jTiles.realloc(numJtiles);
261  return numJtiles;
262  }
263 
264  // If we are in partition mode, guartaneed we had enough storage, else
265  // reallocate and repeat the calculation if we ran out of room
266  if (!partitionMode && jTiles.realloc(numJtiles))
267  return _buildBB<false, false>();
268 
269  _numLists = out_i;
270  if (_numLists) {
271  // Set all neighbor tiles as active, refined in force loop
272  memset(jTiles.status, 0, numJtiles * sizeof(int));
273  lists[_numLists].jtileStart = lists[_numLists-1].jtileStart +
274  listDepth[_numLists - 1];
275  }
276 
277  return numJtiles;
278 }
279 
280 void AVXTileLists::delEmptyLists() {
281  int out_i = 0;
282  int totalTiles = 0;
283  const int numLists = _numLists;
284 
285  __mmask16 tripMask = 0xFFFF;
286  int vecTrip = 16;
287 
288  for (int ii = 0; ii < numLists; ii += 16) {
289  if (numLists - ii < 16) {
290  vecTrip = numLists - ii;
291  tripMask >>= (16 - vecTrip);
292  }
293 
294  // Load number of neighbor tiles for each i-tile (depth)
295  __m512i depth = _mm512_load_epi32(listDepth + ii);
296  // Loop predication
297  __mmask16 depth_mask = _mm512_cmpneq_epi32_mask(depth,
298  _mm512_setzero_epi32()) & tripMask;
299  // depth &= 65535
300  depth = _mm512_and_epi32(depth, _mm512_set1_epi32((int)65535));
301  // Store only active tiles
302  _mm512_mask_compressstoreu_epi32(listDepth + out_i, depth_mask, depth);
303 
304  for (int vi = 0; vi < vecTrip; vi++) {
305  if (*((unsigned int*)&(depth) + vi)) {
306  const int i = ii + vi;
307  lists[out_i].atomStart_i = lists[i].atomStart_i;
308 
309  int start = totalTiles;
310  int startOld = lists[i].jtileStart;
311  int endOld = lists[i+1].jtileStart;
312  lists[out_i].jtileStart = start;
313 
314  int jtile0 = start;
315  for (int jtileOld=startOld; jtileOld < endOld; jtileOld++) {
316  int jtile = jTiles.status[jtileOld];
317  jtile &= 65535;
318  if (jtile) {
319  jTiles.atomStart[jtile0] = jTiles.atomStart[jtileOld];
320  // Pack tile exclusion data for active tiles
321  __m512i excl;
322  excl = _mm512_load_epi32(jTiles.excl + (jtileOld<<4));
323  _mm512_store_epi32((void *)(jTiles.excl + (jtile0<<4)), excl);
324  jtile0++;
325  }
326  }
327  out_i++;
328  totalTiles += (*((unsigned int*)&(depth) + vi));
329  }
330  }
331  }
332  jTiles.realloc(totalTiles);
333 
334  _numLists = out_i;
335  if (_numLists)
336  lists[_numLists].jtileStart = lists[_numLists-1].jtileStart +
337  listDepth[_numLists - 1];
338 }
339 
340 void AVXTileLists::build() {
341  tiles_p0->buildBoundingBoxes(_lastBuild);
342  if (tiles_p0 != tiles_p1)
343  tiles_p1->buildBoundingBoxes(_lastBuild);
344 
345  if (_maxPart - _minPart < _numParts)
346  _buildBB<false,true>();
347  else
348  _buildBB<false,false>();
349 
350  // if hybrid tiles/pairs, set indices for i-atoms for any pairlists
351  if (NAMD_AVXTILES_PAIR_THRESHOLD > 0) {
352  for (int i = 0; i < _maxPairLists; i++) _numPairs[i] = 0;
353  int end = tiles_p0->numAtoms() & 15;
354  if (end <= NAMD_AVXTILES_PAIR_THRESHOLD) {
355  const int start = tiles_p0->numAtoms() - end;
356  end += start;
357  for (int i = start; i < end; i++) _pair_i[i-start] = i;
358  }
359  }
360 }
361 
362 void AVXTileLists::_realloc() {
363  if (_numListsAlloc) {
364  free(lists);
365  free(listDepth);
366  }
367  _numListsAlloc = 1.2 * _numLists;
368  int e = posix_memalign((void **)&lists, 64, sizeof(List)*(_numListsAlloc+1));
369  e = e | posix_memalign((void **)&listDepth, 64,
370  sizeof(unsigned int)*(_numListsAlloc+1));
371  if (e) NAMD_die("Could not allocate memory for tiles.");
372 
373  if (_numModifiedAlloc == 0) {
374  int newNum = tiles_p0->numAtoms();
375  if (newNum < tiles_p1->numAtoms()) newNum = tiles_p1->numAtoms();
376  reallocModified(2 * newNum);
377  // WMB: Below is only needed for full electrostatics
378  reallocExcluded(2.2 * newNum);
379  }
380  if (NAMD_AVXTILES_PAIR_THRESHOLD > 0)
381  _reallocPairLists(tiles_p0->numAtoms() & 15, NAMD_AVXTILES_IPAIRCOUNT);
382 }
383 
384 //---------------------------------------------------------------------------
385 
386 void AVXTileLists::_reallocModified() {
387  int numModifiedAlloc = 1.2 * _numModified;
388  int *newp;
389 
390  if (posix_memalign((void **)&newp, 64, sizeof(int) * numModifiedAlloc))
391  NAMD_die("Could not allocate memory for tiles.");
392 
393  if (_numModifiedAlloc) {
394  memcpy(newp, _modified_i, sizeof(int) * _numModifiedAlloc);
395  free(_modified_i);
396  }
397  _modified_i = newp;
398 
399  if (posix_memalign((void **)&newp, 64, sizeof(int) * numModifiedAlloc))
400  NAMD_die("Could not allocate memory for tiles.");
401 
402  if (_numModifiedAlloc) {
403  memcpy(newp, _modified_j, sizeof(int) * _numModifiedAlloc);
404  free(_modified_j);
405  }
406  _modified_j = newp;
407 
408  _numModifiedAlloc = numModifiedAlloc;
409 }
410 
411 //---------------------------------------------------------------------------
412 
413 void AVXTileLists::_reallocExcluded() {
414  int numExcludedAlloc = 1.2 * _numExcluded;
415  int *newp;
416 
417  if (posix_memalign((void **)&newp, 64, sizeof(int) * numExcludedAlloc))
418  NAMD_die("Could not allocate memory for tiles.");
419 
420  if (_numExcludedAlloc) {
421  memcpy(newp, _excluded_i, sizeof(int) * _numExcludedAlloc);
422  free(_excluded_i);
423  }
424  _excluded_i = newp;
425 
426  if (posix_memalign((void **)&newp, 64, sizeof(int) * numExcludedAlloc))
427  NAMD_die("Could not allocate memory for tiles.");
428 
429  if (_numExcludedAlloc) {
430  memcpy(newp, _excluded_j, sizeof(int) * _numExcludedAlloc);
431  free(_excluded_j);
432  }
433  _excluded_j = newp;
434 
435  _numExcludedAlloc = numExcludedAlloc;
436 }
437 
438 //---------------------------------------------------------------------------
439 
440 void AVXTileLists::_reallocPairLists(const int numPairLists,
441  const int maxPairs) {
442  if (NAMD_AVXTILES_PAIR_THRESHOLD > 0) {
443  if (!_maxPairLists) {
444  _maxPairLists = NAMD_AVXTILES_PAIR_THRESHOLD;
445  _maxPairs = maxPairs;
446  int e = posix_memalign((void **)&_pair_i, 64, sizeof(int)*_maxPairLists);
447  e = e | posix_memalign((void **)&_numPairs, 64,
448  sizeof(int)*_maxPairLists);
449  e = e | posix_memalign((void **)&_pairStart, 64,
450  sizeof(int)*_maxPairLists);
451  e = e | posix_memalign((void **)&_pairLists, 64,
452  sizeof(int)*_maxPairLists*_maxPairs);
453  if (e) NAMD_die("Could not allocate memory for tiles.");
454 
455  int listStart = 0;
456  for (int i = 0; i < _maxPairLists; i++) {
457  _pairStart[i] = listStart;
458  listStart += _maxPairs;
459  }
460  } else if (maxPairs > _maxPairs) {
461  int *hold;
462  if (posix_memalign((void **)&hold, 64,
463  sizeof(int)*_maxPairLists*maxPairs))
464  NAMD_die("Could not allocate memory for tiles.");
465 
466  int listStart = 0;
467  for (int i = 0; i < _maxPairLists; i++) {
468  if (_numPairs[i]) memcpy(hold + listStart, _pairLists + _pairStart[i],
469  sizeof(int) * _numPairs[i]);
470  _pairStart[i] = listStart;
471  listStart += maxPairs;
472  }
473  free(_pairLists);
474  _pairLists = hold;
475  _maxPairs = maxPairs;
476  }
477  }
478 }
479 
480 #endif // NAMD_AVXTILES
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ float4 *__restrict__ float4 *__restrict__ int *__restrict__ int *__restrict__ TileListVirialEnergy *__restrict__ virialEnergy int itileList
void NAMD_die(const char *err_msg)
Definition: common.C:85
__global__ void const int numTileLists