NAMD
ComputeNonbondedMIC.C
Go to the documentation of this file.
1 
2 #include "common.h"
3 #include "charm++.h"
4 
5 #include "WorkDistrib.h"
6 #include "ComputeMgr.h"
7 #include "ProxyMgr.h"
8 #include "ComputeNonbondedMIC.h"
10 #include "LJTable.h"
11 #include "ObjectArena.h"
12 #include "SortAtoms.h"
13 #include <algorithm>
14 
15 #include "NamdTypes.h"
16 
17 // This file only used for builds with Xeon Phi (MIC) support
18 #ifdef NAMD_MIC
19 
20 #include <offload.h>
21 #include <queue>
22 #include <assert.h>
23 
24 
25 //#define PRINT_GBIS
26 #undef PRINT_GBIS
27 
28 #ifdef WIN32
29  #define __thread __declspec(thread)
30 #endif
31 
32 
33 // Declare (extern) the structures that will be used for holding kernel-specific data since
34 // virial data is located within that data structure (read below).
35 extern __thread mic_kernel_data * host__kernel_data;
36 extern __thread int singleKernelFlag;
37 
38 // TRACING - Declare (extern) the buffers that hold the timing data passed back from the device
39 #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0)
40  __thread double mic_first_kernel_submit_time;
41  extern __thread patch_pair* host__patch_pairs;
42  extern __thread int host__patch_pairs_size;
43  extern __thread int host__force_lists_size;
44  #if MIC_DEVICE_TRACING_DETAILED != 0
45  extern __thread double* host__device_times_computes;
46  extern __thread double* host__device_times_patches;
47  #endif
48  extern __thread double* host__device_times_start;
49 #endif
50 
51 
52 __thread int mic_singleKernel = -1;
53 __thread int mic_deviceThreshold = -1;
54 __thread int mic_hostSplit = -1;
55 __thread int mic_numParts_self_p1 = -1;
56 __thread int mic_numParts_pair_p1 = -1;
57 __thread int mic_numParts_pair_p2 = -1;
58 
59 
60 // Function used to get the number of devices
61 int mic_device_count = 0;
62 int mic_get_device_count() { return mic_device_count; }
63 
64 
65 void mic_errcheck(const char *msg) {
66  // DMK - TODO : Add mechanism to detect errors here
67 }
68 
69 
70 void mic_die(const char *msg) {
71  char host[128];
72  #ifdef NOHOSTNAME
73  sprintf(host,"physical node %d", CmiPhysicalNodeID(CkMyPe()));
74  #else
75  gethostname(host, 128); host[127] = 0;
76  #endif
77  char devstr[128] = "";
78  int devnum = mic_device_pe();
79  if (devnum == CkMyPe()) {
80  sprintf(devstr, " device %d", devnum);
81  }
82  char errmsg[1024];
83  sprintf(errmsg, "MIC error on Pe %d (%s%s): %s", CkMyPe(), host, devstr, msg);
84  NAMD_die(errmsg);
85 }
86 
87 
88 // Function used to process MIC-specific command-line parameters along with
89 // global variables to told the content (or flag value) of the parameters.
90 char *devicelist;
91 static __thread int usedevicelist;
92 void mic_getargs(char **argv) {
93  devicelist = 0;
94  usedevicelist = CmiGetArgStringDesc(argv, "+devices", &devicelist,
95  "comma-delimited list of MIC device numbers such as 0,2,1,2");
96 
97  CmiGetArgInt(argv, "+micSK", &mic_singleKernel);
98  CmiGetArgInt(argv, "+micDT", &mic_deviceThreshold);
99  CmiGetArgInt(argv, "+micHS", &mic_hostSplit);
100  CmiGetArgInt(argv, "+micSP1", &mic_numParts_self_p1);
101  CmiGetArgInt(argv, "+micPP1", &mic_numParts_pair_p1);
102  CmiGetArgInt(argv, "+micPP2", &mic_numParts_pair_p2);
103 }
104 
105 
106 // Variables for tracking which MIC-device is associated with which host PE.
107 static __thread int shared_mic;
108 static __thread int first_pe_sharing_mic;
109 static __thread int next_pe_sharing_mic;
110 static __thread int devicePe;
111 static __thread int numPesSharingDevice;
112 static __thread int *pesSharingDevice;
113 
114 static __thread int mic_is_mine;
115 static __thread int myDevice;
116 
117 int mic_device_pe() { return devicePe; }
118 
119 bool mic_device_shared_with_pe(int pe) {
120  for ( int i=0; i<numPesSharingDevice; ++i ) {
121  if ( pesSharingDevice[i] == pe ) return true;
122  }
123  return false;
124 }
125 
126 static inline bool sortop_bitreverse(int a, int b) {
127  if ( a == b ) return 0;
128  for ( int bit = 1; bit; bit *= 2 ) {
129  if ( (a&bit) != (b&bit) ) return ((a&bit) < (b&bit));
130  }
131  return 0;
132 }
133 
134 
135 void mic_initialize() {
136 
137  // If this is PE 0, register the MIC-specific user events with Projections
138  if ( 0 == CkMyPe() ) {
139  MIC_TRACING_REGISTER_EVENTS
140  }
141 
142  // Get the host name for this node
143  char host[128];
144  #ifdef NOHOSTNAME
145  sprintf(host,"physical node %d", CmiPhysicalNodeID(CkMyPe()));
146  #else
147  gethostname(host, 128); host[127] = 0;
148  #endif
149 
150  int myPhysicalNodeID = CmiPhysicalNodeID(CkMyPe());
151  int myRankInPhysicalNode;
152  int numPesOnPhysicalNode;
153  int *pesOnPhysicalNode;
154  CmiGetPesOnPhysicalNode(myPhysicalNodeID,
155  &pesOnPhysicalNode,
156  &numPesOnPhysicalNode
157  );
158 
159  {
160  int i;
161  for ( i=0; i < numPesOnPhysicalNode; ++i ) {
162  if ( i && (pesOnPhysicalNode[i] <= pesOnPhysicalNode[i-1]) ) {
163  i = numPesOnPhysicalNode;
164  break;
165  }
166  if ( pesOnPhysicalNode[i] == CkMyPe() ) break;
167  }
168  if ( i == numPesOnPhysicalNode || i != CmiPhysicalRank(CkMyPe()) ) {
169  CkPrintf("Bad result from CmiGetPesOnPhysicalNode!\n");
170  for ( i=0; i < numPesOnPhysicalNode; ++i ) {
171  CkPrintf("pe %d physnode rank %d of %d is %d\n", CkMyPe(),
172  i, numPesOnPhysicalNode, pesOnPhysicalNode[i]);
173  }
174  myRankInPhysicalNode = 0;
175  numPesOnPhysicalNode = 1;
176  pesOnPhysicalNode = new int[1];
177  pesOnPhysicalNode[0] = CkMyPe();
178  } else {
179  myRankInPhysicalNode = i;
180  }
181  }
182  // CkPrintf("Pe %d ranks %d in physical node\n",CkMyPe(),myRankInPhysicalNode);
183 
184  int deviceCount = _Offload_number_of_devices();
185  if ( deviceCount < 1 ) {
186  mic_die("No MIC devices found.");
187  }
188 
189  int *devices;
190  int ndevices = 0;
191  int nexclusive = 0;
192  if ( usedevicelist ) {
193  devices = new int[strlen(devicelist)];
194  int i = 0;
195  while ( devicelist[i] ) {
196  ndevices += sscanf(devicelist+i,"%d",devices+ndevices);
197  while ( devicelist[i] && isdigit(devicelist[i]) ) ++i;
198  while ( devicelist[i] && ! isdigit(devicelist[i]) ) ++i;
199  }
200  } else {
201  if ( ! CkMyPe() ) {
202  CkPrintf("Did not find +devices i,j,k,... argument, using all\n");
203  }
204  devices = new int[deviceCount];
205  #pragma noinline
206  for ( int i=0; i<deviceCount; ++i ) {
207  int dev = i % deviceCount;
208  #if 0
209  int dev_ok = 0;
210  #pragma offload target(mic: dev) in(dev)
211  dev_ok = mic_check_local();
212  #else
213  int dev_ok = mic_check(dev);
214  #endif
215  if ( dev_ok ) devices[ndevices++] = dev;
216  else CkPrintf("Offload to device %d on Pe %d failed.\n",dev,CkMyPe());
217  }
218  }
219 
220  mic_device_count = ndevices;
221 
222  if ( ! ndevices ) {
223  mic_die("No usable MIC devices found.");
224  }
225 
226  shared_mic = 0;
227  mic_is_mine = 1;
228  first_pe_sharing_mic = CkMyPe();
229  next_pe_sharing_mic = CkMyPe();
230 
231  int dev;
232  if ( numPesOnPhysicalNode > 1 ) {
233  int myDeviceRank = myRankInPhysicalNode * ndevices / numPesOnPhysicalNode;
234  dev = devices[myDeviceRank];
235  devicePe = CkMyPe();
236  pesSharingDevice = new int[numPesOnPhysicalNode];
237  devicePe = -1;
238  numPesSharingDevice = 0;
239  for ( int i = 0; i < numPesOnPhysicalNode; ++i ) {
240  if ( i * ndevices / numPesOnPhysicalNode == myDeviceRank ) {
241  int thisPe = pesOnPhysicalNode[i];
242  pesSharingDevice[numPesSharingDevice++] = thisPe;
243  if ( devicePe < 1 ) devicePe = thisPe;
244  if ( WorkDistrib::pe_sortop_diffuse()(thisPe,devicePe) ) devicePe = thisPe;
245  }
246  }
247  for ( int j = 0; j < ndevices; ++j ) {
248  if ( devices[j] == dev && j != myDeviceRank ) shared_mic = 1;
249  }
250  if ( shared_mic && devicePe == CkMyPe() ) {
251  if ( CmiPhysicalNodeID(CkMyPe()) < 2 )
252  CkPrintf("Pe %d sharing MIC device %d\n", CkMyPe(), dev);
253  }
254  } else { // in case phys node code is lying
255  dev = devices[CkMyPe() % ndevices];
256  devicePe = CkMyPe();
257  pesSharingDevice = new int[1];
258  pesSharingDevice[0] = CkMyPe();
259  numPesSharingDevice = 1;
260  }
261 
262  if ( devicePe != CkMyPe() ) {
263  if ( CmiPhysicalNodeID(devicePe) < 2 )
264  CkPrintf("Pe %d physical rank %d will use MIC device of pe %d\n",
265  CkMyPe(), myRankInPhysicalNode, devicePe);
266  myDevice = -1;
267  return;
268  }
269 
270  // disable token-passing but don't submit local until remote finished
271  // if shared_mic is true, otherwise submit all work immediately
272  first_pe_sharing_mic = CkMyPe();
273  next_pe_sharing_mic = CkMyPe();
274 
275  mic_is_mine = ( first_pe_sharing_mic == CkMyPe() );
276 
277  if ( dev >= deviceCount ) {
278  char buf[256];
279  sprintf(buf,"Pe %d unable to bind to MIC device %d on %s because only %d devices are present",
280  CkMyPe(), dev, host, deviceCount);
281  NAMD_die(buf);
282  }
283 
284  if ( CmiPhysicalNodeID(devicePe) < 2 )
285  CkPrintf("Pe %d physical rank %d binding to MIC device %d on %s: %d procs %d threads \n",
286  CkMyPe(), myRankInPhysicalNode, dev, host,
287  omp_get_num_procs_target(TARGET_MIC,dev),
288  omp_get_max_threads_target(TARGET_MIC,dev));
289 
290  myDevice = dev;
291 
292  // Initialize the MIC device (note, this should only be called once per device)
293  mic_init_device(CkMyPe(), CkMyNode(), myDevice);
294 
295  int dev_ok = mic_check(dev);
296  if ( dev_ok ) devices[ndevices++] = dev;
297  if ( ! dev_ok ) {
298  char errmsg[1024];
299  sprintf(errmsg,"Failed binding to device %d on Pe %d", dev, CkMyPe());
300  NAMD_die(errmsg);
301  }
302 }
303 
304 void mic_free() {
305  if (devicePe != CkMyPe()) { return; }
306  mic_free_device(myDevice);
307 }
308 
309 static __thread ComputeNonbondedMIC* micCompute = 0;
310 static __thread ComputeMgr *computeMgr = 0;
311 
312 void send_build_mic_force_table() {
313  computeMgr->sendBuildMICForceTable();
314 }
315 
316 void build_mic_force_table() {
317  if (devicePe != CkMyPe()) { return; }
321 }
322 
323 void ComputeNonbondedMIC::bind_lj_table(int deviceNum) {
324  if (devicePe != CkMyPe()) { return; }
325  const int lj_table_dim = ljTable->get_table_dim();
326  const int lj_table_size = 2 * lj_table_dim * lj_table_dim * sizeof(LJTable::TableEntry);
327  const LJTable::TableEntry *lj_table_base_ptr = ljTable->get_table();
328  mic_bind_lj_table(deviceNum, (char*)lj_table_base_ptr, lj_table_dim, lj_table_size);
329 }
330 
331 void ComputeNonbondedMIC::bind_force_table(int deviceNum) {
332  if (devicePe != CkMyPe()) { return; }
333  mic_bind_table_four(deviceNum, mic_table_base_ptr, mic_table_n, mic_table_n_16);
334 }
335 
336 void ComputeNonbondedMIC::bind_constants(int deviceNum) {
337  if (devicePe != CkMyPe()) { return; }
338  mic_bind_constants(deviceNum,
346  );
347 }
348 
349 struct exlist_sortop {
350  bool operator() (int32 *li, int32 *lj) {
351  return ( li[1] < lj[1] );
352  }
353 };
354 
355 static __thread int2 *exclusionsByAtom = NULL;
356 
357 void ComputeNonbondedMIC::bind_exclusions(int deviceNum) {
358 
359  // Only do this on PEs that directly control devices (simply return for PEs that do not)
360  if (devicePe != CkMyPe()) { return; }
361 
362  // DMK - Info...
363  // Each atom as a globally unique index in the molecule object. For any
364  // given atom, the exclusion list in the molecule object contains a list of
365  // the other atoms (referenced by index) that are "excluded" for the the
366  // given atom. These exclusion lists tend to follow a set of patterns
367  // (i.e. all water molecules "look" the same). For this reason, we will store
368  // patterns rather than lists, and associate each atom with the appropriate
369  // pattern (e.g. 'all oxygen atoms within water molecules will exclude the
370  // hydrogen atoms attached to them in the global list of atoms,' rather than
371  // both '[water_oxygen_1 excludes water_hydrogen_1 and water_hydrogen_2] and
372  // [water_oxygen_2 excludes water_hydrogen_3 and water_hydrogen_4]'). The code
373  // below creates those patterns and associates atoms with the appropriate
374  // patterns. By using these common patterns, the memory required is reduced.
375  // To do this, the patterns use distances between the indexes of two atoms
376  // that exclude one another, rather than absolute indexes of those same atoms.
377  // A pattern is an array of bits (2-bits per entry), that ranges from negative
378  // offsets to positive offsets. For example, for a water molecule made up of
379  // one oxygen (O1 w/ index i) and two hydrogen (H1 with index i+1 and H2 with
380  // index i+2) atoms, the pattern for the oxygen atom (O1) would have 5 entries
381  // (10 bits total) and would range from offset -2 to 2 (always -X to +X). The
382  // pattern for the first hydrogen (H1) would have 3 entries (6 bits total)
383  // and would range from offset -1 (O1) to 1 (H1). The value of each 2-bit
384  // entry indicate the type of interaction (NORMAL, MODIFIED, or EXCLUDED).
385 
386  // Grab a pointer to the molecule object, which contains information about the
387  // particle system being simulated.
389 
390  // Get the number of atoms
391  #ifdef MEM_OPT_VERSION
392  int natoms = mol->exclSigPoolSize;
393  #else
394  int natoms = mol->numAtoms;
395  #endif
396 
397  // A data structure that will store the offsets and ranges of the patterns
398  // created by the code below (see the comments below) for each atom,
399  // indicating which pattern should be used for the atom and which other
400  // atoms need to be checked against the pattern for this atom.
401  exclusionsByAtom = new int2[natoms];
402 
403  // Declare some tmp variables/buffers used in the code below.
404  ObjectArena<int32> listArena;
405  ResizeArray<int32*> unique_lists;
406  int32 **listsByAtom = new int32*[natoms];
407  SortableResizeArray<int32> curExclList;
408  SortableResizeArray<int32> curModList;
410 
411  // For each atom...
412  for (int i = 0; i < natoms; i++) {
413 
414  // Clear the current lists
415  // NOTE: The values in these lists are offsets between the atom indexes
416  curExclList.resize(0); // EXCLUDED (full) list
417  curModList.resize(0); // MODIFIED list
418  curList.resize(0); // Combined list
419 
420  // Always exclude self (index offset to self is zero)
421  curExclList.add(0);
422 
423  // Get the exclusions for this atom from the molecule object, adding each
424  // one to the current lists, converting absolut indexes to offsets as
425  // required.
426  // NOTE: NAMD uses a macro called MEM_OPT_VERSION to adjust some of its data
427  // structures to be more memory efficient. Whether or not MEM_OPT_VERSION
428  // is set changes how the exclusion information for atoms is stored in the
429  // molecule object.
430  int n = 0;
431  int min_index = 0; // Furthest negative index offset
432  int max_index = 0; // Furthest positive index offset
433  #if MEM_OPT_VERSION
434  const ExclusionSignature *sig = mol->exclSigPool + i;
435  n = sig->fullExclCnt;
436  for (int j = 0; j < n; j++) {
437  int index = sig->fullOffset[j]; // NOTE: Already an index offset
438  if (index < min_index) { min_index = index; }
439  if (index > max_index) { max_index = index; }
440  curExclList.add(index);
441  }
442  n = sig->modExclCnt;
443  for (int j = 0; j < n; j++) {
444  int index = sig->modOffset[j]; // NOTE: Already an index offset
445  if (index < min_index) { min_index = index; }
446  if (index > max_index) { max_index = index; }
447  curModList.add(index);
448  }
449  #else
450  const int32 *full_excl = mol->get_full_exclusions_for_atom(i);
451  n = full_excl[0] + 1;
452  for (int j = 1; j < n; j++) {
453  int index = full_excl[j] - i; // NOTE: An absolute index, so make offset
454  if (index < min_index) { min_index = index; }
455  if (index > max_index) { max_index = index; }
456  curExclList.add(index);
457  }
458  const int32 *mod_excl = mol->get_mod_exclusions_for_atom(i);
459  n = mod_excl[0] + 1;
460  for (int j = 1; j < n; j++) {
461  int index = mod_excl[j] - i;
462  if (index < min_index) { min_index = index; }
463  if (index > max_index) { max_index = index; }
464  curModList.add(index);
465  }
466  #endif
467  int maxDiff = -1 * min_index;
468  if (maxDiff < max_index) { maxDiff = max_index; }
469  // NOTE: maxDiff = max(abs(min_index), max_index);
470  curExclList.sort(); curExclList.add(-1 * maxDiff - 1);
471  curModList.sort(); curModList.add(-1 * maxDiff - 1);
472  // NOTE : curExclList and curModList now contain the list of offsets for
473  // atoms that are full and modified excluded (respectively) for this atom
474  // (i) in sorted order, with a '-1 * maxDiff - 1' value at the end (this
475  // last value is added for the sake of the "matching code" in the loop below).
476 
477  // Create a single list with the combined exclusion info of the two separate lists
478  n = 2 * maxDiff + 1; // NOTE: pattern ranges from -maxDiff to +maxDiff, inclusive
479  curList.resize(n);
480  int curExclListIndex = 0;
481  int curModListIndex = 0;
482 
483  // For each entry in the combined list...
484  for (int j = -1 * maxDiff; j <= maxDiff; j++) {
485 
486  // Assume normal
487  int bitPattern = 0x00;
488 
489  // NOTE: curExclList and curModList are in sorted order and we are moving
490  // through the combined list in-order, so we only need to check one
491  // index in the separate lists (moving ahead in the list when a match
492  // is found).
493 
494  // Check if is actually excluded...
495  if (curExclList[curExclListIndex] == j) {
496  bitPattern |= 0x01;
497  curExclListIndex++;
498  }
499 
500  // Check if is actually modified...
501  if (curModList[curModListIndex] == j) {
502  bitPattern |= 0x02;
503  curModListIndex++;
504  }
505 
506  // Store the generated pattern entry
507  curList[j + maxDiff] = bitPattern;
508  }
509 
510  // NOTE: curList now contains the bit patterns (isModified and isExcluded)
511  // flags for this atom (i) based on the offsets of the other atoms.
512 
513  // Search through the list of patterns that have already been created, checking
514  // to see the current pattern already exists. If so, just use the existing
515  // pattern. If not, add this new, unique pattern.
516  // For each existing pattern...
517  int j = 0;
518  for (j = 0; j < unique_lists.size(); j++) {
519 
520  // Check to see if the size matches (skip if it does not, obviously not a match)
521  if (n != unique_lists[j][0]) { continue; }
522 
523  // Check each entry in the current pattern vs this existing pattern, breaking
524  // from the loop if a mismatch is found (i.e. causing the final value of k
525  // to be less than the pattern length).
526  int k = 0;
527  for (; k < n; k++) { // For each element in the list...
528  if (unique_lists[j][k+3] != curList[k]) { break; } // ... check for mismatch
529  }
530  if (k == n) { break; } // If full match, stop searching (NOTE: j < unique_lists.size() will be true)
531  }
532 
533  // If no match was found (i.e. j == length of existing pattern list) in the search
534  // loop above, then add the current pattern to the end of the list of patterns.
535  if (j == unique_lists.size()) {
536 
537  // Create a new list with size n
538  int32 *list = listArena.getNewArray(n + 3);
539  list[0] = n; // Entry [0] = length of pattern (not including list[0] or list[1])
540  list[1] = maxDiff; // Entry [1] = offset range
541  // NOTE: Entry [2] will be filled in later
542 
543  // Entries [3+] contain the bit pattern entries
544  for (int k = 0; k < n; k++) { list[k + 3] = curList[k]; }
545 
546  // Add this pattern to the list of patterns
547  unique_lists.add(list); // NOTE: This adds the new list at unique_lists[j]
548  }
549 
550  // Note the pattern for this atom (whether it was found or added to the list of patterns)
551  listsByAtom[i] = unique_lists[j];
552 
553  } // end for (i < natoms)
554 
555  // Sort the list of patterns, placing the smaller patterns first
556  std::stable_sort(unique_lists.begin(), unique_lists.end(), exlist_sortop());
557 
558  // Now, we create a data structure that simply stores the patterns, using 2 bits
559  // per entry per atom.
560 
561  // Count the total bits required to store the lists (2 bits per entry in each
562  // pattern). At the same time, note the offsets for the "self entry" for each
563  // pattern in the final list of bits for each pattern. We want to use the
564  // "self entry" as the "start" so that we can easily create on offset in the
565  // pattern by subtracting the indexes of the two atoms and multiplying that
566  // by 2 bits during simulation.
567  long int totalBits = 0;
568  int nlists = unique_lists.size();
569  // For each pattern in the list of patterns...
570  for (int j = 0; j < nlists; j++) {
571  int32 *list = unique_lists[j]; // Get the list
572  int maxDiff = list[1]; // Get the range for the pattern
573  list[2] = totalBits + (2 * maxDiff); // Note the offset for the "self entry"/"start" for this pattern
574  totalBits += 2 * (2 * maxDiff + 1); // Add the total bits needed for this pattern
575  }
576 
577  // For each atom, record the "start" of the list (i.e. the "self" position)
578  for (int i = 0; i < natoms; i++) {
579  exclusionsByAtom[i].x = (listsByAtom[i])[1]; // maxDiff or range of pattern
580  exclusionsByAtom[i].y = (listsByAtom[i])[2]; // "start" (self offset in bits)
581  }
582 
583  // Cleanup listsByAtom (no longer required)
584  delete [] listsByAtom; listsByAtom = NULL;
585 
586  // Roundup the total number of bits to a multiple of sizeof(unsigned int)
587  const long int uintBitCnt = sizeof(unsigned int) * 8;
588  if (totalBits & (uintBitCnt - 1)) {
589  totalBits += (uintBitCnt - (totalBits & (uintBitCnt - 1)));
590  }
591  long int totalBytes = totalBits / 8;
592 
593  // If this is PE 0, print some info...
594  if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
595  CkPrintf("Info: Found %d unique exclusion lists needed %ld bytes\n",
596  unique_lists.size(), totalBytes);
597  }
598 
599  // Allocate the memory required (using 'unsigned int' as the array's data type)
600  // and initialize the memory to zero.
601  long int totalUInts = totalBits / (sizeof(unsigned int) * 8);
602  unsigned int *exclusion_bits = new unsigned int[totalUInts];
603  memset(exclusion_bits, 0, totalBytes); // Zero-out the data
604 
605  // Fill in the bits
606  long int offset = 0; // Offset of current list in exclusion_bits array
607  // For each of the patterns...
608  for (int i = 0; i < unique_lists.size(); i++) {
609 
610  // Get the pattern
611  int32 *list = unique_lists[i];
612 
613  // Sanity check: Verify that the base value matches with the stored "start"
614  // values (i.e. the start of this list is where we expect it to be)
615  if (offset + (2 * list[1]) != list[2]) {
616  NAMD_bug("ComputeNonbondedMIC::bind_exclusions offset mismatch");
617  }
618 
619  // Pack the bits from this list into the exclusion_bits array
620  const int n = list[0];
621  for (int j = 0; j < n; j++) {
622  const int offset_j = offset + (2 * j); // NOTE: Each entry is 2 bits
623  const int offset_major = offset_j / (sizeof(unsigned int) * 8);
624  const int offset_minor = offset_j % (sizeof(unsigned int) * 8); // NOTE: Reverse indexing direction relative to offset_major
625  const int entry_mask = (list[j + 3] & 0x3) << offset_minor;
626  exclusion_bits[offset_major] |= entry_mask;
627  }
628 
629  // Move the offset forward
630  offset += 2 * (2 * list[1] + 1);
631  }
632 
633  // Now that the bits for all of the patterns have been placed into the
634  // exclusion_bits array, push this array down to the device for use
635  // during simulation.
636  mic_bind_exclusions(deviceNum, exclusion_bits, totalUInts);
637 
638  // Cleanup
639  // NOTE: That the individual lists will be free'ed as the arena is destroyed,
640  // along with the resize arrays (curModList, etc.).
641  delete [] exclusion_bits;
642 }
643 
644 
645 // Register a "self compute" on the host with this MIC meta-compute object, creating a
646 // compute_record for the self compute.
647 void register_mic_compute_self(ComputeID c, PatchID pid, int part, int numParts) {
648 
649  if ( ! micCompute ) NAMD_bug("register_self called early");
650 
651  // DMK - DEBUG
652  MICP("register_mic_compute_self(c:%d, pid:%d, part:%d, numParts:%d) - Called...\n", c, pid, part, numParts); MICPF;
653 
654  // Indicate that the mic compute requires the patch information
655  // associated with the given self compute
656  micCompute->requirePatch(pid);
657 
659 
660  numParts = mic_numParts_self_p1;
661  if (numParts < 1) { numParts = 1; }
662  for (int part = 0; part < numParts; part++) {
663 
664  // Create a compute record within the mic compute that represents
665  // the given self compute
667  cr.c = c;
668  cr.pid[0] = pid; cr.pid[1] = pid;
669  cr.offset = 0.;
670  cr.isSelf = 1;
671 
672  cr.part = part;
673  cr.numParts = numParts;
674 
675  if (singleKernelFlag != 0 || micCompute->patchRecords[pid].isLocal) {
676  micCompute->localComputeRecords.add(cr);
677  } else {
678  micCompute->remoteComputeRecords.add(cr);
679  }
680 
681  }
682 }
683 
684 void register_mic_compute_pair(ComputeID c, PatchID pid[], int t[], int part, int numParts) {
685 
686  if ( ! micCompute ) NAMD_bug("register_pair called early");
687 
688  // DMK - DEBUG
689  MICP("register_mic_compute_pair(c:%d, pid:{%d,%d}, t:--, part:%d, numParts:%d) - Called...\n", c, pid[0], pid[1], part, numParts); MICPF;
690 
691  // Indicate that the mic compute requires the patch information
692  // associated with the given pair compute
693  micCompute->requirePatch(pid[0]);
694  micCompute->requirePatch(pid[1]);
695 
696  // Calculate the offset that will need to be applied to the atom positions (which are
697  // stored as offsets from the center of their respective patches)
698  int t1 = t[0];
699  int t2 = t[1];
700  Vector offset = micCompute->patchMap->center(pid[0])
701  - micCompute->patchMap->center(pid[1]);
702  offset.x += (t1%3-1) - (t2%3-1);
703  offset.y += ((t1/3)%3-1) - ((t2/3)%3-1);
704  offset.z += (t1/9-1) - (t2/9-1);
705 
706  // Calculate the Manhattan distance between the patches and use that to determine how
707  // many parts the pair compute should be broken up into
708  ComputeMap *computeMap = ComputeMap::Object();
709  PatchMap *patchMap = PatchMap::Object();
711 
712  int aSize = patchMap->gridsize_a();
713  int bSize = patchMap->gridsize_b();
714  int cSize = patchMap->gridsize_c();
715  int trans0 = computeMap->trans(c, 0);
716  int trans1 = computeMap->trans(c, 1);
717  int index_a0 = patchMap->index_a(pid[0]) + aSize * Lattice::offset_a(trans0);
718  int index_b0 = patchMap->index_b(pid[0]) + bSize * Lattice::offset_b(trans0);
719  int index_c0 = patchMap->index_c(pid[0]) + cSize * Lattice::offset_c(trans0);
720  int index_a1 = patchMap->index_a(pid[1]) + aSize * Lattice::offset_a(trans1);
721  int index_b1 = patchMap->index_b(pid[1]) + bSize * Lattice::offset_b(trans1);
722  int index_c1 = patchMap->index_c(pid[1]) + cSize * Lattice::offset_c(trans1);
723  int da = index_a0 - index_a1; da *= ((da < 0) ? (-1) : (1));
724  int db = index_b0 - index_b1; db *= ((db < 0) ? (-1) : (1));
725  int dc = index_c0 - index_c1; dc *= ((dc < 0) ? (-1) : (1));
726  int manDist = da + db + dc;
727 
728  // Create each part
729  numParts = mic_numParts_pair_p1 - (mic_numParts_pair_p2 * manDist);
730  if (numParts < 1) { numParts = 1; }
731  for (int part = 0; part < numParts; part++) {
732 
733  // Create a compute record within the mic compute that represents
734  // the given pair compute
736  cr.c = c;
737  cr.pid[0] = pid[0]; cr.pid[1] = pid[1];
738  cr.offset = offset;
739  cr.isSelf = 0;
740 
741  cr.part = part;
742  cr.numParts = numParts;
743 
744  // If splitting the kernels up into "local" and "remote" kernels, place the pair compute in
745  // the "remote" kernel if either of the patches is "remote"... otherwise, mark as "local"
746  if ((singleKernelFlag != 0) ||
747  (micCompute->patchRecords[pid[0]].isLocal && micCompute->patchRecords[pid[1]].isLocal)
748  ) {
749  micCompute->localComputeRecords.add(cr);
750  } else {
751  micCompute->remoteComputeRecords.add(cr);
752  }
753 
754  } // end for (part < numParts)
755 }
756 
757 
758 void unregister_mic_compute(ComputeID c) { // static
759  NAMD_bug("unregister_compute unimplemented");
760 }
761 
762 
763 // DMK - DEBUG - A function that periodically prints a "heartbeat" output
764 void mic_heartbeat(void *arg, double t) {
765  #if MIC_DEBUG != 0
766  MICP("[DEBUG:%d] :: heartbeat (t:%lf)...\n", CkMyPe(), t); MICPF;
767  #else
768  printf("[DEBUG:%d] :: heartbeat (t:%lf)...\n", CkMyPe(), t); fflush(NULL);
769  #endif
770  CcdCallFnAfter(mic_heartbeat, NULL, 1000.0);
771 }
772 
773 
775  ComputeMgr *mgr,
777  int idx
778  ) : Compute(c), slaveIndex(idx) {
779 
780  #ifdef PRINT_GBIS
781  CkPrintf("C.N.MIC[%d]::constructor cid=%d\n", CkMyPe(), c);
782  #endif
783 
784  // DMK - DEBUG
785  #if MIC_HEARTBEAT != 0
786  mic_heartbeat(NULL, CmiWallTimer());
787  #endif
788 
789  // DMK - DEBUG
790  MICP("ComputeNonbondedMIC::ComputeNonbondedMIC(cid:%d) - Called...\n", c); MICPF;
791 
792  master = m ? m : this;
793  micCompute = this;
794  computeMgr = mgr;
795  patchMap = PatchMap::Object();
796  atomMap = AtomMap::Object();
797  reduction = 0;
798 
800  if (params->pressureProfileOn) {
801  NAMD_die("pressure profile not supported in MIC");
802  }
803  if (mic_singleKernel >= 0) {
804  singleKernelFlag = ((mic_singleKernel != 0) ? (1) : (0));
805  } else {
806  singleKernelFlag = ((params->mic_singleKernel != 0) ? (1) : (0));
807  }
808 
809  atomsChanged = 1;
810  computesChanged = 1;
811 
812  pairlistsValid = 0;
813  pairlistTolerance = 0.;
814  usePairlists = 0;
815  savePairlists = 0;
816  plcutoff2 = 0.;
817 
818  workStarted = 0;
819  basePriority = PROXY_DATA_PRIORITY;
820  localWorkMsg2 = new (PRIORITY_SIZE) LocalWorkMsg;
821 
822  // Master and slaves
823  #if MIC_SUBMIT_ATOMS_ON_ARRIVAL != 0
824  micDevice = -1;
825  exclusionsByAtom_ptr = NULL;
826  atomSubmitSignals = new std::set<void*>(); __ASSERT(atomSubmitSignals != NULL);
827  #endif
828 
829  // Print some info for the user
830  // NOTE: Do this before the master/slave check below (so PE 0 will reach here)
831  if (CkMyPe() == 0) {
832  iout << iINFO << "MIC SINGLE OFFLOAD: " << ((singleKernelFlag != 0) ? ("SET") : ("UNSET")) << "\n" << endi;
833  iout << iINFO << "MIC DEVICE THRESHOLD: " << mic_deviceThreshold << "\n" << endi;
834  iout << iINFO << "MIC HOST SPLIT: " << mic_hostSplit << "\n" << endi;
835  iout << iINFO << "MIC NUM PARTS SELF P1: " << mic_numParts_self_p1 << "\n" << endi;
836  iout << iINFO << "MIC NUM PARTS PAIR P1: " << mic_numParts_pair_p1 << "\n" << endi;
837  iout << iINFO << "MIC NUM PARTS PAIR P2: " << mic_numParts_pair_p2 << "\n" << endi;
838  }
839 
840  if ( master != this ) { // I am slave
841  masterPe = master->masterPe;
842  master->slaves[slaveIndex] = this;
843  if ( master->slavePes[slaveIndex] != CkMyPe() ) {
844  NAMD_bug("ComputeNonbondedMIC slavePes[slaveIndex] != CkMyPe");
845  }
846  registerPatches();
847  return;
848  } else { // I am a master, identify self to ComputeMgr for load balancing data
849  computeMgr->sendMICPEData(CkMyPe(), 1);
850  patchRecords.resize(patchMap->numPatches());
851  }
852  masterPe = CkMyPe();
853 
854  if (myDevice >= 0) {
855  bind_exclusions(myDevice);
856  }
857 
858  // Master only
859  #if MIC_SUBMIT_ATOMS_ON_ARRIVAL != 0
860  micDevice = myDevice; __ASSERT(micDevice >= 0);
861  exclusionsByAtom_ptr = exclusionsByAtom; __ASSERT(exclusionsByAtom_ptr != NULL);
862  #endif
863 
864  // Initialize the exclusion contribution sum
865  exclContrib = 0;
866 
867  // DMK - DEBUG
868  timestep = 0;
869 }
870 
871 
873  #if MIC_DEBUG > 0
874  debugClose();
875  #endif
876 }
877 
878 void ComputeNonbondedMIC::requirePatch(int pid) {
879 
880  computesChanged = 1;
881  patch_record &pr = patchRecords[pid];
882  if ( pr.refCount == 0 ) {
883  //if ( CkNumNodes() < 2 ) {
884  // pr.isLocal = 1 & ( 1 ^ patchMap->index_a(pid) ^ patchMap->index_b(pid) ^ patchMap->index_c(pid) );
885  //} else {
886  pr.isLocal = ( CkNodeOf(patchMap->node(pid)) == CkMyNode() );
887  //}
888 
889  if ( singleKernelFlag != 0 || pr.isLocal ) {
890  localActivePatches.add(pid);
891  } else {
893  }
894 
895  activePatches.add(pid);
896  pr.patchID = pid;
897  pr.hostPe = -1;
898  pr.x = NULL;
899  pr.xExt = NULL;
900  pr.r = NULL;
901  pr.f = NULL;
902  pr.intRad = NULL;
903  pr.psiSum = NULL;
904  pr.bornRad = NULL;
905  pr.dEdaSum = NULL;
906  pr.dHdrPrefix = NULL;
907  }
908  pr.refCount += 1;
909 }
910 
912 
914  int npatches = master->activePatches.size();
915  int *pids = master->activePatches.begin();
916  patch_record *recs = master->patchRecords.begin();
917  const int myPE = CkMyPe();
918 
919  for ( int i=0; i<npatches; ++i ) {
920  int pid = pids[i];
921  patch_record &pr = recs[pid];
922  if ( pr.hostPe == myPE ) {
923 
924  hostedPatches.add(pid);
925 
926  if ( singleKernelFlag != 0 || pr.isLocal ) {
927  localHostedPatches.add(pid);
928  } else {
930  }
931 
933  pr.p = patchMap->patch(pid);
934  pr.positionBox = pr.p->registerPositionPickup(this);
935 
936  pr.forceBox = pr.p->registerForceDeposit(this);
937  if (simParams->GBISOn) {
938  pr.intRadBox = pr.p->registerIntRadPickup(this);
939  pr.psiSumBox = pr.p->registerPsiSumDeposit(this);
940  pr.bornRadBox = pr.p->registerBornRadPickup(this);
941  pr.dEdaSumBox = pr.p->registerDEdaSumDeposit(this);
942  pr.dHdrPrefixBox = pr.p->registerDHdrPrefixPickup(this);
943  }
944  }
945  }
946  if ( master == this ) setNumPatches(activePatches.size());
948 
949  if ( CmiPhysicalNodeID(CkMyPe()) < 2 )
950  CkPrintf("Pe %d hosts %d local and %d remote patches for pe %d\n", CkMyPe(), localHostedPatches.size(), remoteHostedPatches.size(), masterPe);
951 }
952 
954 
955  // DMK - DEBUG
956  MICP("ComputeNonbondedMIC::assignPatches() - Called...\n"); MICPF;
957 
958  int *pesOnNodeSharingDevice = new int[CkMyNodeSize()];
959  int numPesOnNodeSharingDevice = 0;
960  int masterIndex = -1;
961  for ( int i=0; i<numPesSharingDevice; ++i ) {
962  int pe = pesSharingDevice[i];
963  if ( pe == CkMyPe() ) masterIndex = numPesOnNodeSharingDevice;
964  if ( CkNodeOf(pe) == CkMyNode() ) {
965  pesOnNodeSharingDevice[numPesOnNodeSharingDevice++] = pe;
966  }
967  }
968 
969  int npatches = activePatches.size();
970 
971  if ( npatches ) {
973  }
974 
975  int *count = new int[npatches];
976  memset(count, 0, sizeof(int)*npatches);
977  int *pcount = new int[numPesOnNodeSharingDevice];
978  memset(pcount, 0, sizeof(int)*numPesOnNodeSharingDevice);
979  int *rankpcount = new int[CkMyNodeSize()];
980  memset(rankpcount, 0, sizeof(int)*CkMyNodeSize());
981  char *table = new char[npatches*numPesOnNodeSharingDevice];
982  memset(table, 0, npatches*numPesOnNodeSharingDevice);
983 
984  int unassignedpatches = npatches;
985 
986  if ( 0 ) { // assign all to device pe
987  for ( int i=0; i<npatches; ++i ) {
988  int pid = activePatches[i];
989  patch_record &pr = patchRecords[pid];
990  pr.hostPe = CkMyPe();
991  }
992  unassignedpatches = 0;
993  pcount[masterIndex] = npatches;
994  } else
995 
996  // assign if home pe and build table of natural proxies
997  for ( int i=0; i<npatches; ++i ) {
998  int pid = activePatches[i];
999  patch_record &pr = patchRecords[pid];
1000  int homePe = patchMap->node(pid);
1001 
1002  for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
1003  int pe = pesOnNodeSharingDevice[j];
1004  if ( pe == homePe ) {
1005  pr.hostPe = pe; --unassignedpatches;
1006  pcount[j] += 1;
1007  }
1008  if ( PatchMap::ObjectOnPe(pe)->patch(pid) ) {
1009  table[i*numPesOnNodeSharingDevice+j] = 1;
1010  }
1011  }
1012  if ( pr.hostPe == -1 && CkNodeOf(homePe) == CkMyNode() ) {
1013  pr.hostPe = homePe; --unassignedpatches;
1014  rankpcount[CkRankOf(homePe)] += 1;
1015  }
1016  }
1017 
1018  // assign if only one pe has a required proxy
1019  int assignj = 0;
1020  for ( int i=0; i<npatches; ++i ) {
1021  int pid = activePatches[i];
1022  patch_record &pr = patchRecords[pid];
1023  if ( pr.hostPe != -1 ) continue;
1024  int c = 0;
1025  int lastj;
1026  for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
1027  if ( table[i*numPesOnNodeSharingDevice+j] ) { ++c; lastj=j; }
1028  }
1029  count[i] = c;
1030  if ( c == 1 ) {
1031  pr.hostPe = pesOnNodeSharingDevice[lastj];
1032  --unassignedpatches;
1033  pcount[lastj] += 1;
1034  }
1035  }
1036  while ( unassignedpatches ) {
1037  int i;
1038  for ( i=0; i<npatches; ++i ) {
1039  if ( ! table[i*numPesOnNodeSharingDevice+assignj] ) continue;
1040  int pid = activePatches[i];
1041  patch_record &pr = patchRecords[pid];
1042  if ( pr.hostPe != -1 ) continue;
1043  pr.hostPe = pesOnNodeSharingDevice[assignj];
1044  --unassignedpatches;
1045  pcount[assignj] += 1;
1046  if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
1047  break;
1048  }
1049  if ( i<npatches ) continue; // start search again
1050  for ( i=0; i<npatches; ++i ) {
1051  int pid = activePatches[i];
1052  patch_record &pr = patchRecords[pid];
1053  if ( pr.hostPe != -1 ) continue;
1054  if ( count[i] ) continue;
1055  pr.hostPe = pesOnNodeSharingDevice[assignj];
1056  --unassignedpatches;
1057  pcount[assignj] += 1;
1058  if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
1059  break;
1060  }
1061  if ( i<npatches ) continue; // start search again
1062  if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
1063  }
1064 
1065  for ( int i=0; i<npatches; ++i ) {
1066  int pid = activePatches[i];
1067  patch_record &pr = patchRecords[pid];
1068  // CkPrintf("Pe %d patch %d hostPe %d\n", CkMyPe(), pid, pr.hostPe);
1069  }
1070 
1071  slavePes = new int[CkMyNodeSize()];
1072  slaves = new ComputeNonbondedMIC*[CkMyNodeSize()];
1073  numSlaves = 0;
1074  for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
1075  int pe = pesOnNodeSharingDevice[j];
1076  int rank = pe - CkNodeFirst(CkMyNode());
1077  // CkPrintf("host %d sharing %d pe %d rank %d pcount %d rankpcount %d\n",
1078  // CkMyPe(),j,pe,rank,pcount[j],rankpcount[rank]);
1079  if ( pe == CkMyPe() ) continue;
1080  if ( ! pcount[j] && ! rankpcount[rank] ) continue;
1081  rankpcount[rank] = 0; // skip in rank loop below
1082  slavePes[numSlaves] = pe;
1083  computeMgr->sendCreateNonbondedMICSlave(pe,numSlaves);
1084  ++numSlaves;
1085  }
1086  for ( int j=0; j<CkMyNodeSize(); ++j ) {
1087  int pe = CkNodeFirst(CkMyNode()) + j;
1088  // CkPrintf("host %d rank %d pe %d rankpcount %d\n",
1089  // CkMyPe(),j,pe,rankpcount[j]);
1090  if ( ! rankpcount[j] ) continue;
1091  if ( pe == CkMyPe() ) continue;
1092  slavePes[numSlaves] = pe;
1093  computeMgr->sendCreateNonbondedMICSlave(pe,numSlaves);
1094  ++numSlaves;
1095  }
1096  registerPatches();
1097 
1098  delete [] pesOnNodeSharingDevice;
1099  delete [] count;
1100  delete [] pcount;
1101  delete [] rankpcount;
1102  delete [] table;
1103 }
1104 
1105 static __thread int num_atom_records_allocated;
1106 static __thread float *energy_gbis;
1107 
1108 //GBIS host pointers
1109 static __thread float *intRad0H;
1110 static __thread float *intRadSH;
1111 //static __thread GBReal *psiSumH; //moved into class
1112 static __thread float *bornRadH;
1113 //static __thread GBReal *dEdaSumH; //moved into class
1114 static __thread float *dHdrPrefixH;
1115 
1116 static __thread int mic_timer_count;
1117 static __thread double mic_timer_total;
1118 static __thread double kernel_time;
1119 static __thread double remote_submit_time;
1120 static __thread double local_submit_time;
1121 
1122 // TRACING
1123 #if MIC_TRACING != 0
1124  __thread double mic_tracing_offload_start_remote = 0.0;
1125  __thread double mic_tracing_offload_start_local = 0.0;
1126  __thread double mic_tracing_offload_end_remote = 0.0;
1127  __thread double mic_tracing_offload_end_local = 0.0;
1128  static __thread double mic_tracing_polling_start = 0.0;
1129  static __thread int mic_tracing_polling_count = 0;
1130  #define MIC_TRACING_POLLING_SET_FREQ ( 100 )
1131 #endif
1132 
1133 #define MIC_POLL(FN,ARG) CcdCallFnAfter(FN,ARG,0.1)
1134 
1135 #ifdef PRINT_GBIS
1136 #define GBISP(...) CkPrintf(__VA_ARGS__);
1137 #else
1138 #define GBISP(...)
1139 #endif
1140 
1141 
1142 #define count_limit 5000000 // DMK - NOTE : I have this set fairly high so that I can test executions
1143  // that us a single thread on the MIC device, which can take quite some
1144  // time if using printfs to display intermediate debug values.
1145 static __thread int check_remote_count;
1146 static __thread int check_local_count;
1147 
1148 void mic_check_remote_progress(void *arg, double) {
1149 
1150  // If the offloaded work is finished, trigger finishWork()
1151  if (mic_check_remote_kernel_complete(myDevice)) {
1152 
1153  ComputeNonbondedMIC* compute = (ComputeNonbondedMIC*)arg;
1154  #if MIC_SYNC_OUTPUT != 0
1155  #if MIC_TRACING != 0
1156  double xfer_start = CmiWallTimer();
1157  #endif
1158  mic_transfer_output(myDevice,
1159  1,
1160  compute->num_local_atom_records,
1161  compute->doSlow
1162  );
1163  #if MIC_TRACING != 0
1164  double xfer_end = CmiWallTimer();
1165  MIC_TRACING_RECORD(MIC_EVENT_SYNC_OUTPUT_REMOTE_PRAGMA, xfer_start, xfer_end);
1166  #endif
1167  #endif
1168  //#if MIC_TRACING != 0
1169  // double now = CmiWallTimer();
1170  // mic_tracing_offload_end_remote = now;
1171  // MIC_TRACING_RECORD(MIC_EVENT_OFFLOAD_REMOTE, mic_tracing_offload_start_remote, now);
1172  // MIC_TRACING_RECORD(MIC_EVENT_OFFLOAD_POLLSET, mic_tracing_polling_start, now);
1173  // mic_tracing_polling_count = 0;
1174  //#endif
1175  check_remote_count = 0;
1176  mic_errcheck("at mic remote stream completed");
1178 
1179  // DMK - DEBUG
1180  MICP("[DEBUG:%d] :: << detected remote kernel completion >>\n", CkMyPe()); MICPF;
1181 
1182  // Otherwise, check to see if it has been a long time (if so, timeout with error)
1183  } else if (++check_remote_count >= count_limit) {
1184  char errmsg[256];
1185  sprintf(errmsg,
1186  "mic_check_remote_progress polled %d times over %f s on step %d",
1187  check_remote_count, CkWallTimer() - remote_submit_time,
1188  ((ComputeNonbondedMIC*)arg)->step
1189  );
1190  //mic_errcheck(errmsg);
1191  NAMD_die(errmsg);
1192 
1193  // Otherwise, queue another poll attempt in the future to try again later
1194  } else {
1195 
1196  //#if MIC_TRACING != 0
1197  // if (++mic_tracing_polling_count > MIC_TRACING_POLLING_SET_FREQ) {
1198  // double now = CmiWallTimer();
1199  // MIC_TRACING_RECORD(MIC_EVENT_OFFLOAD_POLLSET, mic_tracing_polling_start, now);
1200  // mic_tracing_polling_start = now;
1201  // mic_tracing_polling_count = 0;
1202  // }
1203  //#endif
1204  MIC_POLL(mic_check_remote_progress, arg);
1205  }
1206 }
1207 
1208 void mic_check_local_progress(void *arg, double) {
1209 
1210  // If the offloaded work is finished, trigger finishWork()
1211  if (mic_check_local_kernel_complete(myDevice)) {
1212 
1213  ComputeNonbondedMIC* compute = (ComputeNonbondedMIC*)arg;
1214  #if MIC_SYNC_OUTPUT != 0
1215  #if MIC_TRACING != 0
1216  double xfer_start = CmiWallTimer();
1217  #endif
1218  mic_transfer_output(myDevice,
1219  0,
1220  compute->num_local_atom_records,
1221  compute->doSlow
1222  );
1223  #if MIC_TRACING != 0
1224  double xfer_end = CmiWallTimer();
1225  MIC_TRACING_RECORD(MIC_EVENT_SYNC_OUTPUT_LOCAL_PRAGMA, xfer_start, xfer_end);
1226  #endif
1227  #endif
1228  //#if MIC_TRACING != 0
1229  // double now = CmiWallTimer();
1230  // mic_tracing_offload_end_local = now;
1231  // MIC_TRACING_RECORD(MIC_EVENT_OFFLOAD_LOCAL, mic_tracing_offload_start_local, now);
1232  // MIC_TRACING_RECORD(MIC_EVENT_OFFLOAD_POLLSET, mic_tracing_polling_start, now);
1233  // mic_tracing_polling_count = 0;
1234  //#endif
1235  check_local_count = 0;
1236  mic_errcheck("at mic local stream completed");
1238 
1239  // DMK - DEBUG
1240  MICP("[DEBUG:%d] :: << detected local kernel completion >>\n", CkMyPe()); MICPF;
1241 
1242  // Otherwise, check to see if it has been a long time (if so, timeout with error)
1243  } else if (++check_local_count >= count_limit) {
1244  char errmsg[256];
1245  sprintf(errmsg,
1246  "mic_check_local_progress polled %d times over %f s on step %d",
1247  check_local_count, CkWallTimer() - local_submit_time,
1248  ((ComputeNonbondedMIC*)arg)->step
1249  );
1250  //mic_errcheck(errmsg);
1251  NAMD_die(errmsg);
1252 
1253  // Shouldn't be polling for local complete until remote complete was already detected (check for this case)
1254  } else if ( check_remote_count ) {
1255  NAMD_bug("nonzero check_remote_count in mic_check_local_progres");
1256 
1257  // Otherwise, queue another poll attmpt in the future to try again later
1258  } else {
1259  MIC_POLL(mic_check_local_progress, arg);
1260  }
1261 }
1262 
1264 
1265 static __thread int kernel_launch_state = 0;
1266 
1267 struct cr_sortop {
1268  const Lattice &l;
1269  cr_sortop(const Lattice &lattice) : l(lattice) { }
1270  bool operator() (ComputeNonbondedMIC::compute_record i,
1272  Vector a = l.a();
1273  Vector b = l.b();
1274  Vector c = l.c();
1275  BigReal ri = (i.offset.x * a + i.offset.y * b + i.offset.z * c).length2();
1276  BigReal rj = (j.offset.x * a + j.offset.y * b + j.offset.z * c).length2();
1277  return ( ri < rj );
1278  }
1279 };
1280 
1281 
1282 #if MIC_SUBMIT_ATOMS_ON_ARRIVAL != 0
1283 void ComputeNonbondedMIC::patchReady(PatchID patchID, int doneMigration, int seq) {
1284 
1285  // This function overrides the Compute::patchReady function so that some additional
1286  // work can be done. Compute::patchReady is called at the end of the function.
1287  // When submitting atoms as the atom data arrives on the node, this function along
1288  // with mic_submit_patch_data take care of pushing that data to the devices that
1289  // require them. However, this function is called by each of the host cores, so
1290  // some care must be taken when doing this. There are two items that need to be
1291  // accomplished each timestep. First, the atom data needs to be packed up for
1292  // transfer to the devices (once per patch). Second, for each device that requires
1293  // the data, push that atom data to the device (once per patch per device). A lock
1294  // is used to protect flags that track what portions of this work have been done
1295  // as each of the host threads are notified that the patches are ready. The flags
1296  // are mic_atomData_seq for once per patch work and mic_atomData_deviceSeq[] for
1297  // once per patch per device work (they use changes in the seq value to trigger work).
1298 
1299  // NOTE: The slave-ready mechanism makes use of patchReady to signal the master. The master
1300  // and associated slaves are all on the same node (same host address space and same MIC), so
1301  // the slaves can push the data to the MIC device directly (here). However, when the slaves
1302  // are finished, calls to patchReady with a patchID of -1 will be made (master waits for all
1303  // patches, slaves only wait for their assigned patches and 'pass on' patchReady notifications
1304  // to the master so the master gets them all). As such, we need to check for a '-1' here to
1305  // detect that this is just the slave notifying the master (if so, nothing extra to do here).
1306  if (patchID >= 0) {
1307 
1308  #if MIC_TRACING != 0
1309  double atomSubmit_start = CmiWallTimer();
1310  #endif
1311 
1312  // Get the patch information
1313  patch_record &pr = master->patchRecords[patchID];
1314  Patch *p = pr.p;
1315  CudaAtom *ca = p->getCudaAtomList();
1316  CompAtom *a = pr.positionBox->open(); pr.x = a;
1317  CompAtomExt *aExt = p->getCompAtomExtInfo();
1318  int2 *exclusionsByAtom = master->exclusionsByAtom_ptr;
1319  int numAtoms = p->getNumAtoms();
1320  int numAtoms_16 = ((numAtoms + 15) & (~15));
1321 
1322  // Create references to the atomData pointers on the host for this patch (host copy of data on device)
1323  void* &atomData = p->mic_atomData;
1324  int &allocSize = p->mic_atomData_allocSize_host;
1325  // NOTE: Since we are using a mutex (per patch) to protect the following region,
1326  // only the first thread through will reallocate the memory and copy the patch
1327  // data in to the host's copy of the atomData buffer (seq number as check).
1328  // The remaining threads will check the size and see that everything is fine,
1329  // skipping this work.
1330 
1331  // Note the MIC device that this call wants the device on (the device that the master object drives)
1332  int micDevice = master->micDevice;
1333  __ASSERT(micDevice < MIC_MAX_DEVICES_PER_NODE);
1334  int transferFlag = 0;
1335 
1336  // If either the 'once per patch' or 'once per patch per device' work needs to be done, then grab
1337  // the lock and dow the work
1338  if (p->mic_atomData_seq != seq || p->mic_atomData_deviceSeq[micDevice] != seq) {
1339  pthread_mutex_lock(&(p->mic_atomData_mutex));
1340 
1341  // Clear the flags early so other threads are more likely to skip by the checks for the work that
1342  // will be done by this thread (and the lock). If another thread does pass the check, the lock and
1343  // flag values will still avoid the work being done multiple times. This is just an optimization.
1344  int tmp_mic_atomData_seq = p->mic_atomData_seq;
1345  int tmp_mic_atomData_deviceSeq = p->mic_atomData_deviceSeq[micDevice];
1346  p->mic_atomData_seq = seq;
1347  p->mic_atomData_deviceSeq[micDevice] = seq;
1348 
1349  // Once per patch per timestep work
1350  if (tmp_mic_atomData_seq != seq) { // Check again in case the first check passed while another thread was in the region
1351 
1352  // Allocate the memory as needed
1353  int allocFlag = 0;
1354  if (numAtoms_16 > allocSize || atomData == NULL) {
1355  if (atomData != NULL) { _MM_FREE_WRAPPER(atomData); }
1356  int toAllocSize = (int)(numAtoms_16 * 1.1f);
1357  toAllocSize = ((toAllocSize + 15) & (~15));
1358  atomData = (char*)(_MM_MALLOC_WRAPPER((sizeof(atom) + sizeof(atom_param)) * toAllocSize, MIC_ALIGN, "atomData"));
1359  __ASSERT(atomData != NULL);
1360  allocSize = toAllocSize;
1361  allocFlag = 1;
1362  }
1363 
1364  // Copy the data to the buffer that will be passed to the device(s)
1365  // NOTE: the number of atoms will only change when doneMigration is set
1366  // WARNING | NOTE : The following memcopy assumes CudaAtom and atom data structures match !!!
1367  atom* dev_a = (atom*)atomData;
1368  atom_param* dev_aExt = (atom_param*)(dev_a + numAtoms_16);
1369  memcpy(dev_a, ca, sizeof(atom) * numAtoms); // atoms always
1370  if (doneMigration || allocFlag) { // atom_params sometimes
1371  #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
1372  for (int k = 0; k < numAtoms; k++) {
1373  int j = aExt[k].sortOrder;
1374  dev_aExt[k].vdw_type = a[j].vdwType;
1375  dev_aExt[k].index = aExt[j].id;
1376  #ifdef MEM_OPT_VERSION
1377  dev_aExt[k].excl_index = exclusionsByAtom[aExt[j].exclId].y;
1378  dev_aExt[k].excl_maxdiff = exclusionsByAtom[aExt[j].exclId].x;
1379  #else
1380  dev_aExt[k].excl_index = exclusionsByAtom[aExt[j].id].y;
1381  dev_aExt[k].excl_maxdiff = exclusionsByAtom[aExt[j].id].x;
1382  #endif
1383  }
1384  #else
1385  int *dev_aExt_vdwType = ((int*)dev_aExt) + (0 * numAtoms_16);
1386  int *dev_aExt_index = ((int*)dev_aExt) + (1 * numAtoms_16);
1387  int *dev_aExt_exclIndex = ((int*)dev_aExt) + (2 * numAtoms_16);
1388  int *dev_aExt_exclMaxDiff = ((int*)dev_aExt) + (3 * numAtoms_16);
1389  for (int k = 0; k < numAtoms; k++) {
1390  int j = aExt[k].sortOrder;
1391  dev_aExt_vdwType[k] = a[j].vdwType;
1392  dev_aExt_index[k] = aExt[j].id;
1393  #ifdef MEM_OPT_VERSION
1394  dev_aExt_exclIndex[k] = exclusionsByAtom[aExt[j].exclId].y;
1395  dev_aExt_exclMaxDiff[k] = exclusionsByAtom[aExt[j].exclId].x;
1396  #else
1397  dev_aExt_exclIndex[k] = exclusionsByAtom[aExt[j].id].y;
1398  dev_aExt_exclMaxDiff[k] = exclusionsByAtom[aExt[j].id].x;
1399  #endif
1400  }
1401  #endif
1402  }
1403  } // end if (mic_atomData_seq != seq)
1404 
1405  // Once per patch per timestep per device work
1406  // NOTE : Within the protected region, simply flag that the transfer needs to take place
1407  // and move on. This will allow transfers to multiple MIC cards to occur in parallel
1408  // without the per-patch-lock serializing them.
1409  if (tmp_mic_atomData_deviceSeq != seq) { transferFlag = 1; }
1410 
1411  pthread_mutex_unlock(&(p->mic_atomData_mutex));
1412  } // end if (mic_atomData_seq != seq || mic_atomData_deviceSeq[micDevice] != seq)
1413 
1414  // Transfer the data to the given device
1415  if (transferFlag != 0) {
1416  int allocBytes = allocSize * (sizeof(atom) + sizeof(atom_param));
1417  int transferBytes = numAtoms_16 * sizeof(atom);
1418  if (doneMigration) { transferBytes += numAtoms_16 * sizeof(atom_param); }
1419  void *signal = NULL;
1420 
1421  #if MIC_TRACING != 0
1422  double atomTransfer_start = CmiWallTimer();
1423  #endif
1424 
1425  mic_submit_patch_data(micDevice,
1426  p->mic_atomData,
1427  p->mic_atomData_prev[micDevice],
1428  transferBytes,
1429  allocBytes,
1430  p->mic_atomData_allocSize_device[micDevice],
1431  p->mic_atomData_devicePtr[micDevice],
1432  signal
1433  );
1434 
1435  #if MIC_TRACING != 0
1436  double atomTransfer_finish = CmiWallTimer();
1437  MIC_TRACING_RECORD(MIC_EVENT_ATOMS_TRANSFER, atomTransfer_start, atomTransfer_finish);
1438  #endif
1439 
1440  if (signal != NULL) { atomSubmitSignals->insert(signal); }
1441  }
1442 
1443  #if MIC_TRACING != 0
1444  double atomSubmit_finish = CmiWallTimer();
1445  MIC_TRACING_RECORD(MIC_EVENT_ATOMS_SUBMIT, atomSubmit_start, atomSubmit_finish);
1446  #endif
1447 
1448  } // end if (patchID >= 0)
1449 
1450  // Call parent class patchReady function to perform the usual work
1451  Compute::patchReady(patchID, doneMigration, seq);
1452 }
1453 #endif // MIC_SUBMIT_ATOMS_ON_ARRIVAL != 0
1454 
1456  //SimParameters *simParams = Node::Object()->simParameters;
1457  for ( int i=0; i<hostedPatches.size(); ++i ) {
1458  patch_record &pr = master->patchRecords[hostedPatches[i]];
1459  pr.positionBox->skip();
1460  pr.forceBox->skip();
1461  //if (simParams->GBISOn) {
1462  // pr.intRadBox->skip();
1463  // pr.psiSumBox->skip();
1464  // pr.bornRadBox->skip();
1465  // pr.dEdaSumBox->skip();
1466  // pr.dHdrPrefixBox->skip();
1467  //}
1468  }
1469 }
1470 
1472 
1473  SimParameters *simParams = Node::Object()->simParameters;
1474 
1475  __ASSERT(hostedPatches.size() > 0);
1476  Flags &flags = master->patchRecords[hostedPatches[0]].p->flags;
1477 
1478  lattice = flags.lattice;
1479  doSlow = flags.doFullElectrostatics;
1480  doEnergy = flags.doEnergy;
1481  step = flags.step;
1482 
1483  if ( ! flags.doNonbonded ) {
1484  GBISP("GBIS[%d] noWork() don't do nonbonded\n",CkMyPe());
1485  if ( master != this ) {
1486  computeMgr->sendNonbondedMICSlaveReady(masterPe,
1488  } else {
1489  for ( int i = 0; i < numSlaves; ++i ) {
1490  computeMgr->sendNonbondedMICSlaveSkip(slaves[i],slavePes[i]);
1491  }
1492  skip();
1493  }
1494  if ( reduction ) reduction->submit();
1495  return 1;
1496  }
1497 
1498  // Wait for pending input buffers here
1499  // DMK - NOTE | TODO : For now this is blocking, but setup polling at some point. May be possible to
1500  // have slaves start polling here with the callback sending the ready message. For the master, perhaps
1501  // the polling could be started at the beginning of doWork() with the callback doing the same thing as
1502  // the offload completion callbacks (trigger another call to doWork() and check state variables to
1503  // figure out what to do).
1504  #if MIC_SUBMIT_ATOMS_ON_ARRIVAL != 0
1505  {
1506  #if MIC_TRACING != 0
1507  double atomSubmit_wait_start = CmiWallTimer();
1508  #endif
1509 
1510  int micDevice = master->micDevice;
1511 
1512  std::set<void*>::iterator it;
1513  for (it = atomSubmitSignals->begin(); it != atomSubmitSignals->end(); it++) {
1514  void *sig = (*it);
1515  #if 0 // Use blocking offload_wait pragma
1516  #pragma offload_wait target(mic:micDevice) wait(sig)
1517  { }
1518  #else // Use busy wait
1519  while (!_Offload_signaled(master->micDevice, sig)) { }
1520  #endif
1521  }
1522  atomSubmitSignals->clear(); // Remove all pending signals
1523 
1524  #if MIC_TRACING != 0
1525  double atomSubmit_wait_finished = CmiWallTimer();
1526  MIC_TRACING_RECORD(MIC_EVENT_ATOMS_WAIT, atomSubmit_wait_start, atomSubmit_wait_finished);
1527  #endif
1528  }
1529  #endif
1530 
1531  for ( int i=0; i<hostedPatches.size(); ++i ) {
1532  patch_record &pr = master->patchRecords[hostedPatches[i]];
1533  if (!simParams->GBISOn || gbisPhase == 1) {
1534  GBISP("GBIS[%d] noWork() P0[%d] open()\n",CkMyPe(), pr.patchID);
1535  // DMK - NOTE : When patches are pushed to the device individually, open called in patchReady function
1536  #if MIC_SUBMIT_ATOMS_ON_ARRIVAL == 0
1537  pr.x = pr.positionBox->open();
1538  #endif
1539  pr.xExt = pr.p->getCompAtomExtInfo();
1540  }
1541 
1542  //if (simParams->GBISOn) {
1543  // if (gbisPhase == 1) {
1544  // GBISP("GBIS[%d] noWork() P1[%d] open()\n",CkMyPe(),pr.patchID);
1545  // pr.intRad = pr.intRadBox->open();
1546  // pr.psiSum = pr.psiSumBox->open();
1547  // } else if (gbisPhase == 2) {
1548  // GBISP("GBIS[%d] noWork() P2[%d] open()\n",CkMyPe(),pr.patchID);
1549  // pr.bornRad = pr.bornRadBox->open();
1550  // pr.dEdaSum = pr.dEdaSumBox->open();
1551  // } else if (gbisPhase == 3) {
1552  // GBISP("GBIS[%d] noWork() P3[%d] open()\n",CkMyPe(),pr.patchID);
1553  // pr.dHdrPrefix = pr.dHdrPrefixBox->open();
1554  // }
1555  // GBISP("opened GBIS boxes");
1556  //}
1557  }
1558 
1559  if ( master == this ) return 0; //work to do, enqueue as usual
1560 
1561  // message masterPe
1562  computeMgr->sendNonbondedMICSlaveReady(masterPe,
1563  hostedPatches.size(),
1564  atomsChanged,
1565  sequence()
1566  );
1567 
1568  workStarted = ((singleKernelFlag != 0) ? (2) : (1));
1570 
1571  return 1;
1572 }
1573 
1575 
1576  GBISP("C.N.MIC[%d]::doWork: seq %d, phase %d, workStarted %d\n", \
1577  CkMyPe(), sequence(), gbisPhase, workStarted);
1578 
1579  if ( workStarted ) { //if work already started, check if finished
1580  if ( finishWork() ) { // finished
1581  workStarted = 0;
1582  basePriority = PROXY_DATA_PRIORITY; // higher to aid overlap
1583 
1584  // DMK - DEBUG
1585  timestep++;
1586 
1587  } else { // need to call again
1588 
1589  workStarted = 2;
1590  basePriority = PROXY_RESULTS_PRIORITY; // lower for local
1591  if ( master == this && kernel_launch_state > 2 ) {
1592  //#if MIC_TRACING != 0
1593  // mic_tracing_polling_start = CmiWallTimer();
1594  // mic_tracing_polling_count = 0;
1595  //#endif
1596  mic_check_local_progress(this,0.); // launches polling
1597  }
1598  }
1599  return;
1600  }
1601 
1602  //#if MIC_TRACING != 0
1603  // double doWork_start = CmiWallTimer();
1604  //#endif
1605 
1606  workStarted = ((singleKernelFlag != 0) ? (2) : (1));
1608 
1609  Molecule *mol = Node::Object()->molecule;
1610  Parameters *params = Node::Object()->parameters;
1611  SimParameters *simParams = Node::Object()->simParameters;
1612 
1613  //execute only during GBIS phase 1, or if not using GBIS
1614  if (!simParams->GBISOn || gbisPhase == 1) {
1615 
1616  // bind new patches to device
1617  if ( atomsChanged || computesChanged ) {
1618 
1619  int npatches = activePatches.size();
1620 
1621  pairlistsValid = 0;
1622  pairlistTolerance = 0.;
1623 
1624  if ( computesChanged ) {
1625 
1626  computesChanged = 0;
1627 
1628  // Merge the local and remote active patch lists into a single list
1632  activePatches.resize(npatches);
1633  int *ap = activePatches.begin();
1634  for ( int i=0; i<num_local_patch_records; ++i ) {
1635  *(ap++) = localActivePatches[i];
1636  }
1637  for ( int i=0; i<num_remote_patch_records; ++i ) {
1638  *(ap++) = remoteActivePatches[i];
1639  }
1640 
1641  // sort computes by distance between patches
1642  #if MIC_SORT_COMPUTES != 0
1643  cr_sortop so(lattice);
1644  std::stable_sort(localComputeRecords.begin(),localComputeRecords.end(),so);
1645  std::stable_sort(remoteComputeRecords.begin(),remoteComputeRecords.end(),so);
1646  #endif
1647 
1648  // Merge the sorted lists of local and remote computes into a single list of computes
1651  computeRecords.resize(num_local_compute_records + num_remote_compute_records);
1652  compute_record *cr = computeRecords.begin();
1653  for ( int i=0; i<num_local_compute_records; ++i ) {
1654  *(cr++) = localComputeRecords[i];
1655  }
1656  for ( int i=0; i<num_remote_compute_records; ++i ) {
1657  *(cr++) = remoteComputeRecords[i];
1658  }
1659 
1660  // Set each patch record's localIndex and initialize force_list_size for each force_list
1661  force_lists.resize(npatches);
1662  for ( int i=0; i<npatches; ++i ) {
1663  patchRecords[activePatches[i]].localIndex = i;
1664  force_lists[i].force_list_size = 0;
1665  }
1666 
1667  // For the patch_pairs, one for each compute object...
1668  int ncomputes = computeRecords.size();
1669  patch_pairs.resize(ncomputes);
1670  for ( int i=0; i<ncomputes; ++i ) {
1671 
1673  int lp1 = patchRecords[cr.pid[0]].localIndex;
1674  int lp2 = patchRecords[cr.pid[1]].localIndex;
1675 
1676  // Count the number of writers/force-contributers
1677  force_lists[lp1].force_list_size++;
1678  if (!cr.isSelf) {
1679  force_lists[lp2].force_list_size++;
1680  }
1681 
1682  // Initialize the offset
1683  patch_pair &pp = patch_pairs[i];
1684  pp.offset.x = cr.offset.x;
1685  pp.offset.y = cr.offset.y;
1686  pp.offset.z = cr.offset.z;
1687 
1688  // Place in the patch object's atomData (device) pointer
1689  #if MIC_SUBMIT_ATOMS_ON_ARRIVAL != 0
1690  pp.patch1_atomDataPtr = patchRecords[cr.pid[0]].p->mic_atomData_devicePtr[myDevice];
1691  pp.patch2_atomDataPtr = patchRecords[cr.pid[1]].p->mic_atomData_devicePtr[myDevice];
1692  #endif
1693  }
1694 
1695  // Set the force list information for each patch pair
1696  for ( int i=0; i<ncomputes; ++i ) {
1697 
1699  patch_pair &pp = patch_pairs[i];
1700 
1701  int lp1 = patchRecords[cr.pid[0]].localIndex;
1702  pp.patch1_force_list_index = lp1;
1703  pp.patch1_force_list_size = force_lists[lp1].force_list_size;
1704 
1705  if (cr.isSelf) {
1706  pp.patch2_force_list_index = pp.patch1_force_list_index;
1707  pp.patch2_force_list_size = pp.patch1_force_list_size;
1708  } else {
1709  int lp2 = patchRecords[cr.pid[1]].localIndex;
1710  pp.patch2_force_list_index = lp2;
1711  pp.patch2_force_list_size = force_lists[lp2].force_list_size;
1712  }
1713  }
1714 
1715  if ( CmiPhysicalNodeID(CkMyPe()) < 2 )
1716  CkPrintf("Pe %d has %d local and %d remote patches and %d local and %d remote computes.\n",
1719  );
1720 
1721  } // computesChanged
1722 
1723  // Count the number of atoms (and non-fixed atoms), recording the accumulated
1724  // values as we go into the patch record and force list structures.
1725  int istart = 0;
1726  int flstart = 0;
1727  int vlstart = 0;
1728  int max_atoms_per_patch = 0;
1729  int i;
1730  for (i = 0; i < npatches; i++) {
1731 
1732  // If we have walked off the end of the local list, then take
1733  // note of the numer of atoms so far (the local atom count).
1734  if ( i == localActivePatches.size() ) {
1735  num_local_atom_records = istart;
1736  }
1737 
1738  // Record the current offsets as the offsets as the
1739  // beginning of this force list.
1740  force_lists[i].force_list_start = flstart;
1741  force_lists[i].force_output_start = istart;
1742  force_lists[i].atom_start = istart;
1743 
1744  // Record the current offset as the patch record's starting offset.
1745  patch_record &pr = patchRecords[activePatches[i]];
1746  pr.localStart = istart;
1747 
1748  // Count number of atoms (and non-fixed atoms), recording the actual
1749  // counts to the patch record and then rounding up for alignment
1750  int natoms = pr.p->getNumAtoms();
1751  int nfreeatoms = natoms; // MDK - TODO | FIXME : treat all as free for now (will save some work in not all free, smaller pairlists during pairlist generation)
1752  if ( fixedAtomsOn ) {
1753  const CompAtomExt *aExt = pr.xExt;
1754  for ( int j=0; j<natoms; ++j ) {
1755  if ( aExt[j].atomFixed ) --nfreeatoms;
1756  }
1757  }
1758  if ( natoms > max_atoms_per_patch ) max_atoms_per_patch = natoms;
1759  pr.numAtoms = natoms;
1760  pr.numFreeAtoms = nfreeatoms;
1761  force_lists[i].patch_size = natoms; // DMK - nfreeatoms;
1762  natoms = (natoms + 15) & (~15);
1763  nfreeatoms = (nfreeatoms + 15) & (~15);
1764  force_lists[i].patch_stride = natoms; // DMK - nfreeatoms;
1765 
1766  // DMK - CHECK - Rollover check
1767  int _flstart = flstart;
1768  int _vlstart = vlstart;
1769 
1770  // Update the offsets by the atom counts for this patch record.
1771  flstart += natoms * force_lists[i].force_list_size; // DMK - nfreeatoms * force_lists[i].force_list_size;
1772  vlstart += 16 * force_lists[i].force_list_size;
1773  istart += natoms; // already rounded up
1774 
1775  // DMK - CHECK - Rollover check
1776  __ASSERT(_flstart <= flstart && _vlstart <= vlstart);
1777 
1778  force_lists[i].force_list_size = 0; // rebuild below
1779  }
1780  // Handle the case where all are local for recording local atom count.
1781  if ( i == localActivePatches.size() ) {
1782  num_local_atom_records = istart;
1783  }
1784 
1785  // Record final offsets (i.e. lengths/counts).
1786  num_force_records = flstart;
1787  num_atom_records = istart;
1789 
1790  // Allocate the memory for the atom and force information
1791  if ( num_atom_records > num_atom_records_allocated ) {
1792  if ( num_atom_records_allocated ) {
1793  _MM_FREE_WRAPPER(atom_params); //delete [] atom_params;
1794  _MM_FREE_WRAPPER(atoms); //delete [] atoms;
1795  _MM_FREE_WRAPPER(forces); //delete [] forces;
1796  _MM_FREE_WRAPPER(slow_forces); //delete [] slow_forces;
1797  //if (simParams->GBISOn) {
1798  // delete [] intRad0H;//6 GBIS arrays
1799  // delete [] intRadSH;
1800  // delete [] psiSumH;
1801  // delete [] bornRadH;
1802  // delete [] dEdaSumH;
1803  // delete [] dHdrPrefixH;
1804  //}
1805  }
1806  num_atom_records_allocated = 1.1 * num_atom_records + 1;
1807  atom_params = (atom_param*)_MM_MALLOC_WRAPPER(num_atom_records_allocated * sizeof(atom_param), 64, "atom_params"); //new atom_param[num_atom_records_allocated];
1808  atoms = (atom*)_MM_MALLOC_WRAPPER(num_atom_records_allocated * sizeof(atom), 64, "atoms"); //new atom[num_atom_records_allocated];
1809  if (atom_params == NULL || atoms == NULL) { NAMD_die("Unable to allocate atoms in ComputeNonbondedMIC::doWork"); }
1810  forces = (double4*)_MM_MALLOC_WRAPPER(num_atom_records_allocated * sizeof(double4), 64, "forces"); //new double4[num_atom_records_allocated];
1811  slow_forces = (double4*)_MM_MALLOC_WRAPPER(num_atom_records_allocated * sizeof(double4), 64, "slow_forces"); //new double4[num_atom_records_allocated];
1812  //allocate GBIS memory
1813  //if (simParams->GBISOn) {
1814  // intRad0H = new float[num_atom_records_allocated];
1815  // intRadSH = new float[num_atom_records_allocated];
1816  // psiSumH = new GBReal[num_atom_records_allocated];
1817  // bornRadH = new float[num_atom_records_allocated];
1818  // dEdaSumH = new GBReal[num_atom_records_allocated];
1819  // dHdrPrefixH = new float[num_atom_records_allocated];
1820  //}
1821  if (forces == NULL || slow_forces == NULL || (simParams->GBISOn &&
1822  (intRad0H == NULL || intRadSH == NULL || psiSumH == NULL ||
1823  bornRadH == NULL || dEdaSumH == NULL || dHdrPrefixH == NULL))) {
1824  NAMD_die("Unable to allocate forces in ComputeNonbondedMIC::doWork");
1825  }
1826  }
1827 
1828  // DMK - NOTE - Continue filling in the patch pair records
1829  int bfstart = 0;
1830  int ncomputes = computeRecords.size();
1831  for ( int i=0; i<ncomputes; ++i ) {
1832 
1834 
1835  int p1 = cr.pid[0];
1836  int p2 = cr.pid[1];
1837  int lp1 = patchRecords[p1].localIndex;
1838  int lp2 = patchRecords[p2].localIndex;
1839  patch_pair &pp = patch_pairs[i];
1840  pp.patch1_atom_start = patchRecords[p1].localStart;
1841  pp.patch1_force_start = force_lists[lp1].force_list_start
1842  + (force_lists[lp1].patch_stride * force_lists[lp1].force_list_size);
1843  force_lists[lp1].force_list_size++;
1844  pp.patch1_size = patchRecords[p1].numAtoms;
1845  pp.patch1_force_size = patchRecords[p1].numAtoms;
1846 
1847  if (cr.isSelf) {
1848  pp.patch2_atom_start = pp.patch1_atom_start;
1849  pp.patch2_force_start = pp.patch1_force_start;
1850  pp.patch2_size = pp.patch1_size;
1851  pp.patch2_force_size = pp.patch1_force_size;
1852  } else {
1853  pp.patch2_atom_start = patchRecords[p2].localStart;
1854  pp.patch2_force_start = force_lists[lp2].force_list_start
1855  + (force_lists[lp2].patch_stride * force_lists[lp2].force_list_size);
1856  force_lists[lp2].force_list_size++;
1857  pp.patch2_size = patchRecords[p2].numAtoms;
1858  pp.patch2_force_size = patchRecords[p2].numAtoms;
1859  }
1860 
1861  // Get a custom pairlist cutoff for each patch pair
1862  pp.plcutoff = ComputeNonbondedUtil::cutoff +
1863  patchRecords[p1].p->flags.pairlistTolerance +
1864  patchRecords[p2].p->flags.pairlistTolerance;
1865 
1866  // Record the parts
1867  pp.numParts = cr.numParts;
1868  pp.part = cr.part;
1869 
1870  // Record the patch centers
1871  Vector p1_center = micCompute->patchMap->center(p1);
1872  pp.patch1_center_x = p1_center.x;
1873  pp.patch1_center_y = p1_center.y;
1874  pp.patch1_center_z = p1_center.z;
1875  Vector p2_center = micCompute->patchMap->center(p2);
1876  pp.patch2_center_x = p2_center.x;
1877  pp.patch2_center_y = p2_center.y;
1878  pp.patch2_center_z = p2_center.z;
1879 
1880  // DMK - DEBUG
1881  pp.p1 = p1;
1882  pp.p2 = p2;
1883  pp.cid = cr.c;
1884 
1885  pp.block_flags_start = bfstart;
1886  bfstart += ((pp.patch1_force_size + 127) >> 7) << 5;
1887 
1888  } // for ncomputes
1889 
1890  #if 0
1891  CkPrintf("Pe %d mic_bind_patch_pairs %d %d %d %d %d\n", CkMyPe(),
1892  patch_pairs.size(), force_lists.size(),
1894  max_atoms_per_patch);
1895  #endif
1896 
1897  int totalmem = patch_pairs.size() * sizeof(patch_pair) +
1898  force_lists.size() * sizeof(force_list) +
1899  num_force_records * sizeof(double4) +
1900  num_atom_records * sizeof(atom) +
1901  num_atom_records * sizeof(atom_param) +
1902  num_atom_records * sizeof(double4);
1903  int totalcopy = num_atom_records * ( sizeof(atom) + sizeof(double4) );
1904  /*
1905  CkPrintf("Pe %d allocating %d MB of MIC memory, will copy %d kB per step\n",
1906  CkMyPe(), totalmem >> 20, totalcopy >> 10);
1907  */
1908 
1909  // Push the data structures just created, such as patch pairs, to the device.
1910  mic_bind_patch_pairs_only(myDevice, patch_pairs.begin(), patch_pairs.size(), patch_pairs.bufSize());
1911  mic_bind_force_lists_only(myDevice, force_lists.begin(), force_lists.size(), force_lists.bufSize());
1912  mic_bind_atoms_only(myDevice, atoms, atom_params, forces, slow_forces, num_atom_records, num_atom_records_allocated);
1913  mic_bind_force_buffers_only(myDevice, (size_t)num_force_records);
1914 
1915  } // atomsChanged || computesChanged
1916 
1917  double charge_scaling = sqrt(COULOMB * scaling * dielectric_1);
1918 
1919  __ASSERT(hostedPatches.size() > 0);
1920  Flags &flags = patchRecords[hostedPatches[0]].p->flags;
1921  float maxAtomMovement = 0.;
1922  float maxPatchTolerance = 0.;
1923 
1924  for ( int i=0; i<activePatches.size(); ++i ) {
1925 
1926  patch_record &pr = patchRecords[activePatches[i]];
1927 
1928  float maxMove = pr.p->flags.maxAtomMovement;
1929  if ( maxMove > maxAtomMovement ) maxAtomMovement = maxMove;
1930 
1931  float maxTol = pr.p->flags.pairlistTolerance;
1932  if ( maxTol > maxPatchTolerance ) maxPatchTolerance = maxTol;
1933 
1934  int start = pr.localStart;
1935  int n = pr.numAtoms;
1936  int n_16 = (n + 15) & (~15);
1937  const CompAtom *a = pr.x;
1938  const CompAtomExt *aExt = pr.xExt;
1939 
1940  #if MIC_SUBMIT_ATOMS_ON_ARRIVAL == 0
1941 
1942  if ( atomsChanged ) {
1943 
1944  #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
1945  atom_param *ap = atom_params + start;
1946  for (int k = 0; k < n; k++) {
1947  int j = aExt[k].sortOrder;
1948  ap[k].vdw_type = a[j].vdwType;
1949  ap[k].index = aExt[j].id;
1950  #ifdef MEM_OPT_VERSION
1951  ap[k].excl_index = exclusionsByAtom[aExt[j].exclId].y;
1952  ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].exclId].x;
1953  #else
1954  ap[k].excl_index = exclusionsByAtom[aExt[j].id].y;
1955  ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].id].x;
1956  #endif
1957  }
1958  #else
1959  atom_param *ap = atom_params + start;
1960  int *ap_vdwType = ((int*)ap) + (0 * n_16);
1961  int *ap_index = ((int*)ap) + (1 * n_16);
1962  int *ap_exclIndex = ((int*)ap) + (2 * n_16);
1963  int *ap_exclMaxDiff = ((int*)ap) + (3 * n_16);
1964  for ( int k=0; k<n; ++k ) {
1965  int j = aExt[k].sortOrder;
1966  ap_vdwType[k] = a[j].vdwType;
1967  ap_index[k] = aExt[j].id;
1968  #ifdef MEM_OPT_VERSION
1969  ap_exclIndex[k] = exclusionsByAtom[aExt[j].exclId].y;
1970  ap_exclMaxDiff[k] = exclusionsByAtom[aExt[j].exclId].x;
1971  #else
1972  ap_exclIndex[k] = exclusionsByAtom[aExt[j].id].y;
1973  ap_exclMaxDiff[k] = exclusionsByAtom[aExt[j].id].x;
1974  #endif
1975  }
1976  #endif
1977 
1978  } // end if (atomsChanged)
1979 
1980  { // start block
1981  const CudaAtom *ac = pr.p->getCudaAtomList();
1982  atom *ap = atoms + start;
1983  #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
1984  memcpy(ap, ac, sizeof(atom)*n);
1985  #else
1986  memcpy(ap, ac, sizeof(atom)*n_16);
1987  #endif
1988 
1989  } // end block
1990 
1991  #endif // MIC_SUBMIT_ATOMS_ON_ARRIVAL == 0
1992 
1993  } // end for (i < activePatches.size())
1994 
1995  //GBISP("finished active patches\n")
1996 
1997  //CkPrintf("maxMovement = %f maxTolerance = %f save = %d use = %d\n",
1998  // maxAtomMovement, maxPatchTolerance,
1999  // flags.savePairlists, flags.usePairlists);
2000 
2001  savePairlists = 0;
2002  usePairlists = 0;
2003  if ( flags.savePairlists ) {
2004  savePairlists = 1;
2005  usePairlists = 1;
2006  } else if ( flags.usePairlists ) {
2007  if ( ! pairlistsValid ||
2008  ( 2. * maxAtomMovement > pairlistTolerance ) ) {
2010  } else {
2011  usePairlists = 1;
2012  }
2013  }
2014  if ( ! usePairlists ) {
2015  pairlistsValid = 0;
2016  }
2017  float plcutoff = cutoff;
2018  if ( savePairlists ) {
2019  pairlistsValid = 1;
2020  pairlistTolerance = 2. * maxPatchTolerance;
2021  plcutoff += pairlistTolerance;
2022  }
2023  plcutoff2 = plcutoff * plcutoff;
2024 
2025  //CkPrintf("plcutoff = %f listTolerance = %f save = %d use = %d\n",
2026  // plcutoff, pairlistTolerance, savePairlists, usePairlists);
2027 
2028  } // !GBISOn || gbisPhase == 1
2029 
2031  //if (simParams->GBISOn) {
2032  // //open GBIS boxes depending on phase
2033  // for ( int i=0; i<activePatches.size(); ++i ) {
2034  // patch_record &pr = master->patchRecords[activePatches[i]];
2035  // GBISP("doWork[%d] accessing arrays for P%d\n",CkMyPe(),gbisPhase);
2036  // if (gbisPhase == 1) {
2037  // //Copy GBIS intRadius to Host
2038  // if (atomsChanged) {
2039  // float *intRad0 = intRad0H + pr.localStart;
2040  // float *intRadS = intRadSH + pr.localStart;
2041  // for ( int k=0; k<pr.numAtoms; ++k ) {
2042  // int j = pr.xExt[k].sortOrder;
2043  // intRad0[k] = pr.intRad[2*j+0];
2044  // intRadS[k] = pr.intRad[2*j+1];
2045  // }
2046  // }
2047  // } else if (gbisPhase == 2) {
2048  // float *bornRad = bornRadH + pr.localStart;
2049  // for ( int k=0; k<pr.numAtoms; ++k ) {
2050  // int j = pr.xExt[k].sortOrder;
2051  // bornRad[k] = pr.bornRad[j];
2052  // }
2053  // } else if (gbisPhase == 3) {
2054  // float *dHdrPrefix = dHdrPrefixH + pr.localStart;
2055  // for ( int k=0; k<pr.numAtoms; ++k ) {
2056  // int j = pr.xExt[k].sortOrder;
2057  // dHdrPrefix[k] = pr.dHdrPrefix[j];
2058  // }
2059  // } // end phases
2060  // } // end for patches
2061  //} // if GBISOn
2062 
2063  //#if MIC_TRACING != 0
2064  // MIC_TRACING_RECORD(MIC_EVENT_FUNC_DOWORK, doWork_start, CmiWallTimer());
2065  //#endif
2066 
2067  kernel_time = CkWallTimer();
2068  kernel_launch_state = ((singleKernelFlag != 0) ? (2) : (1));
2069  if ( mic_is_mine ) recvYieldDevice(-1);
2070 }
2071 
2072 void mic_check_remote_calc(void *arg, double) {
2073  if (mic_check_remote_kernel_complete(myDevice)) {
2074  computeMgr->sendYieldDevice(next_pe_sharing_mic);
2075  } else {
2076  MIC_POLL(mic_check_remote_calc, arg);
2077  }
2078 }
2079 
2080 void mic_check_local_calc(void *arg, double) {
2081  if (mic_check_local_kernel_complete(myDevice)) {
2082  computeMgr->sendYieldDevice(next_pe_sharing_mic);
2083  } else {
2084  MIC_POLL(mic_check_local_calc, arg);
2085  }
2086 }
2087 
2089 
2090  GBISP("C.N.MIC[%d]::recvYieldDevice: seq %d, workStarted %d, \
2091  gbisPhase %d, kls %d, from pe %d\n", CkMyPe(), sequence(), \
2092  workStarted, gbisPhase, kernel_launch_state, pe)
2093 
2094  mic_position3_t lata, latb, latc;
2095  lata.x = lattice.a().x;
2096  lata.y = lattice.a().y;
2097  lata.z = lattice.a().z;
2098  latb.x = lattice.b().x;
2099  latb.y = lattice.b().y;
2100  latb.z = lattice.b().z;
2101  latc.x = lattice.c().x;
2102  latc.y = lattice.c().y;
2103  latc.z = lattice.c().z;
2104  SimParameters *simParams = Node::Object()->simParameters;
2105 
2106  switch ( kernel_launch_state ) {
2107 
2109  // Remote
2110  case 1:
2111 
2112  GBISP("C.N.MIC[%d]::recvYieldDeviceR: case 1\n", CkMyPe())
2114  mic_is_mine = 0;
2115  remote_submit_time = CkWallTimer();
2116 
2117  if (!simParams->GBISOn || gbisPhase == 1) {
2118 
2119  // DMK - NOTE : GBIS is not supported in this port yet, so cause a runtime error
2120  // if it has been enabled and execution has made it this far.
2121  if (simParams->GBISOn) {
2122  NAMD_die("Unsupported feature (DMK33949330)");
2123  }
2124 
2125  //#if MIC_TRACING != 0
2126  // mic_tracing_offload_start_remote = CmiWallTimer();
2127  //#endif
2128 
2129  // Issue the remote kernel
2130  mic_nonbonded_forces(myDevice, 1,
2131  num_local_atom_records,
2132  num_local_compute_records,
2133  num_local_patch_records,
2134  lata, latb, latc,
2135  doSlow, doEnergy,
2137  atomsChanged
2138  );
2139  #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0)
2140  mic_first_kernel_submit_time = CmiWallTimer();
2141  #endif
2142 
2143  // Start the polling check for the completion of the remote kernel
2144  //#if MIC_TRACING != 0
2145  // mic_tracing_polling_start = CmiWallTimer();
2146  // mic_tracing_polling_count = 0;
2147  //#endif
2148  MIC_POLL(mic_check_remote_progress, this);
2149  }
2150 
2151  // NOTE : Fall through to next case (i.e. break not missing)
2152 
2154  // Local
2155  case 2:
2156 
2157  GBISP("C.N.MIC[%d]::recvYieldDeviceL: case 2\n", CkMyPe())
2159  mic_is_mine = 0;
2160 
2161  if (!simParams->GBISOn || gbisPhase == 1) {
2162 
2163  // DMK - NOTE : GBIS is not supported in this port yet, so cause a runtime error
2164  // if it has been enabled and execution has made it this far.
2165  if (simParams->GBISOn) {
2166  NAMD_die("Unsupported feature (DMK83620583)");
2167  }
2168 
2169  // DMK - TIMING - NOTE : Only local being submitted for now, so
2170  // take the local time now
2171  local_submit_time = CkWallTimer();
2172 
2173  //#if MIC_TRACING != 0
2174  // mic_tracing_offload_start_local = CmiWallTimer();
2175  //#endif
2176 
2177  // Issue the local kernel
2178  mic_nonbonded_forces(myDevice, 0,
2179  num_local_atom_records,
2180  num_local_compute_records,
2181  num_local_patch_records,
2182  lata, latb, latc,
2183  doSlow, doEnergy,
2185  atomsChanged
2186  );
2187 
2188  #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0)
2189  if (singleKernelFlag) { mic_first_kernel_submit_time = CmiWallTimer(); }
2190  #endif
2191 
2192  if ( workStarted == 2 ) {
2193  // Start the polling check for the completion of the local kernel
2194  //#if MIC_TRACING != 0
2195  // mic_tracing_polling_start = CmiWallTimer();
2196  // mic_tracing_polling_count = 0;
2197  //#endif
2198  MIC_POLL(mic_check_local_progress, this);
2199  }
2200 
2201  } // end if (!simParams->GBISOn || gbisPhase == 1)
2202 
2203  default:
2204 
2205  GBISP("C.N.MIC[%d]::recvYieldDevice: case default\n", CkMyPe())
2206  mic_is_mine = 1;
2207  break;
2208 
2209  } // switch
2210 
2211  GBISP("C.N.MIC[%d]::recvYieldDevice: DONE\n", CkMyPe())
2212 }
2213 
2214 
2216 
2217  if ( master == this ) {
2218  for ( int i = 0; i < numSlaves; ++i ) {
2219  computeMgr->sendNonbondedMICSlaveEnqueue(slaves[i],slavePes[i],sequence(),priority(),workStarted);
2220  }
2221  }
2222 
2223  //#if MIC_TRACING != 0
2224  // double finishWork_start = CmiWallTimer();
2225  //#endif
2226 
2227  GBISP("C.N.MIC[%d]::fnWork: workStarted %d, phase %d\n", \
2228  CkMyPe(), workStarted, gbisPhase)
2229 
2230  Molecule *mol = Node::Object()->molecule;
2231  SimParameters *simParams = Node::Object()->simParameters;
2232 
2234  mic_kernel_data * host__local_kernel_data = host__kernel_data;
2235  mic_kernel_data * host__remote_kernel_data = host__kernel_data + 1;
2236  mic_kernel_data* &kernel_data = (workStarted == 1) ? (host__remote_kernel_data) : (host__local_kernel_data);
2237 
2238  // DMK - NOTE : Open the force boxes for the patches so forces can be contributed
2239  if ( !simParams->GBISOn || gbisPhase == 1 ) {
2240  for ( int i=0; i<patches.size(); ++i ) {
2241  patch_record &pr = master->patchRecords[patches[i]];
2242  GBISP("GBIS[%d] fnWork() P0[%d] force.open()\n",CkMyPe(), pr.patchID);
2243  pr.r = pr.forceBox->open();
2244  }
2245  } // !GBISOn || gbisPhase==1
2246 
2247  // DMK - NOTE : For each patch...
2248  for ( int i=0; i<patches.size(); ++i ) {
2249 
2250  // CkPrintf("Pe %d patch %d of %d pid %d\n",CkMyPe(),i,patches.size(),patches[i]);
2251  patch_record &pr = master->patchRecords[patches[i]];
2252  int start = pr.localStart;
2253  const CompAtomExt *aExt = pr.xExt;
2254 
2255  // DMK - NOTE : If this is the end of the timestep for this compute
2256  if ( !simParams->GBISOn || gbisPhase == 3 ) {
2257 
2258  // DMK - NOTE : Contribute the calculated forces and slow_forces from
2259  // this compute to the patch.
2260  //int nfree = pr.numFreeAtoms;
2261  int nAtoms = pr.numAtoms;
2262  int nAtoms_16 = (nAtoms + 15) & (~15);
2263  pr.f = pr.r->f[Results::nbond];
2264  Force *f = pr.f;
2265  Force *f_slow = pr.r->f[Results::slow];
2266  const CompAtom *a = pr.x;
2267  const CompAtomExt *aExt = pr.xExt;
2268  // int *ao = atom_order + start;
2269  //float4 *af = master->forces + start;
2270  //float4 *af_slow = master->slow_forces + start;
2271 
2272  #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
2273 
2274  double4 *af = master->forces + start;
2275  for (int k = 0; k < nAtoms; k++) {
2276  int j = aExt[k].sortOrder;
2277  f[j].x += af[k].x;
2278  f[j].y += af[k].y;
2279  f[j].z += af[k].z;
2280  }
2281  if (doSlow) {
2282  double4 *af_slow = master->slow_forces + start;
2283  for (int k = 0; k < nAtoms; k++) {
2284  int j = aExt[k].sortOrder;
2285  f_slow[j].x += af_slow[k].x;
2286  f_slow[j].y += af_slow[k].y;
2287  f_slow[j].z += af_slow[k].z;
2288  }
2289  }
2290 
2291  #else
2292 
2293  double4 *af = master->forces + start;
2294  double4 *af_slow = master->slow_forces + start;
2295  double *af_x = ((double*)af) + (0 * nAtoms_16);
2296  double *af_y = ((double*)af) + (1 * nAtoms_16);
2297  double *af_z = ((double*)af) + (2 * nAtoms_16);
2298  double *af_w = ((double*)af) + (3 * nAtoms_16);
2299  double *af_slow_x = ((double*)af_slow) + (0 * nAtoms_16);
2300  double *af_slow_y = ((double*)af_slow) + (1 * nAtoms_16);
2301  double *af_slow_z = ((double*)af_slow) + (2 * nAtoms_16);
2302  double *af_slow_w = ((double*)af_slow) + (3 * nAtoms_16);
2303 
2304  for (int k = 0; k < nAtoms; k++) {
2305  int j = aExt[k].sortOrder;
2306  f[j].x += af_x[k];
2307  f[j].y += af_y[k];
2308  f[j].z += af_z[k];
2309  if (doSlow) {
2310  f_slow[j].x += af_slow_x[k];
2311  f_slow[j].y += af_slow_y[k];
2312  f_slow[j].z += af_slow_z[k];
2313  }
2314  }
2315 
2316  #endif
2317 
2318  } // !GBISOn || gbisPhase == 3
2319 
2320  #if 0
2321  if ( i % 31 == 0 ) for ( int j=0; j<3; ++j ) {
2322  CkPrintf("Pe %d patch %d atom %d (%f %f %f) force %f\n", CkMyPe(), i,
2323  j, pr.x[j].position.x, pr.x[j].position.y, pr.x[j].position.z,
2324  af[j].w);
2325  }
2326  #endif
2327 
2329  //if (simParams->GBISOn) {
2330  //
2331  // if (gbisPhase == 1) {
2332  //
2333  // //Copy dEdaSum from Host to Patch Box
2334  // GBReal *psiSumMaster = master->psiSumH + start;
2335  // for ( int k=0; k<pr.numAtoms; ++k ) {
2336  // int j = aExt[k].sortOrder;
2337  // pr.psiSum[j] += psiSumMaster[k];
2338  // }
2339  // GBISP("C.N.MIC[%d]::fnWork: P1 psiSum.close()\n", CkMyPe());
2340  // pr.psiSumBox->close(&(pr.psiSum));
2341  //
2342  // } else if (gbisPhase == 2) {
2343  //
2344  // //Copy dEdaSum from Host to Patch Box
2345  // GBReal *dEdaSumMaster = master->dEdaSumH + start;
2346  // for ( int k=0; k<pr.numAtoms; ++k ) {
2347  // int j = aExt[k].sortOrder;
2348  // pr.dEdaSum[j] += dEdaSumMaster[k];
2349  // }
2350  // GBISP("C.N.MIC[%d]::fnWork: P2 dEdaSum.close()\n", CkMyPe());
2351  // pr.dEdaSumBox->close(&(pr.dEdaSum));
2352  //
2353  // } else if (gbisPhase == 3) {
2354  //
2355  // GBISP("C.N.MIC[%d]::fnWork: P3 all.close()\n", CkMyPe());
2356  // pr.intRadBox->close(&(pr.intRad)); //box 6
2357  // pr.bornRadBox->close(&(pr.bornRad)); //box 7
2358  // pr.dHdrPrefixBox->close(&(pr.dHdrPrefix)); //box 9
2359  // pr.positionBox->close(&(pr.x)); //box 0
2360  // pr.forceBox->close(&(pr.r));
2361  //
2362  // } //end phases
2363  //
2364  //} else { //not GBIS
2365  //
2366  // GBISP("C.N.MIC[%d]::fnWork: pos/force.close()\n", CkMyPe());
2367 
2369  //double _start = CmiWallTimer();
2370 
2371  pr.positionBox->close(&(pr.x));
2372  pr.forceBox->close(&(pr.r));
2373 
2375  //traceUserBracketEvent(11050, _start, CmiWallTimer());
2376 
2377  //}
2378 
2379  } // end for (i<patches.size())
2380 
2381  // DMK - NOTE : Contribute virial values
2382  if ( master == this && (!simParams->GBISOn || gbisPhase == 3) && workStarted == 2 ) {
2383 
2384  double virial_xx = host__local_kernel_data->virial_xx;
2385  double virial_xy = host__local_kernel_data->virial_xy;
2386  double virial_xz = host__local_kernel_data->virial_xz;
2387  double virial_yy = host__local_kernel_data->virial_yy;
2388  double virial_yz = host__local_kernel_data->virial_yz;
2389  double virial_zz = host__local_kernel_data->virial_zz;
2390  double fullElectVirial_xx = host__local_kernel_data->fullElectVirial_xx;
2391  double fullElectVirial_xy = host__local_kernel_data->fullElectVirial_xy;
2392  double fullElectVirial_xz = host__local_kernel_data->fullElectVirial_xz;
2393  double fullElectVirial_yy = host__local_kernel_data->fullElectVirial_yy;
2394  double fullElectVirial_yz = host__local_kernel_data->fullElectVirial_yz;
2395  double fullElectVirial_zz = host__local_kernel_data->fullElectVirial_zz;
2396  double vdwEnergy = host__local_kernel_data->vdwEnergy;
2397  double electEnergy = host__local_kernel_data->electEnergy;
2398  double fullElectEnergy = host__local_kernel_data->fullElectEnergy;
2399  if (singleKernelFlag == 0) {
2400  virial_xx += host__remote_kernel_data->virial_xx;
2401  virial_xy += host__remote_kernel_data->virial_xy;
2402  virial_xz += host__remote_kernel_data->virial_xz;
2403  virial_yy += host__remote_kernel_data->virial_yy;
2404  virial_yz += host__remote_kernel_data->virial_yz;
2405  virial_zz += host__remote_kernel_data->virial_zz;
2406  fullElectVirial_xx += host__remote_kernel_data->fullElectVirial_xx;
2407  fullElectVirial_xy += host__remote_kernel_data->fullElectVirial_xy;
2408  fullElectVirial_xz += host__remote_kernel_data->fullElectVirial_xz;
2409  fullElectVirial_yy += host__remote_kernel_data->fullElectVirial_yy;
2410  fullElectVirial_yz += host__remote_kernel_data->fullElectVirial_yz;
2411  fullElectVirial_zz += host__remote_kernel_data->fullElectVirial_zz;
2412  vdwEnergy += host__remote_kernel_data->vdwEnergy;
2413  electEnergy += host__remote_kernel_data->electEnergy;
2414  fullElectEnergy += host__remote_kernel_data->fullElectEnergy;
2415  }
2416 
2417  // DMK - NOTE : Contribute virial
2418  Tensor virial_tensor;
2419  virial_tensor.xx = virial_xx;
2420  virial_tensor.xy = virial_xy;
2421  virial_tensor.xz = virial_xz;
2422  virial_tensor.yx = virial_xy;
2423  virial_tensor.yy = virial_yy;
2424  virial_tensor.yz = virial_yz;
2425  virial_tensor.zx = virial_xz;
2426  virial_tensor.zy = virial_yz;
2427  virial_tensor.zz = virial_zz;
2428  // DMK - TODO | FIXME : GBIS support needed eventually
2429  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NBOND, virial_tensor);
2430  if (doEnergy) {
2431  reduction->item(REDUCTION_LJ_ENERGY) += vdwEnergy;
2433  }
2434  if (doSlow) {
2435  Tensor virial_slow_tensor;
2436  virial_slow_tensor.xx = fullElectVirial_xx;
2437  virial_slow_tensor.xy = fullElectVirial_xy;
2438  virial_slow_tensor.xz = fullElectVirial_xz;
2439  virial_slow_tensor.yx = fullElectVirial_xy;
2440  virial_slow_tensor.yy = fullElectVirial_yy;
2441  virial_slow_tensor.yz = fullElectVirial_yz;
2442  virial_slow_tensor.zx = fullElectVirial_xz;
2443  virial_slow_tensor.zy = fullElectVirial_yz;
2444  virial_slow_tensor.zz = fullElectVirial_zz;
2445  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_SLOW, virial_slow_tensor);
2446  if (doEnergy) { reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += fullElectEnergy; }
2447  }
2448 
2449  // Contribute to the exclusion checksum
2450  #if MIC_EXCL_CHECKSUM_FULL != 0
2451  int exclusionSum = host__local_kernel_data->exclusionSum;
2452  if (singleKernelFlag == 0) { exclusionSum += host__remote_kernel_data->exclusionSum; }
2453  reduction->item(REDUCTION_EXCLUSION_CHECKSUM) += exclusionSum;
2454  #endif
2455 
2456  // TRACING - Using the tracing output data generated by the device, submit user events to show
2457  // performance data from the MIC device in Projections output
2458  #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0)
2459  {
2460  double timeBase = host__device_times_start[((singleKernelFlag) ? (1) : (0))];
2461 
2462  #if MIC_DEVICE_TRACING_DETAILED != 0
2463 
2464  // Create compute user events
2465  ComputeMap *computeMap = ComputeMap::Object();
2466  PatchMap *patchMap = PatchMap::Object();
2467  int aSize = patchMap->gridsize_a();
2468  int bSize = patchMap->gridsize_b();
2469  int cSize = patchMap->gridsize_c();
2470  for (int i = 0; i < host__patch_pairs_size; i++) {
2471 
2472  // Determine the distance between the patches
2473  int pid0 = host__patch_pairs[i].p1;
2474  int pid1 = host__patch_pairs[i].p2;
2475  int dist = 0;
2476  if (pid0 != pid1) { // Is a pair, not a self
2477  int trans0 = computeMap->trans(host__patch_pairs[i].cid, 0);
2478  int trans1 = computeMap->trans(host__patch_pairs[i].cid, 1);
2479  int index_a0 = patchMap->index_a(pid0) + aSize * Lattice::offset_a(trans0);
2480  int index_b0 = patchMap->index_b(pid0) + bSize * Lattice::offset_b(trans0);
2481  int index_c0 = patchMap->index_c(pid0) + cSize * Lattice::offset_c(trans0);
2482  int index_a1 = patchMap->index_a(pid1) + aSize * Lattice::offset_a(trans1);
2483  int index_b1 = patchMap->index_b(pid1) + bSize * Lattice::offset_b(trans1);
2484  int index_c1 = patchMap->index_c(pid1) + cSize * Lattice::offset_c(trans1);
2485  int da = index_a0 - index_a1; da *= ((da < 0) ? (-1) : (1));
2486  int db = index_b0 - index_b1; db *= ((db < 0) ? (-1) : (1));
2487  int dc = index_c0 - index_c1; dc *= ((dc < 0) ? (-1) : (1));
2488  dist = da + db + dc;
2489  }
2490 
2491  // Retrieve the start and end times for the "compute's task"
2492  double ueStart = host__device_times_computes[i * 2];
2493  double ueEnd = host__device_times_computes[i * 2 + 1];
2494  ueStart += mic_first_kernel_submit_time - timeBase;
2495  ueEnd += mic_first_kernel_submit_time - timeBase;
2496 
2497  if (dist > 7) { dist = 7; } // NOTE: Make sure that the distance is put in the "7+" category if it is >7
2498  traceUserBracketEvent(MIC_EVENT_DEVICE_COMPUTE + dist, ueStart, ueEnd);
2499  }
2500 
2501  // Create patch (force reduction) user events
2502  for (int i = 0; i < host__force_lists_size; i++) {
2503  double ueStart = host__device_times_patches[i * 2];
2504  double ueEnd = host__device_times_patches[i * 2 + 1];
2505  ueStart += mic_first_kernel_submit_time - timeBase;
2506  ueEnd += mic_first_kernel_submit_time - timeBase;
2507  traceUserBracketEvent(MIC_EVENT_DEVICE_PATCH, ueStart, ueEnd);
2508  }
2509 
2510  #endif // MIC_DEVICE_TRACING_DETAILED
2511 
2512  // Create phases
2513  double lPTime0 = host__device_times_start[3];
2514  double lPTime1 = host__device_times_start[5];
2515  double lPTime2 = host__device_times_start[7];
2516  double lPTime3 = host__device_times_start[9];
2517  lPTime0 += mic_first_kernel_submit_time - timeBase;
2518  lPTime1 += mic_first_kernel_submit_time - timeBase;
2519  lPTime2 += mic_first_kernel_submit_time - timeBase;
2520  lPTime3 += mic_first_kernel_submit_time - timeBase;
2521  traceUserBracketEvent(MIC_EVENT_DEVICE_COMPUTES, lPTime0, lPTime1);
2522  //traceUserBracketEvent(MIC_EVENT_DEVICE_VIRIALS, lPTime1, lPTime2);
2523  traceUserBracketEvent(MIC_EVENT_DEVICE_PATCHES, lPTime2, lPTime3);
2524  if (singleKernelFlag == 0) {
2525  double rPTime0 = host__device_times_start[2];
2526  double rPTime1 = host__device_times_start[4];
2527  double rPTime2 = host__device_times_start[6];
2528  double rPTime3 = host__device_times_start[8];
2529  rPTime0 += mic_first_kernel_submit_time - timeBase;
2530  rPTime1 += mic_first_kernel_submit_time - timeBase;
2531  rPTime2 += mic_first_kernel_submit_time - timeBase;
2532  rPTime3 += mic_first_kernel_submit_time - timeBase;
2533  traceUserBracketEvent(MIC_EVENT_DEVICE_COMPUTES, rPTime0, rPTime1);
2534  //traceUserBracketEvent(MIC_EVENT_DEVICE_VIRIALS, rPTime1, rPTime2);
2535  traceUserBracketEvent(MIC_EVENT_DEVICE_PATCHES, rPTime2, rPTime3);
2536  }
2537  }
2538  #endif // MIC_DEVICE_TRACING
2539  }
2540 
2541  if (workStarted == 1) { return 0; }
2542 
2543  if ( master != this ) { // finished
2544  GBISP("finished\n");
2545  if (simParams->GBISOn) gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
2546  atomsChanged = 0;
2547 
2548  //#if MIC_TRACING != 0
2549  // MIC_TRACING_RECORD(MIC_EVENT_FUNC_FINISHWORK, finishWork_start, CmiWallTimer());
2550  //#endif
2551 
2552  return 1;
2553  }
2554 
2555  mic_timer_total += kernel_time;
2556 
2557  // DMK - NOTE : If this is the end of the step for this compute
2558  if ( !simParams->GBISOn || gbisPhase == 3 ) {
2559 
2560  // DMK - DEBUG
2561  MICP("submitting reduction...\n"); MICPF;
2562 
2563  atomsChanged = 0;
2564 
2566  //double __start = CmiWallTimer();
2567 
2568  reduction->submit();
2569 
2571  //traceUserBracketEvent(11051, __start, CmiWallTimer());
2572 
2573  #if 0
2574  mic_timer_count++;
2575  if ( simParams->outputCudaTiming &&
2576  step % simParams->outputCudaTiming == 0 ) {
2577 
2578  // int natoms = mol->numAtoms;
2579  // double wpa = wcount; wpa /= natoms;
2580 
2581  // CkPrintf("Pe %d MIC kernel %f ms, total %f ms, wpa %f\n", CkMyPe(),
2582  // kernel_time * 1.e3, time * 1.e3, wpa);
2583 
2584  #if 0
2585  float upload_ms, remote_calc_ms;
2586  float local_calc_ms, total_ms;
2587  mic_errcheck("before event timers");
2588  micEventElapsedTime(&upload_ms, start_upload, start_calc);
2589  mic_errcheck("in event timer 1");
2590  micEventElapsedTime(&remote_calc_ms, start_calc, end_remote_download);
2591  mic_errcheck("in event timer 2");
2592  micEventElapsedTime(&local_calc_ms, end_remote_download, end_local_download);
2593  mic_errcheck("in event timer 3");
2594  micEventElapsedTime(&total_ms, start_upload, end_local_download);
2595  mic_errcheck("in event timer 4");
2596  mic_errcheck("in event timers");
2597 
2598  CkPrintf("MIC EVENT TIMING: %d %f %f %f %f\n",
2599  CkMyPe(), upload_ms, remote_calc_ms,
2600  local_calc_ms, total_ms);
2601  #endif
2602 
2603  if ( mic_timer_count >= simParams->outputCudaTiming ) {
2604  mic_timer_total /= mic_timer_count;
2605  CkPrintf("MIC TIMING: %d %f ms/step on node %d\n",
2606  step, mic_timer_total * 1.e3, CkMyPe());
2607  }
2608  mic_timer_count = 0;
2609  mic_timer_total = 0;
2610  }
2611  #endif
2612 
2613  } // !GBISOn || gbisPhase==3
2614 
2615  // Next GBIS Phase
2616  GBISP("C.N.MIC[%d]::fnWork: incrementing phase\n", CkMyPe())
2617  if (simParams->GBISOn) gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
2618 
2619  GBISP("C.N.MIC[%d] finished ready for next step\n",CkMyPe());
2620 
2621  //#if MIC_TRACING != 0
2622  // MIC_TRACING_RECORD(MIC_EVENT_FUNC_FINISHWORK, finishWork_start, CmiWallTimer());
2623  //#endif
2624 
2625  return 1; // finished and ready for next step
2626 }
2627 
2628 
2629 __thread FILE* mic_output = NULL;
2630 __thread int mic_output_set = 0;
2631 
2632 
2633 void mic_initproc() {
2634  #if MIC_DEBUG > 0
2635  debugInit(NULL);
2636  #endif
2637 }
2638 
2639 
2640 void debugInit(FILE* fout) {
2641  if (mic_output != NULL) { return; }
2642  if (fout != NULL) {
2643  mic_output = fout;
2644  mic_output_set = 1;
2645  } else {
2646  char fname[256];
2647  sprintf(fname, "mic_debug.%d", CkMyPe());
2648  printf("[MIC-INFO] :: Creating MIC debug file \"%s\" for PE %d...\n", fname, CkMyPe());
2649  mic_output = fopen(fname, "w");
2650  mic_output_set = 0;
2651  }
2652 }
2653 
2654 void debugClose() {
2655  if (mic_output_set == 0) { fclose(mic_output); mic_output = NULL; }
2656 }
2657 
2658 
2659 void mic_initHostDeviceLDB() {
2660  assert(!CkMyPe()); // NOTE: Only PE 0 should call this function
2661  WorkDistrib::send_initHostDeviceLDB(); // This results in a broadcast causing all
2662 } // PEs to call mic_hostDeviceLDB() below
2663 
2664 
2665 #define COMPUTE_DISTANCE(cid) \
2666 int manDist = -1; { \
2667  if (computeMap->type(cid) == computeNonbondedSelfType) { \
2668  manDist = 0; \
2669  } else if (computeMap->type(cid) == computeNonbondedPairType) { \
2670  int aSize = patchMap->gridsize_a(); \
2671  int bSize = patchMap->gridsize_b(); \
2672  int cSize = patchMap->gridsize_c(); \
2673  int pid0 = computeMap->pid(cid, 0); \
2674  int pid1 = computeMap->pid(cid, 1); \
2675  int trans0 = computeMap->trans(cid, 0); \
2676  int trans1 = computeMap->trans(cid, 1); \
2677  int index_a0 = patchMap->index_a(pid0) + aSize * Lattice::offset_a(trans0); \
2678  int index_b0 = patchMap->index_b(pid0) + bSize * Lattice::offset_b(trans0); \
2679  int index_c0 = patchMap->index_c(pid0) + cSize * Lattice::offset_c(trans0); \
2680  int index_a1 = patchMap->index_a(pid1) + aSize * Lattice::offset_a(trans1); \
2681  int index_b1 = patchMap->index_b(pid1) + bSize * Lattice::offset_b(trans1); \
2682  int index_c1 = patchMap->index_c(pid1) + cSize * Lattice::offset_c(trans1); \
2683  int da = index_a0 - index_a1; da *= ((da < 0) ? (-1) : (1)); \
2684  int db = index_b0 - index_b1; db *= ((db < 0) ? (-1) : (1)); \
2685  int dc = index_c0 - index_c1; dc *= ((dc < 0) ? (-1) : (1)); \
2686  manDist = da + db + dc; \
2687  } \
2688 }
2689 
2690 
2691 void mic_hostDeviceLDB() {
2692  // NOTE: Called by all PEs, whether they drive a MIC or not
2693 
2694  const SimParameters * simParams = Node::Object()->simParameters;
2695 
2696  // If this PE is not driving a MIC, then just return an empty contribution
2697  if (devicePe != CkMyPe()) {
2699  return;
2700  }
2701 
2702  // Otherwise send the set of PEs contributing to this PE's MIC
2703  WorkDistrib::send_contributeHostDeviceLDB(numPesSharingDevice, pesSharingDevice);
2704 }
2705 
2706 
2707 void mic_setDeviceLDBParams(int dt, int hs, int sp1, int pp1, int pp2) {
2708  if (CkMyPe() != 0) {
2709  if (dt >= 0) { mic_deviceThreshold = dt; }
2710  if (hs >= 0) { mic_hostSplit = hs; }
2711  if (sp1 > 0) { mic_numParts_self_p1 = sp1; }
2712  if (pp1 > 0) { mic_numParts_pair_p1 = pp1; }
2713  if (pp2 > 0) { mic_numParts_pair_p2 = pp2; }
2714  }
2715 }
2716 
2717 
2718 void mic_contributeHostDeviceLDB(int peSetLen, int * peSet) {
2719  assert(!CkMyPe()); // NOTE: This function should only be called on PE 0
2720 
2721  static int numContribs = 0;
2722  __ASSERT(numContribs >= 0 && numContribs < CkNumPes()); // Make sure we don't get more messages than expected
2723 
2724  static double hdLDBTime = 0.0;
2725  static double hdLDBTime_phase0 = 0.0;
2726  static double hdLDBTime_phase1 = 0.0;
2727  static double hdLDBTime_phase2a = 0.0;
2728  static double hdLDBTime_phase2b = 0.0;
2729  double startTime = CmiWallTimer();
2730 
2731  const SimParameters * simParams = Node::Object()->simParameters;
2732  PatchMap *patchMap = PatchMap::Object();
2733  ComputeMap *computeMap = ComputeMap::Object();
2734  Molecule *molecule = Node::Object()->molecule;
2735  int nComputes = computeMap->numComputes();
2736 
2737  if (numContribs == 0) { // First call
2738  // Setup load balancing parameters (pulling from SimParameters if they haven't already been
2739  // set using the command line or PE 0)
2740  if (mic_deviceThreshold < 0) { mic_deviceThreshold = simParams->mic_deviceThreshold; }
2741  if (mic_hostSplit < 0) { mic_hostSplit = simParams->mic_hostSplit; }
2742  if (mic_numParts_self_p1 < 0) { mic_numParts_self_p1 = simParams->mic_numParts_self_p1; }
2743  if (mic_numParts_pair_p1 < 0) { mic_numParts_pair_p1 = simParams->mic_numParts_pair_p1; }
2744  if (mic_numParts_pair_p2 < 0) { mic_numParts_pair_p2 = simParams->mic_numParts_pair_p2; }
2745  }
2746 
2747  // If a PE has checked in with PE 0 with a non-empty peSet, perform the work
2748  // of load balancing the computes within that peSet.
2749  if (peSetLen > 0) {
2750 
2751  // DMK - DEBUG
2752  #if MIC_VERBOSE_HVD_LDB != 0
2753  CkPrintf("[MIC-HvD-LDB] :: PE %d :: offload pe list = {", CkMyPe());
2754  for (int i = 0; i < peSetLen; i++) { CkPrintf(" %d", peSet[i]); }
2755  CkPrintf(" }\n");
2756  #endif
2757 
2758  // If the deviceThreshold and/or hostSplit values are unset, default them now
2759  // NOTE: By this time, both the command line and config file parameters will have been applied if present
2760  if (mic_deviceThreshold < 0 || mic_hostSplit <= 0) {
2761 
2762  // Collect some parameters for the given problem/input
2763  const int ppn = CkNodeSize(0); // NOTE: For now, assuming all nodes have the same number of host threads
2764  const int ppm = ppn / mic_get_device_count();
2765  const float atomsPerPE = ((float)(molecule->numAtoms)) / ((float)(CkNumPes()));
2766  const int numAway = ((simParams->twoAwayX > 0) ? (1) : (0))
2767  + ((simParams->twoAwayY > 0) ? (1) : (0))
2768  + ((simParams->twoAwayZ > 0) ? (1) : (0));
2769  // If the "twoAway" options are being used, backoff/reduce on the hostSplit adjustment by
2770  // prending there are more atoms per PE (i.e. reduce the scaling adjustment below)
2771  const float atomsPerPE_adj = atomsPerPE + (atomsPerPE * 0.85f * numAway);
2772 
2773  // DMK - DEBUG
2774  #if MIC_VERBOSE_HVD_LDB != 0
2775  printf("[MIC-HvD-LDB] :: PE %d :: ppn: %d, ppm: %d, atomsPerPe: %f (%f)\n", CkMyPe(), ppn, ppm, atomsPerPE, atomsPerPE_adj);
2776  #endif
2777 
2778  // Set hostSplit
2779  // NOTE: This calculation is based on measurements performed on the Stampede cluster at
2780  // TACC (update as necessary: code changes, performance improvements on the MIC, etc.)
2781  // NOTE: The calculation has only been tested on Stampede, and breaks down at 2AwayXY because
2782  // of other dynamic load balancing effects that occur at those scales. TODO: These effects
2783  // need to be corrected/accounted for and then this balancing scheme needs to be revisited.
2784  //if (numAway < 1) {
2785 
2786  // Set deviceThreshold
2787  mic_deviceThreshold = 2;
2788 
2789  // DMK - NOTE : The constants used here are based on measurements and may need to change as the
2790  // the code is modified (change as necessary)
2791  float hs_base = 0.714f * 79.0f;
2792  float hostVsDevice_adj = 0.714f * ppm / 2.0f;
2793  float scaling_adj = 0.714f * 26000.0f / atomsPerPE_adj; // NOTE: scaling as problem thins out, not hard processor/core count
2794 
2795  mic_hostSplit = (int)(hs_base - hostVsDevice_adj - scaling_adj);
2796  if (mic_hostSplit < 5) { mic_hostSplit = 5; } // Apply upper and lower bounds (5% - 95%)
2797  if (mic_hostSplit > 95) { mic_hostSplit = 95; }
2798 
2799  //} else {
2800  // mic_deviceThreshold = 2; // + numAway;
2801  // mic_hostSplit = 30;
2802  //}
2803 
2804  //printf("[MIC-HvD-LDB] :: PE %d :: Automated HvD LDB - Setting DT:%d and HS:%d...\n", CkMyPe(), mic_deviceThreshold, mic_hostSplit);
2805  iout << iINFO << "MIC-HvD-LDB - Automated HvD LDB Setting DT:" << mic_deviceThreshold << " and HS:" << mic_hostSplit << "\n" << endi;
2806 
2807  } else {
2808  if (mic_hostSplit > 100) { mic_hostSplit = 100; }
2809  }
2810  __ASSERT(mic_deviceThreshold >= 0 && mic_hostSplit >= 0);
2811 
2812  // Create "grab bags" of self and pair computes associated with the peSet that
2813  // can be potentially offloaded to the MIC associated with that peSet
2814  std::queue<int> * selfs = new std::queue<int>();
2815  std::queue<int> * pairs = new std::queue<int>();
2816  std::queue<int> * selfsTmp = new std::queue<int>();
2817  std::queue<int> * pairsTmp = new std::queue<int>();
2818  __ASSERT(selfs != NULL && pairs != NULL && selfsTmp != NULL && pairsTmp != NULL);
2819 
2820  int minPID = -1;
2821  int maxPID = -1;
2822  #define WIDEN_PID_RANGE(pid) \
2823  if (minPID < 0) { minPID = pid; maxPID = pid; } \
2824  if (pid < minPID) { minPID = pid; } \
2825  if (pid > maxPID) { maxPID = pid; }
2826 
2827  double startTime_phase0 = CmiWallTimer();
2828 
2829  // NOTE: If a self or pair compute does not meet the requirements set forth by the load
2830  // balancing parameters, place it in pairsTmp ("other" selfs and pairs). It may be
2831  // required in phase1 below to ensure the one-patch-per-PE requirement in phase 1.
2832  int totalNumSelfs = 0;
2833  int totalNumPairs = 0;
2834  for (int i = 0; i < nComputes; i++) {
2835 
2836  // Skip this compute if it is not on a PE in the given set of PEs
2837  int pe = computeMap->node(i);
2838  bool peInSet = false;
2839  for (int j = 0; !peInSet && j < peSetLen; j++) {
2840  if (peSet[j] == pe) { peInSet = true; }
2841  }
2842  if (!peInSet) { continue; }
2843 
2844  // Only consider self and pair computes
2845  if (computeMap->type(i) == computeNonbondedSelfType) {
2846  totalNumSelfs++;
2847  } else if (computeMap->type(i) == computeNonbondedPairType) {
2848  totalNumPairs++;
2849  } else {
2850  continue;
2851  }
2852 
2853  // Check: distance less than device threshold
2854  COMPUTE_DISTANCE(i);
2855  if (manDist < 0 || manDist > mic_deviceThreshold) {
2856  pairsTmp->push(i); // NOTE: because mic_deviceThreshold >= 0 and the manDist
2857  continue; // of a self is 0, only pairs can be placed in pairsTmp
2858  }
2859 
2860  // Check: self or pair, place in associated "grab bag" while updating PID range
2861  if (computeMap->type(i) == computeNonbondedSelfType) {
2862  selfs->push(i);
2863  int pid0 = computeMap->pid(i, 0); WIDEN_PID_RANGE(pid0);
2864  }
2865  if (computeMap->type(i) == computeNonbondedPairType) {
2866  pairs->push(i);
2867  int pid0 = computeMap->pid(i, 0); WIDEN_PID_RANGE(pid0);
2868  int pid1 = computeMap->pid(i, 1); WIDEN_PID_RANGE(pid1);
2869  }
2870  }
2871  __ASSERT(minPID <= maxPID);
2872 
2873  hdLDBTime_phase0 += CmiWallTimer() - startTime_phase0;
2874 
2875  // Calculate the target percentage of compute objects to offload
2876  int total = totalNumSelfs + totalNumPairs;
2877  int target = (int)((total * mic_hostSplit) / 100.0f);
2878  if (target > total) { target = total; } // Do this check just in case a floating point round issue occurs
2879 
2880  // DMK - DEBUG
2881  #if MIC_VERBOSE_HVD_LDB != 0
2882  printf("[MIC-HvD-LDB] :: PE %d :: Phase 0 Result - selfs:%d, pairs:%d, other:%d, self+pair:%d, target:%d\n",
2883  CkMyPe(), (int)(selfs->size()), (int)(pairs->size()), (int)(pairsTmp->size()), total, target
2884  ); fflush(NULL);
2885  #endif
2886 
2887  // Setup an array of flags (one entry per PID) that indicates whether or not a given
2888  // PID is already being referenced by the MIC compute object (i.e. required by at
2889  // least on compute already mapped to the device). Also create a list of CIDs that
2890  // are to be mapped to the device.
2891  bool * pidFlag = new bool[maxPID - minPID + 1];
2892  std::queue<int> * offloads = new std::queue<int>();
2893  __ASSERT(offloads != NULL && pidFlag != NULL);
2894  int offloads_selfs = 0;
2895  int offloads_pairs = 0;
2896  for (int i = minPID; i <= maxPID; i++) { pidFlag[i - minPID] = false; }
2897 
2898  // Create an array of flags that indicate if a given PE contributing to this PE's MIC
2899  // has had at least one patch offloaded (required by ::doWork and ::noWork)
2900  int peLo = peSet[0]; // NOTE: Have already tested that peSetLen > 0
2901  int peHi = peSet[0];
2902  for (int i = 0; i < peSetLen; i++) {
2903  if (peSet[i] < peLo) { peLo = peSet[i]; }
2904  if (peSet[i] > peHi) { peHi = peSet[i]; }
2905  }
2906  __ASSERT(peLo <= peHi);
2907  bool * peSetFlag = new bool[peHi - peLo + 1]; // Inclusive of peLo and peHi
2908  __ASSERT(peSetFlag != NULL);
2909  for (int i = peLo; i <= peHi; i++) { peSetFlag[i - peLo] = false; }
2910 
2912 
2913  double startTime_phase1 = CmiWallTimer();
2914 
2915  // Sweep through the computes, adding at least one self compute per contributing PE to
2916  // satisfy assumptions in ::doWork() and ::noWork()... for now, at most one per PE
2917  // Start with self computes (favor them)
2918  int numSelfs = selfs->size();
2919  for (int i = 0; i < numSelfs; i++) {
2920  int cid = selfs->front(); selfs->pop();
2921  int pid = computeMap->pid(cid, 0);
2922  int pe0 = patchMap->node(pid);
2923  if (pe0 >= peLo && pe0 <= peHi && !(peSetFlag[pe0 - peLo])) {
2924  offloads->push(cid);
2925  offloads_selfs++;
2926  peSetFlag[pe0 - peLo] = true; // Mark the PE as having at least one compute offloaded
2927  pidFlag[pid - minPID] = true; // Mark the associated PID as already being required by the MIC card
2928  } else {
2929  selfs->push(cid);
2930  }
2931  }
2932  // Move on to pair computes
2933  int numPairs = pairs->size();
2934  for (int i = 0; i < numPairs; i++) {
2935  int cid = pairs->front(); pairs->pop();
2936  int pid0 = computeMap->pid(cid, 0);
2937  int pid1 = computeMap->pid(cid, 1);
2938  int pe0 = patchMap->node(pid0);
2939  int pe1 = patchMap->node(pid1);
2940  if ((pe0 >= peLo && pe0 <= peHi && !(peSetFlag[pe0 - peLo])) ||
2941  (pe1 >= peLo && pe1 <= peHi && !(peSetFlag[pe1 - peLo]))
2942  ) {
2943  offloads->push(cid);
2944  offloads_pairs++;
2945  if (pe0 >= peLo && pe0 <= peHi) {
2946  peSetFlag[pe0 - peLo] = true;
2947  pidFlag[pid0 - minPID] = true;
2948  }
2949  if (pe1 >= peLo && pe1 <= peHi) {
2950  peSetFlag[pe1 - peLo] = true;
2951  pidFlag[pid1 - minPID] = true;
2952  }
2953  } else {
2954  pairs->push(cid);
2955  }
2956  }
2957  // If need be, "other" computes (pairs that didn't meet LDB requirements, but might be required)
2958  // NOTE: By this point, it should be the case that all the peSetFlag values are set. Just in
2959  // case that is not true, look through the "other" pairs (that didn't fit LDB criteria).
2960  // However, in this case, also empty the queue since these cannot be used in phase 2 below.
2961  numPairs = pairsTmp->size();
2962  bool usedOther = false;
2963  while (!(pairsTmp->empty())) {
2964  int cid = pairsTmp->front(); pairsTmp->pop();
2965  int pid0 = computeMap->pid(cid, 0);
2966  int pid1 = computeMap->pid(cid, 1);
2967  int pe0 = patchMap->node(pid0);
2968  int pe1 = patchMap->node(pid1);
2969  if ((pe0 >= peLo && pe0 <= peHi && !(peSetFlag[pe0 - peLo])) ||
2970  (pe1 >= peLo && pe1 <= peHi && !(peSetFlag[pe1 - peLo]))
2971  ) {
2972  offloads->push(cid);
2973  offloads_pairs++;
2974  if (pe0 >= peLo && pe0 <= peHi) {
2975  peSetFlag[pe0 - peLo] = true;
2976  pidFlag[pid0 - minPID] = true;
2977  }
2978  if (pe1 >= peLo && pe1 <= peHi) {
2979  peSetFlag[pe1 - peLo] = true;
2980  pidFlag[pid1 - minPID] = true;
2981  }
2982  }
2983  }
2984  if (usedOther) { CkPrintf("[MIC-WARNING] :: Offloaded non-bonded compute that didn't meet LDB criteria to satisfy phase 1 requirement (just FYI, not a problem)\n"); }
2985 
2986  hdLDBTime_phase1 += CmiWallTimer() - startTime_phase1;
2987 
2988  // Verify at least one self per PE was selected (assumption is one patch per
2989  // PE and self computes are mapped to same PE as their associated patch)
2990  int numPEsSet = 0;
2991  for (int i = 0; i < peSetLen; i++) {
2992  if (peSetFlag[peSet[i] - peLo]) { numPEsSet++; }
2993  }
2994  __ASSERT(numPEsSet > 0); // NOTE: Need at least one per set of PEs (one per master)
2995 
2996  // DMK - DEBUG
2997  #if MIC_VERBOSE_HVD_LDB != 0
2998  printf("[MIC-HvD-LDB] :: PE %d :: Phase 1 Result - selfs:%d, pairs:%d, offloads:%d, target:%d\n",
2999  CkMyPe(), (int)(selfs->size()), (int)(pairs->size()), (int)(offloads->size()), target
3000  ); fflush(NULL);
3001  #endif
3002 
3004 
3005  // Push compute objects to the device until the target number of computes has been reached
3006  while (offloads->size() < target) {
3007 
3009 
3010  double time_0 = CmiWallTimer();
3011 
3012  // If we haven't reached target yet, grab a compute to add to the mix (along with it's
3013  // referenced PIDs)
3014  // NOTE : As a heuristic, if the number of offloaded computes is still far away from the
3015  // overall target number, try to add many at a time (miniTarget). However, with twoAway
3016  // options enabled, many are likely to be added by part 2 below, so don't add too many
3017  // in part 1 so that part 2 below can do it's job of minimizing data movement across
3018  // the PCIe bus. This is where the miniTarget divisor comes from (about half of 5*5*5
3019  // from twoAways all being enabled).
3020  int miniTarget = (target - offloads->size()) / 50;
3021  int cid = -1;
3022  if (miniTarget < 1) { miniTarget = 1; }
3023  for (int k = 0; k < miniTarget; k++) {
3024 
3025  // Select the compute (prefer selfs)
3026  if (selfs->size() > 0) { cid = selfs->front(); selfs->pop(); offloads_selfs++; }
3027  else if (pairs->size() > 0) { cid = pairs->front(); pairs->pop(); offloads_pairs++; }
3028  else { break; } // Nothing to grab, so exit
3029  __ASSERT(cid >= 0);
3030 
3031  // Add the compute to the list of offloaded computes
3032  offloads->push(cid); // Select this compute and mark it's PIDs as used
3033 
3034  // Mark the PIDs associated with the compute as being referenced by the device
3035  // so computes that use the same PIDs can be selected in part 2 below
3036  int pid0 = computeMap->pid(cid, 0);
3037  pidFlag[pid0 - minPID] = true;
3038  if (computeMap->type(cid) == computeNonbondedPairType) {
3039  int pid1 = computeMap->pid(cid, 1);
3040  pidFlag[pid1 - minPID] = true;
3041  }
3042  }
3043  if (cid < 0) { break; } // No longer any computes in either list (selfs or pairs)
3044 
3046 
3047  double time_1 = CmiWallTimer();
3048 
3049  // While we haven't reached the target, add computes that reference PIDs that are
3050  // already referenced by computes already mapped to the device
3051  while (offloads->size() < target && selfs->size() > 0) { // Start with selfs
3052  int cid = selfs->front(); selfs->pop();
3053  int pid0 = computeMap->pid(cid, 0);
3054  if (pidFlag[pid0 - minPID]) {
3055  offloads->push(cid);
3056  offloads_selfs++;
3057  } else {
3058  selfsTmp->push(cid);
3059  }
3060  }
3061  { std::queue<int> * t = selfs; selfs = selfsTmp; selfsTmp = t; }
3062 
3063  while (offloads->size() < target && pairs->size() > 0) {
3064  int cid = pairs->front(); pairs->pop();
3065  int pid0 = computeMap->pid(cid, 0);
3066  int pid1 = computeMap->pid(cid, 1);
3067  if (pidFlag[pid0 - minPID] && pidFlag[pid1 - minPID]) {
3068  offloads->push(cid);
3069  offloads_pairs++;
3070  } else {
3071  pairsTmp->push(cid);
3072  }
3073  }
3074  { std::queue<int> * t = pairs; pairs = pairsTmp; pairsTmp = t; }
3075 
3076  double time_2 = CmiWallTimer();
3077  hdLDBTime_phase2a += time_1 - time_0;
3078  hdLDBTime_phase2b += time_2 - time_1;
3079  }
3080 
3081  // DMK - DEBUG
3082  #if MIC_VERBOSE_HVD_LDB != 0
3083  printf("[MIC-HvD-LDB] :: PE %d :: Phase 2 Result [%d->%d] -- selfs:%d, pairs:%d, offloads:%d (s:%d, p:%d), target:%d\n",
3084  CkMyPe(), peLo, peHi, (int)(selfs->size() + selfsTmp->size()), (int)(pairs->size() + pairsTmp->size()),
3085  (int)(offloads->size()), offloads_selfs, offloads_pairs, target
3086  ); fflush(NULL);
3087  #endif
3088 
3089  // If the numParts parameters have not been set manually, then split up the computes headed
3090  // to the device to (1) split up larger tasks and (2) create more tasks
3091  if (mic_numParts_self_p1 <= 0 || mic_numParts_pair_p1 <= 0 || mic_numParts_pair_p2 <= 0) {
3092 
3093  int numDeviceComputes = offloads->size();
3094  if (numDeviceComputes > 1000) { // Enough tasks, no need to break up anything
3095  mic_numParts_self_p1 = 1;
3096  mic_numParts_pair_p1 = 1;
3097  mic_numParts_pair_p2 = 1;
3098  } else if (numDeviceComputes <= 1000 && numDeviceComputes > 350) { // Enough tasks, but some might be too large (granularity)
3099  mic_numParts_self_p1 = 3;
3100  mic_numParts_pair_p1 = 3;
3101  mic_numParts_pair_p2 = 1;
3102  } else { // Starting to be too few tasks, so start splitting them up more and more
3103  // Try to get around 250 - 300 self parts and 400 - 500 pair parts (650 - 800 total parts)
3104  if (offloads_selfs <= 0) { offloads_selfs = 1; }
3105  if (offloads_pairs <= 0) { offloads_pairs = 1; }
3106  #define CONFINE_TO_RANGE(v, mi, ma) (v = (v > ma) ? (ma) : ((v < mi) ? (mi) : (v)))
3107  mic_numParts_self_p1 = 300 / offloads_selfs; CONFINE_TO_RANGE(mic_numParts_self_p1, 8, 16);
3108  mic_numParts_pair_p1 = 500 / offloads_pairs; CONFINE_TO_RANGE(mic_numParts_pair_p1, 4, 16);
3109  #undef CONFINE_TO_RANGE
3110  mic_numParts_pair_p2 = mic_numParts_pair_p1 / 2;
3111  mic_numParts_pair_p1 += mic_numParts_pair_p2; // NOTE: 'numParts = p1 - dist * p2' and 'dist > 0' for pairs
3112  }
3113 
3114  //printf("[MIC-HvD-LDB] :: PE %d :: Automated HvD LDB - Setting NumParts { %d %d %d }\n", CkMyPe(), mic_numParts_self_p1, mic_numParts_pair_p1, mic_numParts_pair_p2);
3115  iout << iINFO << "MIC-HvD-LDB - Automated HvD LDB Setting NumParts { " << mic_numParts_self_p1 << ", " << mic_numParts_pair_p1 << ", " << mic_numParts_pair_p2 << " }\n" << endi;
3116  }
3117 
3118  // Update the compute map to refect the load balancing decision
3119  while (offloads->size() > 0) {
3120  int cid = offloads->front(); offloads->pop();
3121  computeMap->setDirectToDevice(cid, 1);
3122  }
3123 
3124  // Cleanup temp data
3125  delete selfs;
3126  delete pairs;
3127  delete selfsTmp;
3128  delete pairsTmp;
3129  delete offloads;
3130  delete [] peSetFlag;
3131  delete [] pidFlag;
3132  }
3133 
3134  // Increment the count for the number of contributors (PEs). If this is the last
3135  // one, attempt to dump the host-deivce-computeMap.
3136  numContribs++;
3137  hdLDBTime += CmiWallTimer() - startTime;
3138  if (numContribs >= CkNumPes()) {
3139  mic_dumpHostDeviceComputeMap();
3140  WorkDistrib::send_setDeviceLDBParams(mic_deviceThreshold, mic_hostSplit,
3141  mic_numParts_self_p1, mic_numParts_pair_p1, mic_numParts_pair_p2);
3142  #if MIC_VERBOSE_HVD_LDB != 0
3143  CkPrintf("[MIC-HvD-LDB] :: PE %d :: Total HvD LDB Time: %lf sec = { %lf, %lf, (%lf, %lf) }\n",
3144  CkMyPe(), hdLDBTime, hdLDBTime_phase0, hdLDBTime_phase1, hdLDBTime_phase2a, hdLDBTime_phase2b
3145  );
3146  #endif
3147  }
3148 }
3149 
3150 
3151 void mic_dumpHostDeviceComputeMap() {
3152  #if MIC_DUMP_COMPUTE_MAPPING != 0
3153  ComputeMap *computeMap = ComputeMap::Object();
3154  int nComputes = computeMap->numComputes();
3155  char filename[256] = { 0 };
3156  sprintf(filename, "deviceComputeMap.%d", CkMyPe());
3157  FILE * fmap = fopen(filename, "w");
3158  if (fmap != NULL) {
3159  printf("[MIC-INFO] :: PE %d :: Writing device compute map to \"%s\".\n", CkMyPe(), filename);
3160  fprintf(fmap, "Device compute map\n");
3161  for (int i = 0; i < nComputes; i++) {
3162  if (i % 50 == 0) { fprintf(fmap, "\n%d:", i); }
3163  fprintf(fmap, "%s%d", ((i % 10 == 0) ? (" ") : ("")), computeMap->directToDevice(i));
3164  }
3165  fclose(fmap);
3166  } else {
3167  printf("[MIC-WARNING] :: Unable to dump device compute map to file \"%s\"... skipping.\n", filename);
3168  }
3169  #endif
3170 }
3171 
3172 
3173 #endif // NAMD_MIC
static Node * Object()
Definition: Node.h:86
void setNumPatches(int n)
Definition: Compute.h:52
static void bind_force_table(int deviceNum)
static int offset_b(int i)
Definition: Lattice.h:248
register BigReal virial_xy
#define COMPUTE_PROXY_PRIORITY
Definition: Priorities.h:71
register BigReal virial_xz
BigReal zy
Definition: Tensor.h:19
std::ostream & iINFO(std::ostream &s)
Definition: InfoStream.C:81
ResizeArray< compute_record > computeRecords
Type * getNewArray(int n)
Definition: ObjectArena.h:49
static void messageFinishMIC(Compute *)
Definition: WorkDistrib.C:2934
#define PROXY_RESULTS_PRIORITY
Definition: Priorities.h:73
ResizeArray< int > localActivePatches
int sequence(void)
Definition: Compute.h:64
register BigReal virial_yz
void recvYieldDevice(int pe)
static bool sortop_bitreverse(int a, int b)
BigReal xz
Definition: Tensor.h:17
static int offset_c(int i)
Definition: Lattice.h:249
int sortOrder
Definition: NamdTypes.h:87
int numComputes(void)
Definition: ComputeMap.h:101
static __thread int check_remote_count
static ProxyMgr * Object()
Definition: ProxyMgr.h:394
Definition: Node.h:78
short int32
Definition: dumpdcd.c:24
int ComputeID
Definition: NamdTypes.h:183
ComputeNonbondedMIC ** slaves
void requirePatch(int pid)
static __thread cudaEvent_t end_remote_download
int gridsize_c(void) const
Definition: PatchMap.h:66
static PatchMap * Object()
Definition: PatchMap.h:27
#define GBISP(...)
static __thread ComputeMgr * computeMgr
Definition: Vector.h:64
const int32 * get_mod_exclusions_for_atom(int anum) const
Definition: Molecule.h:1163
static void send_contributeHostDeviceLDB(int peSetLen, int *peSet)
Definition: WorkDistrib.C:3395
#define ADD_TENSOR_OBJECT(R, RL, D)
Definition: ReductionMgr.h:43
SimParameters * simParameters
Definition: Node.h:178
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
static __thread int lj_table_size
register BigReal electEnergy
int index_a(int pid) const
Definition: PatchMap.h:86
static __thread atom * atoms
static const Molecule * mol
static void bind_lj_table(int deviceNum)
#define COULOMB
Definition: common.h:46
BigReal & item(int i)
Definition: ReductionMgr.h:312
static __thread float * bornRadH
std::ostream & endi(std::ostream &s)
Definition: InfoStream.C:54
BigReal z
Definition: Vector.h:66
static __thread int2 * exclusionsByAtom
static __thread float * dHdrPrefixH
BigReal yz
Definition: Tensor.h:18
BasicAtomInfo * atomData
Definition: CompressPsf.C:314
#define PROXY_DATA_PRIORITY
Definition: Priorities.h:40
SubmitReduction * willSubmit(int setID, int size=-1)
Definition: ReductionMgr.C:365
int get_table_dim() const
Definition: LJTable.h:44
if(ComputeNonbondedUtil::goMethod==2)
#define count_limit
static ReductionMgr * Object(void)
Definition: ReductionMgr.h:278
#define iout
Definition: InfoStream.h:51
Patch * patch(PatchID pid)
Definition: PatchMap.h:235
static PatchMap * ObjectOnPe(int pe)
Definition: PatchMap.h:28
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
static __thread cudaEvent_t end_local_download
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float plcutoff2
Definition: Patch.h:35
register BigReal virial_yy
#define PRIORITY_SIZE
Definition: Priorities.h:13
static Units next(Units u)
Definition: ParseOptions.C:48
static __thread patch_pair * patch_pairs
CudaAtom * getCudaAtomList()
Definition: Patch.h:124
static __thread float * intRadSH
bool mic_device_shared_with_pe(int pe)
ResizeArray< int > activePatches
int gridsize_a(void) const
Definition: PatchMap.h:64
static __thread double kernel_time
void NAMD_bug(const char *err_msg)
Definition: common.C:129
static int offset_a(int i)
Definition: Lattice.h:247
ComputeType type(ComputeID cid)
Definition: ComputeMap.C:120
BigReal yx
Definition: Tensor.h:18
iterator end(void)
Definition: ResizeArray.h:37
OwnerBox< Patch, CompAtom > positionBox
Definition: Patch.h:213
ComputeNonbondedMIC(ComputeID c, ComputeMgr *mgr, ComputeNonbondedMIC *m=0, int idx=-1)
gridSize z
int index_b(int pid) const
Definition: PatchMap.h:87
CompAtomList p
Definition: Patch.h:146
int priority(void)
Definition: Compute.h:65
static void bind_exclusions(int deviceNum)
static void bind_constants(int deviceNum)
register BigReal virial_zz
BigReal x
Definition: Vector.h:66
ResizeArray< compute_record > remoteComputeRecords
int numAtoms
Definition: Molecule.h:557
int PatchID
Definition: NamdTypes.h:182
void createProxy(PatchID pid)
Definition: ProxyMgr.C:493
void NAMD_die(const char *err_msg)
Definition: common.C:85
SubmitReduction * reduction
const TableEntry * get_table() const
Definition: LJTable.h:43
static AtomMap * Object()
Definition: AtomMap.h:36
void mic_getargs(char **)
BigReal xx
Definition: Tensor.h:17
ResizeArray< int > remoteHostedPatches
int index_c(int pid) const
Definition: PatchMap.h:88
static __thread double remote_submit_time
int add(const Elem &elem)
Definition: ResizeArray.h:97
static __thread double local_submit_time
BigReal zz
Definition: Tensor.h:19
int gbisPhase
Definition: Compute.h:39
Parameters * parameters
Definition: Node.h:177
ResizeArray< int > hostedPatches
#define simParams
Definition: Output.C:127
short vdwType
Definition: NamdTypes.h:55
static void send_setDeviceLDBParams(int dt, int hs, int sp1, int pp1, int pp2)
Definition: WorkDistrib.C:3408
int numPatches(void) const
Definition: PatchMap.h:59
void resize(int i)
Definition: ResizeArray.h:84
int node(int pid) const
Definition: PatchMap.h:114
Definition: Tensor.h:15
static __thread float * energy_gbis
BigReal xy
Definition: Tensor.h:17
static ComputeMap * Object()
Definition: ComputeMap.h:89
ResizeArray< compute_record > localComputeRecords
BigReal y
Definition: Vector.h:66
static __thread int kernel_launch_state
int getNumAtoms()
Definition: Patch.h:105
static const LJTable * ljTable
register BigReal virial_xx
Bool pressureProfileOn
BigReal yy
Definition: Tensor.h:18
ComputeNonbondedMIC * master
void mic_initialize()
static __thread cudaEvent_t start_calc
bool operator()(int32 *li, int32 *lj)
static __thread atom_param * atom_params
ResizeArray< patch_record > patchRecords
int node(ComputeID cid)
Definition: ComputeMap.h:106
gridSize y
static void send_initHostDeviceLDB()
Definition: WorkDistrib.C:3382
int pid(ComputeID cid, int i)
Definition: ComputeMap.C:109
void submit(void)
Definition: ReductionMgr.h:323
virtual void patchReady(PatchID, int doneMigration, int seq)
Definition: Compute.C:63
int basePriority
Definition: Compute.h:37
int size(void) const
Definition: ResizeArray.h:127
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
gridSize x
int trans(ComputeID cid, int i)
Definition: ComputeMap.C:114
ResizeArray< int > remoteActivePatches
void sendBuildMICForceTable()
Definition: ComputeMgr.C:1486
BigReal zx
Definition: Tensor.h:19
Molecule * molecule
Definition: Node.h:176
int gridsize_b(void) const
Definition: PatchMap.h:65
const ComputeID cid
Definition: Compute.h:43
const int32 * get_full_exclusions_for_atom(int anum) const
Definition: Molecule.h:1161
int mic_device_pe()
static __thread float * intRad0H
static __thread int check_local_count
double BigReal
Definition: common.h:114
ResizeArray< int > localHostedPatches
for(int i=0;i< n1;++i)
CompAtomExt * getCompAtomExtInfo()
Definition: Patch.h:117
iterator begin(void)
Definition: ResizeArray.h:36