ComputeNonbondedMIC.C

Go to the documentation of this file.
00001 
00002 #include "common.h"
00003 #include "charm++.h"
00004 
00005 #include "WorkDistrib.h"
00006 #include "ComputeMgr.h"
00007 #include "ProxyMgr.h"
00008 #include "ComputeNonbondedMIC.h"
00009 #include "ComputeNonbondedMICKernel.h"
00010 #include "LJTable.h"
00011 #include "ObjectArena.h"
00012 #include "SortAtoms.h"
00013 #include <algorithm>
00014 
00015 #include "NamdTypes.h"
00016 
00017 // This file only used for builds with Xeon Phi (MIC) support
00018 #ifdef NAMD_MIC
00019 
00020 #include <offload.h>
00021 #include <queue>
00022 #include <assert.h>
00023 
00024 
00025 //#define PRINT_GBIS
00026 #undef PRINT_GBIS
00027 
00028 #ifdef WIN32
00029   #define __thread __declspec(thread)
00030 #endif
00031 
00032 
00033 // Declare (extern) the structures that will be used for holding kernel-specific data since
00034 //   virial data is located within that data structure (read below).
00035 extern __thread mic_kernel_data * host__kernel_data;
00036 extern __thread int singleKernelFlag;
00037 
00038 // TRACING - Declare (extern) the buffers that hold the timing data passed back from the device
00039 #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0)
00040   __thread double mic_first_kernel_submit_time;
00041   extern __thread patch_pair* host__patch_pairs;
00042   extern __thread int host__patch_pairs_size;
00043   extern __thread int host__force_lists_size;
00044   #if MIC_DEVICE_TRACING_DETAILED != 0
00045     extern __thread double* host__device_times_computes;
00046     extern __thread double* host__device_times_patches;
00047   #endif
00048   extern __thread double* host__device_times_start;
00049 #endif
00050 
00051 
00052 __thread int mic_singleKernel = -1;
00053 __thread int mic_deviceThreshold = -1;
00054 __thread int mic_hostSplit = -1;
00055 __thread int mic_numParts_self_p1 = -1;
00056 __thread int mic_numParts_pair_p1 = -1;
00057 __thread int mic_numParts_pair_p2 = -1;
00058 
00059 
00060 // Function used to get the number of devices
00061 int mic_device_count = 0;
00062 int mic_get_device_count() { return mic_device_count; }
00063 
00064 
00065 void mic_errcheck(const char *msg) {
00066   // DMK - TODO : Add mechanism to detect errors here
00067 }
00068 
00069 
00070 void mic_die(const char *msg) {
00071   char host[128];
00072   #ifdef NOHOSTNAME
00073     sprintf(host,"physical node %d", CmiPhysicalNodeID(CkMyPe()));
00074   #else
00075     gethostname(host, 128);  host[127] = 0;
00076   #endif
00077   char devstr[128] = "";
00078   int devnum = mic_device_pe();
00079   if (devnum == CkMyPe()) {
00080     sprintf(devstr, " device %d", devnum);
00081   }
00082   char errmsg[1024];
00083   sprintf(errmsg, "MIC error on Pe %d (%s%s): %s", CkMyPe(), host, devstr, msg);
00084   NAMD_die(errmsg);
00085 }
00086 
00087 
00088 // Function used to process MIC-specific command-line parameters along with
00089 //   global variables to told the content (or flag value) of the parameters.
00090 char *devicelist;
00091 static __thread int usedevicelist;
00092 void mic_getargs(char **argv) {
00093   devicelist = 0;
00094   usedevicelist = CmiGetArgStringDesc(argv, "+devices", &devicelist,
00095         "comma-delimited list of MIC device numbers such as 0,2,1,2");
00096 
00097   CmiGetArgInt(argv, "+micSK", &mic_singleKernel);
00098   CmiGetArgInt(argv, "+micDT", &mic_deviceThreshold);
00099   CmiGetArgInt(argv, "+micHS", &mic_hostSplit);
00100   CmiGetArgInt(argv, "+micSP1", &mic_numParts_self_p1);
00101   CmiGetArgInt(argv, "+micPP1", &mic_numParts_pair_p1);
00102   CmiGetArgInt(argv, "+micPP2", &mic_numParts_pair_p2);
00103 }
00104 
00105 
00106 // Variables for tracking which MIC-device is associated with which host PE.
00107 static __thread int shared_mic;
00108 static __thread int first_pe_sharing_mic;
00109 static __thread int next_pe_sharing_mic;
00110 static __thread int devicePe;
00111 static __thread int numPesSharingDevice;
00112 static __thread int *pesSharingDevice;
00113 
00114 static __thread int mic_is_mine;
00115 static __thread int myDevice;
00116 
00117 int mic_device_pe() { return devicePe; }
00118 
00119 bool mic_device_shared_with_pe(int pe) {
00120   for ( int i=0; i<numPesSharingDevice; ++i ) {
00121     if ( pesSharingDevice[i] == pe ) return true;
00122   }
00123   return false;
00124 }
00125 
00126 static inline bool sortop_bitreverse(int a, int b) {
00127   if ( a == b ) return 0; 
00128   for ( int bit = 1; bit; bit *= 2 ) {
00129     if ( (a&bit) != (b&bit) ) return ((a&bit) < (b&bit));
00130   }
00131   return 0;
00132 }
00133 
00134 
00135 void mic_initialize() {
00136 
00137   // If this is PE 0, register the MIC-specific user events with Projections
00138   if ( 0 == CkMyPe() ) {
00139     MIC_TRACING_REGISTER_EVENTS
00140   }
00141 
00142   // Get the host name for this node
00143   char host[128];
00144   #ifdef NOHOSTNAME
00145     sprintf(host,"physical node %d", CmiPhysicalNodeID(CkMyPe()));
00146   #else
00147     gethostname(host, 128);  host[127] = 0;
00148   #endif
00149 
00150   int myPhysicalNodeID = CmiPhysicalNodeID(CkMyPe());
00151   int myRankInPhysicalNode;
00152   int numPesOnPhysicalNode;
00153   int *pesOnPhysicalNode;
00154   CmiGetPesOnPhysicalNode(myPhysicalNodeID,
00155                           &pesOnPhysicalNode,
00156                           &numPesOnPhysicalNode
00157                          );
00158 
00159   {
00160     int i;
00161     for ( i=0; i < numPesOnPhysicalNode; ++i ) {
00162       if ( i && (pesOnPhysicalNode[i] <= pesOnPhysicalNode[i-1]) ) {
00163         i = numPesOnPhysicalNode;
00164         break;
00165       }
00166       if ( pesOnPhysicalNode[i] == CkMyPe() ) break;
00167     }
00168     if ( i == numPesOnPhysicalNode || i != CmiPhysicalRank(CkMyPe()) ) {
00169       CkPrintf("Bad result from CmiGetPesOnPhysicalNode!\n");
00170       for ( i=0; i < numPesOnPhysicalNode; ++i ) {
00171         CkPrintf("pe %d physnode rank %d of %d is %d\n", CkMyPe(),
00172           i, numPesOnPhysicalNode, pesOnPhysicalNode[i]);
00173       }
00174       myRankInPhysicalNode = 0;
00175       numPesOnPhysicalNode = 1;
00176       pesOnPhysicalNode = new int[1];
00177       pesOnPhysicalNode[0] = CkMyPe();
00178     } else {
00179       myRankInPhysicalNode = i;
00180     }
00181   }
00182   // CkPrintf("Pe %d ranks %d in physical node\n",CkMyPe(),myRankInPhysicalNode);
00183 
00184   int deviceCount = _Offload_number_of_devices();
00185   if ( deviceCount < 1 ) {
00186     mic_die("No MIC devices found.");
00187   }
00188 
00189   int *devices;
00190   int ndevices = 0;
00191   int nexclusive = 0;
00192   if ( usedevicelist ) {
00193     devices = new int[strlen(devicelist)];
00194     int i = 0;
00195     while ( devicelist[i] ) {
00196       ndevices += sscanf(devicelist+i,"%d",devices+ndevices);
00197       while ( devicelist[i] && isdigit(devicelist[i]) ) ++i;
00198       while ( devicelist[i] && ! isdigit(devicelist[i]) ) ++i;
00199     }
00200   } else {
00201     if ( ! CkMyPe() ) {
00202       CkPrintf("Did not find +devices i,j,k,... argument, using all\n");
00203     }
00204     devices = new int[deviceCount];
00205     #pragma noinline
00206     for ( int i=0; i<deviceCount; ++i ) {
00207       int dev = i % deviceCount;
00208       #if 0
00209         int dev_ok = 0;
00210         #pragma offload target(mic: dev) in(dev)
00211         dev_ok = mic_check_local();
00212       #else
00213         int dev_ok = mic_check(dev);
00214       #endif
00215       if ( dev_ok ) devices[ndevices++] = dev;
00216       else CkPrintf("Offload to device %d on Pe %d failed.\n",dev,CkMyPe());
00217     }
00218   }
00219 
00220   mic_device_count = ndevices;
00221 
00222   if ( ! ndevices ) {
00223     mic_die("No usable MIC devices found.");
00224   }
00225 
00226   shared_mic = 0;
00227   mic_is_mine = 1;
00228   first_pe_sharing_mic = CkMyPe();
00229   next_pe_sharing_mic = CkMyPe();
00230 
00231   int dev;
00232   if ( numPesOnPhysicalNode > 1 ) {
00233     int myDeviceRank = myRankInPhysicalNode * ndevices / numPesOnPhysicalNode;
00234     dev = devices[myDeviceRank];
00235     devicePe = CkMyPe();
00236     pesSharingDevice = new int[numPesOnPhysicalNode];
00237     devicePe = -1;
00238     numPesSharingDevice = 0;
00239     for ( int i = 0; i < numPesOnPhysicalNode; ++i ) {
00240       if ( i * ndevices / numPesOnPhysicalNode == myDeviceRank ) {
00241         int thisPe = pesOnPhysicalNode[i];
00242         pesSharingDevice[numPesSharingDevice++] = thisPe;
00243         if ( devicePe < 1 ) devicePe = thisPe;
00244         if ( WorkDistrib::pe_sortop_diffuse()(thisPe,devicePe) ) devicePe = thisPe;
00245       }
00246     }
00247     for ( int j = 0; j < ndevices; ++j ) {
00248       if ( devices[j] == dev && j != myDeviceRank ) shared_mic = 1;
00249     }
00250     if ( shared_mic && devicePe == CkMyPe() ) {
00251       if ( CmiPhysicalNodeID(CkMyPe()) < 2 )
00252       CkPrintf("Pe %d sharing MIC device %d\n", CkMyPe(), dev);
00253     }
00254   } else {  // in case phys node code is lying
00255     dev = devices[CkMyPe() % ndevices];
00256     devicePe = CkMyPe();
00257     pesSharingDevice = new int[1];
00258     pesSharingDevice[0] = CkMyPe();
00259     numPesSharingDevice = 1;
00260   }
00261 
00262   if ( devicePe != CkMyPe() ) {
00263     if ( CmiPhysicalNodeID(devicePe) < 2 )
00264     CkPrintf("Pe %d physical rank %d will use MIC device of pe %d\n",
00265              CkMyPe(), myRankInPhysicalNode, devicePe);
00266     myDevice = -1;
00267     return;
00268   }
00269 
00270   // disable token-passing but don't submit local until remote finished
00271   // if shared_mic is true, otherwise submit all work immediately
00272   first_pe_sharing_mic = CkMyPe();
00273   next_pe_sharing_mic = CkMyPe();
00274 
00275   mic_is_mine = ( first_pe_sharing_mic == CkMyPe() ); 
00276 
00277   if ( dev >= deviceCount ) {
00278     char buf[256];
00279     sprintf(buf,"Pe %d unable to bind to MIC device %d on %s because only %d devices are present",
00280                 CkMyPe(), dev, host, deviceCount);
00281     NAMD_die(buf);
00282   }
00283 
00284   if ( CmiPhysicalNodeID(devicePe) < 2 )
00285   CkPrintf("Pe %d physical rank %d binding to MIC device %d on %s: %d procs %d threads \n",
00286              CkMyPe(), myRankInPhysicalNode, dev, host,
00287              omp_get_num_procs_target(TARGET_MIC,dev),
00288              omp_get_max_threads_target(TARGET_MIC,dev));
00289 
00290   myDevice = dev;
00291 
00292   // Initialize the MIC device (note, this should only be called once per device)
00293   mic_init_device(CkMyPe(), CkMyNode(), myDevice);
00294 
00295   int dev_ok = mic_check(dev);
00296   if ( dev_ok ) devices[ndevices++] = dev;
00297   if ( ! dev_ok ) {
00298     char errmsg[1024];
00299     sprintf(errmsg,"Failed binding to device %d on Pe %d", dev, CkMyPe());
00300     NAMD_die(errmsg);
00301   }
00302 }
00303 
00304 void mic_free() {
00305   if (devicePe != CkMyPe()) { return; }
00306   mic_free_device(myDevice);
00307 }
00308 
00309 static __thread ComputeNonbondedMIC* micCompute = 0;
00310 static __thread ComputeMgr *computeMgr = 0;
00311 
00312 void send_build_mic_force_table() {
00313   computeMgr->sendBuildMICForceTable();
00314 }
00315 
00316 void build_mic_force_table() {
00317   if (devicePe != CkMyPe()) { return; }
00318   ComputeNonbondedMIC::bind_lj_table(myDevice);
00319   ComputeNonbondedMIC::bind_force_table(myDevice);
00320   ComputeNonbondedMIC::bind_constants(myDevice);
00321 }
00322 
00323 void ComputeNonbondedMIC::bind_lj_table(int deviceNum) {
00324   if (devicePe != CkMyPe()) { return; }
00325   const int lj_table_dim = ljTable->get_table_dim();
00326   const int lj_table_size = 2 * lj_table_dim * lj_table_dim * sizeof(LJTable::TableEntry);
00327   const LJTable::TableEntry *lj_table_base_ptr = ljTable->get_table();
00328   mic_bind_lj_table(deviceNum, (char*)lj_table_base_ptr, lj_table_dim, lj_table_size);
00329 }
00330 
00331 void ComputeNonbondedMIC::bind_force_table(int deviceNum) {
00332   if (devicePe != CkMyPe()) { return; }
00333   mic_bind_table_four(deviceNum, mic_table_base_ptr, mic_table_n, mic_table_n_16);
00334 }
00335 
00336 void ComputeNonbondedMIC::bind_constants(int deviceNum) {
00337   if (devicePe != CkMyPe()) { return; }
00338   mic_bind_constants(deviceNum,
00339                      ComputeNonbondedUtil::cutoff2,
00340                      ComputeNonbondedUtil::dielectric_1,
00341                      ComputeNonbondedUtil::scaling,
00342                      ComputeNonbondedUtil::scale14,
00343                      ComputeNonbondedUtil::r2_delta,
00344                      ComputeNonbondedUtil::r2_delta_exp,
00345                      ComputeNonbondedUtil::commOnly
00346                     );
00347 }
00348 
00349 struct exlist_sortop {
00350   bool operator() (int32 *li, int32 *lj) {
00351     return ( li[1] < lj[1] );
00352   }
00353 };
00354 
00355 static __thread int2 *exclusionsByAtom = NULL;
00356 
00357 void ComputeNonbondedMIC::bind_exclusions(int deviceNum) {
00358 
00359   // Only do this on PEs that directly control devices (simply return for PEs that do not)
00360   if (devicePe != CkMyPe()) { return; }
00361 
00362   // DMK - Info...
00363   //   Each atom as a globally unique index in the molecule object.  For any
00364   //   given atom, the exclusion list in the molecule object contains a list of
00365   //   the other atoms (referenced by index) that are "excluded" for the the
00366   //   given atom.  These exclusion lists tend to follow a set of patterns
00367   //   (i.e. all water molecules "look" the same).  For this reason, we will store
00368   //   patterns rather than lists, and associate each atom with the appropriate
00369   //   pattern (e.g. 'all oxygen atoms within water molecules will exclude the
00370   //   hydrogen atoms attached to them in the global list of atoms,' rather than
00371   //   both '[water_oxygen_1 excludes water_hydrogen_1 and water_hydrogen_2] and
00372   //   [water_oxygen_2 excludes water_hydrogen_3 and water_hydrogen_4]'). The code
00373   //   below creates those patterns and associates atoms with the appropriate
00374   //   patterns.  By using these common patterns, the memory required is reduced.
00375   //   To do this, the patterns use distances between the indexes of two atoms
00376   //   that exclude one another, rather than absolute indexes of those same atoms.
00377   //   A pattern is an array of bits (2-bits per entry), that ranges from negative
00378   //   offsets to positive offsets.  For example, for a water molecule made up of
00379   //   one oxygen (O1 w/ index i) and two hydrogen (H1 with index i+1 and H2 with
00380   //   index i+2) atoms, the pattern for the oxygen atom (O1) would have 5 entries
00381   //   (10 bits total) and would range from offset -2 to 2 (always -X to +X).  The
00382   //   pattern for the first hydrogen (H1) would have 3 entries (6 bits total)
00383   //   and would range from offset -1 (O1) to 1 (H1).  The value of each 2-bit
00384   //   entry indicate the type of interaction (NORMAL, MODIFIED, or EXCLUDED).
00385 
00386   // Grab a pointer to the molecule object, which contains information about the
00387   //   particle system being simulated.
00388   Molecule *mol = Node::Object()->molecule;
00389 
00390   // Get the number of atoms
00391   #ifdef MEM_OPT_VERSION
00392     int natoms = mol->exclSigPoolSize;
00393   #else
00394     int natoms = mol->numAtoms;
00395   #endif
00396 
00397   // A data structure that will store the offsets and ranges of the patterns
00398   //   created by the code below (see the comments below) for each atom,
00399   //   indicating which pattern should be used for the atom and which other
00400   //   atoms need to be checked against the pattern for this atom.
00401   exclusionsByAtom = new int2[natoms];
00402 
00403   // Declare some tmp variables/buffers used in the code below.
00404   ObjectArena<int32> listArena;
00405   ResizeArray<int32*> unique_lists;
00406   int32 **listsByAtom = new int32*[natoms];
00407   SortableResizeArray<int32> curExclList;
00408   SortableResizeArray<int32> curModList;
00409   SortableResizeArray<int32> curList;
00410 
00411   // For each atom...
00412   for (int i = 0; i < natoms; i++) {
00413 
00414     // Clear the current lists
00415     // NOTE: The values in these lists are offsets between the atom indexes
00416     curExclList.resize(0);  // EXCLUDED (full) list
00417     curModList.resize(0);   // MODIFIED list
00418     curList.resize(0);      // Combined list
00419 
00420     // Always exclude self (index offset to self is zero)
00421     curExclList.add(0);
00422 
00423     // Get the exclusions for this atom from the molecule object, adding each
00424     //   one to the current lists, converting absolut indexes to offsets as
00425     //   required.
00426     // NOTE: NAMD uses a macro called MEM_OPT_VERSION to adjust some of its data
00427     //   structures to be more memory efficient.  Whether or not MEM_OPT_VERSION
00428     //   is set changes how the exclusion information for atoms is stored in the
00429     //   molecule object.
00430     int n = 0;
00431     int min_index = 0;  // Furthest negative index offset
00432     int max_index = 0;  // Furthest positive index offset
00433     #if MEM_OPT_VERSION
00434       const ExclusionSignature *sig = mol->exclSigPool + i;
00435       n = sig->fullExclCnt;
00436       for (int j = 0; j < n; j++) {
00437         int index = sig->fullOffset[j];  // NOTE: Already an index offset
00438         if (index < min_index) { min_index = index; }
00439         if (index > max_index) { max_index = index; }
00440         curExclList.add(index);
00441       }
00442       n = sig->modExclCnt;
00443       for (int j = 0; j < n; j++) {
00444         int index = sig->modOffset[j];  // NOTE: Already an index offset
00445         if (index < min_index) { min_index = index; }
00446         if (index > max_index) { max_index = index; }
00447         curModList.add(index);
00448       }
00449     #else
00450       const int32 *full_excl = mol->get_full_exclusions_for_atom(i);
00451       n = full_excl[0] + 1;
00452       for (int j = 1; j < n; j++) {
00453         int index = full_excl[j] - i;  // NOTE: An absolute index, so make offset
00454         if (index < min_index) { min_index = index; }
00455         if (index > max_index) { max_index = index; }
00456         curExclList.add(index);
00457       }
00458       const int32 *mod_excl = mol->get_mod_exclusions_for_atom(i);
00459       n = mod_excl[0] + 1;
00460       for (int j = 1; j < n; j++) {
00461         int index = mod_excl[j] - i;
00462         if (index < min_index) { min_index = index; }
00463         if (index > max_index) { max_index = index; }
00464         curModList.add(index);
00465       }
00466     #endif
00467     int maxDiff = -1 * min_index;
00468     if (maxDiff < max_index) { maxDiff = max_index; }
00469     // NOTE: maxDiff = max(abs(min_index), max_index);
00470     curExclList.sort(); curExclList.add(-1 * maxDiff - 1);
00471     curModList.sort(); curModList.add(-1 * maxDiff - 1);
00472     // NOTE : curExclList and curModList now contain the list of offsets for
00473     //   atoms that are full and modified excluded (respectively) for this atom
00474     //   (i) in sorted order, with a '-1 * maxDiff - 1' value at the end (this
00475     //   last value is added for the sake of the "matching code" in the loop below).
00476 
00477     // Create a single list with the combined exclusion info of the two separate lists
00478     n = 2 * maxDiff + 1;  // NOTE: pattern ranges from -maxDiff to +maxDiff, inclusive
00479     curList.resize(n);
00480     int curExclListIndex = 0;
00481     int curModListIndex = 0;
00482 
00483     // For each entry in the combined list...
00484     for (int j = -1 * maxDiff; j <= maxDiff; j++) {
00485 
00486       // Assume normal
00487       int bitPattern = 0x00;
00488 
00489       // NOTE: curExclList and curModList are in sorted order and we are moving
00490       //   through the combined list in-order, so we only need to check one
00491       //   index in the separate lists (moving ahead in the list when a match
00492       //   is found).
00493 
00494       // Check if is actually excluded...
00495       if (curExclList[curExclListIndex] == j) {
00496         bitPattern |= 0x01;
00497         curExclListIndex++;
00498       }
00499 
00500       // Check if is actually modified...
00501       if (curModList[curModListIndex] == j) {
00502         bitPattern |= 0x02;
00503         curModListIndex++;
00504       }
00505 
00506       // Store the generated pattern entry
00507       curList[j + maxDiff] = bitPattern;
00508     }
00509 
00510     // NOTE: curList now contains the bit patterns (isModified and isExcluded)
00511     //   flags for this atom (i) based on the offsets of the other atoms.
00512 
00513     // Search through the list of patterns that have already been created, checking
00514     //   to see the current pattern already exists.  If so, just use the existing
00515     //   pattern.  If not, add this new, unique pattern.
00516     // For each existing pattern...
00517     int j = 0;
00518     for (j = 0; j < unique_lists.size(); j++) {
00519 
00520       // Check to see if the size matches (skip if it does not, obviously not a match)
00521       if (n != unique_lists[j][0]) { continue; }
00522 
00523       // Check each entry in the current pattern vs this existing pattern, breaking
00524       //   from the loop if a mismatch is found (i.e. causing the final value of k
00525       //   to be less than the pattern length).
00526       int k = 0;
00527       for (; k < n; k++) { // For each element in the list...
00528         if (unique_lists[j][k+3] != curList[k]) { break; } // ... check for mismatch
00529       }
00530       if (k == n) { break; } // If full match, stop searching (NOTE: j < unique_lists.size() will be true)
00531     }
00532 
00533     // If no match was found (i.e. j == length of existing pattern list) in the search
00534     //   loop above, then add the current pattern to the end of the list of patterns.
00535     if (j == unique_lists.size()) {
00536 
00537       // Create a new list with size n
00538       int32 *list = listArena.getNewArray(n + 3);
00539       list[0] = n;        // Entry [0] = length of pattern (not including list[0] or list[1])
00540       list[1] = maxDiff;  // Entry [1] = offset range
00541       // NOTE: Entry [2] will be filled in later
00542 
00543       // Entries [3+] contain the bit pattern entries
00544       for (int k = 0; k < n; k++) { list[k + 3] = curList[k]; }
00545 
00546       // Add this pattern to the list of patterns
00547       unique_lists.add(list);  // NOTE: This adds the new list at unique_lists[j]
00548     }
00549 
00550     // Note the pattern for this atom (whether it was found or added to the list of patterns)
00551     listsByAtom[i] = unique_lists[j];
00552 
00553   } // end for (i < natoms)
00554 
00555   // Sort the list of patterns, placing the smaller patterns first
00556   std::stable_sort(unique_lists.begin(), unique_lists.end(), exlist_sortop());
00557 
00558   // Now, we create a data structure that simply stores the patterns, using 2 bits
00559   //   per entry per atom.
00560 
00561   // Count the total bits required to store the lists (2 bits per entry in each
00562   //   pattern).  At the same time, note the offsets for the "self entry" for each
00563   //   pattern in the final list of bits for each pattern.  We want to use the
00564   //   "self entry" as the "start" so that we can easily create on offset in the
00565   //   pattern by subtracting the indexes of the two atoms and multiplying that
00566   //   by 2 bits during simulation.
00567   long int totalBits = 0;
00568   int nlists = unique_lists.size();
00569   // For each pattern in the list of patterns...
00570   for (int j = 0; j < nlists; j++) {
00571     int32 *list = unique_lists[j];  // Get the list
00572     int maxDiff = list[1];  // Get the range for the pattern
00573     list[2] = totalBits + (2 * maxDiff);  // Note the offset for the "self entry"/"start" for this pattern
00574     totalBits += 2 * (2 * maxDiff + 1); // Add the total bits needed for this pattern
00575   }
00576 
00577   // For each atom, record the "start" of the list (i.e. the "self" position)
00578   for (int i = 0; i < natoms; i++) {
00579     exclusionsByAtom[i].x = (listsByAtom[i])[1]; // maxDiff or range of pattern
00580     exclusionsByAtom[i].y = (listsByAtom[i])[2]; // "start" (self offset in bits)
00581   }
00582 
00583   // Cleanup listsByAtom (no longer required)
00584   delete [] listsByAtom; listsByAtom = NULL;
00585 
00586   // Roundup the total number of bits to a multiple of sizeof(unsigned int)
00587   const long int uintBitCnt = sizeof(unsigned int) * 8;
00588   if (totalBits & (uintBitCnt - 1)) {
00589     totalBits += (uintBitCnt - (totalBits & (uintBitCnt - 1)));
00590   }
00591   long int totalBytes = totalBits / 8;
00592 
00593   // If this is PE 0, print some info...
00594   if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
00595     CkPrintf("Info: Found %d unique exclusion lists needed %ld bytes\n",
00596              unique_lists.size(), totalBytes);
00597   }
00598 
00599   // Allocate the memory required (using 'unsigned int' as the array's data type)
00600   //   and initialize the memory to zero.
00601   long int totalUInts = totalBits / (sizeof(unsigned int) * 8);
00602   unsigned int *exclusion_bits = new unsigned int[totalUInts];
00603   memset(exclusion_bits, 0, totalBytes);  // Zero-out the data
00604 
00605   // Fill in the bits
00606   long int offset = 0;  // Offset of current list in exclusion_bits array
00607   // For each of the patterns...
00608   for (int i = 0; i < unique_lists.size(); i++) {
00609 
00610     // Get the pattern
00611     int32 *list = unique_lists[i];
00612 
00613     // Sanity check: Verify that the base value matches with the stored "start"
00614     //   values (i.e. the start of this list is where we expect it to be)
00615     if (offset + (2 * list[1]) != list[2]) {
00616       NAMD_bug("ComputeNonbondedMIC::bind_exclusions offset mismatch");
00617     }
00618 
00619     // Pack the bits from this list into the exclusion_bits array
00620     const int n = list[0];
00621     for (int j = 0; j < n; j++) {
00622       const int offset_j = offset + (2 * j);  // NOTE: Each entry is 2 bits
00623       const int offset_major = offset_j / (sizeof(unsigned int) * 8);
00624       const int offset_minor = offset_j % (sizeof(unsigned int) * 8); // NOTE: Reverse indexing direction relative to offset_major
00625       const int entry_mask = (list[j + 3] & 0x3) << offset_minor;
00626       exclusion_bits[offset_major] |= entry_mask;
00627     }
00628 
00629     // Move the offset forward
00630     offset += 2 * (2 * list[1] + 1);
00631   }
00632 
00633   // Now that the bits for all of the patterns have been placed into the
00634   //   exclusion_bits array, push this array down to the device for use
00635   //   during simulation.
00636   mic_bind_exclusions(deviceNum, exclusion_bits, totalUInts);
00637 
00638   // Cleanup
00639   // NOTE: That the individual lists will be free'ed as the arena is destroyed,
00640   //   along with the resize arrays (curModList, etc.).
00641   delete [] exclusion_bits;
00642 }
00643 
00644 
00645 // Register a "self compute" on the host with this MIC meta-compute object, creating a
00646 //   compute_record for the self compute.
00647 void register_mic_compute_self(ComputeID c, PatchID pid, int part, int numParts) {
00648 
00649   if ( ! micCompute ) NAMD_bug("register_self called early");
00650 
00651   // DMK - DEBUG
00652   MICP("register_mic_compute_self(c:%d, pid:%d, part:%d, numParts:%d) - Called...\n", c, pid, part, numParts); MICPF;
00653 
00654   // Indicate that the mic compute requires the patch information
00655   //   associated with the given self compute
00656   micCompute->requirePatch(pid);
00657 
00658   SimParameters *params = Node::Object()->simParameters;
00659 
00660   numParts = mic_numParts_self_p1;
00661   if (numParts < 1) { numParts = 1; }
00662   for (int part = 0; part < numParts; part++) {
00663 
00664     // Create a compute record within the mic compute that represents
00665     //   the given self compute
00666     ComputeNonbondedMIC::compute_record cr;
00667     cr.c = c;
00668     cr.pid[0] = pid;  cr.pid[1] = pid;
00669     cr.offset = 0.;
00670     cr.isSelf = 1;
00671  
00672     cr.part = part;
00673     cr.numParts = numParts;
00674 
00675     if (singleKernelFlag != 0 || micCompute->patchRecords[pid].isLocal) {
00676       micCompute->localComputeRecords.add(cr);
00677     } else {
00678       micCompute->remoteComputeRecords.add(cr);
00679     }
00680 
00681   }
00682 }
00683 
00684 void register_mic_compute_pair(ComputeID c, PatchID pid[], int t[], int part, int numParts) {
00685 
00686   if ( ! micCompute ) NAMD_bug("register_pair called early");
00687 
00688   // DMK - DEBUG
00689   MICP("register_mic_compute_pair(c:%d, pid:{%d,%d}, t:--, part:%d, numParts:%d) - Called...\n", c, pid[0], pid[1], part, numParts); MICPF;
00690 
00691   // Indicate that the mic compute requires the patch information
00692   //   associated with the given pair compute
00693   micCompute->requirePatch(pid[0]);
00694   micCompute->requirePatch(pid[1]);
00695 
00696   // Calculate the offset that will need to be applied to the atom positions (which are
00697   //   stored as offsets from the center of their respective patches)
00698   int t1 = t[0];
00699   int t2 = t[1];
00700   Vector offset = micCompute->patchMap->center(pid[0])
00701                 - micCompute->patchMap->center(pid[1]);
00702   offset.x += (t1%3-1) - (t2%3-1);
00703   offset.y += ((t1/3)%3-1) - ((t2/3)%3-1);
00704   offset.z += (t1/9-1) - (t2/9-1);
00705 
00706   // Calculate the Manhattan distance between the patches and use that to determine how
00707   //   many parts the pair compute should be broken up into
00708   ComputeMap *computeMap = ComputeMap::Object();
00709   PatchMap *patchMap = PatchMap::Object();
00710   SimParameters *params = Node::Object()->simParameters;
00711 
00712   int aSize = patchMap->gridsize_a();
00713   int bSize = patchMap->gridsize_b();
00714   int cSize = patchMap->gridsize_c();
00715   int trans0 = computeMap->trans(c, 0);
00716   int trans1 = computeMap->trans(c, 1);
00717   int index_a0 = patchMap->index_a(pid[0]) + aSize * Lattice::offset_a(trans0);
00718   int index_b0 = patchMap->index_b(pid[0]) + bSize * Lattice::offset_b(trans0);
00719   int index_c0 = patchMap->index_c(pid[0]) + cSize * Lattice::offset_c(trans0);
00720   int index_a1 = patchMap->index_a(pid[1]) + aSize * Lattice::offset_a(trans1);
00721   int index_b1 = patchMap->index_b(pid[1]) + bSize * Lattice::offset_b(trans1);
00722   int index_c1 = patchMap->index_c(pid[1]) + cSize * Lattice::offset_c(trans1);
00723   int da = index_a0 - index_a1; da *= ((da < 0) ? (-1) : (1));
00724   int db = index_b0 - index_b1; db *= ((db < 0) ? (-1) : (1));
00725   int dc = index_c0 - index_c1; dc *= ((dc < 0) ? (-1) : (1));
00726   int manDist = da + db + dc;
00727 
00728   // Create each part
00729   numParts = mic_numParts_pair_p1 - (mic_numParts_pair_p2 * manDist);
00730   if (numParts < 1) { numParts = 1; }
00731   for (int part = 0; part < numParts; part++) {
00732 
00733     // Create a compute record within the mic compute that represents
00734     //   the given pair compute
00735     ComputeNonbondedMIC::compute_record cr;
00736     cr.c = c;
00737     cr.pid[0] = pid[0];  cr.pid[1] = pid[1];
00738     cr.offset = offset;
00739     cr.isSelf = 0;
00740 
00741     cr.part = part;
00742     cr.numParts = numParts;
00743 
00744     // If splitting the kernels up into "local" and "remote" kernels, place the pair compute in
00745     //   the "remote" kernel if either of the patches is "remote"... otherwise, mark as "local"
00746     if ((singleKernelFlag != 0) ||
00747         (micCompute->patchRecords[pid[0]].isLocal && micCompute->patchRecords[pid[1]].isLocal)
00748        ) {
00749       micCompute->localComputeRecords.add(cr);
00750     } else {
00751       micCompute->remoteComputeRecords.add(cr);
00752     }
00753 
00754   }  // end for (part < numParts)
00755 }
00756 
00757 
00758 void unregister_mic_compute(ComputeID c) {  // static
00759   NAMD_bug("unregister_compute unimplemented");
00760 }
00761 
00762 
00763 // DMK - DEBUG - A function that periodically prints a "heartbeat" output
00764 void mic_heartbeat(void *arg, double t) {
00765   #if MIC_DEBUG != 0
00766     MICP("[DEBUG:%d] :: heartbeat (t:%lf)...\n", CkMyPe(), t); MICPF;
00767   #else
00768     printf("[DEBUG:%d] :: heartbeat (t:%lf)...\n", CkMyPe(), t); fflush(NULL);
00769   #endif
00770   CcdCallFnAfter(mic_heartbeat, NULL, 1000.0);
00771 }
00772 
00773 
00774 ComputeNonbondedMIC::ComputeNonbondedMIC(ComputeID c,
00775                                          ComputeMgr *mgr,
00776                                          ComputeNonbondedMIC *m,
00777                                          int idx
00778                                         ) : Compute(c), slaveIndex(idx) {
00779 
00780   #ifdef PRINT_GBIS
00781     CkPrintf("C.N.MIC[%d]::constructor cid=%d\n", CkMyPe(), c);
00782   #endif
00783 
00784   // DMK - DEBUG
00785   #if MIC_HEARTBEAT != 0
00786     mic_heartbeat(NULL, CmiWallTimer());
00787   #endif
00788 
00789   // DMK - DEBUG
00790   MICP("ComputeNonbondedMIC::ComputeNonbondedMIC(cid:%d) - Called...\n", c); MICPF;
00791 
00792   master = m ? m : this;
00793   micCompute = this;
00794   computeMgr = mgr;
00795   patchMap = PatchMap::Object();
00796   atomMap = AtomMap::Object();
00797   reduction = 0;
00798 
00799   SimParameters *params = Node::Object()->simParameters;
00800   if (params->pressureProfileOn) {
00801     NAMD_die("pressure profile not supported in MIC");
00802   }
00803   if (mic_singleKernel >= 0) {
00804     singleKernelFlag = ((mic_singleKernel != 0) ? (1) : (0));
00805   } else {
00806     singleKernelFlag = ((params->mic_singleKernel != 0) ? (1) : (0));
00807   }
00808 
00809   atomsChanged = 1;
00810   computesChanged = 1;
00811 
00812   pairlistsValid = 0;
00813   pairlistTolerance = 0.;
00814   usePairlists = 0;
00815   savePairlists = 0;
00816   plcutoff2 = 0.;
00817 
00818   workStarted = 0;
00819   basePriority = PROXY_DATA_PRIORITY;
00820   localWorkMsg2 = new (PRIORITY_SIZE) LocalWorkMsg;
00821 
00822   // Master and slaves
00823   #if MIC_SUBMIT_ATOMS_ON_ARRIVAL != 0
00824     micDevice = -1;
00825     exclusionsByAtom_ptr = NULL;
00826     atomSubmitSignals = new std::set<void*>(); __ASSERT(atomSubmitSignals != NULL);
00827   #endif
00828 
00829   // Print some info for the user
00830   //   NOTE: Do this before the master/slave check below (so PE 0 will reach here)
00831   if (CkMyPe() == 0) {
00832     iout << iINFO << "MIC SINGLE OFFLOAD: " << ((singleKernelFlag != 0) ? ("SET") : ("UNSET")) << "\n" << endi;
00833     iout << iINFO << "MIC DEVICE THRESHOLD: " << mic_deviceThreshold << "\n" << endi;
00834     iout << iINFO << "MIC HOST SPLIT: " << mic_hostSplit << "\n" << endi;
00835     iout << iINFO << "MIC NUM PARTS SELF P1: " << mic_numParts_self_p1 << "\n" << endi;
00836     iout << iINFO << "MIC NUM PARTS PAIR P1: " << mic_numParts_pair_p1 << "\n" << endi;
00837     iout << iINFO << "MIC NUM PARTS PAIR P2: " << mic_numParts_pair_p2 << "\n" << endi;
00838   }
00839 
00840   if ( master != this ) { // I am slave
00841     masterPe = master->masterPe;
00842     master->slaves[slaveIndex] = this;
00843     if ( master->slavePes[slaveIndex] != CkMyPe() ) {
00844       NAMD_bug("ComputeNonbondedMIC slavePes[slaveIndex] != CkMyPe");
00845     }
00846     registerPatches();
00847     return;
00848   } else {  // I am a master, identify self to ComputeMgr for load balancing data
00849     computeMgr->sendMICPEData(CkMyPe(), 1);
00850     patchRecords.resize(patchMap->numPatches());
00851   }
00852   masterPe = CkMyPe();
00853 
00854   if (myDevice >= 0) {
00855     bind_exclusions(myDevice);
00856   }
00857 
00858   // Master only
00859   #if MIC_SUBMIT_ATOMS_ON_ARRIVAL != 0
00860     micDevice = myDevice; __ASSERT(micDevice >= 0);
00861     exclusionsByAtom_ptr = exclusionsByAtom; __ASSERT(exclusionsByAtom_ptr != NULL);
00862   #endif
00863 
00864   // Initialize the exclusion contribution sum
00865   exclContrib = 0;
00866 
00867   // DMK - DEBUG
00868   timestep = 0;
00869 }
00870 
00871 
00872 ComputeNonbondedMIC::~ComputeNonbondedMIC() {
00873   #if MIC_DEBUG > 0
00874     debugClose();
00875   #endif
00876 }
00877 
00878 void ComputeNonbondedMIC::requirePatch(int pid) {
00879 
00880   computesChanged = 1;
00881   patch_record &pr = patchRecords[pid];
00882   if ( pr.refCount == 0 ) {
00883     //if ( CkNumNodes() < 2 ) {
00884     //  pr.isLocal = 1 & ( 1 ^ patchMap->index_a(pid) ^ patchMap->index_b(pid) ^ patchMap->index_c(pid) );
00885     //} else {
00886       pr.isLocal = ( CkNodeOf(patchMap->node(pid)) == CkMyNode() );
00887     //}
00888 
00889     if ( singleKernelFlag != 0 || pr.isLocal ) {
00890       localActivePatches.add(pid);
00891     } else {
00892       remoteActivePatches.add(pid);
00893     }
00894 
00895     activePatches.add(pid);
00896     pr.patchID = pid;
00897     pr.hostPe = -1;
00898     pr.x = NULL;
00899     pr.xExt = NULL;
00900     pr.r = NULL;
00901     pr.f = NULL;
00902     pr.intRad      = NULL;
00903     pr.psiSum      = NULL;
00904     pr.bornRad     = NULL;
00905     pr.dEdaSum     = NULL;
00906     pr.dHdrPrefix  = NULL;
00907   }
00908   pr.refCount += 1;
00909 }
00910 
00911 void ComputeNonbondedMIC::registerPatches() {
00912 
00913   SimParameters *simParams = Node::Object()->simParameters;
00914   int npatches = master->activePatches.size();
00915   int *pids = master->activePatches.begin();
00916   patch_record *recs = master->patchRecords.begin();
00917   const int myPE = CkMyPe();
00918 
00919   for ( int i=0; i<npatches; ++i ) {
00920     int pid = pids[i];
00921     patch_record &pr = recs[pid];
00922     if ( pr.hostPe == myPE ) {
00923 
00924       hostedPatches.add(pid);
00925 
00926       if ( singleKernelFlag != 0 || pr.isLocal ) {
00927         localHostedPatches.add(pid);
00928       } else {
00929         remoteHostedPatches.add(pid);
00930       }
00931 
00932       ProxyMgr::Object()->createProxy(pid);
00933       pr.p = patchMap->patch(pid);
00934       pr.positionBox = pr.p->registerPositionPickup(this);
00935 
00936       pr.forceBox = pr.p->registerForceDeposit(this);
00937       if (simParams->GBISOn) {
00938         pr.intRadBox      = pr.p->registerIntRadPickup(this);
00939         pr.psiSumBox      = pr.p->registerPsiSumDeposit(this);
00940         pr.bornRadBox     = pr.p->registerBornRadPickup(this);
00941         pr.dEdaSumBox     = pr.p->registerDEdaSumDeposit(this);
00942         pr.dHdrPrefixBox  = pr.p->registerDHdrPrefixPickup(this);
00943       }
00944     }
00945   }
00946   if ( master == this ) setNumPatches(activePatches.size());
00947   else setNumPatches(hostedPatches.size());
00948 
00949   if ( CmiPhysicalNodeID(CkMyPe()) < 2 )
00950   CkPrintf("Pe %d hosts %d local and %d remote patches for pe %d\n", CkMyPe(), localHostedPatches.size(), remoteHostedPatches.size(), masterPe);
00951 }
00952 
00953 void ComputeNonbondedMIC::assignPatches() {
00954 
00955   // DMK - DEBUG
00956   MICP("ComputeNonbondedMIC::assignPatches() - Called...\n"); MICPF;
00957 
00958   int *pesOnNodeSharingDevice = new int[CkMyNodeSize()];
00959   int numPesOnNodeSharingDevice = 0;
00960   int masterIndex = -1;
00961   for ( int i=0; i<numPesSharingDevice; ++i ) {
00962     int pe = pesSharingDevice[i];
00963     if ( pe == CkMyPe() ) masterIndex = numPesOnNodeSharingDevice;
00964     if ( CkNodeOf(pe) == CkMyNode() ) {
00965       pesOnNodeSharingDevice[numPesOnNodeSharingDevice++] = pe;
00966     }
00967   }
00968 
00969   int npatches = activePatches.size();
00970 
00971   if ( npatches ) {
00972     reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
00973   }
00974 
00975   int *count = new int[npatches];
00976   memset(count, 0, sizeof(int)*npatches);
00977   int *pcount = new int[numPesOnNodeSharingDevice];
00978   memset(pcount, 0, sizeof(int)*numPesOnNodeSharingDevice);
00979   int *rankpcount = new int[CkMyNodeSize()];
00980   memset(rankpcount, 0, sizeof(int)*CkMyNodeSize());
00981   char *table = new char[npatches*numPesOnNodeSharingDevice];
00982   memset(table, 0, npatches*numPesOnNodeSharingDevice);
00983 
00984   int unassignedpatches = npatches;
00985 
00986   if ( 0 ) { // assign all to device pe
00987     for ( int i=0; i<npatches; ++i ) {
00988       int pid = activePatches[i];
00989       patch_record &pr = patchRecords[pid];
00990       pr.hostPe = CkMyPe();
00991     }
00992     unassignedpatches = 0;
00993     pcount[masterIndex] = npatches;
00994   } else 
00995 
00996   // assign if home pe and build table of natural proxies
00997   for ( int i=0; i<npatches; ++i ) {
00998     int pid = activePatches[i];
00999     patch_record &pr = patchRecords[pid];
01000     int homePe = patchMap->node(pid);
01001 
01002     for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
01003       int pe = pesOnNodeSharingDevice[j];
01004       if ( pe == homePe ) {
01005         pr.hostPe = pe;  --unassignedpatches;
01006         pcount[j] += 1;
01007       }
01008       if ( PatchMap::ObjectOnPe(pe)->patch(pid) ) {
01009         table[i*numPesOnNodeSharingDevice+j] = 1;
01010       }
01011     }
01012     if ( pr.hostPe == -1 && CkNodeOf(homePe) == CkMyNode() ) {
01013       pr.hostPe = homePe;  --unassignedpatches;
01014       rankpcount[CkRankOf(homePe)] += 1;
01015     }
01016   }
01017 
01018   // assign if only one pe has a required proxy
01019   int assignj = 0;
01020   for ( int i=0; i<npatches; ++i ) {
01021     int pid = activePatches[i];
01022     patch_record &pr = patchRecords[pid];
01023     if ( pr.hostPe != -1 ) continue;
01024     int c = 0;
01025     int lastj;
01026     for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
01027       if ( table[i*numPesOnNodeSharingDevice+j] ) { ++c; lastj=j; }
01028     }
01029     count[i] = c;
01030     if ( c == 1 ) {
01031       pr.hostPe = pesOnNodeSharingDevice[lastj];
01032       --unassignedpatches;
01033       pcount[lastj] += 1;
01034     }
01035   }
01036   while ( unassignedpatches ) {
01037     int i;
01038     for ( i=0; i<npatches; ++i ) {
01039       if ( ! table[i*numPesOnNodeSharingDevice+assignj] ) continue;
01040       int pid = activePatches[i];
01041       patch_record &pr = patchRecords[pid];
01042       if ( pr.hostPe != -1 ) continue;
01043       pr.hostPe = pesOnNodeSharingDevice[assignj];
01044       --unassignedpatches;
01045       pcount[assignj] += 1;
01046       if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
01047       break;
01048     }
01049     if ( i<npatches ) continue;  // start search again
01050     for ( i=0; i<npatches; ++i ) {
01051       int pid = activePatches[i];
01052       patch_record &pr = patchRecords[pid];
01053       if ( pr.hostPe != -1 ) continue;
01054       if ( count[i] ) continue;
01055       pr.hostPe = pesOnNodeSharingDevice[assignj];
01056       --unassignedpatches;
01057       pcount[assignj] += 1;
01058       if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
01059       break;
01060     }
01061     if ( i<npatches ) continue;  // start search again
01062     if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
01063   }
01064 
01065   for ( int i=0; i<npatches; ++i ) {
01066     int pid = activePatches[i];
01067     patch_record &pr = patchRecords[pid];
01068     // CkPrintf("Pe %d patch %d hostPe %d\n", CkMyPe(), pid, pr.hostPe);
01069   }
01070 
01071   slavePes = new int[CkMyNodeSize()];
01072   slaves = new ComputeNonbondedMIC*[CkMyNodeSize()];
01073   numSlaves = 0;
01074   for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
01075     int pe = pesOnNodeSharingDevice[j];
01076     int rank = pe - CkNodeFirst(CkMyNode());
01077     // CkPrintf("host %d sharing %d pe %d rank %d pcount %d rankpcount %d\n",
01078     //          CkMyPe(),j,pe,rank,pcount[j],rankpcount[rank]);
01079     if ( pe == CkMyPe() ) continue;
01080     if ( ! pcount[j] && ! rankpcount[rank] ) continue;
01081     rankpcount[rank] = 0;  // skip in rank loop below
01082     slavePes[numSlaves] = pe;
01083     computeMgr->sendCreateNonbondedMICSlave(pe,numSlaves);
01084     ++numSlaves;
01085   }
01086   for ( int j=0; j<CkMyNodeSize(); ++j ) {
01087     int pe = CkNodeFirst(CkMyNode()) + j;
01088     // CkPrintf("host %d rank %d pe %d rankpcount %d\n",
01089     //          CkMyPe(),j,pe,rankpcount[j]);
01090     if ( ! rankpcount[j] ) continue;
01091     if ( pe == CkMyPe() ) continue;
01092     slavePes[numSlaves] = pe;
01093     computeMgr->sendCreateNonbondedMICSlave(pe,numSlaves);
01094     ++numSlaves;
01095   }
01096   registerPatches();
01097 
01098   delete [] pesOnNodeSharingDevice;
01099   delete [] count;
01100   delete [] pcount;
01101   delete [] rankpcount;
01102   delete [] table;
01103 }
01104 
01105 static __thread int num_atom_records_allocated;
01106 static __thread float *energy_gbis;
01107 
01108 //GBIS host pointers
01109 static __thread float *intRad0H;
01110 static __thread float *intRadSH;
01111 //static __thread GBReal *psiSumH; //moved into class
01112 static __thread float *bornRadH;
01113 //static __thread GBReal *dEdaSumH; //moved into class
01114 static __thread float *dHdrPrefixH;
01115 
01116 static __thread int mic_timer_count;
01117 static __thread double mic_timer_total;
01118 static __thread double kernel_time;
01119 static __thread double remote_submit_time;
01120 static __thread double local_submit_time;
01121 
01122 // TRACING
01123 #if MIC_TRACING != 0
01124   __thread double mic_tracing_offload_start_remote = 0.0;
01125   __thread double mic_tracing_offload_start_local = 0.0;
01126   __thread double mic_tracing_offload_end_remote = 0.0;
01127   __thread double mic_tracing_offload_end_local = 0.0;
01128   static __thread double mic_tracing_polling_start = 0.0;
01129   static __thread int mic_tracing_polling_count = 0;
01130   #define MIC_TRACING_POLLING_SET_FREQ  ( 100 )
01131 #endif
01132 
01133 #define MIC_POLL(FN,ARG) CcdCallFnAfter(FN,ARG,0.1)
01134 
01135 #ifdef PRINT_GBIS
01136 #define GBISP(...) CkPrintf(__VA_ARGS__);
01137 #else
01138 #define GBISP(...)
01139 #endif
01140 
01141 
01142 #define count_limit 5000000  // DMK - NOTE : I have this set fairly high so that I can test executions 
01143                              //   that us a single thread on the MIC device, which can take quite some
01144                              //   time if using printfs to display intermediate debug values.
01145 static __thread int check_remote_count;
01146 static __thread int check_local_count;
01147 
01148 void mic_check_remote_progress(void *arg, double) {
01149 
01150   // If the offloaded work is finished, trigger finishWork()
01151   if (mic_check_remote_kernel_complete(myDevice)) {
01152 
01153     ComputeNonbondedMIC* compute = (ComputeNonbondedMIC*)arg;
01154     #if MIC_SYNC_OUTPUT != 0
01155       #if MIC_TRACING != 0
01156         double xfer_start = CmiWallTimer();
01157       #endif
01158       mic_transfer_output(myDevice,
01159                           1,
01160                           compute->num_local_atom_records,
01161                           compute->doSlow
01162                          );
01163       #if MIC_TRACING != 0
01164         double xfer_end = CmiWallTimer();
01165         MIC_TRACING_RECORD(MIC_EVENT_SYNC_OUTPUT_REMOTE_PRAGMA, xfer_start, xfer_end);
01166       #endif
01167     #endif
01168     //#if MIC_TRACING != 0
01169     //  double now = CmiWallTimer();
01170     //  mic_tracing_offload_end_remote = now;
01171     //  MIC_TRACING_RECORD(MIC_EVENT_OFFLOAD_REMOTE, mic_tracing_offload_start_remote, now);
01172     //  MIC_TRACING_RECORD(MIC_EVENT_OFFLOAD_POLLSET, mic_tracing_polling_start, now);
01173     //  mic_tracing_polling_count = 0;
01174     //#endif
01175     check_remote_count = 0;
01176     mic_errcheck("at mic remote stream completed");
01177     WorkDistrib::messageFinishMIC((ComputeNonbondedMIC *) arg);
01178 
01179     // DMK - DEBUG
01180     MICP("[DEBUG:%d] :: << detected remote kernel completion >>\n", CkMyPe()); MICPF;
01181 
01182   // Otherwise, check to see if it has been a long time (if so, timeout with error)
01183   } else if (++check_remote_count >= count_limit) {
01184     char errmsg[256];
01185     sprintf(errmsg,
01186             "mic_check_remote_progress polled %d times over %f s on step %d",
01187             check_remote_count, CkWallTimer() - remote_submit_time,
01188             ((ComputeNonbondedMIC*)arg)->step
01189            );
01190     //mic_errcheck(errmsg);
01191     NAMD_die(errmsg);
01192 
01193   // Otherwise, queue another poll attempt in the future to try again later
01194   } else {
01195 
01196     //#if MIC_TRACING != 0
01197     //  if (++mic_tracing_polling_count > MIC_TRACING_POLLING_SET_FREQ) {
01198     //    double now = CmiWallTimer();
01199     //    MIC_TRACING_RECORD(MIC_EVENT_OFFLOAD_POLLSET, mic_tracing_polling_start, now);
01200     //    mic_tracing_polling_start = now;
01201     //    mic_tracing_polling_count = 0;
01202     //  }
01203     //#endif
01204     MIC_POLL(mic_check_remote_progress, arg);
01205   }
01206 }
01207 
01208 void mic_check_local_progress(void *arg, double) {
01209 
01210   // If the offloaded work is finished, trigger finishWork()
01211   if (mic_check_local_kernel_complete(myDevice)) {
01212 
01213     ComputeNonbondedMIC* compute = (ComputeNonbondedMIC*)arg;
01214     #if MIC_SYNC_OUTPUT != 0
01215       #if MIC_TRACING != 0
01216         double xfer_start = CmiWallTimer();
01217       #endif
01218       mic_transfer_output(myDevice,
01219                           0,
01220                           compute->num_local_atom_records,
01221                           compute->doSlow
01222                          );
01223       #if MIC_TRACING != 0
01224         double xfer_end = CmiWallTimer();
01225         MIC_TRACING_RECORD(MIC_EVENT_SYNC_OUTPUT_LOCAL_PRAGMA, xfer_start, xfer_end);
01226       #endif
01227     #endif
01228     //#if MIC_TRACING != 0
01229     //  double now = CmiWallTimer();
01230     //  mic_tracing_offload_end_local = now;
01231     //  MIC_TRACING_RECORD(MIC_EVENT_OFFLOAD_LOCAL, mic_tracing_offload_start_local, now);
01232     //  MIC_TRACING_RECORD(MIC_EVENT_OFFLOAD_POLLSET, mic_tracing_polling_start, now);
01233     //  mic_tracing_polling_count = 0;
01234     //#endif
01235     check_local_count = 0;
01236     mic_errcheck("at mic local stream completed");
01237     WorkDistrib::messageFinishMIC((ComputeNonbondedMIC *) arg);
01238 
01239     // DMK - DEBUG
01240     MICP("[DEBUG:%d] :: << detected local kernel completion >>\n", CkMyPe()); MICPF;
01241 
01242   // Otherwise, check to see if it has been a long time (if so, timeout with error)
01243   } else if (++check_local_count >= count_limit) {
01244     char errmsg[256];
01245     sprintf(errmsg,
01246             "mic_check_local_progress polled %d times over %f s on step %d",
01247             check_local_count, CkWallTimer() - local_submit_time,
01248             ((ComputeNonbondedMIC*)arg)->step
01249            );
01250     //mic_errcheck(errmsg);
01251     NAMD_die(errmsg);
01252 
01253   // Shouldn't be polling for local complete until remote complete was already detected (check for this case)
01254   } else if ( check_remote_count ) {
01255     NAMD_bug("nonzero check_remote_count in mic_check_local_progres");
01256 
01257   // Otherwise, queue another poll attmpt in the future to try again later
01258   } else {
01259     MIC_POLL(mic_check_local_progress, arg);
01260   }
01261 }
01262 
01263 void ComputeNonbondedMIC::atomUpdate() { atomsChanged = 1; }
01264 
01265 static __thread int kernel_launch_state = 0;
01266 
01267 struct cr_sortop {
01268   const Lattice &l;
01269   cr_sortop(const Lattice &lattice) : l(lattice) { }
01270   bool operator() (ComputeNonbondedMIC::compute_record i,
01271                    ComputeNonbondedMIC::compute_record j) {
01272     Vector a = l.a();
01273     Vector b = l.b();
01274     Vector c = l.c();
01275     BigReal ri = (i.offset.x * a + i.offset.y * b + i.offset.z * c).length2();
01276     BigReal rj = (j.offset.x * a + j.offset.y * b + j.offset.z * c).length2();
01277     return ( ri < rj );
01278   }
01279 };
01280 
01281 
01282 #if MIC_SUBMIT_ATOMS_ON_ARRIVAL != 0
01283 void ComputeNonbondedMIC::patchReady(PatchID patchID, int doneMigration, int seq) {
01284 
01285   // This function overrides the Compute::patchReady function so that some additional
01286   //   work can be done.  Compute::patchReady is called at the end of the function.
01287   //   When submitting atoms as the atom data arrives on the node, this function along
01288   //   with mic_submit_patch_data take care of pushing that data to the devices that
01289   //   require them.  However, this function is called by each of the host cores, so
01290   //   some care must be taken when doing this.  There are two items that need to be
01291   //   accomplished each timestep.  First, the atom data needs to be packed up for
01292   //   transfer to the devices (once per patch).  Second, for each device that requires
01293   //   the data, push that atom data to the device (once per patch per device).  A lock
01294   //   is used to protect flags that track what portions of this work have been done
01295   //   as each of the host threads are notified that the patches are ready.  The flags
01296   //   are mic_atomData_seq for once per patch work and mic_atomData_deviceSeq[] for
01297   //   once per patch per device work (they use changes in the seq value to trigger work).
01298 
01299   // NOTE: The slave-ready mechanism makes use of patchReady to signal the master.  The master
01300   //   and associated slaves are all on the same node (same host address space and same MIC), so
01301   //   the slaves can push the data to the MIC device directly (here).  However, when the slaves
01302   //   are finished, calls to patchReady with a patchID of -1 will be made (master waits for all
01303   //   patches, slaves only wait for their assigned patches and 'pass on' patchReady notifications
01304   //   to the master so the master gets them all).  As such, we need to check for a '-1' here to
01305   //   detect that this is just the slave notifying the master (if so, nothing extra to do here).
01306   if (patchID >= 0) {
01307 
01308     #if MIC_TRACING != 0
01309       double atomSubmit_start = CmiWallTimer();
01310     #endif
01311 
01312     // Get the patch information
01313     patch_record &pr = master->patchRecords[patchID];
01314     Patch *p = pr.p;
01315     CudaAtom *ca = p->getCudaAtomList();
01316     CompAtom *a = pr.positionBox->open(); pr.x = a;
01317     CompAtomExt *aExt = p->getCompAtomExtInfo();
01318     int2 *exclusionsByAtom = master->exclusionsByAtom_ptr;
01319     int numAtoms = p->getNumAtoms();
01320     int numAtoms_16 = ((numAtoms + 15) & (~15));
01321 
01322     // Create references to the atomData pointers on the host for this patch (host copy of data on device)
01323     void* &atomData = p->mic_atomData;
01324     int &allocSize = p->mic_atomData_allocSize_host;
01325     // NOTE: Since we are using a mutex (per patch) to protect the following region,
01326     //   only the first thread through will reallocate the memory and copy the patch
01327     //   data in to the host's copy of the atomData buffer (seq number as check).
01328     //   The remaining threads will check the size and see that everything is fine,
01329     //   skipping this work.
01330 
01331     // Note the MIC device that this call wants the device on (the device that the master object drives)
01332     int micDevice = master->micDevice;
01333     __ASSERT(micDevice < MIC_MAX_DEVICES_PER_NODE);
01334     int transferFlag = 0;
01335 
01336     // If either the 'once per patch' or 'once per patch per device' work needs to be done, then grab
01337     //   the lock and dow the work
01338     if (p->mic_atomData_seq != seq || p->mic_atomData_deviceSeq[micDevice] != seq) {
01339       pthread_mutex_lock(&(p->mic_atomData_mutex));
01340 
01341       // Clear the flags early so other threads are more likely to skip by the checks for the work that
01342       //   will be done by this thread (and the lock).  If another thread does pass the check, the lock and
01343       //   flag values will still avoid the work being done multiple times.  This is just an optimization.
01344       int tmp_mic_atomData_seq = p->mic_atomData_seq;
01345       int tmp_mic_atomData_deviceSeq = p->mic_atomData_deviceSeq[micDevice];
01346       p->mic_atomData_seq = seq;
01347       p->mic_atomData_deviceSeq[micDevice] = seq;
01348 
01349       // Once per patch per timestep work
01350       if (tmp_mic_atomData_seq != seq) {  // Check again in case the first check passed while another thread was in the region
01351 
01352         // Allocate the memory as needed
01353         int allocFlag = 0;
01354         if (numAtoms_16 > allocSize || atomData == NULL) {
01355           if (atomData != NULL) { _MM_FREE_WRAPPER(atomData); }
01356           int toAllocSize = (int)(numAtoms_16 * 1.1f);
01357           toAllocSize = ((toAllocSize + 15) & (~15));
01358           atomData = (char*)(_MM_MALLOC_WRAPPER((sizeof(atom) + sizeof(atom_param)) * toAllocSize, MIC_ALIGN, "atomData"));
01359           __ASSERT(atomData != NULL);
01360           allocSize = toAllocSize;
01361           allocFlag = 1;
01362         }
01363 
01364         // Copy the data to the buffer that will be passed to the device(s)
01365         // NOTE: the number of atoms will only change when doneMigration is set
01366         // WARNING | NOTE : The following memcopy assumes CudaAtom and atom data structures match !!!
01367         atom* dev_a = (atom*)atomData;
01368         atom_param* dev_aExt = (atom_param*)(dev_a + numAtoms_16);
01369         memcpy(dev_a, ca, sizeof(atom) * numAtoms);  // atoms always
01370         if (doneMigration || allocFlag) { // atom_params sometimes
01371           #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
01372             for (int k = 0; k < numAtoms; k++) {
01373               int j = aExt[k].sortOrder;
01374               dev_aExt[k].vdw_type = a[j].vdwType;
01375               dev_aExt[k].index = aExt[j].id;
01376               #ifdef MEM_OPT_VERSION
01377                 dev_aExt[k].excl_index = exclusionsByAtom[aExt[j].exclId].y;
01378                 dev_aExt[k].excl_maxdiff = exclusionsByAtom[aExt[j].exclId].x;
01379               #else
01380                 dev_aExt[k].excl_index = exclusionsByAtom[aExt[j].id].y;
01381                 dev_aExt[k].excl_maxdiff = exclusionsByAtom[aExt[j].id].x;
01382               #endif
01383             }
01384           #else
01385             int *dev_aExt_vdwType     = ((int*)dev_aExt) + (0 * numAtoms_16);
01386             int *dev_aExt_index       = ((int*)dev_aExt) + (1 * numAtoms_16);
01387             int *dev_aExt_exclIndex   = ((int*)dev_aExt) + (2 * numAtoms_16);
01388             int *dev_aExt_exclMaxDiff = ((int*)dev_aExt) + (3 * numAtoms_16);
01389             for (int k = 0; k < numAtoms; k++) {
01390               int j = aExt[k].sortOrder;
01391               dev_aExt_vdwType[k] = a[j].vdwType;
01392               dev_aExt_index[k] = aExt[j].id;
01393               #ifdef MEM_OPT_VERSION
01394                 dev_aExt_exclIndex[k] = exclusionsByAtom[aExt[j].exclId].y;
01395                 dev_aExt_exclMaxDiff[k] = exclusionsByAtom[aExt[j].exclId].x;
01396               #else
01397                 dev_aExt_exclIndex[k] = exclusionsByAtom[aExt[j].id].y;
01398                 dev_aExt_exclMaxDiff[k] = exclusionsByAtom[aExt[j].id].x;
01399               #endif
01400             }
01401           #endif
01402         }
01403       } // end if (mic_atomData_seq != seq)
01404 
01405       // Once per patch per timestep per device work
01406       // NOTE : Within the protected region, simply flag that the transfer needs to take place
01407       //   and move on.  This will allow transfers to multiple MIC cards to occur in parallel
01408       //   without the per-patch-lock serializing them.
01409       if (tmp_mic_atomData_deviceSeq != seq) { transferFlag = 1; }
01410 
01411       pthread_mutex_unlock(&(p->mic_atomData_mutex));
01412     } // end if (mic_atomData_seq != seq || mic_atomData_deviceSeq[micDevice] != seq)
01413 
01414     // Transfer the data to the given device
01415     if (transferFlag != 0) {
01416       int allocBytes = allocSize * (sizeof(atom) + sizeof(atom_param));
01417       int transferBytes = numAtoms_16 * sizeof(atom);
01418       if (doneMigration) { transferBytes += numAtoms_16 * sizeof(atom_param); }
01419       void *signal = NULL;
01420 
01421       #if MIC_TRACING != 0
01422         double atomTransfer_start = CmiWallTimer();
01423       #endif
01424 
01425       mic_submit_patch_data(micDevice,
01426                             p->mic_atomData,
01427                             p->mic_atomData_prev[micDevice],
01428                             transferBytes,
01429                             allocBytes,
01430                             p->mic_atomData_allocSize_device[micDevice],
01431                             p->mic_atomData_devicePtr[micDevice],
01432                             signal
01433                            );
01434 
01435       #if MIC_TRACING != 0
01436         double atomTransfer_finish = CmiWallTimer();
01437         MIC_TRACING_RECORD(MIC_EVENT_ATOMS_TRANSFER, atomTransfer_start, atomTransfer_finish);
01438       #endif
01439 
01440       if (signal != NULL) { atomSubmitSignals->insert(signal); }
01441     }
01442 
01443     #if MIC_TRACING != 0
01444       double atomSubmit_finish = CmiWallTimer();
01445       MIC_TRACING_RECORD(MIC_EVENT_ATOMS_SUBMIT, atomSubmit_start, atomSubmit_finish);
01446     #endif
01447 
01448   } // end if (patchID >= 0)
01449 
01450   // Call parent class patchReady function to perform the usual work
01451   Compute::patchReady(patchID, doneMigration, seq);
01452 }
01453 #endif // MIC_SUBMIT_ATOMS_ON_ARRIVAL != 0
01454 
01455 void ComputeNonbondedMIC::skip() {
01456   //SimParameters *simParams = Node::Object()->simParameters;
01457   for ( int i=0; i<hostedPatches.size(); ++i ) {
01458     patch_record &pr = master->patchRecords[hostedPatches[i]];
01459     pr.positionBox->skip();
01460     pr.forceBox->skip();
01461     //if (simParams->GBISOn) {
01462     //  pr.intRadBox->skip();
01463     //  pr.psiSumBox->skip();
01464     //  pr.bornRadBox->skip();
01465     //  pr.dEdaSumBox->skip();
01466     //  pr.dHdrPrefixBox->skip();
01467     //}
01468   }
01469 }
01470 
01471 int ComputeNonbondedMIC::noWork() {
01472 
01473   SimParameters *simParams = Node::Object()->simParameters;
01474 
01475   __ASSERT(hostedPatches.size() > 0);
01476   Flags &flags = master->patchRecords[hostedPatches[0]].p->flags;
01477 
01478   lattice = flags.lattice;
01479   doSlow = flags.doFullElectrostatics;
01480   doEnergy = flags.doEnergy;
01481   step = flags.step;
01482 
01483   if ( ! flags.doNonbonded ) {
01484     GBISP("GBIS[%d] noWork() don't do nonbonded\n",CkMyPe());
01485     if ( master != this ) {
01486       computeMgr->sendNonbondedMICSlaveReady(masterPe,
01487                         hostedPatches.size(),atomsChanged,sequence());
01488     } else {
01489       for ( int i = 0; i < numSlaves; ++i ) {
01490         computeMgr->sendNonbondedMICSlaveSkip(slaves[i],slavePes[i]);
01491       }
01492       skip();
01493     }
01494     if ( reduction ) reduction->submit();
01495     return 1;
01496   }
01497 
01498   // Wait for pending input buffers here
01499   // DMK - NOTE | TODO : For now this is blocking, but setup polling at some point.  May be possible to
01500   //   have slaves start polling here with the callback sending the ready message.  For the master, perhaps
01501   //   the polling could be started at the beginning of doWork() with the callback doing the same thing as
01502   //   the offload completion callbacks (trigger another call to doWork() and check state variables to
01503   //   figure out what to do).
01504   #if MIC_SUBMIT_ATOMS_ON_ARRIVAL != 0
01505   {
01506     #if MIC_TRACING != 0
01507       double atomSubmit_wait_start = CmiWallTimer();
01508     #endif
01509 
01510     int micDevice = master->micDevice;
01511 
01512     std::set<void*>::iterator it;
01513     for (it = atomSubmitSignals->begin(); it != atomSubmitSignals->end(); it++) {
01514       void *sig = (*it);
01515       #if 0  // Use blocking offload_wait pragma
01516         #pragma offload_wait target(mic:micDevice) wait(sig)
01517         { }
01518       #else  // Use busy wait
01519         while (!_Offload_signaled(master->micDevice, sig)) { }
01520       #endif
01521     }
01522     atomSubmitSignals->clear();  // Remove all pending signals
01523 
01524     #if MIC_TRACING != 0
01525       double atomSubmit_wait_finished = CmiWallTimer();
01526       MIC_TRACING_RECORD(MIC_EVENT_ATOMS_WAIT, atomSubmit_wait_start, atomSubmit_wait_finished);
01527     #endif
01528   }
01529   #endif
01530 
01531   for ( int i=0; i<hostedPatches.size(); ++i ) {
01532     patch_record &pr = master->patchRecords[hostedPatches[i]];
01533     if (!simParams->GBISOn || gbisPhase == 1) {
01534       GBISP("GBIS[%d] noWork() P0[%d] open()\n",CkMyPe(), pr.patchID);
01535       // DMK - NOTE : When patches are pushed to the device individually, open called in patchReady function
01536       #if MIC_SUBMIT_ATOMS_ON_ARRIVAL == 0
01537         pr.x = pr.positionBox->open();
01538       #endif
01539       pr.xExt = pr.p->getCompAtomExtInfo();
01540     }
01541 
01542     //if (simParams->GBISOn) {
01543     //  if (gbisPhase == 1) {
01544     //    GBISP("GBIS[%d] noWork() P1[%d] open()\n",CkMyPe(),pr.patchID);
01545     //    pr.intRad     = pr.intRadBox->open();
01546     //    pr.psiSum     = pr.psiSumBox->open();
01547     //  } else if (gbisPhase == 2) {
01548     //    GBISP("GBIS[%d] noWork() P2[%d] open()\n",CkMyPe(),pr.patchID);
01549     //    pr.bornRad    = pr.bornRadBox->open();
01550     //    pr.dEdaSum    = pr.dEdaSumBox->open();
01551     //  } else if (gbisPhase == 3) {
01552     //    GBISP("GBIS[%d] noWork() P3[%d] open()\n",CkMyPe(),pr.patchID);
01553     //    pr.dHdrPrefix = pr.dHdrPrefixBox->open();
01554     //  }
01555     //  GBISP("opened GBIS boxes");
01556     //}
01557   }
01558 
01559   if ( master == this ) return 0; //work to do, enqueue as usual
01560 
01561   // message masterPe
01562   computeMgr->sendNonbondedMICSlaveReady(masterPe,
01563                                          hostedPatches.size(),
01564                                          atomsChanged,
01565                                          sequence()
01566                                         );
01567 
01568   workStarted = ((singleKernelFlag != 0) ? (2) : (1));
01569   basePriority = COMPUTE_PROXY_PRIORITY;
01570 
01571   return 1;
01572 }
01573 
01574 void ComputeNonbondedMIC::doWork() {
01575 
01576   GBISP("C.N.MIC[%d]::doWork: seq %d, phase %d, workStarted %d\n", \
01577         CkMyPe(), sequence(), gbisPhase, workStarted);
01578 
01579   if ( workStarted ) { //if work already started, check if finished
01580     if ( finishWork() ) {  // finished
01581       workStarted = 0;
01582       basePriority = PROXY_DATA_PRIORITY;  // higher to aid overlap
01583 
01584       // DMK - DEBUG
01585       timestep++;
01586 
01587     } else {  // need to call again
01588 
01589       workStarted = 2;
01590       basePriority = PROXY_RESULTS_PRIORITY;  // lower for local
01591       if ( master == this && kernel_launch_state > 2 ) {
01592         //#if MIC_TRACING != 0
01593         //  mic_tracing_polling_start = CmiWallTimer();
01594         //  mic_tracing_polling_count = 0;
01595         //#endif
01596         mic_check_local_progress(this,0.);  // launches polling
01597       }
01598     }
01599     return;
01600   }
01601 
01602   //#if MIC_TRACING != 0
01603   //  double doWork_start = CmiWallTimer();
01604   //#endif
01605 
01606   workStarted = ((singleKernelFlag != 0) ? (2) : (1));
01607   basePriority = COMPUTE_PROXY_PRIORITY;
01608 
01609   Molecule *mol = Node::Object()->molecule;
01610   Parameters *params = Node::Object()->parameters;
01611   SimParameters *simParams = Node::Object()->simParameters;
01612 
01613   //execute only during GBIS phase 1, or if not using GBIS
01614   if (!simParams->GBISOn || gbisPhase == 1) {
01615 
01616     // bind new patches to device
01617     if ( atomsChanged || computesChanged ) {
01618 
01619       int npatches = activePatches.size();
01620 
01621       pairlistsValid = 0;
01622       pairlistTolerance = 0.;
01623 
01624       if ( computesChanged ) {
01625 
01626         computesChanged = 0;
01627 
01628         // Merge the local and remote active patch lists into a single list
01629         num_local_patch_records = localActivePatches.size();
01630         num_remote_patch_records = remoteActivePatches.size();
01631         npatches = num_local_patch_records + num_remote_patch_records;
01632         activePatches.resize(npatches);
01633         int *ap = activePatches.begin();
01634         for ( int i=0; i<num_local_patch_records; ++i ) {
01635           *(ap++) = localActivePatches[i];
01636         }
01637         for ( int i=0; i<num_remote_patch_records; ++i ) {
01638           *(ap++) = remoteActivePatches[i];
01639         }
01640 
01641         // sort computes by distance between patches
01642         #if MIC_SORT_COMPUTES != 0
01643           cr_sortop so(lattice);
01644           std::stable_sort(localComputeRecords.begin(),localComputeRecords.end(),so);
01645           std::stable_sort(remoteComputeRecords.begin(),remoteComputeRecords.end(),so);
01646         #endif
01647 
01648         // Merge the sorted lists of local and remote computes into a single list of computes 
01649         num_local_compute_records = localComputeRecords.size();
01650         num_remote_compute_records = remoteComputeRecords.size();
01651         computeRecords.resize(num_local_compute_records + num_remote_compute_records);
01652         compute_record *cr = computeRecords.begin();
01653         for ( int i=0; i<num_local_compute_records; ++i ) {
01654           *(cr++) = localComputeRecords[i];
01655         }
01656         for ( int i=0; i<num_remote_compute_records; ++i ) {
01657           *(cr++) = remoteComputeRecords[i];
01658         }
01659 
01660         // Set each patch record's localIndex and initialize force_list_size for each force_list
01661         force_lists.resize(npatches);
01662         for ( int i=0; i<npatches; ++i ) {
01663           patchRecords[activePatches[i]].localIndex = i;
01664           force_lists[i].force_list_size = 0;
01665         }
01666 
01667         // For the patch_pairs, one for each compute object...
01668         int ncomputes = computeRecords.size();
01669         patch_pairs.resize(ncomputes);
01670         for ( int i=0; i<ncomputes; ++i ) {
01671 
01672           ComputeNonbondedMIC::compute_record &cr = computeRecords[i];
01673           int lp1 = patchRecords[cr.pid[0]].localIndex;
01674           int lp2 = patchRecords[cr.pid[1]].localIndex;
01675 
01676           // Count the number of writers/force-contributers
01677           force_lists[lp1].force_list_size++;
01678           if (!cr.isSelf) {
01679             force_lists[lp2].force_list_size++;
01680           }
01681 
01682           // Initialize the offset
01683           patch_pair &pp = patch_pairs[i];
01684           pp.offset.x = cr.offset.x;
01685           pp.offset.y = cr.offset.y;
01686           pp.offset.z = cr.offset.z;
01687 
01688           // Place in the patch object's atomData (device) pointer
01689           #if MIC_SUBMIT_ATOMS_ON_ARRIVAL != 0
01690             pp.patch1_atomDataPtr = patchRecords[cr.pid[0]].p->mic_atomData_devicePtr[myDevice];
01691             pp.patch2_atomDataPtr = patchRecords[cr.pid[1]].p->mic_atomData_devicePtr[myDevice];
01692           #endif
01693         }
01694 
01695         // Set the force list information for each patch pair
01696         for ( int i=0; i<ncomputes; ++i ) {
01697 
01698           ComputeNonbondedMIC::compute_record &cr = computeRecords[i];
01699           patch_pair &pp = patch_pairs[i];
01700 
01701           int lp1 = patchRecords[cr.pid[0]].localIndex;
01702           pp.patch1_force_list_index = lp1;
01703           pp.patch1_force_list_size = force_lists[lp1].force_list_size;
01704 
01705           if (cr.isSelf) {
01706             pp.patch2_force_list_index = pp.patch1_force_list_index;
01707             pp.patch2_force_list_size = pp.patch1_force_list_size;
01708           } else {
01709             int lp2 = patchRecords[cr.pid[1]].localIndex;
01710             pp.patch2_force_list_index = lp2;
01711             pp.patch2_force_list_size = force_lists[lp2].force_list_size;
01712           }
01713         }
01714 
01715         if ( CmiPhysicalNodeID(CkMyPe()) < 2 )
01716         CkPrintf("Pe %d has %d local and %d remote patches and %d local and %d remote computes.\n",
01717                  CkMyPe(), localActivePatches.size(), remoteActivePatches.size(),
01718                  localComputeRecords.size(), remoteComputeRecords.size()
01719                 );
01720 
01721       }  // computesChanged
01722 
01723       // Count the number of atoms (and non-fixed atoms), recording the accumulated
01724       //   values as we go into the patch record and force list structures.
01725       int istart = 0;
01726       int flstart = 0;
01727       int vlstart = 0;
01728       int max_atoms_per_patch = 0;
01729       int i;
01730       for (i = 0; i < npatches; i++) {
01731 
01732         // If we have walked off the end of the local list, then take
01733         //   note of the numer of atoms so far (the local atom count).
01734         if ( i == localActivePatches.size() ) {
01735           num_local_atom_records = istart;
01736         }
01737 
01738         // Record the current offsets as the offsets as the
01739         // beginning of this force list.
01740         force_lists[i].force_list_start = flstart;
01741         force_lists[i].force_output_start = istart;
01742         force_lists[i].atom_start = istart;
01743 
01744         // Record the current offset as the patch record's starting offset.
01745         patch_record &pr = patchRecords[activePatches[i]];
01746         pr.localStart = istart;
01747 
01748         // Count number of atoms (and non-fixed atoms), recording the actual
01749         //   counts to the patch record and then rounding up for alignment
01750         int natoms = pr.p->getNumAtoms();
01751         int nfreeatoms = natoms;  // MDK - TODO | FIXME : treat all as free for now (will save some work in not all free, smaller pairlists during pairlist generation)
01752         if ( fixedAtomsOn ) {
01753           const CompAtomExt *aExt = pr.xExt;
01754           for ( int j=0; j<natoms; ++j ) {
01755             if ( aExt[j].atomFixed ) --nfreeatoms;
01756           }
01757         }
01758         if ( natoms > max_atoms_per_patch ) max_atoms_per_patch = natoms;
01759         pr.numAtoms = natoms;
01760         pr.numFreeAtoms = nfreeatoms;
01761         force_lists[i].patch_size = natoms; // DMK - nfreeatoms;
01762         natoms = (natoms + 15) & (~15);
01763         nfreeatoms = (nfreeatoms + 15) & (~15);
01764         force_lists[i].patch_stride = natoms; // DMK - nfreeatoms;
01765 
01766         // DMK - CHECK - Rollover check
01767         int _flstart = flstart;
01768         int _vlstart = vlstart;
01769 
01770         // Update the offsets by the atom counts for this patch record.
01771         flstart += natoms * force_lists[i].force_list_size; // DMK - nfreeatoms * force_lists[i].force_list_size;
01772         vlstart += 16 * force_lists[i].force_list_size;
01773         istart += natoms;  // already rounded up
01774 
01775         // DMK - CHECK - Rollover check
01776         __ASSERT(_flstart <= flstart && _vlstart <= vlstart);
01777 
01778         force_lists[i].force_list_size = 0;  // rebuild below
01779       }
01780       // Handle the case where all are local for recording local atom count.
01781       if ( i == localActivePatches.size() ) {
01782         num_local_atom_records = istart;
01783       }
01784 
01785       // Record final offsets (i.e. lengths/counts).
01786       num_force_records = flstart;
01787       num_atom_records = istart;
01788       num_remote_atom_records = num_atom_records - num_local_atom_records;
01789 
01790       // Allocate the memory for the atom and force information
01791       if ( num_atom_records > num_atom_records_allocated ) {
01792         if ( num_atom_records_allocated ) {
01793           _MM_FREE_WRAPPER(atom_params); //delete [] atom_params;
01794           _MM_FREE_WRAPPER(atoms); //delete [] atoms;
01795           _MM_FREE_WRAPPER(forces); //delete [] forces;
01796           _MM_FREE_WRAPPER(slow_forces); //delete [] slow_forces;
01797           //if (simParams->GBISOn) {
01798           //  delete [] intRad0H;//6 GBIS arrays
01799           //  delete [] intRadSH;
01800           //  delete [] psiSumH;
01801           //  delete [] bornRadH;
01802           //  delete [] dEdaSumH;
01803           //  delete [] dHdrPrefixH;
01804           //}
01805         }
01806         num_atom_records_allocated = 1.1 * num_atom_records + 1;
01807         atom_params = (atom_param*)_MM_MALLOC_WRAPPER(num_atom_records_allocated * sizeof(atom_param), 64, "atom_params"); //new atom_param[num_atom_records_allocated];
01808         atoms = (atom*)_MM_MALLOC_WRAPPER(num_atom_records_allocated * sizeof(atom), 64, "atoms"); //new atom[num_atom_records_allocated];
01809         if (atom_params == NULL || atoms == NULL) { NAMD_die("Unable to allocate atoms in ComputeNonbondedMIC::doWork"); }
01810         forces = (double4*)_MM_MALLOC_WRAPPER(num_atom_records_allocated * sizeof(double4), 64, "forces"); //new double4[num_atom_records_allocated];
01811         slow_forces = (double4*)_MM_MALLOC_WRAPPER(num_atom_records_allocated * sizeof(double4), 64, "slow_forces"); //new double4[num_atom_records_allocated];
01812         //allocate GBIS memory
01813         //if (simParams->GBISOn) {
01814         //  intRad0H = new float[num_atom_records_allocated];
01815         //  intRadSH = new float[num_atom_records_allocated];
01816         //  psiSumH = new GBReal[num_atom_records_allocated];
01817         //  bornRadH = new float[num_atom_records_allocated];
01818         //  dEdaSumH = new GBReal[num_atom_records_allocated];
01819         //  dHdrPrefixH = new float[num_atom_records_allocated];
01820         //}
01821         if (forces == NULL || slow_forces == NULL || (simParams->GBISOn &&
01822             (intRad0H == NULL || intRadSH == NULL || psiSumH == NULL ||
01823              bornRadH == NULL || dEdaSumH == NULL || dHdrPrefixH == NULL))) {
01824           NAMD_die("Unable to allocate forces in ComputeNonbondedMIC::doWork");
01825         }
01826       }
01827 
01828       // DMK - NOTE - Continue filling in the patch pair records
01829       int bfstart = 0;
01830       int ncomputes = computeRecords.size();
01831       for ( int i=0; i<ncomputes; ++i ) {
01832 
01833         ComputeNonbondedMIC::compute_record &cr = computeRecords[i];
01834 
01835         int p1 = cr.pid[0];
01836         int p2 = cr.pid[1];
01837         int lp1 = patchRecords[p1].localIndex;
01838         int lp2 = patchRecords[p2].localIndex;
01839         patch_pair &pp = patch_pairs[i];
01840         pp.patch1_atom_start = patchRecords[p1].localStart;
01841         pp.patch1_force_start = force_lists[lp1].force_list_start
01842                               + (force_lists[lp1].patch_stride * force_lists[lp1].force_list_size);
01843         force_lists[lp1].force_list_size++;
01844         pp.patch1_size = patchRecords[p1].numAtoms;
01845         pp.patch1_force_size = patchRecords[p1].numAtoms; 
01846 
01847         if (cr.isSelf) {
01848           pp.patch2_atom_start = pp.patch1_atom_start;
01849           pp.patch2_force_start = pp.patch1_force_start;
01850           pp.patch2_size = pp.patch1_size;
01851           pp.patch2_force_size = pp.patch1_force_size;
01852         } else {
01853           pp.patch2_atom_start = patchRecords[p2].localStart;
01854           pp.patch2_force_start = force_lists[lp2].force_list_start
01855                                 + (force_lists[lp2].patch_stride * force_lists[lp2].force_list_size);
01856           force_lists[lp2].force_list_size++;
01857           pp.patch2_size = patchRecords[p2].numAtoms;
01858           pp.patch2_force_size = patchRecords[p2].numAtoms;
01859         }
01860 
01861         // Get a custom pairlist cutoff for each patch pair
01862         pp.plcutoff = ComputeNonbondedUtil::cutoff +
01863           patchRecords[p1].p->flags.pairlistTolerance +
01864           patchRecords[p2].p->flags.pairlistTolerance;
01865 
01866         // Record the parts
01867         pp.numParts = cr.numParts;
01868         pp.part = cr.part;
01869 
01870         // Record the patch centers
01871         Vector p1_center = micCompute->patchMap->center(p1);
01872         pp.patch1_center_x = p1_center.x;
01873         pp.patch1_center_y = p1_center.y;
01874         pp.patch1_center_z = p1_center.z;
01875         Vector p2_center = micCompute->patchMap->center(p2);
01876         pp.patch2_center_x = p2_center.x;
01877         pp.patch2_center_y = p2_center.y;
01878         pp.patch2_center_z = p2_center.z;
01879 
01880         // DMK - DEBUG
01881         pp.p1 = p1;
01882         pp.p2 = p2;
01883         pp.cid = cr.c;
01884 
01885         pp.block_flags_start = bfstart;
01886         bfstart += ((pp.patch1_force_size + 127) >> 7) << 5;
01887 
01888       } // for ncomputes
01889 
01890       #if 0
01891         CkPrintf("Pe %d mic_bind_patch_pairs %d %d %d %d %d\n", CkMyPe(),
01892           patch_pairs.size(), force_lists.size(),
01893           num_atom_records, num_force_records,
01894           max_atoms_per_patch);
01895       #endif
01896 
01897       int totalmem = patch_pairs.size() * sizeof(patch_pair) +
01898                   force_lists.size() * sizeof(force_list) +
01899                   num_force_records * sizeof(double4) +
01900                   num_atom_records * sizeof(atom) +
01901                   num_atom_records * sizeof(atom_param) +
01902                   num_atom_records * sizeof(double4);
01903       int totalcopy = num_atom_records * ( sizeof(atom) + sizeof(double4) );
01904       /*
01905       CkPrintf("Pe %d allocating %d MB of MIC memory, will copy %d kB per step\n",
01906                         CkMyPe(), totalmem >> 20, totalcopy >> 10);
01907       */
01908 
01909       // Push the data structures just created, such as patch pairs, to the device.
01910       mic_bind_patch_pairs_only(myDevice, patch_pairs.begin(), patch_pairs.size(), patch_pairs.bufSize());
01911       mic_bind_force_lists_only(myDevice, force_lists.begin(), force_lists.size(), force_lists.bufSize());
01912       mic_bind_atoms_only(myDevice, atoms, atom_params, forces, slow_forces, num_atom_records, num_atom_records_allocated);
01913       mic_bind_force_buffers_only(myDevice, (size_t)num_force_records);
01914 
01915     } // atomsChanged || computesChanged
01916 
01917     double charge_scaling = sqrt(COULOMB * scaling * dielectric_1);
01918 
01919     __ASSERT(hostedPatches.size() > 0);
01920     Flags &flags = patchRecords[hostedPatches[0]].p->flags;
01921     float maxAtomMovement = 0.;
01922     float maxPatchTolerance = 0.;
01923 
01924     for ( int i=0; i<activePatches.size(); ++i ) {
01925 
01926       patch_record &pr = patchRecords[activePatches[i]];
01927 
01928       float maxMove = pr.p->flags.maxAtomMovement;
01929       if ( maxMove > maxAtomMovement ) maxAtomMovement = maxMove;
01930 
01931       float maxTol = pr.p->flags.pairlistTolerance;
01932       if ( maxTol > maxPatchTolerance ) maxPatchTolerance = maxTol;
01933 
01934       int start = pr.localStart;
01935       int n = pr.numAtoms;
01936       int n_16 = (n + 15) & (~15);
01937       const CompAtom *a = pr.x;
01938       const CompAtomExt *aExt = pr.xExt;
01939 
01940       #if MIC_SUBMIT_ATOMS_ON_ARRIVAL == 0
01941 
01942       if ( atomsChanged ) {
01943 
01944         #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
01945           atom_param *ap = atom_params + start;
01946           for (int k = 0; k < n; k++) {
01947             int j = aExt[k].sortOrder;
01948             ap[k].vdw_type = a[j].vdwType;
01949             ap[k].index = aExt[j].id;
01950             #ifdef MEM_OPT_VERSION
01951               ap[k].excl_index = exclusionsByAtom[aExt[j].exclId].y;
01952               ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].exclId].x;
01953             #else
01954               ap[k].excl_index = exclusionsByAtom[aExt[j].id].y;
01955               ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].id].x;
01956             #endif
01957           }
01958         #else
01959           atom_param *ap = atom_params + start;
01960           int *ap_vdwType = ((int*)ap) + (0 * n_16);
01961           int *ap_index = ((int*)ap) + (1 * n_16);
01962           int *ap_exclIndex = ((int*)ap) + (2 * n_16);
01963           int *ap_exclMaxDiff = ((int*)ap) + (3 * n_16);
01964           for ( int k=0; k<n; ++k ) {
01965             int j = aExt[k].sortOrder;
01966             ap_vdwType[k] = a[j].vdwType;
01967             ap_index[k] = aExt[j].id;
01968             #ifdef MEM_OPT_VERSION
01969               ap_exclIndex[k] = exclusionsByAtom[aExt[j].exclId].y;
01970               ap_exclMaxDiff[k] = exclusionsByAtom[aExt[j].exclId].x;
01971             #else
01972               ap_exclIndex[k] = exclusionsByAtom[aExt[j].id].y;
01973               ap_exclMaxDiff[k] = exclusionsByAtom[aExt[j].id].x;
01974             #endif
01975           }
01976         #endif
01977 
01978       } // end if (atomsChanged)
01979 
01980       { // start block
01981         const CudaAtom *ac = pr.p->getCudaAtomList();
01982         atom *ap = atoms + start;
01983         #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
01984           memcpy(ap, ac, sizeof(atom)*n);
01985         #else
01986           memcpy(ap, ac, sizeof(atom)*n_16);
01987         #endif
01988 
01989       } // end block
01990 
01991       #endif // MIC_SUBMIT_ATOMS_ON_ARRIVAL == 0
01992 
01993     } // end for (i < activePatches.size())
01994 
01995     //GBISP("finished active patches\n")
01996 
01997     //CkPrintf("maxMovement = %f  maxTolerance = %f  save = %d  use = %d\n",
01998     //  maxAtomMovement, maxPatchTolerance,
01999     //  flags.savePairlists, flags.usePairlists);
02000 
02001     savePairlists = 0;
02002     usePairlists = 0;
02003     if ( flags.savePairlists ) {
02004       savePairlists = 1;
02005       usePairlists = 1;
02006     } else if ( flags.usePairlists ) {
02007       if ( ! pairlistsValid ||
02008            ( 2. * maxAtomMovement > pairlistTolerance ) ) {
02009         reduction->item(REDUCTION_PAIRLIST_WARNINGS) += 1;
02010       } else {
02011         usePairlists = 1;
02012       }
02013     }
02014     if ( ! usePairlists ) {
02015       pairlistsValid = 0;
02016     }
02017     float plcutoff = cutoff;
02018     if ( savePairlists ) {
02019       pairlistsValid = 1;
02020       pairlistTolerance = 2. * maxPatchTolerance;
02021       plcutoff += pairlistTolerance;
02022     }
02023     plcutoff2 = plcutoff * plcutoff;
02024 
02025     //CkPrintf("plcutoff = %f  listTolerance = %f  save = %d  use = %d\n",
02026     //  plcutoff, pairlistTolerance, savePairlists, usePairlists);
02027 
02028   } // !GBISOn || gbisPhase == 1
02029 
02031   //if (simParams->GBISOn) {
02032   //  //open GBIS boxes depending on phase
02033   //  for ( int i=0; i<activePatches.size(); ++i ) {
02034   //    patch_record &pr = master->patchRecords[activePatches[i]];
02035   //    GBISP("doWork[%d] accessing arrays for P%d\n",CkMyPe(),gbisPhase);
02036   //    if (gbisPhase == 1) {
02037   //      //Copy GBIS intRadius to Host
02038   //      if (atomsChanged) {
02039   //        float *intRad0 = intRad0H + pr.localStart;
02040   //        float *intRadS = intRadSH + pr.localStart;
02041   //        for ( int k=0; k<pr.numAtoms; ++k ) {
02042   //          int j = pr.xExt[k].sortOrder;
02043   //          intRad0[k] = pr.intRad[2*j+0];
02044   //          intRadS[k] = pr.intRad[2*j+1];
02045   //        }
02046   //      }
02047   //    } else if (gbisPhase == 2) {
02048   //      float *bornRad = bornRadH + pr.localStart;
02049   //      for ( int k=0; k<pr.numAtoms; ++k ) {
02050   //        int j = pr.xExt[k].sortOrder;
02051   //        bornRad[k] = pr.bornRad[j];
02052   //      }
02053   //    } else if (gbisPhase == 3) {
02054   //      float *dHdrPrefix = dHdrPrefixH + pr.localStart;
02055   //      for ( int k=0; k<pr.numAtoms; ++k ) {
02056   //        int j = pr.xExt[k].sortOrder;
02057   //        dHdrPrefix[k] = pr.dHdrPrefix[j];
02058   //      }
02059   //    } // end phases
02060   //  } // end for patches
02061   //} // if GBISOn
02062 
02063   //#if MIC_TRACING != 0
02064   //  MIC_TRACING_RECORD(MIC_EVENT_FUNC_DOWORK, doWork_start, CmiWallTimer());
02065   //#endif
02066 
02067   kernel_time = CkWallTimer();
02068   kernel_launch_state = ((singleKernelFlag != 0) ? (2) : (1));
02069   if ( mic_is_mine ) recvYieldDevice(-1);
02070 }
02071 
02072 void mic_check_remote_calc(void *arg, double) {
02073   if (mic_check_remote_kernel_complete(myDevice)) {
02074     computeMgr->sendYieldDevice(next_pe_sharing_mic);
02075   } else {
02076     MIC_POLL(mic_check_remote_calc, arg);
02077   }
02078 }
02079 
02080 void mic_check_local_calc(void *arg, double) {
02081   if (mic_check_local_kernel_complete(myDevice)) {
02082     computeMgr->sendYieldDevice(next_pe_sharing_mic);
02083   } else {
02084     MIC_POLL(mic_check_local_calc, arg);
02085   }
02086 }
02087 
02088 void ComputeNonbondedMIC::recvYieldDevice(int pe) {
02089 
02090   GBISP("C.N.MIC[%d]::recvYieldDevice: seq %d, workStarted %d, \
02091         gbisPhase %d, kls %d, from pe %d\n", CkMyPe(), sequence(), \
02092         workStarted, gbisPhase, kernel_launch_state, pe)
02093 
02094   mic_position3_t lata, latb, latc;
02095   lata.x = lattice.a().x;
02096   lata.y = lattice.a().y;
02097   lata.z = lattice.a().z;
02098   latb.x = lattice.b().x;
02099   latb.y = lattice.b().y;
02100   latb.z = lattice.b().z;
02101   latc.x = lattice.c().x;
02102   latc.y = lattice.c().y;
02103   latc.z = lattice.c().z;
02104   SimParameters *simParams = Node::Object()->simParameters;
02105 
02106   switch ( kernel_launch_state ) {
02107 
02109     // Remote
02110     case 1:
02111 
02112       GBISP("C.N.MIC[%d]::recvYieldDeviceR: case 1\n", CkMyPe())
02113       ++kernel_launch_state;
02114       mic_is_mine = 0;
02115       remote_submit_time = CkWallTimer();
02116 
02117       if (!simParams->GBISOn || gbisPhase == 1) {
02118 
02119         // DMK - NOTE : GBIS is not supported in this port yet, so cause a runtime error
02120         //   if it has been enabled and execution has made it this far.
02121         if (simParams->GBISOn) {
02122           NAMD_die("Unsupported feature (DMK33949330)");
02123         }
02124 
02125         //#if MIC_TRACING != 0
02126         //  mic_tracing_offload_start_remote = CmiWallTimer();
02127         //#endif
02128 
02129         // Issue the remote kernel
02130         mic_nonbonded_forces(myDevice, 1,
02131                              num_local_atom_records,
02132                              num_local_compute_records,
02133                              num_local_patch_records,
02134                              lata, latb, latc,
02135                              doSlow, doEnergy,
02136                              usePairlists, savePairlists,
02137                              atomsChanged
02138                             );
02139         #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0)
02140           mic_first_kernel_submit_time = CmiWallTimer();
02141         #endif
02142 
02143         // Start the polling check for the completion of the remote kernel
02144         //#if MIC_TRACING != 0
02145         //  mic_tracing_polling_start = CmiWallTimer();
02146         //  mic_tracing_polling_count = 0;
02147         //#endif
02148         MIC_POLL(mic_check_remote_progress, this);
02149       }
02150 
02151       // NOTE : Fall through to next case (i.e. break not missing)
02152 
02154     // Local
02155     case 2:
02156 
02157       GBISP("C.N.MIC[%d]::recvYieldDeviceL: case 2\n", CkMyPe())
02158       ++kernel_launch_state;
02159       mic_is_mine = 0;
02160 
02161       if (!simParams->GBISOn || gbisPhase == 1) {
02162 
02163         // DMK - NOTE : GBIS is not supported in this port yet, so cause a runtime error
02164         //   if it has been enabled and execution has made it this far.
02165         if (simParams->GBISOn) {
02166           NAMD_die("Unsupported feature (DMK83620583)");
02167         }
02168 
02169         // DMK - TIMING - NOTE : Only local being submitted for now, so
02170         //   take the local time now
02171         local_submit_time = CkWallTimer();
02172 
02173         //#if MIC_TRACING != 0
02174         //  mic_tracing_offload_start_local = CmiWallTimer();
02175         //#endif
02176 
02177         // Issue the local kernel
02178         mic_nonbonded_forces(myDevice, 0,
02179                              num_local_atom_records,
02180                              num_local_compute_records,
02181                              num_local_patch_records,
02182                              lata, latb, latc,
02183                              doSlow, doEnergy,
02184                              usePairlists, savePairlists,
02185                              atomsChanged
02186                             );
02187 
02188         #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0)
02189           if (singleKernelFlag) { mic_first_kernel_submit_time = CmiWallTimer(); }
02190         #endif
02191 
02192         if ( workStarted == 2 ) {
02193           // Start the polling check for the completion of the local kernel
02194           //#if MIC_TRACING != 0
02195           //  mic_tracing_polling_start = CmiWallTimer();
02196           //  mic_tracing_polling_count = 0;
02197           //#endif
02198           MIC_POLL(mic_check_local_progress, this);
02199         }
02200 
02201       } // end if (!simParams->GBISOn || gbisPhase == 1)
02202 
02203     default:
02204 
02205       GBISP("C.N.MIC[%d]::recvYieldDevice: case default\n", CkMyPe())
02206       mic_is_mine = 1;
02207       break;
02208 
02209   } // switch
02210 
02211   GBISP("C.N.MIC[%d]::recvYieldDevice: DONE\n", CkMyPe())
02212 }
02213 
02214 
02215 int ComputeNonbondedMIC::finishWork() {
02216 
02217   if ( master == this ) {
02218     for ( int i = 0; i < numSlaves; ++i ) {
02219       computeMgr->sendNonbondedMICSlaveEnqueue(slaves[i],slavePes[i],sequence(),priority(),workStarted);
02220     }
02221   }
02222 
02223   //#if MIC_TRACING != 0
02224   //  double finishWork_start = CmiWallTimer();
02225   //#endif
02226 
02227   GBISP("C.N.MIC[%d]::fnWork: workStarted %d, phase %d\n", \
02228   CkMyPe(), workStarted, gbisPhase)
02229 
02230   Molecule *mol = Node::Object()->molecule;
02231   SimParameters *simParams = Node::Object()->simParameters;
02232 
02233   ResizeArray<int> &patches( workStarted == 1 ? remoteHostedPatches : localHostedPatches );
02234   mic_kernel_data * host__local_kernel_data = host__kernel_data;
02235   mic_kernel_data * host__remote_kernel_data = host__kernel_data + 1;
02236   mic_kernel_data* &kernel_data = (workStarted == 1) ? (host__remote_kernel_data) : (host__local_kernel_data);
02237 
02238   // DMK - NOTE : Open the force boxes for the patches so forces can be contributed
02239   if ( !simParams->GBISOn || gbisPhase == 1 ) {
02240     for ( int i=0; i<patches.size(); ++i ) {
02241       patch_record &pr = master->patchRecords[patches[i]];
02242       GBISP("GBIS[%d] fnWork() P0[%d] force.open()\n",CkMyPe(), pr.patchID);
02243       pr.r = pr.forceBox->open();
02244     }
02245   } // !GBISOn || gbisPhase==1
02246 
02247   // DMK - NOTE : For each patch...
02248   for ( int i=0; i<patches.size(); ++i ) {
02249 
02250     // CkPrintf("Pe %d patch %d of %d pid %d\n",CkMyPe(),i,patches.size(),patches[i]);
02251     patch_record &pr = master->patchRecords[patches[i]];
02252     int start = pr.localStart;
02253     const CompAtomExt *aExt = pr.xExt;
02254 
02255     // DMK - NOTE : If this is the end of the timestep for this compute
02256     if ( !simParams->GBISOn || gbisPhase == 3 ) {
02257 
02258       // DMK - NOTE : Contribute the calculated forces and slow_forces from
02259       //   this compute to the patch.
02260       //int nfree = pr.numFreeAtoms;
02261       int nAtoms = pr.numAtoms;
02262       int nAtoms_16 = (nAtoms + 15) & (~15);
02263       pr.f = pr.r->f[Results::nbond];
02264       Force *f = pr.f;
02265       Force *f_slow = pr.r->f[Results::slow];
02266       const CompAtom *a = pr.x;
02267       const CompAtomExt *aExt = pr.xExt;
02268       // int *ao = atom_order + start;
02269       //float4 *af = master->forces + start;
02270       //float4 *af_slow = master->slow_forces + start;
02271 
02272       #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
02273 
02274         double4 *af = master->forces + start;
02275         for (int k = 0; k < nAtoms; k++) {
02276           int j = aExt[k].sortOrder;
02277           f[j].x += af[k].x;
02278           f[j].y += af[k].y;
02279           f[j].z += af[k].z;
02280         }
02281         if (doSlow) {
02282           double4 *af_slow = master->slow_forces + start;
02283           for (int k = 0; k < nAtoms; k++) {
02284             int j = aExt[k].sortOrder;
02285             f_slow[j].x += af_slow[k].x;
02286             f_slow[j].y += af_slow[k].y;
02287             f_slow[j].z += af_slow[k].z;
02288           }
02289         }
02290 
02291       #else
02292 
02293         double4 *af = master->forces + start;
02294         double4 *af_slow = master->slow_forces + start;
02295         double *af_x = ((double*)af) + (0 * nAtoms_16);
02296         double *af_y = ((double*)af) + (1 * nAtoms_16);
02297         double *af_z = ((double*)af) + (2 * nAtoms_16);
02298         double *af_w = ((double*)af) + (3 * nAtoms_16);
02299         double *af_slow_x = ((double*)af_slow) + (0 * nAtoms_16);
02300         double *af_slow_y = ((double*)af_slow) + (1 * nAtoms_16);
02301         double *af_slow_z = ((double*)af_slow) + (2 * nAtoms_16);
02302         double *af_slow_w = ((double*)af_slow) + (3 * nAtoms_16);
02303 
02304         for (int k = 0; k < nAtoms; k++) {
02305           int j = aExt[k].sortOrder;
02306           f[j].x += af_x[k];
02307           f[j].y += af_y[k];
02308           f[j].z += af_z[k];
02309           if (doSlow) {
02310             f_slow[j].x += af_slow_x[k];
02311             f_slow[j].y += af_slow_y[k];
02312             f_slow[j].z += af_slow_z[k];
02313           }
02314         }
02315 
02316       #endif
02317 
02318     } // !GBISOn || gbisPhase == 3
02319 
02320     #if 0
02321       if ( i % 31 == 0 ) for ( int j=0; j<3; ++j ) {
02322         CkPrintf("Pe %d patch %d atom %d (%f %f %f) force %f\n", CkMyPe(), i,
02323                  j, pr.x[j].position.x, pr.x[j].position.y, pr.x[j].position.z,
02324                  af[j].w);
02325       }
02326     #endif
02327 
02329     //if (simParams->GBISOn) {
02330     //
02331     //  if (gbisPhase == 1) {
02332     //
02333     //    //Copy dEdaSum from Host to Patch Box
02334     //    GBReal *psiSumMaster = master->psiSumH + start;
02335     //    for ( int k=0; k<pr.numAtoms; ++k ) {
02336     //      int j = aExt[k].sortOrder;
02337     //      pr.psiSum[j] += psiSumMaster[k];
02338     //    }
02339     //    GBISP("C.N.MIC[%d]::fnWork: P1 psiSum.close()\n", CkMyPe());
02340     //    pr.psiSumBox->close(&(pr.psiSum));
02341     //
02342     //  } else if (gbisPhase == 2) {
02343     //
02344     //    //Copy dEdaSum from Host to Patch Box
02345     //    GBReal *dEdaSumMaster = master->dEdaSumH + start;
02346     //    for ( int k=0; k<pr.numAtoms; ++k ) {
02347     //      int j = aExt[k].sortOrder;
02348     //      pr.dEdaSum[j] += dEdaSumMaster[k];
02349     //    }
02350     //    GBISP("C.N.MIC[%d]::fnWork: P2 dEdaSum.close()\n", CkMyPe());
02351     //    pr.dEdaSumBox->close(&(pr.dEdaSum));
02352     //
02353     //  } else if (gbisPhase == 3) {
02354     //
02355     //    GBISP("C.N.MIC[%d]::fnWork: P3 all.close()\n", CkMyPe());
02356     //    pr.intRadBox->close(&(pr.intRad)); //box 6
02357     //    pr.bornRadBox->close(&(pr.bornRad)); //box 7
02358     //    pr.dHdrPrefixBox->close(&(pr.dHdrPrefix)); //box 9
02359     //    pr.positionBox->close(&(pr.x)); //box 0
02360     //    pr.forceBox->close(&(pr.r));
02361     //
02362     //  } //end phases
02363     //
02364     //} else { //not GBIS
02365     //
02366     //  GBISP("C.N.MIC[%d]::fnWork: pos/force.close()\n", CkMyPe());
02367 
02369     //double _start = CmiWallTimer();
02370 
02371       pr.positionBox->close(&(pr.x));
02372       pr.forceBox->close(&(pr.r));
02373 
02375     //traceUserBracketEvent(11050, _start, CmiWallTimer());
02376 
02377     //}
02378 
02379   }  // end for (i<patches.size())
02380 
02381   // DMK - NOTE : Contribute virial values
02382   if ( master == this && (!simParams->GBISOn || gbisPhase == 3) && workStarted == 2 ) {
02383 
02384     double virial_xx = host__local_kernel_data->virial_xx;
02385     double virial_xy = host__local_kernel_data->virial_xy;
02386     double virial_xz = host__local_kernel_data->virial_xz;
02387     double virial_yy = host__local_kernel_data->virial_yy;
02388     double virial_yz = host__local_kernel_data->virial_yz;
02389     double virial_zz = host__local_kernel_data->virial_zz;
02390     double fullElectVirial_xx = host__local_kernel_data->fullElectVirial_xx;
02391     double fullElectVirial_xy = host__local_kernel_data->fullElectVirial_xy;
02392     double fullElectVirial_xz = host__local_kernel_data->fullElectVirial_xz;
02393     double fullElectVirial_yy = host__local_kernel_data->fullElectVirial_yy;
02394     double fullElectVirial_yz = host__local_kernel_data->fullElectVirial_yz;
02395     double fullElectVirial_zz = host__local_kernel_data->fullElectVirial_zz;
02396     double vdwEnergy = host__local_kernel_data->vdwEnergy;
02397     double electEnergy = host__local_kernel_data->electEnergy;
02398     double fullElectEnergy = host__local_kernel_data->fullElectEnergy;
02399     if (singleKernelFlag == 0) {
02400       virial_xx += host__remote_kernel_data->virial_xx;
02401       virial_xy += host__remote_kernel_data->virial_xy;
02402       virial_xz += host__remote_kernel_data->virial_xz;
02403       virial_yy += host__remote_kernel_data->virial_yy;
02404       virial_yz += host__remote_kernel_data->virial_yz;
02405       virial_zz += host__remote_kernel_data->virial_zz;
02406       fullElectVirial_xx += host__remote_kernel_data->fullElectVirial_xx;
02407       fullElectVirial_xy += host__remote_kernel_data->fullElectVirial_xy;
02408       fullElectVirial_xz += host__remote_kernel_data->fullElectVirial_xz;
02409       fullElectVirial_yy += host__remote_kernel_data->fullElectVirial_yy;
02410       fullElectVirial_yz += host__remote_kernel_data->fullElectVirial_yz;
02411       fullElectVirial_zz += host__remote_kernel_data->fullElectVirial_zz;
02412       vdwEnergy += host__remote_kernel_data->vdwEnergy;
02413       electEnergy += host__remote_kernel_data->electEnergy;
02414       fullElectEnergy += host__remote_kernel_data->fullElectEnergy;
02415     }
02416 
02417     // DMK - NOTE : Contribute virial
02418     Tensor virial_tensor;
02419     virial_tensor.xx = virial_xx;
02420     virial_tensor.xy = virial_xy;
02421     virial_tensor.xz = virial_xz;
02422     virial_tensor.yx = virial_xy;
02423     virial_tensor.yy = virial_yy;
02424     virial_tensor.yz = virial_yz;
02425     virial_tensor.zx = virial_xz;
02426     virial_tensor.zy = virial_yz;
02427     virial_tensor.zz = virial_zz;
02428     // DMK - TODO | FIXME : GBIS support needed eventually
02429     ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NBOND, virial_tensor);
02430     if (doEnergy) {
02431       reduction->item(REDUCTION_LJ_ENERGY) += vdwEnergy;
02432       reduction->item(REDUCTION_ELECT_ENERGY) += electEnergy;
02433     }
02434     if (doSlow) {
02435       Tensor virial_slow_tensor;
02436       virial_slow_tensor.xx = fullElectVirial_xx;
02437       virial_slow_tensor.xy = fullElectVirial_xy;
02438       virial_slow_tensor.xz = fullElectVirial_xz;
02439       virial_slow_tensor.yx = fullElectVirial_xy;
02440       virial_slow_tensor.yy = fullElectVirial_yy;
02441       virial_slow_tensor.yz = fullElectVirial_yz;
02442       virial_slow_tensor.zx = fullElectVirial_xz;
02443       virial_slow_tensor.zy = fullElectVirial_yz;
02444       virial_slow_tensor.zz = fullElectVirial_zz;
02445       ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_SLOW, virial_slow_tensor);
02446       if (doEnergy) { reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += fullElectEnergy; }
02447     }
02448 
02449     // Contribute to the exclusion checksum
02450     #if MIC_EXCL_CHECKSUM_FULL != 0
02451       int exclusionSum = host__local_kernel_data->exclusionSum;
02452       if (singleKernelFlag == 0) { exclusionSum += host__remote_kernel_data->exclusionSum; }
02453       reduction->item(REDUCTION_EXCLUSION_CHECKSUM) += exclusionSum;
02454     #endif
02455 
02456     // TRACING - Using the tracing output data generated by the device, submit user events to show
02457     //   performance data from the MIC device in Projections output
02458     #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0)
02459     {
02460       double timeBase = host__device_times_start[((singleKernelFlag) ? (1) : (0))];
02461 
02462       #if MIC_DEVICE_TRACING_DETAILED != 0
02463 
02464         // Create compute user events
02465         ComputeMap *computeMap = ComputeMap::Object();
02466         PatchMap *patchMap = PatchMap::Object();
02467         int aSize = patchMap->gridsize_a();
02468         int bSize = patchMap->gridsize_b();
02469         int cSize = patchMap->gridsize_c();
02470         for (int i = 0; i < host__patch_pairs_size; i++) {
02471 
02472           // Determine the distance between the patches
02473           int pid0 = host__patch_pairs[i].p1;
02474           int pid1 = host__patch_pairs[i].p2;
02475           int dist = 0;
02476           if (pid0 != pid1) { // Is a pair, not a self
02477             int trans0 = computeMap->trans(host__patch_pairs[i].cid, 0);
02478             int trans1 = computeMap->trans(host__patch_pairs[i].cid, 1);
02479             int index_a0 = patchMap->index_a(pid0) + aSize * Lattice::offset_a(trans0);
02480             int index_b0 = patchMap->index_b(pid0) + bSize * Lattice::offset_b(trans0);
02481             int index_c0 = patchMap->index_c(pid0) + cSize * Lattice::offset_c(trans0);
02482             int index_a1 = patchMap->index_a(pid1) + aSize * Lattice::offset_a(trans1);
02483             int index_b1 = patchMap->index_b(pid1) + bSize * Lattice::offset_b(trans1);
02484             int index_c1 = patchMap->index_c(pid1) + cSize * Lattice::offset_c(trans1);
02485             int da = index_a0 - index_a1; da *= ((da < 0) ? (-1) : (1));
02486             int db = index_b0 - index_b1; db *= ((db < 0) ? (-1) : (1));
02487             int dc = index_c0 - index_c1; dc *= ((dc < 0) ? (-1) : (1));
02488             dist = da + db + dc;
02489           }
02490 
02491           // Retrieve the start and end times for the "compute's task"
02492           double ueStart = host__device_times_computes[i * 2];
02493           double ueEnd = host__device_times_computes[i * 2 + 1];
02494           ueStart += mic_first_kernel_submit_time - timeBase;
02495           ueEnd += mic_first_kernel_submit_time - timeBase;
02496 
02497           if (dist > 7) { dist = 7; }  // NOTE: Make sure that the distance is put in the "7+" category if it is >7
02498           traceUserBracketEvent(MIC_EVENT_DEVICE_COMPUTE + dist, ueStart, ueEnd);
02499         }
02500 
02501         // Create patch (force reduction) user events
02502         for (int i = 0; i < host__force_lists_size; i++) {
02503           double ueStart = host__device_times_patches[i * 2];
02504           double ueEnd = host__device_times_patches[i * 2 + 1];
02505           ueStart += mic_first_kernel_submit_time - timeBase;
02506           ueEnd += mic_first_kernel_submit_time - timeBase;
02507           traceUserBracketEvent(MIC_EVENT_DEVICE_PATCH, ueStart, ueEnd);
02508         }
02509 
02510       #endif // MIC_DEVICE_TRACING_DETAILED
02511 
02512       // Create phases
02513       double lPTime0 = host__device_times_start[3];
02514       double lPTime1 = host__device_times_start[5];
02515       double lPTime2 = host__device_times_start[7];
02516       double lPTime3 = host__device_times_start[9];
02517       lPTime0 += mic_first_kernel_submit_time - timeBase;
02518       lPTime1 += mic_first_kernel_submit_time - timeBase;
02519       lPTime2 += mic_first_kernel_submit_time - timeBase;
02520       lPTime3 += mic_first_kernel_submit_time - timeBase;
02521       traceUserBracketEvent(MIC_EVENT_DEVICE_COMPUTES, lPTime0, lPTime1);
02522       //traceUserBracketEvent(MIC_EVENT_DEVICE_VIRIALS, lPTime1, lPTime2);
02523       traceUserBracketEvent(MIC_EVENT_DEVICE_PATCHES, lPTime2, lPTime3);
02524       if (singleKernelFlag == 0) {
02525         double rPTime0 = host__device_times_start[2];
02526         double rPTime1 = host__device_times_start[4];
02527         double rPTime2 = host__device_times_start[6];
02528         double rPTime3 = host__device_times_start[8];
02529         rPTime0 += mic_first_kernel_submit_time - timeBase;
02530         rPTime1 += mic_first_kernel_submit_time - timeBase;
02531         rPTime2 += mic_first_kernel_submit_time - timeBase;
02532         rPTime3 += mic_first_kernel_submit_time - timeBase;
02533         traceUserBracketEvent(MIC_EVENT_DEVICE_COMPUTES, rPTime0, rPTime1);
02534         //traceUserBracketEvent(MIC_EVENT_DEVICE_VIRIALS, rPTime1, rPTime2);
02535         traceUserBracketEvent(MIC_EVENT_DEVICE_PATCHES, rPTime2, rPTime3);
02536       }
02537     }
02538     #endif  // MIC_DEVICE_TRACING
02539   }
02540 
02541   if (workStarted == 1) { return 0; }
02542 
02543   if ( master != this ) {  // finished
02544     GBISP("finished\n");
02545     if (simParams->GBISOn) gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
02546     atomsChanged = 0;
02547 
02548     //#if MIC_TRACING != 0
02549     //  MIC_TRACING_RECORD(MIC_EVENT_FUNC_FINISHWORK, finishWork_start, CmiWallTimer());
02550     //#endif
02551 
02552     return 1;
02553   }
02554 
02555   mic_timer_total += kernel_time;
02556 
02557   // DMK - NOTE : If this is the end of the step for this compute
02558   if ( !simParams->GBISOn || gbisPhase == 3 ) {
02559 
02560     // DMK - DEBUG
02561     MICP("submitting reduction...\n"); MICPF;
02562 
02563     atomsChanged = 0;
02564 
02566     //double __start = CmiWallTimer();
02567 
02568     reduction->submit();
02569 
02571     //traceUserBracketEvent(11051, __start, CmiWallTimer());
02572 
02573     #if 0
02574     mic_timer_count++;
02575     if ( simParams->outputCudaTiming &&
02576           step % simParams->outputCudaTiming == 0 ) {
02577 
02578       // int natoms = mol->numAtoms; 
02579       // double wpa = wcount;  wpa /= natoms;
02580 
02581       // CkPrintf("Pe %d MIC kernel %f ms, total %f ms, wpa %f\n", CkMyPe(),
02582       //          kernel_time * 1.e3, time * 1.e3, wpa);
02583 
02584       #if 0
02585         float upload_ms, remote_calc_ms;
02586         float local_calc_ms, total_ms;
02587         mic_errcheck("before event timers");
02588         micEventElapsedTime(&upload_ms, start_upload, start_calc);
02589         mic_errcheck("in event timer 1");
02590         micEventElapsedTime(&remote_calc_ms, start_calc, end_remote_download);
02591         mic_errcheck("in event timer 2");
02592         micEventElapsedTime(&local_calc_ms, end_remote_download, end_local_download);
02593         mic_errcheck("in event timer 3");
02594         micEventElapsedTime(&total_ms, start_upload, end_local_download);
02595         mic_errcheck("in event timer 4");
02596         mic_errcheck("in event timers");
02597 
02598         CkPrintf("MIC EVENT TIMING: %d %f %f %f %f\n",
02599                  CkMyPe(), upload_ms, remote_calc_ms,
02600                  local_calc_ms, total_ms);
02601       #endif
02602 
02603       if ( mic_timer_count >= simParams->outputCudaTiming ) {
02604         mic_timer_total /= mic_timer_count;
02605         CkPrintf("MIC TIMING: %d  %f ms/step on node %d\n",
02606                  step, mic_timer_total * 1.e3, CkMyPe());
02607       }
02608       mic_timer_count = 0;
02609       mic_timer_total = 0;
02610     }
02611     #endif
02612 
02613   } // !GBISOn || gbisPhase==3  
02614 
02615   // Next GBIS Phase
02616   GBISP("C.N.MIC[%d]::fnWork: incrementing phase\n", CkMyPe())
02617   if (simParams->GBISOn) gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
02618 
02619   GBISP("C.N.MIC[%d] finished ready for next step\n",CkMyPe());
02620 
02621   //#if MIC_TRACING != 0
02622   //  MIC_TRACING_RECORD(MIC_EVENT_FUNC_FINISHWORK, finishWork_start, CmiWallTimer());
02623   //#endif
02624 
02625   return 1;  // finished and ready for next step
02626 }
02627 
02628 
02629 __thread FILE* mic_output = NULL;
02630 __thread int mic_output_set = 0;
02631 
02632 
02633 void mic_initproc() {
02634   #if MIC_DEBUG > 0
02635     debugInit(NULL);
02636   #endif
02637 }
02638 
02639 
02640 void debugInit(FILE* fout) {
02641   if (mic_output != NULL) { return; }
02642   if (fout != NULL) {
02643     mic_output = fout;
02644     mic_output_set = 1;
02645   } else {
02646     char fname[256];
02647     sprintf(fname, "mic_debug.%d", CkMyPe());
02648     printf("[MIC-INFO] :: Creating MIC debug file \"%s\" for PE %d...\n", fname, CkMyPe());
02649     mic_output = fopen(fname, "w");
02650     mic_output_set = 0;
02651   }
02652 }
02653 
02654 void debugClose() {
02655   if (mic_output_set == 0) { fclose(mic_output); mic_output = NULL; }
02656 }
02657 
02658 
02659 void mic_initHostDeviceLDB() {
02660   assert(!CkMyPe());  // NOTE: Only PE 0 should call this function
02661   WorkDistrib::send_initHostDeviceLDB();  // This results in a broadcast causing all
02662 }                                         //   PEs to call mic_hostDeviceLDB() below
02663 
02664 
02665 #define COMPUTE_DISTANCE(cid) \
02666 int manDist = -1; { \
02667   if (computeMap->type(cid) == computeNonbondedSelfType) { \
02668     manDist = 0; \
02669   } else if (computeMap->type(cid) == computeNonbondedPairType) { \
02670     int aSize = patchMap->gridsize_a(); \
02671     int bSize = patchMap->gridsize_b(); \
02672     int cSize = patchMap->gridsize_c(); \
02673     int pid0 = computeMap->pid(cid, 0); \
02674     int pid1 = computeMap->pid(cid, 1); \
02675     int trans0 = computeMap->trans(cid, 0); \
02676     int trans1 = computeMap->trans(cid, 1); \
02677     int index_a0 = patchMap->index_a(pid0) + aSize * Lattice::offset_a(trans0); \
02678     int index_b0 = patchMap->index_b(pid0) + bSize * Lattice::offset_b(trans0); \
02679     int index_c0 = patchMap->index_c(pid0) + cSize * Lattice::offset_c(trans0); \
02680     int index_a1 = patchMap->index_a(pid1) + aSize * Lattice::offset_a(trans1); \
02681     int index_b1 = patchMap->index_b(pid1) + bSize * Lattice::offset_b(trans1); \
02682     int index_c1 = patchMap->index_c(pid1) + cSize * Lattice::offset_c(trans1); \
02683     int da = index_a0 - index_a1; da *= ((da < 0) ? (-1) : (1)); \
02684     int db = index_b0 - index_b1; db *= ((db < 0) ? (-1) : (1)); \
02685     int dc = index_c0 - index_c1; dc *= ((dc < 0) ? (-1) : (1)); \
02686     manDist = da + db + dc; \
02687   } \
02688 }
02689 
02690 
02691 void mic_hostDeviceLDB() {
02692   // NOTE: Called by all PEs, whether they drive a MIC or not
02693 
02694   const SimParameters * simParams = Node::Object()->simParameters;
02695 
02696   // If this PE is not driving a MIC, then just return an empty contribution
02697   if (devicePe != CkMyPe()) {
02698     WorkDistrib::send_contributeHostDeviceLDB(0, NULL);
02699     return;
02700   }
02701 
02702   // Otherwise send the set of PEs contributing to this PE's MIC
02703   WorkDistrib::send_contributeHostDeviceLDB(numPesSharingDevice, pesSharingDevice);
02704 }
02705 
02706 
02707 void mic_setDeviceLDBParams(int dt, int hs, int sp1, int pp1, int pp2) {
02708   if (CkMyPe() != 0) {
02709     if (dt >= 0) { mic_deviceThreshold = dt; }
02710     if (hs >= 0) { mic_hostSplit = hs; }
02711     if (sp1 > 0) { mic_numParts_self_p1 = sp1; }
02712     if (pp1 > 0) { mic_numParts_pair_p1 = pp1; }
02713     if (pp2 > 0) { mic_numParts_pair_p2 = pp2; }
02714   }
02715 }
02716 
02717 
02718 void mic_contributeHostDeviceLDB(int peSetLen, int * peSet) {
02719   assert(!CkMyPe()); // NOTE: This function should only be called on PE 0
02720 
02721   static int numContribs = 0;
02722   __ASSERT(numContribs >= 0 && numContribs < CkNumPes());  // Make sure we don't get more messages than expected
02723 
02724   static double hdLDBTime = 0.0;
02725   static double hdLDBTime_phase0 = 0.0;
02726   static double hdLDBTime_phase1 = 0.0;
02727   static double hdLDBTime_phase2a = 0.0;
02728   static double hdLDBTime_phase2b = 0.0;
02729   double startTime = CmiWallTimer();
02730 
02731   const SimParameters * simParams = Node::Object()->simParameters;
02732   PatchMap *patchMap = PatchMap::Object();
02733   ComputeMap *computeMap = ComputeMap::Object();
02734   Molecule *molecule = Node::Object()->molecule;
02735   int nComputes = computeMap->numComputes();
02736 
02737   if (numContribs == 0) { // First call
02738     // Setup load balancing parameters (pulling from SimParameters if they haven't already been
02739     //   set using the command line or PE 0)
02740     if (mic_deviceThreshold < 0) { mic_deviceThreshold = simParams->mic_deviceThreshold; }
02741     if (mic_hostSplit < 0) { mic_hostSplit = simParams->mic_hostSplit; }
02742     if (mic_numParts_self_p1 < 0) { mic_numParts_self_p1 = simParams->mic_numParts_self_p1; }
02743     if (mic_numParts_pair_p1 < 0) { mic_numParts_pair_p1 = simParams->mic_numParts_pair_p1; }
02744     if (mic_numParts_pair_p2 < 0) { mic_numParts_pair_p2 = simParams->mic_numParts_pair_p2; }
02745   }
02746 
02747   // If a PE has checked in with PE 0 with a non-empty peSet, perform the work
02748   //   of load balancing the computes within that peSet.
02749   if (peSetLen > 0) {
02750 
02751     // DMK - DEBUG
02752     #if MIC_VERBOSE_HVD_LDB != 0
02753       CkPrintf("[MIC-HvD-LDB] :: PE %d :: offload pe list = {", CkMyPe());
02754       for (int i = 0; i < peSetLen; i++) { CkPrintf(" %d", peSet[i]); }
02755       CkPrintf(" }\n");
02756     #endif
02757 
02758     // If the deviceThreshold and/or hostSplit values are unset, default them now
02759     // NOTE: By this time, both the command line and config file parameters will have been applied if present
02760     if (mic_deviceThreshold < 0 || mic_hostSplit <= 0) {
02761 
02762       // Collect some parameters for the given problem/input
02763       const int ppn = CkNodeSize(0);  // NOTE: For now, assuming all nodes have the same number of host threads
02764       const int ppm = ppn / mic_get_device_count();
02765       const float atomsPerPE = ((float)(molecule->numAtoms)) / ((float)(CkNumPes()));
02766       const int numAway = ((simParams->twoAwayX > 0) ? (1) : (0))
02767                         + ((simParams->twoAwayY > 0) ? (1) : (0))
02768                         + ((simParams->twoAwayZ > 0) ? (1) : (0));
02769       // If the "twoAway" options are being used, backoff/reduce on the hostSplit adjustment by
02770       //   prending there are more atoms per PE (i.e. reduce the scaling adjustment below)
02771       const float atomsPerPE_adj = atomsPerPE + (atomsPerPE * 0.85f * numAway);
02772 
02773       // DMK - DEBUG
02774       #if MIC_VERBOSE_HVD_LDB != 0
02775         printf("[MIC-HvD-LDB] :: PE %d :: ppn: %d, ppm: %d, atomsPerPe: %f (%f)\n", CkMyPe(), ppn, ppm, atomsPerPE, atomsPerPE_adj);
02776       #endif
02777 
02778       // Set hostSplit
02779       // NOTE: This calculation is based on measurements performed on the Stampede cluster at
02780       //   TACC (update as necessary: code changes, performance improvements on the MIC, etc.)
02781       // NOTE: The calculation has only been tested on Stampede, and breaks down at 2AwayXY because
02782       //   of other dynamic load balancing effects that occur at those scales.  TODO: These effects
02783       //   need to be corrected/accounted for and then this balancing scheme needs to be revisited.
02784       //if (numAway < 1) {
02785 
02786         // Set deviceThreshold
02787         mic_deviceThreshold = 2;
02788 
02789         // DMK - NOTE : The constants used here are based on measurements and may need to change as the
02790         //   the code is modified (change as necessary)
02791         float hs_base = 0.714f * 79.0f;
02792         float hostVsDevice_adj = 0.714f * ppm / 2.0f;
02793         float scaling_adj = 0.714f * 26000.0f / atomsPerPE_adj;  // NOTE: scaling as problem thins out, not hard processor/core count
02794 
02795         mic_hostSplit = (int)(hs_base - hostVsDevice_adj - scaling_adj);
02796         if (mic_hostSplit < 5) { mic_hostSplit = 5; }  // Apply upper and lower bounds (5% - 95%)
02797         if (mic_hostSplit > 95) { mic_hostSplit = 95; }
02798 
02799       //} else {
02800       //  mic_deviceThreshold = 2; // + numAway;
02801       //  mic_hostSplit = 30;
02802       //}
02803 
02804       //printf("[MIC-HvD-LDB] :: PE %d :: Automated HvD LDB - Setting DT:%d and HS:%d...\n", CkMyPe(), mic_deviceThreshold, mic_hostSplit);
02805       iout << iINFO << "MIC-HvD-LDB - Automated HvD LDB Setting DT:" << mic_deviceThreshold << " and HS:" << mic_hostSplit << "\n" << endi;
02806 
02807     } else {
02808       if (mic_hostSplit > 100) { mic_hostSplit = 100; }
02809     }
02810     __ASSERT(mic_deviceThreshold >= 0 && mic_hostSplit >= 0);
02811 
02812     // Create "grab bags" of self and pair computes associated with the peSet that
02813     //   can be potentially offloaded to the MIC associated with that peSet
02814     std::queue<int> * selfs = new std::queue<int>();
02815     std::queue<int> * pairs = new std::queue<int>();
02816     std::queue<int> * selfsTmp = new std::queue<int>();
02817     std::queue<int> * pairsTmp = new std::queue<int>();
02818     __ASSERT(selfs != NULL && pairs != NULL && selfsTmp != NULL && pairsTmp != NULL);
02819 
02820     int minPID = -1;
02821     int maxPID = -1;
02822     #define WIDEN_PID_RANGE(pid) \
02823       if (minPID < 0) { minPID = pid; maxPID = pid; } \
02824       if (pid < minPID) { minPID = pid; } \
02825       if (pid > maxPID) { maxPID = pid; }
02826 
02827     double startTime_phase0 = CmiWallTimer();
02828 
02829     // NOTE: If a self or pair compute does not meet the requirements set forth by the load
02830     //   balancing parameters, place it in pairsTmp ("other" selfs and pairs).  It may be
02831     //   required in phase1 below to ensure the one-patch-per-PE requirement in phase 1.
02832     int totalNumSelfs = 0;
02833     int totalNumPairs = 0;
02834     for (int i = 0; i < nComputes; i++) {
02835 
02836       // Skip this compute if it is not on a PE in the given set of PEs
02837       int pe = computeMap->node(i);
02838       bool peInSet = false;
02839       for (int j = 0; !peInSet && j < peSetLen; j++) {
02840         if (peSet[j] == pe) { peInSet = true; }
02841       }
02842       if (!peInSet) { continue; }
02843 
02844       // Only consider self and pair computes
02845       if (computeMap->type(i) == computeNonbondedSelfType) {
02846         totalNumSelfs++;
02847       } else if (computeMap->type(i) == computeNonbondedPairType) {
02848         totalNumPairs++;
02849       } else {
02850         continue;
02851       }
02852 
02853       // Check: distance less than device threshold
02854       COMPUTE_DISTANCE(i);
02855       if (manDist < 0 || manDist > mic_deviceThreshold) {
02856         pairsTmp->push(i);  // NOTE: because mic_deviceThreshold >= 0 and the manDist
02857         continue;           //   of a self is 0, only pairs can be placed in pairsTmp
02858       }
02859 
02860       // Check: self or pair, place in associated "grab bag" while updating PID range
02861       if (computeMap->type(i) == computeNonbondedSelfType) {
02862         selfs->push(i);
02863         int pid0 = computeMap->pid(i, 0); WIDEN_PID_RANGE(pid0);
02864       }
02865       if (computeMap->type(i) == computeNonbondedPairType) {
02866         pairs->push(i);
02867         int pid0 = computeMap->pid(i, 0); WIDEN_PID_RANGE(pid0);
02868         int pid1 = computeMap->pid(i, 1); WIDEN_PID_RANGE(pid1);
02869       }
02870     }
02871     __ASSERT(minPID <= maxPID);
02872 
02873     hdLDBTime_phase0 += CmiWallTimer() - startTime_phase0;
02874 
02875     // Calculate the target percentage of compute objects to offload
02876     int total = totalNumSelfs + totalNumPairs;
02877     int target = (int)((total * mic_hostSplit) / 100.0f);
02878     if (target > total) { target = total; } // Do this check just in case a floating point round issue occurs
02879  
02880     // DMK - DEBUG
02881     #if MIC_VERBOSE_HVD_LDB != 0
02882       printf("[MIC-HvD-LDB] :: PE %d :: Phase 0 Result - selfs:%d, pairs:%d, other:%d, self+pair:%d, target:%d\n",
02883              CkMyPe(), (int)(selfs->size()), (int)(pairs->size()), (int)(pairsTmp->size()), total, target
02884             ); fflush(NULL);
02885     #endif
02886 
02887     // Setup an array of flags (one entry per PID) that indicates whether or not a given
02888     //   PID is already being referenced by the MIC compute object (i.e. required by at
02889     //   least on compute already mapped to the device).  Also create a list of CIDs that
02890     //   are to be mapped to the device.
02891     bool * pidFlag = new bool[maxPID - minPID + 1];
02892     std::queue<int> * offloads = new std::queue<int>();
02893     __ASSERT(offloads != NULL && pidFlag != NULL);
02894     int offloads_selfs = 0;
02895     int offloads_pairs = 0;
02896     for (int i = minPID; i <= maxPID; i++) { pidFlag[i - minPID] = false; }
02897 
02898     // Create an array of flags that indicate if a given PE contributing to this PE's MIC
02899     //   has had at least one patch offloaded (required by ::doWork and ::noWork)
02900     int peLo = peSet[0];  // NOTE: Have already tested that peSetLen > 0
02901     int peHi = peSet[0];
02902     for (int i = 0; i < peSetLen; i++) {
02903       if (peSet[i] < peLo) { peLo = peSet[i]; }
02904       if (peSet[i] > peHi) { peHi = peSet[i]; }
02905     }
02906     __ASSERT(peLo <= peHi);
02907     bool * peSetFlag = new bool[peHi - peLo + 1];  // Inclusive of peLo and peHi
02908     __ASSERT(peSetFlag != NULL);
02909     for (int i = peLo; i <= peHi; i++) { peSetFlag[i - peLo] = false; }
02910 
02912 
02913     double startTime_phase1 = CmiWallTimer();
02914 
02915     // Sweep through the computes, adding at least one self compute per contributing PE to
02916     //   satisfy assumptions in ::doWork() and ::noWork()... for now, at most one per PE
02917     // Start with self computes (favor them)
02918     int numSelfs = selfs->size();
02919     for (int i = 0; i < numSelfs; i++) {
02920       int cid = selfs->front(); selfs->pop();
02921       int pid = computeMap->pid(cid, 0);
02922       int pe0 = patchMap->node(pid);
02923       if (pe0 >= peLo && pe0 <= peHi && !(peSetFlag[pe0 - peLo])) {
02924         offloads->push(cid);
02925         offloads_selfs++;
02926         peSetFlag[pe0 - peLo] = true;  // Mark the PE as having at least one compute offloaded
02927         pidFlag[pid - minPID] = true;  // Mark the associated PID as already being required by the MIC card
02928       } else {
02929         selfs->push(cid);
02930       }
02931     }
02932     // Move on to pair computes
02933     int numPairs = pairs->size();
02934     for (int i = 0; i < numPairs; i++) {
02935       int cid = pairs->front(); pairs->pop();
02936       int pid0 = computeMap->pid(cid, 0);
02937       int pid1 = computeMap->pid(cid, 1);
02938       int pe0 = patchMap->node(pid0);
02939       int pe1 = patchMap->node(pid1);
02940       if ((pe0 >= peLo && pe0 <= peHi && !(peSetFlag[pe0 - peLo])) ||
02941           (pe1 >= peLo && pe1 <= peHi && !(peSetFlag[pe1 - peLo]))
02942          ) {
02943         offloads->push(cid);
02944         offloads_pairs++;
02945         if (pe0 >= peLo && pe0 <= peHi) {
02946           peSetFlag[pe0 - peLo] = true;
02947           pidFlag[pid0 - minPID] = true;
02948         }
02949         if (pe1 >= peLo && pe1 <= peHi) {
02950           peSetFlag[pe1 - peLo] = true;
02951           pidFlag[pid1 - minPID] = true;
02952         }
02953       } else {
02954         pairs->push(cid);
02955       }
02956     }
02957     // If need be, "other" computes (pairs that didn't meet LDB requirements, but might be required)
02958     // NOTE: By this point, it should be the case that all the peSetFlag values are set.  Just in
02959     //   case that is not true, look through the "other" pairs (that didn't fit LDB criteria).
02960     //   However, in this case, also empty the queue since these cannot be used in phase 2 below.
02961     numPairs = pairsTmp->size();
02962     bool usedOther = false;
02963     while (!(pairsTmp->empty())) {
02964       int cid = pairsTmp->front(); pairsTmp->pop();
02965       int pid0 = computeMap->pid(cid, 0);
02966       int pid1 = computeMap->pid(cid, 1);
02967       int pe0 = patchMap->node(pid0);
02968       int pe1 = patchMap->node(pid1);
02969       if ((pe0 >= peLo && pe0 <= peHi && !(peSetFlag[pe0 - peLo])) ||
02970           (pe1 >= peLo && pe1 <= peHi && !(peSetFlag[pe1 - peLo]))
02971          ) {
02972         offloads->push(cid);
02973         offloads_pairs++;
02974         if (pe0 >= peLo && pe0 <= peHi) {
02975           peSetFlag[pe0 - peLo] = true;
02976           pidFlag[pid0 - minPID] = true;
02977         }
02978         if (pe1 >= peLo && pe1 <= peHi) {
02979           peSetFlag[pe1 - peLo] = true;
02980           pidFlag[pid1 - minPID] = true;
02981         }
02982       }
02983     }
02984     if (usedOther) { CkPrintf("[MIC-WARNING] :: Offloaded non-bonded compute that didn't meet LDB criteria to satisfy phase 1 requirement (just FYI, not a problem)\n"); }
02985 
02986     hdLDBTime_phase1 += CmiWallTimer() - startTime_phase1;
02987 
02988     // Verify at least one self per PE was selected (assumption is one patch per
02989     //   PE and self computes are mapped to same PE as their associated patch)
02990     int numPEsSet = 0;
02991     for (int i = 0; i < peSetLen; i++) {
02992       if (peSetFlag[peSet[i] - peLo]) { numPEsSet++; }
02993     }
02994     __ASSERT(numPEsSet > 0);  // NOTE: Need at least one per set of PEs (one per master)
02995 
02996     // DMK - DEBUG
02997     #if MIC_VERBOSE_HVD_LDB != 0
02998       printf("[MIC-HvD-LDB] :: PE %d :: Phase 1 Result - selfs:%d, pairs:%d, offloads:%d, target:%d\n",
02999              CkMyPe(), (int)(selfs->size()), (int)(pairs->size()), (int)(offloads->size()), target
03000             ); fflush(NULL);
03001     #endif
03002 
03004 
03005     // Push compute objects to the device until the target number of computes has been reached
03006     while (offloads->size() < target) {
03007 
03009 
03010       double time_0 = CmiWallTimer();
03011 
03012       // If we haven't reached target yet, grab a compute to add to the mix (along with it's
03013       //   referenced PIDs)
03014       // NOTE : As a heuristic, if the number of offloaded computes is still far away from the
03015       //   overall target number, try to add many at a time (miniTarget).  However, with twoAway
03016       //   options enabled, many are likely to be added by part 2 below, so don't add too many
03017       //   in part 1 so that part 2 below can do it's job of minimizing data movement across
03018       //   the PCIe bus.  This is where the miniTarget divisor comes from (about half of 5*5*5
03019       //   from twoAways all being enabled).
03020       int miniTarget = (target - offloads->size()) / 50;
03021       int cid = -1;
03022       if (miniTarget < 1) { miniTarget = 1; }
03023       for (int k = 0; k < miniTarget; k++) {
03024 
03025         // Select the compute (prefer selfs)
03026         if (selfs->size() > 0) { cid = selfs->front(); selfs->pop(); offloads_selfs++; }
03027         else if (pairs->size() > 0) { cid = pairs->front(); pairs->pop(); offloads_pairs++; }
03028         else { break; }  // Nothing to grab, so exit
03029         __ASSERT(cid >= 0);
03030 
03031         // Add the compute to the list of offloaded computes
03032         offloads->push(cid);  // Select this compute and mark it's PIDs as used
03033 
03034         // Mark the PIDs associated with the compute as being referenced by the device
03035         //   so computes that use the same PIDs can be selected in part 2 below
03036         int pid0 = computeMap->pid(cid, 0);
03037         pidFlag[pid0 - minPID] = true;
03038         if (computeMap->type(cid) == computeNonbondedPairType) {
03039           int pid1 = computeMap->pid(cid, 1);
03040           pidFlag[pid1 - minPID] = true;
03041         }
03042       }
03043       if (cid < 0) { break; } // No longer any computes in either list (selfs or pairs)
03044 
03046 
03047       double time_1 = CmiWallTimer();
03048 
03049       // While we haven't reached the target, add computes that reference PIDs that are
03050       //   already referenced by computes already mapped to the device
03051       while (offloads->size() < target && selfs->size() > 0) {  // Start with selfs
03052         int cid = selfs->front(); selfs->pop();
03053         int pid0 = computeMap->pid(cid, 0);
03054         if (pidFlag[pid0 - minPID]) {
03055           offloads->push(cid);
03056           offloads_selfs++;
03057         } else {
03058           selfsTmp->push(cid);
03059         }
03060       }
03061       { std::queue<int> * t = selfs; selfs = selfsTmp; selfsTmp = t; }
03062 
03063       while (offloads->size() < target && pairs->size() > 0) {
03064         int cid = pairs->front(); pairs->pop();
03065         int pid0 = computeMap->pid(cid, 0);
03066         int pid1 = computeMap->pid(cid, 1);
03067         if (pidFlag[pid0 - minPID] && pidFlag[pid1 - minPID]) {
03068           offloads->push(cid);
03069           offloads_pairs++;
03070         } else {
03071           pairsTmp->push(cid);
03072         }
03073       }
03074       { std::queue<int> * t = pairs; pairs = pairsTmp; pairsTmp = t; }
03075 
03076       double time_2 = CmiWallTimer();
03077       hdLDBTime_phase2a += time_1 - time_0;
03078       hdLDBTime_phase2b += time_2 - time_1;
03079     }
03080 
03081     // DMK - DEBUG
03082     #if MIC_VERBOSE_HVD_LDB != 0
03083       printf("[MIC-HvD-LDB] :: PE %d :: Phase 2 Result [%d->%d] -- selfs:%d, pairs:%d, offloads:%d (s:%d, p:%d), target:%d\n",
03084              CkMyPe(), peLo, peHi, (int)(selfs->size() + selfsTmp->size()), (int)(pairs->size() + pairsTmp->size()),
03085              (int)(offloads->size()), offloads_selfs, offloads_pairs, target
03086             ); fflush(NULL);
03087     #endif
03088 
03089     // If the numParts parameters have not been set manually, then split up the computes headed
03090     //   to the device to (1) split up larger tasks and (2) create more tasks
03091     if (mic_numParts_self_p1 <= 0 || mic_numParts_pair_p1 <= 0 || mic_numParts_pair_p2 <= 0) {
03092 
03093       int numDeviceComputes = offloads->size();
03094       if (numDeviceComputes > 1000) {  // Enough tasks, no need to break up anything
03095         mic_numParts_self_p1 = 1;
03096         mic_numParts_pair_p1 = 1;
03097         mic_numParts_pair_p2 = 1;
03098       } else if (numDeviceComputes <= 1000 && numDeviceComputes > 350) {  // Enough tasks, but some might be too large (granularity)
03099         mic_numParts_self_p1 = 3;
03100         mic_numParts_pair_p1 = 3;
03101         mic_numParts_pair_p2 = 1;
03102       } else {  // Starting to be too few tasks, so start splitting them up more and more
03103         // Try to get around 250 - 300 self parts and 400 - 500 pair parts (650 - 800 total parts)
03104         if (offloads_selfs <= 0) { offloads_selfs = 1; }
03105         if (offloads_pairs <= 0) { offloads_pairs = 1; }
03106         #define CONFINE_TO_RANGE(v, mi, ma)  (v = (v > ma) ? (ma) : ((v < mi) ? (mi) : (v)))
03107         mic_numParts_self_p1 = 300 / offloads_selfs; CONFINE_TO_RANGE(mic_numParts_self_p1, 8, 16);
03108         mic_numParts_pair_p1 = 500 / offloads_pairs; CONFINE_TO_RANGE(mic_numParts_pair_p1, 4, 16);
03109         #undef CONFINE_TO_RANGE
03110         mic_numParts_pair_p2 = mic_numParts_pair_p1 / 2;
03111         mic_numParts_pair_p1 += mic_numParts_pair_p2;  // NOTE: 'numParts = p1 - dist * p2' and 'dist > 0' for pairs
03112       }
03113 
03114       //printf("[MIC-HvD-LDB] :: PE %d :: Automated HvD LDB - Setting NumParts { %d %d %d }\n", CkMyPe(), mic_numParts_self_p1, mic_numParts_pair_p1, mic_numParts_pair_p2);
03115       iout << iINFO << "MIC-HvD-LDB - Automated HvD LDB Setting NumParts { " << mic_numParts_self_p1 << ", " << mic_numParts_pair_p1 << ", " << mic_numParts_pair_p2   << " }\n" << endi;
03116     }
03117 
03118     // Update the compute map to refect the load balancing decision
03119     while (offloads->size() > 0) {
03120       int cid = offloads->front(); offloads->pop();
03121       computeMap->setDirectToDevice(cid, 1);
03122     }
03123 
03124     // Cleanup temp data
03125     delete selfs;
03126     delete pairs;
03127     delete selfsTmp;
03128     delete pairsTmp;
03129     delete offloads;
03130     delete [] peSetFlag;
03131     delete [] pidFlag;
03132   }
03133 
03134   // Increment the count for the number of contributors (PEs).  If this is the last
03135   //   one, attempt to dump the host-deivce-computeMap.
03136   numContribs++;
03137   hdLDBTime += CmiWallTimer() - startTime;
03138   if (numContribs >= CkNumPes()) {
03139     mic_dumpHostDeviceComputeMap();
03140     WorkDistrib::send_setDeviceLDBParams(mic_deviceThreshold, mic_hostSplit,
03141                                          mic_numParts_self_p1, mic_numParts_pair_p1, mic_numParts_pair_p2);
03142     #if MIC_VERBOSE_HVD_LDB != 0
03143       CkPrintf("[MIC-HvD-LDB] :: PE %d :: Total HvD LDB Time: %lf sec = { %lf, %lf, (%lf, %lf) }\n",
03144                CkMyPe(), hdLDBTime, hdLDBTime_phase0, hdLDBTime_phase1, hdLDBTime_phase2a, hdLDBTime_phase2b
03145               );
03146     #endif
03147   }
03148 }
03149 
03150 
03151 void mic_dumpHostDeviceComputeMap() {
03152   #if MIC_DUMP_COMPUTE_MAPPING != 0
03153     ComputeMap *computeMap = ComputeMap::Object();
03154     int nComputes = computeMap->numComputes();
03155     char filename[256] = { 0 };
03156     sprintf(filename, "deviceComputeMap.%d", CkMyPe());
03157     FILE * fmap = fopen(filename, "w");
03158     if (fmap != NULL) {
03159       printf("[MIC-INFO] :: PE %d :: Writing device compute map to \"%s\".\n", CkMyPe(), filename);
03160       fprintf(fmap, "Device compute map\n");
03161       for (int i = 0; i < nComputes; i++) {
03162         if (i % 50 == 0) { fprintf(fmap, "\n%d:", i); }
03163         fprintf(fmap, "%s%d", ((i % 10 == 0) ? (" ") : ("")), computeMap->directToDevice(i));
03164       }
03165       fclose(fmap);
03166     } else {
03167       printf("[MIC-WARNING] :: Unable to dump device compute map to file \"%s\"... skipping.\n", filename);
03168     }
03169   #endif
03170 }
03171 
03172 
03173 #endif  // NAMD_MIC

Generated on Sat Sep 23 01:17:12 2017 for NAMD by  doxygen 1.4.7