ComputeNonbondedMICKernel.h

Go to the documentation of this file.
00001 #ifndef COMPUTE_NONBONDED_MIC_KERNEL_H
00002 #define COMPUTE_NONBONDED_MIC_KERNEL_H
00003 
00004 
00005 // If this is a MIC build for NAMD, then include the contents of this file
00006 #ifdef NAMD_MIC
00007 
00008 
00010 // General Code Description
00011 //
00012 //    The files beginning with "ComputeNonbondedMIC" are MIC-specific files that
00013 // are involved in supporting MIC within the NAMD application.  Below is a
00014 // general description of that support.
00015 //
00016 // Startup:
00017 //
00018 //    During the startup phase of the application, there are several functions
00019 // that are called, once per device.  First, the function mic_check is used to
00020 // detect if the MIC devices that are to be used available to the application
00021 // (by default, all devices are used, but the "+devices" command line parameter
00022 // can be used to select specific devices).  Then, the mic_init_device function
00023 // is then used to initialize each of the MIC devices.
00024 //
00025 //    Also during startup, some constant tables/variables that will be used
00026 // during the steady state of the application are transfered to the device.
00027 // These functions, again called once per device, include...
00028 //  - mic_bind_table_four : Nonbonded table (includes several sub-tables)
00029 //  - mic_bind_lj_table   : Lennard-Jones (LJ) table
00030 //  - mic_bind_exclusions : Table of exclusion flags
00031 //  - mic_bind_constants  : Several required constant scalar values
00032 //
00033 // Steady State (timesteps executing):
00034 //
00035 //    During the steady state of the application, when timesteps are executing
00036 // one after the next, several functions are used to transfer data to the MIC
00037 // device, including...
00038 //  - mic_bind_patch_pairs : Updates buffers that only need to be updated on
00039 //      the device(s) periodically, usually because of some major event such
00040 //      as atom migration or load balancing (computes shifting between PEs).
00041 //  - mic_nonbonded_forces : Used to initiate computation on the MIC device.
00042 //      Various buffers that are required to be transfered each timestep,
00043 //      such as atoms (input: atom position info) or forces (output: atom
00044 //      force info), along with buffers that are only periodically transfered,
00045 //      such as atom_params (input: misc. atom info).  The body of this function
00046 //      includes an offload pragma that is broken up into two stages.  The first
00047 //      is force computation where the computes (patch_pairs) are executed in
00048 //      parallel with one another.  The second is force reduction, where the
00049 //      forces are reduced in parallel (per patch), accumulating output from
00050 //      each of the computes that contributes to each of the patches.  This
00051 //      offload pragma is asynchronous so that the host can continue to work.
00052 //  - mic_check_[remote|local]_kernel_complete : A function that polls for the
00053 //      completion of the main computation offload pragma.
00054 //
00055 // Force Computation (during steady state):
00056 // 
00057 //    As part of the force computation (first stage of computation on device),
00058 // several functions are used to "process" (calculate forces for) each compute
00059 // object.  These functions are basically the same (at least conceptually, but
00060 // have slight variations from one another based on the type of compute being
00061 // "processed" or the conditions for the timestep.  These functions include...
00062 //  - Base Names:
00063 //    - calc_pair : compute with two different input patches
00064 //    - calc_self : compute with a single input patch (i.e. patch1 == patch2)
00065 //  - Qualifiers:
00066 //    - "_energy" : calculate additional energy info during energy timesteps
00067 //    - "_fullelect" : calculate additional slow_forces during PME timesteps
00068 //  - Examples:
00069 //    - calc_pair : basic caclulations for a pair compute
00070 //    - calc_self_fullelect : basic + slow_force calculations for a self compute
00071 //
00072 //    These force computation functions are generated by including the header
00073 // file "ComputeNonbondedMICKernelBase.h" into "ComputeNonbondedMICKernel.C"
00074 // multiple times, each time with different "macro flags" setup.  The general
00075 // include structure is as follows (indentation indicates inclusion)...
00076 //  - ComputeNonbondedMICKernelBase.h
00077 //    - declares compute function body, along with function setup/cleanup
00078 //    - periodically does pairlist generation
00079 //    - includes ComputeNonbondedMICKernelBase2.h (one per type of interation,
00080 //      such as NORMAL, EXCLUDED, and MODIFIED).
00081 //      - refines pairlist to only include interactions within cutoff for the
00082 //        given timestep (if pairlist refinement is enabled)
00083 //      - includes ComputeNonbondedMICKernelBase2_XXXXX.h, where XXXXX is
00084 //        either "handcode" or "scalar", depending on the build's configuration
00085 //        - force computation loop, that processes each entry in the pairlist
00086 //          for the given type of interaction
00087 //
00088 //    For the most part, the MIC code is based on the host code.  The main
00089 // difference is the restructuring of the pairlists, and thus the force
00090 // computation loops.  For the MIC code, each pairlist entry is a combination
00091 // of both an "i" (outer-loop atom list) index and "j" (inner-loop atom list)
00092 // index that represents a pair of atoms that need to be checked this timestep
00093 // for their exact distance, and if close enough, interact with one another.
00094 // Correspondingly, the force computation loop is only a single loop, rather
00095 // than two nested loops (inner + outer), that iterates over pairlist entries.
00096 // As such, each compute has only one pairlist (per interaction type), rather
00097 // than many smaller pairlists for each outer-loop atom (one per interaction
00098 // type, per outer-loop atom).  There is still one force computation loop,
00099 // setup in ComputeNonbondedMICKernelBase2.h, per interaction type.
00100 //
00101 // Cleanup:
00102 //
00103 //    The cleanup function mic_free_device should be called, once per device, to
00104 // cleanup the various data structures used by the device (and the associated
00105 // data structures on the host) at the end of the overall simulation.
00106 //
00108 
00109 
00110 #include <stdlib.h>
00111 #include <stdio.h>
00112 #include <offload.h>
00113 #include <string.h>
00114 #include <stdint.h>
00115 #ifndef __MIC__
00116   #include "charm++.h"
00117 #endif
00118 
00119 
00121 // NAMD_MIC Configuration Macros
00122 
00123 // There are three main ways in which the force computation loop can be compiled,
00124 //   including those listed in the table below.
00125 //
00126 //                VERSION   MIC_HANDCODE_FORCE   MIC_PAD_PLGEN   ALSO RECOMMENDED ("set" these, others "unset")
00127 //    -------------------   ------------------   -------------   ------------------------------------------------------------------------------
00128 //                 scalar                   0               0    
00129 //    compiler-vectorized                   0               1    MIC_HANDCODE_PLGEN(1) (until auto vectorization of pairlist generation loop is working)
00130 //        hand-vectorized                   1             n/a    MIC_HANDCODE_FORCE_CALCR2TABLE(1), MIC_HANDCODE_FORCE_USEGATHER(1), MIC_HANDCODE_FORCE_SOA_VS_AOS(1)
00131 //              --- for all versions of the code ---             MULTIPLE_THREADS(1), MIC_SORT_ATOMS(1), MIC_HANDCODE_FORCE_SINGLE(1)
00132 
00133 // Flags to enable/disable the use of handcoding
00134 #define MIC_HANDCODE_FORCE                     ( 1 ) // 1 // 0 - disable, !0 - enable hand-coded force generation loop
00135   #define MIC_HANDCODE_FORCE_PFDIST            ( 0 )  // (currently broken!) 0 - disable, >=8 - enable software prefetch (value = prefetch distance)
00136   #define MIC_HANDCODE_FORCE_USEGATHER         ( 1 ) // 1 //
00137   #define MIC_HANDCODE_FORCE_USEGATHER_NBTBL   ( 0 )
00138   #define MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT  ( 0 )  // 0 - disable, !0 - enable loading of vdw type information prior to the cutoff check within the force loops
00139   #define MIC_HANDCODE_FORCE_COMBINE_FORCES    ( 1 ) // 1 // 0 - disable, !0 - enable a single set of force updates (short-fast and full), rather than two separate updates in the force loop body (allowing indexes, etc. to be reused for both)
00140 
00141 // Flags that were originally part of handcoding, but are now more general
00142 #define MIC_HANDCODE_FORCE_SINGLE            ( 1 ) // 1 // 0 - disable, !0 - enable use of single precision to calculate force magnitude in force loops (forces still accumulated in double precision)
00143 #define MIC_HANDCODE_FORCE_CALCR2TABLE       ( 1 ) // 1 // 0 - disable, !0 - enable r2_table value inline calculation (vs r2_table value loads)
00144 #define MIC_HANDCODE_FORCE_SOA_VS_AOS        ( 1 ) // 1 // 0 - SoA, !0 - AoS
00145 
00146 #define MIC_SORT_ATOMS                    ( 1 ) // 1 // 0 - disable, !0 - enable atom sorting within each patch
00147 #define MIC_SORT_LISTS                    ( 0 )  // (currently not working, leave at 0)
00148 
00149 // Pairlist generation controls
00150 #define MIC_HANDCODE_PLGEN                ( 1 ) // 1 // 0 - disable, !0 - enable hand-coded pairlist generation
00151 #define MIC_TILE_PLGEN                    ( 0 )  // 0 - disable, !0 - enable pairlist tiling (value = number of tiles to create for each atom list)
00152 #define MIC_CONDITION_NORMAL              ( 0 )  // 0 - disable, !0 - enable normal pairlist "conditioning" (enabling this flag will cause the normal pairlist to be separated into two sets, such that pairs that are likely to be within cutoff are grouped together and pairs that are less likely to be within cutoff... so vector lanes tend to all active or all not-active)
00153 #define MIC_PAD_PLGEN                     ( 1 ) // 1 // 0 - disable, !0 - enable pairlist padding
00154 
00155 // Flag to enable/disable the use of multiple threads within the force computation
00156 #define MULTIPLE_THREADS                  ( 1 ) // 1 // 0 - disable, !0 - use multiple threads when processing compute objects (a.k.a. patch pairs)
00157 
00158 #define MIC_SORT_COMPUTES                 ( 1 )
00159 
00160 #define MIC_SYNC_INPUT                    ( 1 )
00161 #define MIC_SYNC_OUTPUT                   ( 1 )
00162 
00163 #define MIC_SUBMIT_ATOMS_ON_ARRIVAL       ( 0 )
00164 
00165 // Flags used to enable/disable and control pairlist refinement
00166 #define REFINE_PAIRLISTS                  ( 0 )  // 0 - disable, !0 - enable pairlist refinement
00167   #define REFINE_PAIRLIST_HANDCODE        ( 0 )  // 0 - disable, !0 - use hand-coded version of the pairlist refinement code
00168   #define REFINE_PAIRLISTS_XYZ            ( 0 )  // 0 - disable, !0 - save dX (delta X), dY, and dZ values along with r2 values as part of the pairlist refinement output process
00169 
00170 #define MIC_PREFETCH_DISTANCE             ( 0 )
00171   #define MIC_PREFETCH_HINT               ( 3 )
00172 
00173 // Flag used to enable/disable various checks within and related to the MIC computation code
00174 #define MIC_FULL_CHECK                    ( 1 )  // 0 - disable, !0 - enable various checks use to help verify correct operation, but that are not required to perform the calculations themselves
00175 #define MIC_EXCL_CHECKSUM_FULL            ( 1 )  // NOTE: Mutually exclusive with MIC_EXCL_CHECKSUM
00176 #define MIC_DATA_STRUCT_VERIFY            ( 0 )
00177   #define MIC_DATA_STRUCT_VERIFY_KERNELS  ( 0 )
00178 #define MIC_DEVICE_FPRINTF                ( 0 )  // Print some debug info from the MICs directly to a device file on the MICs (for debug purposes)
00179   #define MIC_DEVICE_FPRINTF_REOPEN_FREQ  ( 0 )  // If >0, will cause the code to periodically reopen (and thus truncate) the device file, helping to limit the amount of memory it takes up
00180 #define MIC_DUMP_COMPUTE_MAPPING          ( 0 )  // 0: do nothing, !0: dump directToDevice info from ComputeMap
00181 #define MIC_TRACK_DEVICE_MEM_USAGE        ( 0 )  // 0: do nothing, !0: print memusage on the device every 100 timesteps
00182 #define MIC_HEARTBEAT                     ( 0 )
00183 
00184 // Alignment used for various buffers, values, etc., in bytes
00185 #define MIC_ALIGN                         ( 64 ) //4 * 1024 )  // NOTE : Must be > 0 and must be a multiple of the cacheline size (in bytes)
00186 
00187 #define MIC_MAX_DEVICES_PER_NODE          ( 16 )
00188 
00189 #define MIC_PRINT_CONFIG                  ( 0 )  // 0 - disable, !0 - enable the printing of configuration information (macros defined in this file) at the start of a simulation (TODO | NOTE: This is meant as a reference for repeating performance experiments... not required for production use... disable at some point)
00190 
00191 #define MIC_VERBOSE_HVD_LDB               ( 0 )  // 0 - disable, !0 - print extra info about HvD LDB
00192 
00193 #define MIC_DEBUG                         ( 0 )  // 0 - disable, !0 - enable extra debugging output to be printed to files (1 file per PE) via MICP statements
00194 
00195 #define NUM_PAIRLIST_TYPES                ( 4 )  // Number of pairlist types (i.e. multiplier used for mallocs, etc.)
00196   #define PL_NORM_INDEX                   ( 0 )  //   Index for normal pairlist
00197   #define PL_EXCL_INDEX                   ( 1 )  //   Index for excluded pairlist
00198   #define PL_MOD_INDEX                    ( 2 )  //   Index for modified pairlist
00199   #define PL_NORMHI_INDEX                 ( 3 )  //   Index for the "normal high" pairlist (temporary pairlist used for entries that are far apart, used to separate the sets of entries, close and far, used in normal pairlist conditioning... see MIC_CONDITION_NORMAL)
00200 
00201 // Projections tracing
00202 #define MIC_TRACING                       ( 1 )  // 0 - disable, !0 - enable tracing information to be collected on the host
00203   #define MIC_DEVICE_TRACING              ( 1 )  // 0 - disable, !0 - enable the collection of extra tracing information on the device itself (only high-level events, like the total time for all computes, for all patches, etc.) (requires MIC_TRACING to be set)
00204   #define MIC_DEVICE_TRACING_DETAILED     ( 0 )  // 0 - disable, !0 - enable the collection of more tracing information on the device, including start and stop times for each task (requires MIC_DEVICE_TRACING to be sent)
00205   #define MIC_TRACING_EVENT_BASE          ( 10000 )
00206   #define MIC_EVENT_OFFLOAD_REMOTE        ( MIC_TRACING_EVENT_BASE +  0 )
00207   #define MIC_EVENT_OFFLOAD_LOCAL         ( MIC_TRACING_EVENT_BASE +  1 )
00208   #define MIC_EVENT_OFFLOAD_PRAGMA        ( MIC_TRACING_EVENT_BASE +  2 )
00209   #define MIC_EVENT_OFFLOAD_POLLSET       ( MIC_TRACING_EVENT_BASE +  3 )
00210   #define MIC_EVENT_FUNC_DOWORK           ( MIC_TRACING_EVENT_BASE +  4 )
00211   #define MIC_EVENT_FUNC_FINISHWORK       ( MIC_TRACING_EVENT_BASE +  5 )
00212   #define MIC_EVENT_ATOMS_SUBMIT          ( MIC_TRACING_EVENT_BASE +  6 )
00213   #define MIC_EVENT_ATOMS_TRANSFER        ( MIC_TRACING_EVENT_BASE +  7 )
00214   #define MIC_EVENT_ATOMS_WAIT            ( MIC_TRACING_EVENT_BASE +  8 )
00215   #define MIC_EVENT_SYNC_INPUT_PRAGMA     ( MIC_TRACING_EVENT_BASE +  9 )
00216   #define MIC_EVENT_SYNC_OUTPUT_REMOTE_PRAGMA  ( MIC_TRACING_EVENT_BASE + 10 )
00217   #define MIC_EVENT_SYNC_OUTPUT_LOCAL_PRAGMA   ( MIC_TRACING_EVENT_BASE + 11 )
00218   #define MIC_EVENT_DEVICE_COMPUTES       ( MIC_TRACING_EVENT_BASE + 12 )
00219   #define MIC_EVENT_DEVICE_VIRIALS        ( MIC_TRACING_EVENT_BASE + 13 )
00220   #define MIC_EVENT_DEVICE_PATCHES        ( MIC_TRACING_EVENT_BASE + 14 )
00221   #define MIC_EVENT_DEVICE_PATCH          ( MIC_TRACING_EVENT_BASE + 15 )
00222   #define MIC_EVENT_DEVICE_COMPUTE        ( MIC_TRACING_EVENT_BASE + 16 )
00223   #define MIC_TRACING_NUM_EVENTS          ( 24 )
00224 
00225 // DMK - DEBUG
00226 #define MIC_KERNEL_DATA_TRANSFER_STATS    (  0 )
00227 
00228 __declspec(target(mic)) void* _mm_malloc_withPrint(size_t s, int a, char * l);
00229 __declspec(target(mic)) void _mm_free_withPrint(void * prt);
00230 
00231 // DMK - NOTE : Leaving these macros in as hooks for now
00232 //#ifdef __MIC__
00233 //  #define _MM_MALLOC_WRAPPER(s, a, l) _mm_malloc_withPrint((s), (a), (l))
00234 //  #define _MM_FREE_WRAPPER(p) _mm_free_withPrint(p)
00235 //  #define _MM_MALLOC_VERIFY
00236 //#else
00237   #define _MM_MALLOC_WRAPPER(s, a, l) _mm_malloc((s), (a))
00238   #define _MM_FREE_WRAPPER(p) _mm_free(p)
00239   #define _MM_MALLOC_VERIFY
00240 //#endif
00241 
00242 
00244 // Configuration Checks/Overrides
00245 
00246 // If not compiling for the MIC device, disable all MIC specific code
00247 #include <stdint.h>
00248 #if defined(__MIC__) || defined(__MIC2__)
00249   #include <immintrin.h>
00250 #else
00251   #undef MIC_HANDCODE_FORCE
00252   #define MIC_HANDCODE_FORCE            (0)
00253   #undef MIC_HANDCODE_PLGEN
00254   #define MIC_HANDCODE_PLGEN            (0)
00255 #endif
00256 
00257 // If the MIC_FULL_CHECK flag is set, enable checks for both __ASSUME and
00258 //   __ASSUME_ALIGNED to ensure the cases for these macros are actually true.
00259 //   Also ensure that MIC_EXCL_CHECKSUM_FULL is enabled.
00260 #if MIC_FULL_CHECK != 0
00261   #define __FULL_CHECK(X) X
00262   #define __ASSERT(c) assert(c)
00263   #define CHECK_ASSUME          (1)
00264   #define CHECK_ASSUME_ALIGNED  (1)
00265   #if (MIC_EXCL_CHECKSUM_FULL == 0)
00266     #warning "MIC_EXCL_CHECKSUM_FULL enabled (because MIC_FULL_CHECK was enabled)"
00267   #endif
00268   #undef MIC_EXCL_CHECKSUM_FULL
00269   #define MIC_EXCL_CHECKSUM_FULL  (1)
00270 #else
00271   #define __FULL_CHECK(X)
00272   #define __ASSERT(c)
00273   #define CHECK_ASSUME          (0)
00274   #define CHECK_ASSUME_ALIGNED  (0)
00275 #endif
00276 
00277 // Create a macro that can be used within the body of offload pragmas that asserts the
00278 //   offload section was executed on a MIC device
00279 #if defined(__MIC__)
00280   #define __MUST_BE_MIC
00281 #else
00282   #define __MUST_BE_MIC  __ASSERT(0)
00283 #endif
00284 
00285 // For now, if padding the pairlist with zeros, disable pairlist tiling
00286 #if (MIC_PAD_PLGEN != 0) && (MIC_HANDCODE_FORCE != 0)
00287   #if REFINE_PAIRLISTS != 0
00288     #warning "REFINE_PAIRLISTS disabled (because MIC_PAD_PLGEN && MIC_HANDCODE_FORCE were enabled; combination not supported yet)"
00289   #endif
00290   #undef REFINE_PAIRLISTS
00291   #define REFINE_PAIRLISTS  (0)
00292 #endif
00293 
00294 // If using the handcoded force loop with mixed precision, disable the use of pairlist refinement (not supported yet)
00295 #if MIC_HANDCODE_FORCE_SINGLE != 0
00296   #if REFINE_PAIRLISTS != 0
00297     #warning "REFINE_PAIRLISTS disabled (because MIC_HANDCODE_FORCE_SINGLE was enabled)"
00298   #endif
00299   #undef REFINE_PAIRLISTS
00300   #define REFINE_PAIRLISTS  (0)
00301 #endif
00302 
00303 // If AOS has been enabled, ensure that loading VDW upfront has been disabled (not supported, doesn't
00304 //   make much sense as x,y,z,q are in a different array than vdwType)
00305 #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
00306   #if MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT != 0
00307     #warning "MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT disabled (because MIC_HANDCODE_FORCE_SOA_VS_AOS was enabled; combination not supported)"
00308   #endif
00309   #undef MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT
00310   #define MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT (0)
00311 #endif
00312 
00313 // Check for a negative value specified as the prefetch distance (macro's value is used, so require positive)
00314 #if MIC_PREFETCH_DISTANCE < 0
00315   #warning "INVALID MIC_PREFETCH_DISTANCE value (disabling)"
00316   #undef MIC_PREFETCH_DISTANCE
00317   #define MIC_PREFETCH_DISTANCE (0)
00318 #endif
00319 
00320 // If tracing has been enabled, ensure that the other required tracing macros are properly set
00321 #if MIC_DEVICE_TRACING_DETAILED != 0
00322   #if MIC_DEVICE_TRACING == 0
00323     #warning "MIC_DEVICE_TRACING enabled (because MIC_DEVICE_TRACING_DETAILED was enabled)"
00324   #endif
00325   #undef MIC_DEVICE_TRACING
00326   #define MIC_DEVICE_TRACING  (1)
00327 #endif
00328 #if MIC_DEVICE_TRACING != 0
00329   #if MIC_TRACING == 0
00330     #warning "MIC_TRACING enabled (because MIC_DEVICE_TRACING was enabled)"
00331   #endif
00332   #undef MIC_TRACING
00333   #define MIC_TRACING  (1)
00334 #endif
00335 
00336 
00338 // Tracing Macros (Projections)
00339 
00340 #if MIC_TRACING != 0
00341 
00342   #define MIC_TRACING_REGISTER_EVENT(str, id) traceRegisterUserEvent(str, id)
00343 
00344   #define MIC_TRACING_REGISTER_EVENTS \
00345     MIC_TRACING_REGISTER_EVENT("MIC device offload (remote)", MIC_EVENT_OFFLOAD_REMOTE); \
00346     MIC_TRACING_REGISTER_EVENT("MIC device offload (local)", MIC_EVENT_OFFLOAD_LOCAL); \
00347     MIC_TRACING_REGISTER_EVENT("MIC offload pragma", MIC_EVENT_OFFLOAD_PRAGMA); \
00348     MIC_TRACING_REGISTER_EVENT("MIC polling set", MIC_EVENT_OFFLOAD_POLLSET); \
00349     MIC_TRACING_REGISTER_EVENT("MIC ::doWork", MIC_EVENT_FUNC_DOWORK); \
00350     MIC_TRACING_REGISTER_EVENT("MIC ::finishWork", MIC_EVENT_FUNC_FINISHWORK); \
00351     MIC_TRACING_REGISTER_EVENT("MIC atom submit", MIC_EVENT_ATOMS_SUBMIT); \
00352     MIC_TRACING_REGISTER_EVENT("MIC atom transfer", MIC_EVENT_ATOMS_TRANSFER); \
00353     MIC_TRACING_REGISTER_EVENT("MIC atom wait", MIC_EVENT_ATOMS_WAIT); \
00354     MIC_TRACING_REGISTER_EVENT("MIC sync input pragma", MIC_EVENT_SYNC_INPUT_PRAGMA); \
00355     MIC_TRACING_REGISTER_EVENT("MIC sync output remote pragma", MIC_EVENT_SYNC_OUTPUT_REMOTE_PRAGMA); \
00356     MIC_TRACING_REGISTER_EVENT("MIC sync output local pragma", MIC_EVENT_SYNC_OUTPUT_LOCAL_PRAGMA); \
00357     MIC_TRACING_REGISTER_EVENT("[MIC] computes", MIC_EVENT_DEVICE_COMPUTES); \
00358     MIC_TRACING_REGISTER_EVENT("[MIC] virials", MIC_EVENT_DEVICE_VIRIALS); \
00359     MIC_TRACING_REGISTER_EVENT("[MIC] patches", MIC_EVENT_DEVICE_PATCHES); \
00360     MIC_TRACING_REGISTER_EVENT("[MIC] patch", MIC_EVENT_DEVICE_PATCH); \
00361     MIC_TRACING_REGISTER_EVENT("[MIC] compute (d0)", MIC_EVENT_DEVICE_COMPUTE + 0); \
00362     MIC_TRACING_REGISTER_EVENT("[MIC] compute (d1)", MIC_EVENT_DEVICE_COMPUTE + 1); \
00363     MIC_TRACING_REGISTER_EVENT("[MIC] compute (d2)", MIC_EVENT_DEVICE_COMPUTE + 2); \
00364     MIC_TRACING_REGISTER_EVENT("[MIC] compute (d3)", MIC_EVENT_DEVICE_COMPUTE + 3); \
00365     MIC_TRACING_REGISTER_EVENT("[MIC] compute (d4)", MIC_EVENT_DEVICE_COMPUTE + 4); \
00366     MIC_TRACING_REGISTER_EVENT("[MIC] compute (d5)", MIC_EVENT_DEVICE_COMPUTE + 5); \
00367     MIC_TRACING_REGISTER_EVENT("[MIC] compute (d6)", MIC_EVENT_DEVICE_COMPUTE + 6); \
00368     MIC_TRACING_REGISTER_EVENT("[MIC] compute (d7+)", MIC_EVENT_DEVICE_COMPUTE + 7); \
00369     \
00370     MIC_TRACING_REGISTER_EVENT("Pair", 11000); \
00371     MIC_TRACING_REGISTER_EVENT("Self", 11001); \
00372     MIC_TRACING_REGISTER_EVENT("PME", 11002); \
00373     MIC_TRACING_REGISTER_EVENT("Tuple", 11003); \
00374     MIC_TRACING_REGISTER_EVENT("Box Closing", 11010); \
00375     MIC_TRACING_REGISTER_EVENT("Positions Ready", 11020); \
00376     MIC_TRACING_REGISTER_EVENT("Post Run Computes", 11021); \
00377     MIC_TRACING_REGISTER_EVENT("Pre Run Computes", 11022); \
00378     MIC_TRACING_REGISTER_EVENT("Reduction Submit", 11023); \
00379     MIC_TRACING_REGISTER_EVENT("MIC Atom Work", 11030); \
00380     MIC_TRACING_REGISTER_EVENT("Patch::positionsReady", 11031); \
00381     MIC_TRACING_REGISTER_EVENT("Atom send", 11032); \
00382     MIC_TRACING_REGISTER_EVENT("DEBUG 0", 11040); \
00383     MIC_TRACING_REGISTER_EVENT("DEBUG 1", 11041); \
00384     MIC_TRACING_REGISTER_EVENT("DEBUG 2", 11042); \
00385     MIC_TRACING_REGISTER_EVENT("DEBUG 3", 11043); \
00386     MIC_TRACING_REGISTER_EVENT("DEBUG 4", 11044); \
00387     MIC_TRACING_REGISTER_EVENT("recvImmediateProxyData", 11046); \
00388     MIC_TRACING_REGISTER_EVENT("msg copy send", 11047); \
00389     MIC_TRACING_REGISTER_EVENT("ProxyMgr::sendProxyData", 11048); \
00390     MIC_TRACING_REGISTER_EVENT("MIC box close", 11050); \
00391     MIC_TRACING_REGISTER_EVENT("MIC reduction submit", 11051); \
00392     MIC_TRACING_REGISTER_EVENT("WorkDistrib compute", 11070); \
00393     MIC_TRACING_REGISTER_EVENT("doWork::work", 11080); \
00394     MIC_TRACING_REGISTER_EVENT("doWork::progress", 11081); 
00395 
00396   #define MIC_TRACING_RECORD(id, start, stop) traceUserBracketEvent(id, start, stop)
00397 
00398   extern void traceUserBracketEvent(int, double, double);
00399   extern double CmiWallTimer();
00400 
00401 #else
00402 
00403   #define MIC_TRACING_REGISTER_EVENT(str, id)
00404   #define MIC_TRACING_REGISTER_EVENTS
00405   #define MIC_TRACING_RECORD(id, start, stop)
00406 
00407 #endif
00408   
00409 
00411 // Debug functions
00412 
00413 
00414 extern void debugInit(FILE* fout);
00415 extern void debugClose();
00416 #if MIC_DEBUG != 0
00417   extern __thread FILE* mic_output;
00418   extern double CmiWallTimer();
00419   extern int CmiMyPe();
00420   #define MICP(fmt, ...)  fprintf(((mic_output == NULL) ? (stdout) : (mic_output)), "[%013.6lf, %05d] " fmt, CmiWallTimer(), CmiMyPe(), ##__VA_ARGS__)
00421   #define MICPF           fflush((mic_output != NULL) ? (mic_output) : (stdout))
00422 #else
00423   #define MICP(fmt, ...)
00424   #define MICPF
00425 #endif
00426 // NOTE: The bodies are defined in ComputeNonbondMIC.C and these should only
00427 //   be used in host code.
00428 
00429 #if MIC_DEVICE_FPRINTF != 0
00430   #define DEVICE_FPRINTF(fmt, ...) if (device__fout != NULL) { fprintf(device__fout, fmt, ##__VA_ARGS__); fflush(device__fout); }
00431   #define DEVICE_FPRINTF_CLAUSE  nocopy(device__fout)
00432 #else
00433   #define DEVICE_FPRINTF(fmt, ...)
00434   #define DEVICE_FPRINTF_CLAUSE
00435 #endif
00436 
00437 
00439 // Shared Types
00440 
00441 #pragma offload_attribute (push, target(mic))
00442 
00443 //this type defined in multiple files
00444 typedef float GBReal;
00445 
00446 struct  float3 {  float x,y,z  ; };
00447 struct  float4 {  float x,y,z,w; };
00448 struct double3 { double x,y,z  ; };
00449 struct double4 { double x,y,z,w; };
00450 struct    int2 {    int x,y    ; };
00451 struct    int3 {    int x,y,z  ; };
00452 struct    int4 {    int x,y,z,w; };
00453 
00454 typedef float mic_position_t;
00455 typedef float3 mic_position3_t;
00456 typedef float4 mic_position4_t;
00457 
00458 // Data structure that represents a compute object/task on the MIC device.  An
00459 //   array of these data structures will be passed down to the device, where each
00460 //   is processed in parallel.  Note that for self computes, patch1 and patch2
00461 //   data should match.
00462 struct patch_pair {  // must be multiple of 16!
00463 
00464   mic_position4_t offset;               // Offset of patches (center to center)
00465 
00466   unsigned int patch1_size;             // Number of atoms for patch 1
00467   unsigned int patch1_force_size;       // Number of non-fixed atoms at start of list
00468   unsigned int patch1_atom_start;       // Start of patch 1 data within atoms array
00469   unsigned int patch1_force_start;      // Start of compute-specific force buffer in split-up force array (i.e. sub-buffer per compute-patch)
00470   unsigned int patch1_force_list_index; // Index for patch 1's force information in force lists array
00471   unsigned int patch1_force_list_size;  // Number of contributers to patch 1's force buffer
00472 
00473   unsigned int patch2_size;             // Number of atoms for patch 2
00474   unsigned int patch2_force_size;       // Number of non-fixed atoms at start of list
00475   unsigned int patch2_atom_start;       // Start of patch 2 data within atoms array
00476   unsigned int patch2_force_start;      // Start of compute-specific force buffer in split-up force array (i.e. sub-buffer per compute-patch)
00477   unsigned int patch2_force_list_index; // Index for patch 2's force information in force lists array
00478   unsigned int patch2_force_list_size;  // Number of contributers to patch 1's force buffer
00479 
00480   double plcutoff;                      // Pairlist cutoff distance
00481 
00482   unsigned int block_flags_start;
00483   unsigned int virial_start;            // virial output location padded to 16
00484 
00485   int numParts;
00486   int part;
00487 
00488   #if MIC_SUBMIT_ATOMS_ON_ARRIVAL != 0
00489     uint64_t patch1_atomDataPtr;
00490     uint64_t patch2_atomDataPtr;
00491   #endif
00492 
00493   // DMK - DEBUG
00494   mic_position_t patch1_center_x;
00495   mic_position_t patch1_center_y;
00496   mic_position_t patch1_center_z;
00497   mic_position_t patch2_center_x;
00498   mic_position_t patch2_center_y;
00499   mic_position_t patch2_center_z;
00500 
00501   // DMK - DEBUG
00502   int p1;
00503   int p2;
00504   int cid;
00505 
00506   int __padding0__[1];
00507 };
00508 
00509 // Data structure used to associate individual force outputs (per compute object)
00510 //   to the force outputs for patches (per patch object).
00511 struct force_list {  // must be multiple of 16!
00512   unsigned int force_list_start;     // beginning of compute output
00513   unsigned int force_list_size;      // number of computes for this patch
00514   unsigned int patch_size;           // real number of atoms in patch
00515   unsigned int patch_stride;         // padded number of atoms in patch
00516   unsigned int force_output_start;   // output array
00517   unsigned int atom_start;           // atom positions
00518   unsigned int virial_list_start;    // beginning of compute virial output
00519   unsigned int virial_output_start;  // virial output location padded to 16
00520 };
00521 
00522 // Data structure for the portion of the atom information that is updated on the
00523 //   device each timestep.
00524 // NOTE: This data structure must match the CudaAtom data structure.
00525 // TODO: Modify the code in the patchs to use this data structure rather than CudaAtom.
00526 struct atom {
00527   //mic_position3_t position; // x, y, z
00528   mic_position_t x;
00529   mic_position_t y;
00530   mic_position_t z;
00531   mic_position_t charge;    // q
00532 };
00533 
00534 // Data structure for the portion of the atom information that is updated
00535 //   periodically on the device (e.g. after atom migration).
00536 struct atom_param {
00537   int vdw_type; // NOTE: The handcoded version of the MIC code relies on this member ("vdw_type") being the first of four ints
00538   int index;
00539   int excl_index;
00540   int excl_maxdiff;  // NOTE: if maxdiff == 0, then only excluded from self
00541 };
00542 
00543 // Data structure containing several constants that are used by the force
00544 //   computation functions.
00545 struct mic_constants {
00546 
00547   double cutoff2;
00548   double dielectric_1;
00549   double scaling;
00550   double scale14;
00551   double modf_mod;
00552   double r2_delta;
00553 
00554   int r2_delta_exp;
00555   int r2_delta_expc;
00556 
00557   int commOnly;
00558   int singleKernelFlag;
00559 };
00560 
00561 // Data structure for each invocation of the offload pragma in mic_nonbonded_force,
00562 //   used to transfer per timestep information related to the kernels each timestep.
00563 struct mic_kernel_data {
00564 
00565   // Inputs
00566   int isRemote;
00567   int numLocalAtoms;
00568   int numLocalComputes;
00569   int numLocalPatches;
00570   int doSlow;
00571   int doEnergy;
00572   int usePairlists;
00573   int savePairlists;
00574 
00575   mic_position3_t lata;
00576   mic_position3_t latb;
00577   mic_position3_t latc;
00578   int numAtoms;
00579   int numPatchPairs;
00580   int numForceLists;
00581   size_t forceBuffersReqSize;
00582 
00583   // Outputs
00584   double virial_xx;
00585   double virial_xy;
00586   double virial_xz;
00587   double virial_yy;
00588   double virial_yz;
00589   double virial_zz;
00590   double fullElectVirial_xx;
00591   double fullElectVirial_xy;
00592   double fullElectVirial_xz;
00593   double fullElectVirial_yy;
00594   double fullElectVirial_yz;
00595   double fullElectVirial_zz;
00596   double vdwEnergy;
00597   double electEnergy;
00598   double fullElectEnergy;
00599 
00600   int exclusionSum;
00601 
00602   int __padding_0__;
00603 };
00604 
00605 // Data structure that is setup each time a compute object is about to be processed
00606 //   on the device and passed into the force computation function (e.g. calc_pair)
00607 //   that is being called for the given compute on the given timestep.  NOTE: When
00608 //   the computes are being processed on the device, there is a one-to-one relationship
00609 //   between patch_pairs and mic_params (one per iteration of the parallel for
00610 //   processing the computes).
00611 struct mic_params {
00612   // DMK - TODO | FIXME : Reorder fields to pack data into less memory (if needed)
00613 
00614   patch_pair *pp;            // Pointer to the associated patch_pair structure
00615   force_list *fl[2];         // Pointers to force list structures (one per input patch)
00616   mic_constants *constants;  // Pointer to the mic_constants structure used to process the compute
00617   int **pairlists_ptr;       // Pointer to short array of pointers, one entry per type of pairlist
00618   int usePairlists;          // Flag if using pairlist (should be the case for MIC)
00619   int savePairlists;         // Flag if need to save (generate) pairlists this timestep
00620 
00621   // If refining pairlists is enabled, these are pointers to other pointers/scalars so
00622   //   that they can be maintained across calls/timesteps
00623   #if REFINE_PAIRLISTS != 0
00624     int** plArray_ptr;     // Pointer to the scratch buffer pointer for pairlist entries (so can be maintained across calls)
00625     int* plSize_ptr;       // Pointer to the length of the plArray scratch buffer
00626     double** r2Array_ptr;  // Pointer to the scratch buffer pointer for r2 (radius squared) values
00627   #endif
00628 
00629   void *table_four_base_ptr;     // Base pointer for nonbonded force table
00630   void *lj_table_base_ptr;       // Base pointer for LJ table data
00631   int table_four_n_16;           // Sub-array length for nonbonded force table
00632   int lj_table_dim;              // Dimention of the LJ table
00633   unsigned int *exclusion_bits;  // Pointer to the exclusion table
00634 
00635   mic_position4_t offset;  // Offset between the patches for pair computes
00636   int numAtoms[2];         // Atom counts for each patch
00637   int numAtoms_16[2];      // Atom counts for each patch, rounded up to nearest multiple of 16
00638 
00639   // Atom information arrays (input), formatted as SOA, with one pointer per input patch (1 for self, 2 for pair)
00640   #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
00641 
00642     atom* p[2];
00643     atom_param* pExt[2];
00644     double4* ff[2];
00645     double4* fullf[2];
00646 
00647   #else
00648 
00649     mic_position_t *p_x[2];    // Atom's position.x information
00650     mic_position_t *p_y[2];    // Atom's position.y information
00651     mic_position_t *p_z[2];    // Atom's position.z information
00652     mic_position_t *p_q[2];    // Atom's charge information
00653     int *pExt_vdwType[2];      // Atom's Van Der Waal information
00654     int *pExt_index[2];        // Atom's global index (i.e. vs the index in this array)
00655     int *pExt_exclIndex[2];    // Atom's exclusion table offset
00656     int *pExt_exclMaxDiff[2];  // Atom's exclusion table entry size (# entries = 2 * maxDiff + 1)
00657   
00658     // Atom force arrays (output), formatted as SOA, with one pointer per input patch (1 for self, 2 for pair).
00659     //   The "ff" arrays are forces for each timestep, while "fullf" arrays are slow_forces calculated during
00660     //   PME timesteps.  Otherwise, just x, y, z, and w (exclusion accumulators) fields.
00661     double *ff_x[2];
00662     double *ff_y[2];
00663     double *ff_z[2];
00664     double *ff_w[2];
00665     double *fullf_x[2];
00666     double *fullf_y[2];
00667     double *fullf_z[2];
00668     double *fullf_w[2];
00669 
00670   #endif
00671   
00672   // Virial and energy output for a given compute function ("virial" = every timestep,
00673   //   "fullElectVirial" = PME timesteps, "Energy" = energy timesteps) that is added
00674   //   to a global reduction.
00675   double virial_xx;
00676   double virial_xy;
00677   double virial_xz;
00678   double virial_yy;
00679   double virial_yz;
00680   double virial_zz;
00681   double fullElectVirial_xx;
00682   double fullElectVirial_xy;
00683   double fullElectVirial_xz;
00684   double fullElectVirial_yy;
00685   double fullElectVirial_yz;
00686   double fullElectVirial_zz;
00687   double vdwEnergy;
00688   double electEnergy;
00689   double fullElectEnergy;
00690 
00691   int doSlow;  // Flag that indicates if slow_forces should be calculated this timestep or not
00692 
00693   // DMK - DEBUG - Record the center of each patch
00694   mic_position_t patch1_center_x;
00695   mic_position_t patch1_center_y;
00696   mic_position_t patch1_center_z;
00697   mic_position_t patch2_center_x;
00698   mic_position_t patch2_center_y;
00699   mic_position_t patch2_center_z;
00700 
00701   // DMK - DEBUG
00702   int ppI;
00703   int p1;
00704   int p2;
00705   int pe;
00706   int timestep;
00707 
00708   int exclusionSum;
00709 
00710   int __padding0__[1];
00711 };
00712 
00713 #pragma offload_attribute (pop)
00714 
00715 
00717 // Functions
00718 
00719 // NOTE: Also see comments for these functions in ComputeNonbondedMICKernel.C.
00720 
00721 // A function to initialize variables, etc. related to the device's operation.
00722 void mic_init_device(const int pe, const int node, const int deviceNum);
00723 
00724 // A functions to free variables, etc. related to the device's operation.
00725 void mic_free_device(const int deviceNum);
00726 
00727 // A function to test if the function's execution is actually offloaded to
00728 //   a device (i.e. test if offloading is working correctly).
00729 //   Returns 1 if offloading is working for device 'dev', and 0 if not.
00730 int mic_check(int deviceNum);
00731 
00732 // A function that transfers the nonbonded force table to the device
00733 void mic_bind_table_four(const int deviceNum,
00734                          const double * table_four,
00735                          const int table_four_n,
00736                          const int table_four_n_16
00737                         );
00738 
00739 // A function that transfers the Lennard-Jones table to the device
00740 void mic_bind_lj_table(const int deviceNum,
00741                        const char * lj_table,
00742                        const int lj_table_dim,
00743                        const int lj_table_size
00744                       );
00745 
00746 // A function that transfers the exclusion table to the device
00747 void mic_bind_exclusions(const int deviceNum,
00748                          unsigned int *exclusion_bits,
00749                          const long int exclusion_bits_size
00750                         );
00751 
00752 // A function that transfers various constants to the device
00753 void mic_bind_constants(const int deviceNum,
00754                         const double cutoff2,
00755                         const double dielectric_1,
00756                         const double scaling,
00757                         const double scale14,
00758                         const double r2_delta,
00759                         const int r2_delta_exp,
00760                         const int commOnly
00761                        );
00762 
00763 
00764 
00765 void mic_bind_patch_pairs_only(const int deviceNum,
00766                                const patch_pair *patch_pairs,
00767                                const int patch_pairs_size,
00768                                const int patch_pairs_bufSize
00769                               );
00770 
00771 void mic_bind_force_lists_only(const int deviceNum,
00772                                force_list *force_lists,
00773                                const int force_lists_size,
00774                                const int force_lists_bufSize
00775                               );
00776 
00777 void mic_bind_atoms_only(const int deviceNum,
00778                          atom *atoms,
00779                          atom_param *atom_params,
00780                          double4 *forces,
00781                          double4 *slow_forces,
00782                          const int atoms_size,
00783                          const int atoms_bufSize
00784                         );
00785 
00786 void mic_bind_force_buffers_only(const int deviceNum, const size_t force_buffers_size);
00787 
00788 
00789 #if MIC_SUBMIT_ATOMS_ON_ARRIVAL != 0
00790 void mic_submit_patch_data(const int deviceNum,
00791                            void * const hostPtr,
00792                            void* &hostPtr_prev,
00793                            const int transferBytes,
00794                            const int allocBytes_host,
00795                            int &allocBytes_device,
00796                            uint64_t &devicePtr,
00797                            void* &signal
00798                           );
00799 #endif  // MIC_SUBMIT_ATOMS_ON_ARRIVAL
00800 
00801 
00802 // A function that initiates the force computation that is done on the device
00803 //   each timestep, resulting in forces being transferred from the device
00804 //   to the host.
00805 void mic_nonbonded_forces(const int deviceNum,
00806                           const int isRemote,
00807                           const int numLocalAtoms,
00808                           const int numLocalComputes,
00809                           const int numLocalPatches,
00810                           const mic_position3_t lata,
00811                           const mic_position3_t latb,
00812                           const mic_position3_t latc,
00813                           const int doSlow,
00814                           const int doEnergy,
00815                           const int usePairlists,
00816                           const int savePairlists,
00817                           const int atomsChanged
00818                          );
00819 
00820 void mic_transfer_output(const int deviceNum,
00821                          const int isRemote,
00822                          const int numLocalAtoms,
00823                          const int doSlow
00824                         );
00825 
00826 // Functions that are used to poll the for the completion of the asynchronous
00827 //   calculation initiated by mic_nonboned_forces.
00828 int mic_check_remote_kernel_complete(const int deviceNum);
00829 int mic_check_local_kernel_complete(const int deviceNum);
00830 
00831 // DMK - DEBUG
00832 void _mic_print_stats(const int deviceNum);
00833 void mic_dumpHostDeviceComputeMap();
00834 
00835 
00836 // DMK - DEBUG
00837 #if MIC_TRACK_DEVICE_MEM_USAGE != 0
00838 
00839 __declspec(target(mic))
00840 typedef struct _mem_info {
00841   size_t memTotal;
00842   size_t memFree;
00843   size_t cached;
00844   size_t active;
00845   size_t inactive;
00846   size_t vmSize;
00847   size_t vmPeak;
00848 } MemInfo;
00849 
00850 __declspec(target(mic))
00851 void printMemInfo(int device__pe, int device__timestep, MemInfo * mi);
00852 
00853 __declspec(target(mic))
00854 void readMemInfo_processLine(MemInfo * memInfo, char * n, char * v, char * u);
00855 
00856 __declspec(target(mic))
00857 bool readMemInfo(MemInfo * memInfo);
00858 
00859 #endif  // MIC_TRACK_DEVICE_MEM_USAGE != 0
00860 
00861 
00862 #endif  // NAMD_MIC
00863 #endif  // COMPUTE_NONBONDED_MIC_KERNEL_H

Generated on Mon Nov 20 01:17:11 2017 for NAMD by  doxygen 1.4.7