NAMD
ComputeNonbondedMICKernel.h
Go to the documentation of this file.
1 #ifndef COMPUTE_NONBONDED_MIC_KERNEL_H
2 #define COMPUTE_NONBONDED_MIC_KERNEL_H
3 
4 
5 // If this is a MIC build for NAMD, then include the contents of this file
6 #ifdef NAMD_MIC
7 
8 
10 // General Code Description
11 //
12 // The files beginning with "ComputeNonbondedMIC" are MIC-specific files that
13 // are involved in supporting MIC within the NAMD application. Below is a
14 // general description of that support.
15 //
16 // Startup:
17 //
18 // During the startup phase of the application, there are several functions
19 // that are called, once per device. First, the function mic_check is used to
20 // detect if the MIC devices that are to be used available to the application
21 // (by default, all devices are used, but the "+devices" command line parameter
22 // can be used to select specific devices). Then, the mic_init_device function
23 // is then used to initialize each of the MIC devices.
24 //
25 // Also during startup, some constant tables/variables that will be used
26 // during the steady state of the application are transfered to the device.
27 // These functions, again called once per device, include...
28 // - mic_bind_table_four : Nonbonded table (includes several sub-tables)
29 // - mic_bind_lj_table : Lennard-Jones (LJ) table
30 // - mic_bind_exclusions : Table of exclusion flags
31 // - mic_bind_constants : Several required constant scalar values
32 //
33 // Steady State (timesteps executing):
34 //
35 // During the steady state of the application, when timesteps are executing
36 // one after the next, several functions are used to transfer data to the MIC
37 // device, including...
38 // - mic_bind_patch_pairs : Updates buffers that only need to be updated on
39 // the device(s) periodically, usually because of some major event such
40 // as atom migration or load balancing (computes shifting between PEs).
41 // - mic_nonbonded_forces : Used to initiate computation on the MIC device.
42 // Various buffers that are required to be transfered each timestep,
43 // such as atoms (input: atom position info) or forces (output: atom
44 // force info), along with buffers that are only periodically transfered,
45 // such as atom_params (input: misc. atom info). The body of this function
46 // includes an offload pragma that is broken up into two stages. The first
47 // is force computation where the computes (patch_pairs) are executed in
48 // parallel with one another. The second is force reduction, where the
49 // forces are reduced in parallel (per patch), accumulating output from
50 // each of the computes that contributes to each of the patches. This
51 // offload pragma is asynchronous so that the host can continue to work.
52 // - mic_check_[remote|local]_kernel_complete : A function that polls for the
53 // completion of the main computation offload pragma.
54 //
55 // Force Computation (during steady state):
56 //
57 // As part of the force computation (first stage of computation on device),
58 // several functions are used to "process" (calculate forces for) each compute
59 // object. These functions are basically the same (at least conceptually, but
60 // have slight variations from one another based on the type of compute being
61 // "processed" or the conditions for the timestep. These functions include...
62 // - Base Names:
63 // - calc_pair : compute with two different input patches
64 // - calc_self : compute with a single input patch (i.e. patch1 == patch2)
65 // - Qualifiers:
66 // - "_energy" : calculate additional energy info during energy timesteps
67 // - "_fullelect" : calculate additional slow_forces during PME timesteps
68 // - Examples:
69 // - calc_pair : basic caclulations for a pair compute
70 // - calc_self_fullelect : basic + slow_force calculations for a self compute
71 //
72 // These force computation functions are generated by including the header
73 // file "ComputeNonbondedMICKernelBase.h" into "ComputeNonbondedMICKernel.C"
74 // multiple times, each time with different "macro flags" setup. The general
75 // include structure is as follows (indentation indicates inclusion)...
76 // - ComputeNonbondedMICKernelBase.h
77 // - declares compute function body, along with function setup/cleanup
78 // - periodically does pairlist generation
79 // - includes ComputeNonbondedMICKernelBase2.h (one per type of interation,
80 // such as NORMAL, EXCLUDED, and MODIFIED).
81 // - refines pairlist to only include interactions within cutoff for the
82 // given timestep (if pairlist refinement is enabled)
83 // - includes ComputeNonbondedMICKernelBase2_XXXXX.h, where XXXXX is
84 // either "handcode" or "scalar", depending on the build's configuration
85 // - force computation loop, that processes each entry in the pairlist
86 // for the given type of interaction
87 //
88 // For the most part, the MIC code is based on the host code. The main
89 // difference is the restructuring of the pairlists, and thus the force
90 // computation loops. For the MIC code, each pairlist entry is a combination
91 // of both an "i" (outer-loop atom list) index and "j" (inner-loop atom list)
92 // index that represents a pair of atoms that need to be checked this timestep
93 // for their exact distance, and if close enough, interact with one another.
94 // Correspondingly, the force computation loop is only a single loop, rather
95 // than two nested loops (inner + outer), that iterates over pairlist entries.
96 // As such, each compute has only one pairlist (per interaction type), rather
97 // than many smaller pairlists for each outer-loop atom (one per interaction
98 // type, per outer-loop atom). There is still one force computation loop,
99 // setup in ComputeNonbondedMICKernelBase2.h, per interaction type.
100 //
101 // Cleanup:
102 //
103 // The cleanup function mic_free_device should be called, once per device, to
104 // cleanup the various data structures used by the device (and the associated
105 // data structures on the host) at the end of the overall simulation.
106 //
108 
109 
110 #include <stdlib.h>
111 #include <stdio.h>
112 #include <offload.h>
113 #include <string.h>
114 #include <stdint.h>
115 #ifndef __MIC__
116  #include "charm++.h"
117 #endif
118 
119 
121 // NAMD_MIC Configuration Macros
122 
123 // There are three main ways in which the force computation loop can be compiled,
124 // including those listed in the table below.
125 //
126 // VERSION MIC_HANDCODE_FORCE MIC_PAD_PLGEN ALSO RECOMMENDED ("set" these, others "unset")
127 // ------------------- ------------------ ------------- ------------------------------------------------------------------------------
128 // scalar 0 0
129 // compiler-vectorized 0 1 MIC_HANDCODE_PLGEN(1) (until auto vectorization of pairlist generation loop is working)
130 // hand-vectorized 1 n/a MIC_HANDCODE_FORCE_CALCR2TABLE(1), MIC_HANDCODE_FORCE_USEGATHER(1), MIC_HANDCODE_FORCE_SOA_VS_AOS(1)
131 // --- for all versions of the code --- MULTIPLE_THREADS(1), MIC_SORT_ATOMS(1), MIC_HANDCODE_FORCE_SINGLE(1)
132 
133 // Flags to enable/disable the use of handcoding
134 #define MIC_HANDCODE_FORCE ( 1 ) // 1 // 0 - disable, !0 - enable hand-coded force generation loop
135  #define MIC_HANDCODE_FORCE_PFDIST ( 0 ) // (currently broken!) 0 - disable, >=8 - enable software prefetch (value = prefetch distance)
136  #define MIC_HANDCODE_FORCE_USEGATHER ( 1 ) // 1 //
137  #define MIC_HANDCODE_FORCE_USEGATHER_NBTBL ( 0 )
138  #define MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT ( 0 ) // 0 - disable, !0 - enable loading of vdw type information prior to the cutoff check within the force loops
139  #define MIC_HANDCODE_FORCE_COMBINE_FORCES ( 1 ) // 1 // 0 - disable, !0 - enable a single set of force updates (short-fast and full), rather than two separate updates in the force loop body (allowing indexes, etc. to be reused for both)
140 
141 // Flags that were originally part of handcoding, but are now more general
142 #define MIC_HANDCODE_FORCE_SINGLE ( 1 ) // 1 // 0 - disable, !0 - enable use of single precision to calculate force magnitude in force loops (forces still accumulated in double precision)
143 #define MIC_HANDCODE_FORCE_CALCR2TABLE ( 1 ) // 1 // 0 - disable, !0 - enable r2_table value inline calculation (vs r2_table value loads)
144 #define MIC_HANDCODE_FORCE_SOA_VS_AOS ( 1 ) // 1 // 0 - SoA, !0 - AoS
145 
146 #define MIC_SORT_ATOMS ( 1 ) // 1 // 0 - disable, !0 - enable atom sorting within each patch
147 #define MIC_SORT_LISTS ( 0 ) // (currently not working, leave at 0)
148 
149 // Pairlist generation controls
150 #define MIC_HANDCODE_PLGEN ( 1 ) // 1 // 0 - disable, !0 - enable hand-coded pairlist generation
151 #define MIC_TILE_PLGEN ( 0 ) // 0 - disable, !0 - enable pairlist tiling (value = number of tiles to create for each atom list)
152 #define MIC_CONDITION_NORMAL ( 0 ) // 0 - disable, !0 - enable normal pairlist "conditioning" (enabling this flag will cause the normal pairlist to be separated into two sets, such that pairs that are likely to be within cutoff are grouped together and pairs that are less likely to be within cutoff... so vector lanes tend to all active or all not-active)
153 #define MIC_PAD_PLGEN ( 1 ) // 1 // 0 - disable, !0 - enable pairlist padding
154 
155 // Flag to enable/disable the use of multiple threads within the force computation
156 #define MULTIPLE_THREADS ( 1 ) // 1 // 0 - disable, !0 - use multiple threads when processing compute objects (a.k.a. patch pairs)
157 
158 #define MIC_SORT_COMPUTES ( 1 )
159 
160 #define MIC_SYNC_INPUT ( 1 )
161 #define MIC_SYNC_OUTPUT ( 1 )
162 
163 #define MIC_SUBMIT_ATOMS_ON_ARRIVAL ( 0 )
164 
165 // Flags used to enable/disable and control pairlist refinement
166 #define REFINE_PAIRLISTS ( 0 ) // 0 - disable, !0 - enable pairlist refinement
167  #define REFINE_PAIRLIST_HANDCODE ( 0 ) // 0 - disable, !0 - use hand-coded version of the pairlist refinement code
168  #define REFINE_PAIRLISTS_XYZ ( 0 ) // 0 - disable, !0 - save dX (delta X), dY, and dZ values along with r2 values as part of the pairlist refinement output process
169 
170 #define MIC_PREFETCH_DISTANCE ( 0 )
171  #define MIC_PREFETCH_HINT ( 3 )
172 
173 // Flag used to enable/disable various checks within and related to the MIC computation code
174 #define MIC_FULL_CHECK ( 1 ) // 0 - disable, !0 - enable various checks use to help verify correct operation, but that are not required to perform the calculations themselves
175 #define MIC_EXCL_CHECKSUM_FULL ( 1 ) // NOTE: Mutually exclusive with MIC_EXCL_CHECKSUM
176 #define MIC_DATA_STRUCT_VERIFY ( 0 )
177  #define MIC_DATA_STRUCT_VERIFY_KERNELS ( 0 )
178 #define MIC_DEVICE_FPRINTF ( 0 ) // Print some debug info from the MICs directly to a device file on the MICs (for debug purposes)
179  #define MIC_DEVICE_FPRINTF_REOPEN_FREQ ( 0 ) // If >0, will cause the code to periodically reopen (and thus truncate) the device file, helping to limit the amount of memory it takes up
180 #define MIC_DUMP_COMPUTE_MAPPING ( 0 ) // 0: do nothing, !0: dump directToDevice info from ComputeMap
181 #define MIC_TRACK_DEVICE_MEM_USAGE ( 0 ) // 0: do nothing, !0: print memusage on the device every 100 timesteps
182 #define MIC_HEARTBEAT ( 0 )
183 
184 // Alignment used for various buffers, values, etc., in bytes
185 #define MIC_ALIGN ( 64 ) //4 * 1024 ) // NOTE : Must be > 0 and must be a multiple of the cacheline size (in bytes)
186 
187 #define MIC_MAX_DEVICES_PER_NODE ( 16 )
188 
189 #define MIC_PRINT_CONFIG ( 0 ) // 0 - disable, !0 - enable the printing of configuration information (macros defined in this file) at the start of a simulation (TODO | NOTE: This is meant as a reference for repeating performance experiments... not required for production use... disable at some point)
190 
191 #define MIC_VERBOSE_HVD_LDB ( 0 ) // 0 - disable, !0 - print extra info about HvD LDB
192 
193 #define MIC_DEBUG ( 0 ) // 0 - disable, !0 - enable extra debugging output to be printed to files (1 file per PE) via MICP statements
194 
195 #define NUM_PAIRLIST_TYPES ( 4 ) // Number of pairlist types (i.e. multiplier used for mallocs, etc.)
196  #define PL_NORM_INDEX ( 0 ) // Index for normal pairlist
197  #define PL_EXCL_INDEX ( 1 ) // Index for excluded pairlist
198  #define PL_MOD_INDEX ( 2 ) // Index for modified pairlist
199  #define PL_NORMHI_INDEX ( 3 ) // Index for the "normal high" pairlist (temporary pairlist used for entries that are far apart, used to separate the sets of entries, close and far, used in normal pairlist conditioning... see MIC_CONDITION_NORMAL)
200 
201 // Projections tracing
202 #define MIC_TRACING ( 1 ) // 0 - disable, !0 - enable tracing information to be collected on the host
203  #define MIC_DEVICE_TRACING ( 1 ) // 0 - disable, !0 - enable the collection of extra tracing information on the device itself (only high-level events, like the total time for all computes, for all patches, etc.) (requires MIC_TRACING to be set)
204  #define MIC_DEVICE_TRACING_DETAILED ( 0 ) // 0 - disable, !0 - enable the collection of more tracing information on the device, including start and stop times for each task (requires MIC_DEVICE_TRACING to be sent)
205  #define MIC_TRACING_EVENT_BASE ( 10000 )
206  #define MIC_EVENT_OFFLOAD_REMOTE ( MIC_TRACING_EVENT_BASE + 0 )
207  #define MIC_EVENT_OFFLOAD_LOCAL ( MIC_TRACING_EVENT_BASE + 1 )
208  #define MIC_EVENT_OFFLOAD_PRAGMA ( MIC_TRACING_EVENT_BASE + 2 )
209  #define MIC_EVENT_OFFLOAD_POLLSET ( MIC_TRACING_EVENT_BASE + 3 )
210  #define MIC_EVENT_FUNC_DOWORK ( MIC_TRACING_EVENT_BASE + 4 )
211  #define MIC_EVENT_FUNC_FINISHWORK ( MIC_TRACING_EVENT_BASE + 5 )
212  #define MIC_EVENT_ATOMS_SUBMIT ( MIC_TRACING_EVENT_BASE + 6 )
213  #define MIC_EVENT_ATOMS_TRANSFER ( MIC_TRACING_EVENT_BASE + 7 )
214  #define MIC_EVENT_ATOMS_WAIT ( MIC_TRACING_EVENT_BASE + 8 )
215  #define MIC_EVENT_SYNC_INPUT_PRAGMA ( MIC_TRACING_EVENT_BASE + 9 )
216  #define MIC_EVENT_SYNC_OUTPUT_REMOTE_PRAGMA ( MIC_TRACING_EVENT_BASE + 10 )
217  #define MIC_EVENT_SYNC_OUTPUT_LOCAL_PRAGMA ( MIC_TRACING_EVENT_BASE + 11 )
218  #define MIC_EVENT_DEVICE_COMPUTES ( MIC_TRACING_EVENT_BASE + 12 )
219  #define MIC_EVENT_DEVICE_VIRIALS ( MIC_TRACING_EVENT_BASE + 13 )
220  #define MIC_EVENT_DEVICE_PATCHES ( MIC_TRACING_EVENT_BASE + 14 )
221  #define MIC_EVENT_DEVICE_PATCH ( MIC_TRACING_EVENT_BASE + 15 )
222  #define MIC_EVENT_DEVICE_COMPUTE ( MIC_TRACING_EVENT_BASE + 16 )
223  #define MIC_TRACING_NUM_EVENTS ( 24 )
224 
225 // DMK - DEBUG
226 #define MIC_KERNEL_DATA_TRANSFER_STATS ( 0 )
227 
228 __declspec(target(mic)) void* _mm_malloc_withPrint(size_t s, int a, char * l);
229 __declspec(target(mic)) void _mm_free_withPrint(void * prt);
230 
231 // DMK - NOTE : Leaving these macros in as hooks for now
232 //#ifdef __MIC__
233 // #define _MM_MALLOC_WRAPPER(s, a, l) _mm_malloc_withPrint((s), (a), (l))
234 // #define _MM_FREE_WRAPPER(p) _mm_free_withPrint(p)
235 // #define _MM_MALLOC_VERIFY
236 //#else
237  #define _MM_MALLOC_WRAPPER(s, a, l) _mm_malloc((s), (a))
238  #define _MM_FREE_WRAPPER(p) _mm_free(p)
239  #define _MM_MALLOC_VERIFY
240 //#endif
241 
242 
244 // Configuration Checks/Overrides
245 
246 // If not compiling for the MIC device, disable all MIC specific code
247 #include <stdint.h>
248 #if defined(__MIC__) || defined(__MIC2__)
249  #include <immintrin.h>
250 #else
251  #undef MIC_HANDCODE_FORCE
252  #define MIC_HANDCODE_FORCE (0)
253  #undef MIC_HANDCODE_PLGEN
254  #define MIC_HANDCODE_PLGEN (0)
255 #endif
256 
257 // If the MIC_FULL_CHECK flag is set, enable checks for both __ASSUME and
258 // __ASSUME_ALIGNED to ensure the cases for these macros are actually true.
259 // Also ensure that MIC_EXCL_CHECKSUM_FULL is enabled.
260 #if MIC_FULL_CHECK != 0
261  #define __FULL_CHECK(X) X
262  #define __ASSERT(c) assert(c)
263  #define CHECK_ASSUME (1)
264  #define CHECK_ASSUME_ALIGNED (1)
265  #if (MIC_EXCL_CHECKSUM_FULL == 0)
266  #warning "MIC_EXCL_CHECKSUM_FULL enabled (because MIC_FULL_CHECK was enabled)"
267  #endif
268  #undef MIC_EXCL_CHECKSUM_FULL
269  #define MIC_EXCL_CHECKSUM_FULL (1)
270 #else
271  #define __FULL_CHECK(X)
272  #define __ASSERT(c)
273  #define CHECK_ASSUME (0)
274  #define CHECK_ASSUME_ALIGNED (0)
275 #endif
276 
277 // Create a macro that can be used within the body of offload pragmas that asserts the
278 // offload section was executed on a MIC device
279 #if defined(__MIC__)
280  #define __MUST_BE_MIC
281 #else
282  #define __MUST_BE_MIC __ASSERT(0)
283 #endif
284 
285 // For now, if padding the pairlist with zeros, disable pairlist tiling
286 #if (MIC_PAD_PLGEN != 0) && (MIC_HANDCODE_FORCE != 0)
287  #if REFINE_PAIRLISTS != 0
288  #warning "REFINE_PAIRLISTS disabled (because MIC_PAD_PLGEN && MIC_HANDCODE_FORCE were enabled; combination not supported yet)"
289  #endif
290  #undef REFINE_PAIRLISTS
291  #define REFINE_PAIRLISTS (0)
292 #endif
293 
294 // If using the handcoded force loop with mixed precision, disable the use of pairlist refinement (not supported yet)
295 #if MIC_HANDCODE_FORCE_SINGLE != 0
296  #if REFINE_PAIRLISTS != 0
297  #warning "REFINE_PAIRLISTS disabled (because MIC_HANDCODE_FORCE_SINGLE was enabled)"
298  #endif
299  #undef REFINE_PAIRLISTS
300  #define REFINE_PAIRLISTS (0)
301 #endif
302 
303 // If AOS has been enabled, ensure that loading VDW upfront has been disabled (not supported, doesn't
304 // make much sense as x,y,z,q are in a different array than vdwType)
305 #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
306  #if MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT != 0
307  #warning "MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT disabled (because MIC_HANDCODE_FORCE_SOA_VS_AOS was enabled; combination not supported)"
308  #endif
309  #undef MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT
310  #define MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT (0)
311 #endif
312 
313 // Check for a negative value specified as the prefetch distance (macro's value is used, so require positive)
314 #if MIC_PREFETCH_DISTANCE < 0
315  #warning "INVALID MIC_PREFETCH_DISTANCE value (disabling)"
316  #undef MIC_PREFETCH_DISTANCE
317  #define MIC_PREFETCH_DISTANCE (0)
318 #endif
319 
320 // If tracing has been enabled, ensure that the other required tracing macros are properly set
321 #if MIC_DEVICE_TRACING_DETAILED != 0
322  #if MIC_DEVICE_TRACING == 0
323  #warning "MIC_DEVICE_TRACING enabled (because MIC_DEVICE_TRACING_DETAILED was enabled)"
324  #endif
325  #undef MIC_DEVICE_TRACING
326  #define MIC_DEVICE_TRACING (1)
327 #endif
328 #if MIC_DEVICE_TRACING != 0
329  #if MIC_TRACING == 0
330  #warning "MIC_TRACING enabled (because MIC_DEVICE_TRACING was enabled)"
331  #endif
332  #undef MIC_TRACING
333  #define MIC_TRACING (1)
334 #endif
335 
336 
338 // Tracing Macros (Projections)
339 
340 #if MIC_TRACING != 0
341 
342  #define MIC_TRACING_REGISTER_EVENT(str, id) traceRegisterUserEvent(str, id)
343 
344  #define MIC_TRACING_REGISTER_EVENTS \
345  MIC_TRACING_REGISTER_EVENT("MIC device offload (remote)", MIC_EVENT_OFFLOAD_REMOTE); \
346  MIC_TRACING_REGISTER_EVENT("MIC device offload (local)", MIC_EVENT_OFFLOAD_LOCAL); \
347  MIC_TRACING_REGISTER_EVENT("MIC offload pragma", MIC_EVENT_OFFLOAD_PRAGMA); \
348  MIC_TRACING_REGISTER_EVENT("MIC polling set", MIC_EVENT_OFFLOAD_POLLSET); \
349  MIC_TRACING_REGISTER_EVENT("MIC ::doWork", MIC_EVENT_FUNC_DOWORK); \
350  MIC_TRACING_REGISTER_EVENT("MIC ::finishWork", MIC_EVENT_FUNC_FINISHWORK); \
351  MIC_TRACING_REGISTER_EVENT("MIC atom submit", MIC_EVENT_ATOMS_SUBMIT); \
352  MIC_TRACING_REGISTER_EVENT("MIC atom transfer", MIC_EVENT_ATOMS_TRANSFER); \
353  MIC_TRACING_REGISTER_EVENT("MIC atom wait", MIC_EVENT_ATOMS_WAIT); \
354  MIC_TRACING_REGISTER_EVENT("MIC sync input pragma", MIC_EVENT_SYNC_INPUT_PRAGMA); \
355  MIC_TRACING_REGISTER_EVENT("MIC sync output remote pragma", MIC_EVENT_SYNC_OUTPUT_REMOTE_PRAGMA); \
356  MIC_TRACING_REGISTER_EVENT("MIC sync output local pragma", MIC_EVENT_SYNC_OUTPUT_LOCAL_PRAGMA); \
357  MIC_TRACING_REGISTER_EVENT("[MIC] computes", MIC_EVENT_DEVICE_COMPUTES); \
358  MIC_TRACING_REGISTER_EVENT("[MIC] virials", MIC_EVENT_DEVICE_VIRIALS); \
359  MIC_TRACING_REGISTER_EVENT("[MIC] patches", MIC_EVENT_DEVICE_PATCHES); \
360  MIC_TRACING_REGISTER_EVENT("[MIC] patch", MIC_EVENT_DEVICE_PATCH); \
361  MIC_TRACING_REGISTER_EVENT("[MIC] compute (d0)", MIC_EVENT_DEVICE_COMPUTE + 0); \
362  MIC_TRACING_REGISTER_EVENT("[MIC] compute (d1)", MIC_EVENT_DEVICE_COMPUTE + 1); \
363  MIC_TRACING_REGISTER_EVENT("[MIC] compute (d2)", MIC_EVENT_DEVICE_COMPUTE + 2); \
364  MIC_TRACING_REGISTER_EVENT("[MIC] compute (d3)", MIC_EVENT_DEVICE_COMPUTE + 3); \
365  MIC_TRACING_REGISTER_EVENT("[MIC] compute (d4)", MIC_EVENT_DEVICE_COMPUTE + 4); \
366  MIC_TRACING_REGISTER_EVENT("[MIC] compute (d5)", MIC_EVENT_DEVICE_COMPUTE + 5); \
367  MIC_TRACING_REGISTER_EVENT("[MIC] compute (d6)", MIC_EVENT_DEVICE_COMPUTE + 6); \
368  MIC_TRACING_REGISTER_EVENT("[MIC] compute (d7+)", MIC_EVENT_DEVICE_COMPUTE + 7); \
369  \
370  MIC_TRACING_REGISTER_EVENT("Pair", 11000); \
371  MIC_TRACING_REGISTER_EVENT("Self", 11001); \
372  MIC_TRACING_REGISTER_EVENT("PME", 11002); \
373  MIC_TRACING_REGISTER_EVENT("Tuple", 11003); \
374  MIC_TRACING_REGISTER_EVENT("Box Closing", 11010); \
375  MIC_TRACING_REGISTER_EVENT("Positions Ready", 11020); \
376  MIC_TRACING_REGISTER_EVENT("Post Run Computes", 11021); \
377  MIC_TRACING_REGISTER_EVENT("Pre Run Computes", 11022); \
378  MIC_TRACING_REGISTER_EVENT("Reduction Submit", 11023); \
379  MIC_TRACING_REGISTER_EVENT("MIC Atom Work", 11030); \
380  MIC_TRACING_REGISTER_EVENT("Patch::positionsReady", 11031); \
381  MIC_TRACING_REGISTER_EVENT("Atom send", 11032); \
382  MIC_TRACING_REGISTER_EVENT("DEBUG 0", 11040); \
383  MIC_TRACING_REGISTER_EVENT("DEBUG 1", 11041); \
384  MIC_TRACING_REGISTER_EVENT("DEBUG 2", 11042); \
385  MIC_TRACING_REGISTER_EVENT("DEBUG 3", 11043); \
386  MIC_TRACING_REGISTER_EVENT("DEBUG 4", 11044); \
387  MIC_TRACING_REGISTER_EVENT("recvImmediateProxyData", 11046); \
388  MIC_TRACING_REGISTER_EVENT("msg copy send", 11047); \
389  MIC_TRACING_REGISTER_EVENT("ProxyMgr::sendProxyData", 11048); \
390  MIC_TRACING_REGISTER_EVENT("MIC box close", 11050); \
391  MIC_TRACING_REGISTER_EVENT("MIC reduction submit", 11051); \
392  MIC_TRACING_REGISTER_EVENT("WorkDistrib compute", 11070); \
393  MIC_TRACING_REGISTER_EVENT("doWork::work", 11080); \
394  MIC_TRACING_REGISTER_EVENT("doWork::progress", 11081);
395 
396  #define MIC_TRACING_RECORD(id, start, stop) traceUserBracketEvent(id, start, stop)
397 
398  extern void traceUserBracketEvent(int, double, double);
399  extern double CmiWallTimer();
400 
401 #else
402 
403  #define MIC_TRACING_REGISTER_EVENT(str, id)
404  #define MIC_TRACING_REGISTER_EVENTS
405  #define MIC_TRACING_RECORD(id, start, stop)
406 
407 #endif
408 
409 
411 // Debug functions
412 
413 
414 extern void debugInit(FILE* fout);
415 extern void debugClose();
416 #if MIC_DEBUG != 0
417  extern __thread FILE* mic_output;
418  extern double CmiWallTimer();
419  extern int CmiMyPe();
420  #define MICP(fmt, ...) fprintf(((mic_output == NULL) ? (stdout) : (mic_output)), "[%013.6lf, %05d] " fmt, CmiWallTimer(), CmiMyPe(), ##__VA_ARGS__)
421  #define MICPF fflush((mic_output != NULL) ? (mic_output) : (stdout))
422 #else
423  #define MICP(fmt, ...)
424  #define MICPF
425 #endif
426 // NOTE: The bodies are defined in ComputeNonbondMIC.C and these should only
427 // be used in host code.
428 
429 #if MIC_DEVICE_FPRINTF != 0
430  #define DEVICE_FPRINTF(fmt, ...) if (device__fout != NULL) { fprintf(device__fout, fmt, ##__VA_ARGS__); fflush(device__fout); }
431  #define DEVICE_FPRINTF_CLAUSE nocopy(device__fout)
432 #else
433  #define DEVICE_FPRINTF(fmt, ...)
434  #define DEVICE_FPRINTF_CLAUSE
435 #endif
436 
437 
439 // Shared Types
440 
441 #pragma offload_attribute (push, target(mic))
442 
443 //this type defined in multiple files
444 typedef float GBReal;
445 
446 struct float3 { float x,y,z ; };
447 struct float4 { float x,y,z,w; };
448 struct double3 { double x,y,z ; };
449 struct double4 { double x,y,z,w; };
450 struct int2 { int x,y ; };
451 struct int3 { int x,y,z ; };
452 struct int4 { int x,y,z,w; };
453 
454 typedef float mic_position_t;
455 typedef float3 mic_position3_t;
456 typedef float4 mic_position4_t;
457 
458 // Data structure that represents a compute object/task on the MIC device. An
459 // array of these data structures will be passed down to the device, where each
460 // is processed in parallel. Note that for self computes, patch1 and patch2
461 // data should match.
462 struct patch_pair { // must be multiple of 16!
463 
464  mic_position4_t offset; // Offset of patches (center to center)
465 
466  unsigned int patch1_size; // Number of atoms for patch 1
467  unsigned int patch1_force_size; // Number of non-fixed atoms at start of list
468  unsigned int patch1_atom_start; // Start of patch 1 data within atoms array
469  unsigned int patch1_force_start; // Start of compute-specific force buffer in split-up force array (i.e. sub-buffer per compute-patch)
470  unsigned int patch1_force_list_index; // Index for patch 1's force information in force lists array
471  unsigned int patch1_force_list_size; // Number of contributers to patch 1's force buffer
472 
473  unsigned int patch2_size; // Number of atoms for patch 2
474  unsigned int patch2_force_size; // Number of non-fixed atoms at start of list
475  unsigned int patch2_atom_start; // Start of patch 2 data within atoms array
476  unsigned int patch2_force_start; // Start of compute-specific force buffer in split-up force array (i.e. sub-buffer per compute-patch)
477  unsigned int patch2_force_list_index; // Index for patch 2's force information in force lists array
478  unsigned int patch2_force_list_size; // Number of contributers to patch 1's force buffer
479 
480  double plcutoff; // Pairlist cutoff distance
481 
482  unsigned int block_flags_start;
483  unsigned int virial_start; // virial output location padded to 16
484 
485  int numParts;
486  int part;
487 
488  #if MIC_SUBMIT_ATOMS_ON_ARRIVAL != 0
489  uint64_t patch1_atomDataPtr;
490  uint64_t patch2_atomDataPtr;
491  #endif
492 
493  // DMK - DEBUG
494  mic_position_t patch1_center_x;
495  mic_position_t patch1_center_y;
496  mic_position_t patch1_center_z;
497  mic_position_t patch2_center_x;
498  mic_position_t patch2_center_y;
499  mic_position_t patch2_center_z;
500 
501  // DMK - DEBUG
502  int p1;
503  int p2;
504  int cid;
505 
506  int __padding0__[1];
507 };
508 
509 // Data structure used to associate individual force outputs (per compute object)
510 // to the force outputs for patches (per patch object).
511 struct force_list { // must be multiple of 16!
512  unsigned int force_list_start; // beginning of compute output
513  unsigned int force_list_size; // number of computes for this patch
514  unsigned int patch_size; // real number of atoms in patch
515  unsigned int patch_stride; // padded number of atoms in patch
516  unsigned int force_output_start; // output array
517  unsigned int atom_start; // atom positions
518  unsigned int virial_list_start; // beginning of compute virial output
519  unsigned int virial_output_start; // virial output location padded to 16
520 };
521 
522 // Data structure for the portion of the atom information that is updated on the
523 // device each timestep.
524 // NOTE: This data structure must match the CudaAtom data structure.
525 // TODO: Modify the code in the patchs to use this data structure rather than CudaAtom.
526 struct atom {
527  //mic_position3_t position; // x, y, z
528  mic_position_t x;
529  mic_position_t y;
530  mic_position_t z;
531  mic_position_t charge; // q
532 };
533 
534 // Data structure for the portion of the atom information that is updated
535 // periodically on the device (e.g. after atom migration).
536 struct atom_param {
537  int vdw_type; // NOTE: The handcoded version of the MIC code relies on this member ("vdw_type") being the first of four ints
538  int index;
539  int excl_index;
540  int excl_maxdiff; // NOTE: if maxdiff == 0, then only excluded from self
541 };
542 
543 // Data structure containing several constants that are used by the force
544 // computation functions.
545 struct mic_constants {
546 
547  double cutoff2;
548  double dielectric_1;
549  double scaling;
550  double scale14;
551  double modf_mod;
552  double r2_delta;
553 
554  int r2_delta_exp;
555  int r2_delta_expc;
556 
557  int commOnly;
558  int singleKernelFlag;
559 };
560 
561 // Data structure for each invocation of the offload pragma in mic_nonbonded_force,
562 // used to transfer per timestep information related to the kernels each timestep.
563 struct mic_kernel_data {
564 
565  // Inputs
566  int isRemote;
567  int numLocalAtoms;
568  int numLocalComputes;
569  int numLocalPatches;
570  int doSlow;
571  int doEnergy;
572  int usePairlists;
573  int savePairlists;
574 
575  mic_position3_t lata;
576  mic_position3_t latb;
577  mic_position3_t latc;
578  int numAtoms;
579  int numPatchPairs;
580  int numForceLists;
581  size_t forceBuffersReqSize;
582 
583  // Outputs
584  double virial_xx;
585  double virial_xy;
586  double virial_xz;
587  double virial_yy;
588  double virial_yz;
589  double virial_zz;
590  double fullElectVirial_xx;
591  double fullElectVirial_xy;
592  double fullElectVirial_xz;
593  double fullElectVirial_yy;
594  double fullElectVirial_yz;
595  double fullElectVirial_zz;
596  double vdwEnergy;
597  double electEnergy;
598  double fullElectEnergy;
599 
600  int exclusionSum;
601 
602  int __padding_0__;
603 };
604 
605 // Data structure that is setup each time a compute object is about to be processed
606 // on the device and passed into the force computation function (e.g. calc_pair)
607 // that is being called for the given compute on the given timestep. NOTE: When
608 // the computes are being processed on the device, there is a one-to-one relationship
609 // between patch_pairs and mic_params (one per iteration of the parallel for
610 // processing the computes).
611 struct mic_params {
612  // DMK - TODO | FIXME : Reorder fields to pack data into less memory (if needed)
613 
614  patch_pair *pp; // Pointer to the associated patch_pair structure
615  force_list *fl[2]; // Pointers to force list structures (one per input patch)
616  mic_constants *constants; // Pointer to the mic_constants structure used to process the compute
617  int **pairlists_ptr; // Pointer to short array of pointers, one entry per type of pairlist
618  int usePairlists; // Flag if using pairlist (should be the case for MIC)
619  int savePairlists; // Flag if need to save (generate) pairlists this timestep
620 
621  // If refining pairlists is enabled, these are pointers to other pointers/scalars so
622  // that they can be maintained across calls/timesteps
623  #if REFINE_PAIRLISTS != 0
624  int** plArray_ptr; // Pointer to the scratch buffer pointer for pairlist entries (so can be maintained across calls)
625  int* plSize_ptr; // Pointer to the length of the plArray scratch buffer
626  double** r2Array_ptr; // Pointer to the scratch buffer pointer for r2 (radius squared) values
627  #endif
628 
629  void *table_four_base_ptr; // Base pointer for nonbonded force table
630  void *lj_table_base_ptr; // Base pointer for LJ table data
631  int table_four_n_16; // Sub-array length for nonbonded force table
632  int lj_table_dim; // Dimention of the LJ table
633  unsigned int *exclusion_bits; // Pointer to the exclusion table
634 
635  mic_position4_t offset; // Offset between the patches for pair computes
636  int numAtoms[2]; // Atom counts for each patch
637  int numAtoms_16[2]; // Atom counts for each patch, rounded up to nearest multiple of 16
638 
639  // Atom information arrays (input), formatted as SOA, with one pointer per input patch (1 for self, 2 for pair)
640  #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
641 
642  atom* p[2];
643  atom_param* pExt[2];
644  double4* ff[2];
645  double4* fullf[2];
646 
647  #else
648 
649  mic_position_t *p_x[2]; // Atom's position.x information
650  mic_position_t *p_y[2]; // Atom's position.y information
651  mic_position_t *p_z[2]; // Atom's position.z information
652  mic_position_t *p_q[2]; // Atom's charge information
653  int *pExt_vdwType[2]; // Atom's Van Der Waal information
654  int *pExt_index[2]; // Atom's global index (i.e. vs the index in this array)
655  int *pExt_exclIndex[2]; // Atom's exclusion table offset
656  int *pExt_exclMaxDiff[2]; // Atom's exclusion table entry size (# entries = 2 * maxDiff + 1)
657 
658  // Atom force arrays (output), formatted as SOA, with one pointer per input patch (1 for self, 2 for pair).
659  // The "ff" arrays are forces for each timestep, while "fullf" arrays are slow_forces calculated during
660  // PME timesteps. Otherwise, just x, y, z, and w (exclusion accumulators) fields.
661  double *ff_x[2];
662  double *ff_y[2];
663  double *ff_z[2];
664  double *ff_w[2];
665  double *fullf_x[2];
666  double *fullf_y[2];
667  double *fullf_z[2];
668  double *fullf_w[2];
669 
670  #endif
671 
672  // Virial and energy output for a given compute function ("virial" = every timestep,
673  // "fullElectVirial" = PME timesteps, "Energy" = energy timesteps) that is added
674  // to a global reduction.
675  double virial_xx;
676  double virial_xy;
677  double virial_xz;
678  double virial_yy;
679  double virial_yz;
680  double virial_zz;
681  double fullElectVirial_xx;
682  double fullElectVirial_xy;
683  double fullElectVirial_xz;
684  double fullElectVirial_yy;
685  double fullElectVirial_yz;
686  double fullElectVirial_zz;
687  double vdwEnergy;
688  double electEnergy;
689  double fullElectEnergy;
690 
691  int doSlow; // Flag that indicates if slow_forces should be calculated this timestep or not
692 
693  // DMK - DEBUG - Record the center of each patch
694  mic_position_t patch1_center_x;
695  mic_position_t patch1_center_y;
696  mic_position_t patch1_center_z;
697  mic_position_t patch2_center_x;
698  mic_position_t patch2_center_y;
699  mic_position_t patch2_center_z;
700 
701  // DMK - DEBUG
702  int ppI;
703  int p1;
704  int p2;
705  int pe;
706  int timestep;
707 
708  int exclusionSum;
709 
710  int __padding0__[1];
711 };
712 
713 #pragma offload_attribute (pop)
714 
715 
717 // Functions
718 
719 // NOTE: Also see comments for these functions in ComputeNonbondedMICKernel.C.
720 
721 // A function to initialize variables, etc. related to the device's operation.
722 void mic_init_device(const int pe, const int node, const int deviceNum);
723 
724 // A functions to free variables, etc. related to the device's operation.
725 void mic_free_device(const int deviceNum);
726 
727 // A function to test if the function's execution is actually offloaded to
728 // a device (i.e. test if offloading is working correctly).
729 // Returns 1 if offloading is working for device 'dev', and 0 if not.
730 int mic_check(int deviceNum);
731 
732 // A function that transfers the nonbonded force table to the device
733 void mic_bind_table_four(const int deviceNum,
734  const double * table_four,
735  const int table_four_n,
736  const int table_four_n_16
737  );
738 
739 // A function that transfers the Lennard-Jones table to the device
740 void mic_bind_lj_table(const int deviceNum,
741  const char * lj_table,
742  const int lj_table_dim,
743  const int lj_table_size
744  );
745 
746 // A function that transfers the exclusion table to the device
747 void mic_bind_exclusions(const int deviceNum,
748  unsigned int *exclusion_bits,
749  const long int exclusion_bits_size
750  );
751 
752 // A function that transfers various constants to the device
753 void mic_bind_constants(const int deviceNum,
754  const double cutoff2,
755  const double dielectric_1,
756  const double scaling,
757  const double scale14,
758  const double r2_delta,
759  const int r2_delta_exp,
760  const int commOnly
761  );
762 
763 
764 
765 void mic_bind_patch_pairs_only(const int deviceNum,
766  const patch_pair *patch_pairs,
767  const int patch_pairs_size,
768  const int patch_pairs_bufSize
769  );
770 
771 void mic_bind_force_lists_only(const int deviceNum,
772  force_list *force_lists,
773  const int force_lists_size,
774  const int force_lists_bufSize
775  );
776 
777 void mic_bind_atoms_only(const int deviceNum,
778  atom *atoms,
779  atom_param *atom_params,
780  double4 *forces,
781  double4 *slow_forces,
782  const int atoms_size,
783  const int atoms_bufSize
784  );
785 
786 void mic_bind_force_buffers_only(const int deviceNum, const size_t force_buffers_size);
787 
788 
789 #if MIC_SUBMIT_ATOMS_ON_ARRIVAL != 0
790 void mic_submit_patch_data(const int deviceNum,
791  void * const hostPtr,
792  void* &hostPtr_prev,
793  const int transferBytes,
794  const int allocBytes_host,
795  int &allocBytes_device,
796  uint64_t &devicePtr,
797  void* &signal
798  );
799 #endif // MIC_SUBMIT_ATOMS_ON_ARRIVAL
800 
801 
802 // A function that initiates the force computation that is done on the device
803 // each timestep, resulting in forces being transferred from the device
804 // to the host.
805 void mic_nonbonded_forces(const int deviceNum,
806  const int isRemote,
807  const int numLocalAtoms,
808  const int numLocalComputes,
809  const int numLocalPatches,
810  const mic_position3_t lata,
811  const mic_position3_t latb,
812  const mic_position3_t latc,
813  const int doSlow,
814  const int doEnergy,
815  const int usePairlists,
816  const int savePairlists,
817  const int atomsChanged
818  );
819 
820 void mic_transfer_output(const int deviceNum,
821  const int isRemote,
822  const int numLocalAtoms,
823  const int doSlow
824  );
825 
826 // Functions that are used to poll the for the completion of the asynchronous
827 // calculation initiated by mic_nonboned_forces.
828 int mic_check_remote_kernel_complete(const int deviceNum);
829 int mic_check_local_kernel_complete(const int deviceNum);
830 
831 // DMK - DEBUG
832 void _mic_print_stats(const int deviceNum);
833 void mic_dumpHostDeviceComputeMap();
834 
835 
836 // DMK - DEBUG
837 #if MIC_TRACK_DEVICE_MEM_USAGE != 0
838 
839 __declspec(target(mic))
840 typedef struct _mem_info {
841  size_t memTotal;
842  size_t memFree;
843  size_t cached;
844  size_t active;
845  size_t inactive;
846  size_t vmSize;
847  size_t vmPeak;
848 } MemInfo;
849 
850 __declspec(target(mic))
851 void printMemInfo(int device__pe, int device__timestep, MemInfo * mi);
852 
853 __declspec(target(mic))
854 void readMemInfo_processLine(MemInfo * memInfo, char * n, char * v, char * u);
855 
856 __declspec(target(mic))
857 bool readMemInfo(MemInfo * memInfo);
858 
859 #endif // MIC_TRACK_DEVICE_MEM_USAGE != 0
860 
861 
862 #endif // NAMD_MIC
863 #endif // COMPUTE_NONBONDED_MIC_KERNEL_H
register BigReal virial_xy
register BigReal virial_xz
register BigReal virial_yz
static __global__ void(const patch_pair *patch_pairs, const atom *atoms, const atom_param *atom_params, const int *vdw_types, unsigned int *plist, float4 *tmpforces, float4 *slow_tmpforces, float4 *forces, float4 *slow_forces, float *tmpvirials, float *slow_tmpvirials, float *virials, float *slow_virials, unsigned int *global_counters, int *force_ready_queue, const unsigned int *overflow_exclusions, const int npatches, const int block_begin, const int total_block_count, int *block_order, exclmask *exclmasks, const int lj_table_size, const float3 lata, const float3 latb, const float3 latc, const float cutoff2, const float plcutoff2, const int doSlow)
static __thread int patch_pairs_size
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
static __thread int lj_table_size
register BigReal electEnergy
static __thread atom * atoms
static __thread float4 * forces
static __thread float4 * slow_forces
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
register BigReal virial_yy
static __thread patch_pair * patch_pairs
texture< float2, 1, cudaReadModeElementType > lj_table
gridSize z
register BigReal virial_zz
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cutoff2
register BigReal virial_xx
static __thread atom_param * atom_params
k< npairi;++k){TABENERGY(const int numtypes=simParams->tableNumTypes;const float table_spacing=simParams->tableSpacing;const int npertype=(int)(namdnearbyint(simParams->tableMaxDist/simParams->tableSpacing)+1);) int table_i=(r2iilist[2 *k] >> 14)+r2_delta_expc;const int j=pairlisti[k];#define p_j BigReal diffa=r2list[k]-r2_table[table_i];#define table_four_i TABENERGY(register const int tabtype=-1-(lj_pars->A< 0?lj_pars->A:0);) BigReal kqq=kq_i *p_j-> charge
gridSize y
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
gridSize x
float GBReal
Definition: ComputeGBIS.inl:17
static __thread int atoms_size