ComputeNonbondedMICKernel.C

Go to the documentation of this file.
00001 // NOTE: See ComputeNonbondedMICKernel.h for a general description of NAMD on MIC.
00002 
00003 // Only compile the contents of this file if this build supports MIC.
00004 #ifdef NAMD_MIC
00005 
00006 
00007 #include <stdlib.h>
00008 #include <stdio.h>
00009 #include <offload.h>
00010 #include "ComputeNonbondedMICKernel.h"
00011 #include <assert.h>
00012 #include <math.h>
00013 
00014 // DMK - DEBUG
00015 #if MIC_TRACK_DEVICE_MEM_USAGE != 0
00016   #include <sys/types.h>
00017   #include <unistd.h>
00018 #endif
00019 
00020 // Setup __ASSUME_ALIGNED macro to take the appropriate action based on macro flags
00021 #if (CHECK_ASSUME_ALIGNED != 0)
00022   #define __ASSUME_ALIGNED(v) assert(((unsigned long long int)(v)) % 64 == 0); __assume_aligned((v), MIC_ALIGN);
00023 #else
00024   #define __ASSUME_ALIGNED(v) __assume_aligned((v), MIC_ALIGN);
00025 #endif
00026 
00027 // Setup __ASSUME macro to take the appropriate action based on macro flags
00028 #if (CHECK_ASSUME != 0)
00029   #define __ASSUME(s) assert(s); __assume(s);
00030 #else
00031   #define __ASSUME(s) __assume(s);
00032 #endif
00033 
00034 // Setup RESTRICT macro
00035 #define RESTRICT __restrict
00036 
00037 #ifdef WIN32
00038   #define __thread __declspec(thread)
00039 #endif
00040 
00041 
00043 // Global data used during simulation (tables, constants, etc. that are setup
00044 //   during startup but essentially read-only and constant throughout the
00045 //   steady-state simulation.
00046 
00047 __attribute__((target(mic))) double * device__table_four = NULL;
00048 __attribute__((target(mic))) float * device__table_four_float = NULL;
00049 __attribute__((target(mic))) int device__table_four_n_16 = 0;
00050 
00051 __attribute__((target(mic))) double * device__lj_table = NULL;
00052 __attribute__((target(mic))) float * device__lj_table_float = NULL;
00053 __attribute__((target(mic))) int device__lj_table_dim = 0;
00054 __attribute__((target(mic))) int device__lj_table_size = 0;
00055 
00056 __attribute__((target(mic))) unsigned int * device__exclusion_bits = NULL;
00057 __attribute__((target(mic))) long int device__exclusion_bits_size = 0;
00058 
00059 __attribute__((target(mic))) mic_constants * device__constants = NULL;
00060 
00061 __attribute__((target(mic))) double * device__table_four_copy = NULL;
00062 __attribute__((target(mic))) float * device__table_four_float_copy = NULL;
00063 __attribute__((target(mic))) double * device__lj_table_copy = NULL;
00064 __attribute__((target(mic))) float * device__lj_table_float_copy = NULL;
00065 __attribute__((target(mic))) unsigned int * device__exclusion_bits_copy = NULL;
00066 __attribute__((target(mic))) mic_constants * device__constants_copy = NULL;
00067 
00068 __attribute__((target(mic))) patch_pair* device__patch_pairs_copy = NULL;
00069 __attribute__((target(mic))) force_list* device__force_lists_copy = NULL;
00070 __attribute__((target(mic))) atom* device__atoms_copy = NULL;
00071 __attribute__((target(mic))) atom_param* device__atom_params_copy = NULL;
00072 __attribute__((target(mic))) int device__patch_pairs_copy_size = 0;
00073 __attribute__((target(mic))) int device__force_lists_copy_size = 0;
00074 __attribute__((target(mic))) int device__atoms_copy_size = 0;
00075 __attribute__((target(mic))) int device__atom_params_copy_size = 0;
00076 
00077 
00079 // Device variables which exist both on the host and the MIC device and/or are
00080 //   used to manage moving data betwen the host and the MIC device.
00081 
00082 // DMK - DEBUG - PE and node info for printing debug output
00083 __thread int host__pe = -1;
00084 __thread int host__node = -1;
00085 __attribute__((target(mic))) int device__pe = -1;
00086 __attribute__((target(mic))) int device__node = -1;
00087 
00088 __thread int singleKernelFlag = 0;
00089 
00090 __thread patch_pair * host__patch_pairs = NULL;
00091 __thread int host__patch_pairs_size = 0;
00092 __thread int host__patch_pairs_bufSize = 0;
00093 __attribute__((target(mic))) const patch_pair * device__patch_pairs = NULL;
00094 __attribute__((target(mic))) int device__patch_pairs_size = 0;
00095 
00096 __thread force_list * host__force_lists = NULL;
00097 __thread int host__force_lists_size = 0;
00098 __thread int host__force_lists_bufSize = 0;
00099 __attribute__((target(mic))) force_list * device__force_lists = NULL;
00100 __attribute__((target(mic))) int device__force_lists_size = 0;
00101 
00102 __attribute__((target(mic))) uintptr_t device__pairlists = 0;  // NOTE: Don't use a global in case the device is shared between multiple threads, so each thread's patch pair lists has its own pairlist pointer
00103 __attribute__((target(mic))) int device__pairlists_alloc_size = 0;
00104 
00105 __attribute__((target(mic))) int** device__pl_array = NULL;
00106 __attribute__((target(mic))) int* device__pl_size = NULL;
00107 __attribute__((target(mic))) double** device__r2_array = NULL;
00108 
00109 __thread atom * host__atoms = NULL;
00110 __thread atom_param * host__atom_params = NULL;
00111 __thread double4 * host__forces = NULL;
00112 __thread double4 * host__slow_forces = NULL;
00113 __thread int host__atoms_size = 0;
00114 __thread int host__atoms_bufSize = 0;
00115 __attribute__((target(mic))) atom * device__atoms = NULL;
00116 __attribute__((target(mic))) atom_param * device__atom_params = NULL;
00117 __attribute__((target(mic))) double4 * device__forces = NULL;
00118 __attribute__((target(mic))) double4 * device__slow_forces = NULL;
00119 __attribute__((target(mic))) int device__atoms_size = 0;
00120 
00121 __thread size_t host__force_buffers_req_size = 0;
00122 __attribute__((target(mic))) size_t device__force_buffers_req_size = 0;
00123 __attribute__((target(mic))) size_t device__force_buffers_alloc_size = 0;
00124 __attribute__((target(mic))) double4 * device__force_buffers = NULL;
00125 __attribute__((target(mic))) double4 * device__slow_force_buffers = NULL;
00126 
00127 __attribute__((target(mic))) mic_position3_t device__lata;
00128 __attribute__((target(mic))) mic_position3_t device__latb;
00129 __attribute__((target(mic))) mic_position3_t device__latc;
00130 
00131 __thread mic_kernel_data * host__kernel_data = NULL;
00132 
00133 __attribute__((target(mic))) int device__numOMPThreads = -1;
00134 
00135 __thread int tag_atom_params;
00136 __thread int tag_remote_kernel;
00137 __thread int tag_local_kernel;
00138 
00139 __thread int patch_pairs_copySize;
00140 __thread int force_lists_copySize;
00141 __thread int atom_params_copySize;
00142 
00143 #if MIC_DEVICE_FPRINTF != 0
00144 __attribute__((target(mic))) FILE * device__fout = NULL;
00145 #endif
00146 
00147 // DMK - DEBUG - Based on the number of kernels invoked, track the timestep number.
00148 //__thread int host__timestep = 0;
00149 __attribute__((target(mic))) int device__timestep = 0;
00150 
00151 
00152 // DMK - TRACING / TIMING
00153 #include <sys/time.h>
00154 __declspec(target(mic))
00155 double getCurrentTime() {
00156   timeval now;
00157   gettimeofday(&now, NULL);
00158   return (double)(now.tv_sec + (now.tv_usec * 1.0e-6));
00159 }
00160 
00161 // DMK - TRACING
00162 #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0)
00163   #if MIC_DEVICE_TRACING_DETAILED != 0
00164     __thread double * host__device_times_computes = NULL;
00165     __thread double * host__device_times_patches = NULL;
00166     __attribute__((target(mic))) double * device__device_times_computes = NULL;
00167     __attribute__((target(mic))) double * device__device_times_patches = NULL;
00168   #endif
00169   __thread double * host__device_times_start = NULL;
00170   __attribute__((target(mic))) double * device__device_times_start = NULL;
00171 #endif
00172 
00173 
00174 // DMK - TODO : A device version of the die function, which does not have access
00175 //   to the host info.  Only to be used within the kernel itself.  Perhaps add
00176 //   some startup code to send host/PE# and device# to device and then target
00177 //   mic_die to the device as well, removing the need for this separate function.
00178 __attribute__((target(mic)))
00179 void mic_dev_die(const char * const str) {
00180   #ifdef __MIC__
00181     const char * const loc = "on device";
00182   #else
00183     const char * const loc = "on host";
00184   #endif
00185   if (str != NULL) {
00186     printf("[MIC_DIE] :: \"%s\" (%s)\n", str, loc);
00187   } else {
00188     printf("[MIC_DIE] :: mic_dev_die called (%s)\n", loc);
00189   }
00190   fflush(NULL);
00191   abort();
00192 }
00193 
00194 
00195 __attribute__((target(mic)))
00196 void mic_print_config() {
00197 
00198   #if MIC_PRINT_CONFIG != 0
00199 
00200     // DMK - TODO | FIXME : Create a mechanism, so that it is only printed once if
00201     //   there are multiple MIC devices being used.
00202   
00203     printf("device :: MULTIPLE_THREADS  (%d)\n", MULTIPLE_THREADS);
00204     printf("device :: # OpenMP Threads : %d\n", device__numOMPThreads);
00205 
00206     printf("device :: MIC_HANDCODE_FORCE                     (%d)\n", MIC_HANDCODE_FORCE);
00207     printf("device ::   MIC_HANDCODE_FORCE_PFDIST            (%d)\n", MIC_HANDCODE_FORCE_PFDIST);
00208     printf("device ::   MIC_HANDCODE_FORCE_USEGATHER_NBTBL   (%d)\n", MIC_HANDCODE_FORCE_USEGATHER_NBTBL);
00209     printf("device ::   MIC_HANDCODE_FORCE_USEGATHER         (%d)\n", MIC_HANDCODE_FORCE_USEGATHER);
00210     printf("device ::   MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT  (%d)\n", MIC_HANDCODE_FORCE_LOAD_VDW_UPFRONT);
00211     printf("device ::   MIC_HANDCODE_FORCE_COMBINE_FORCES    (%d)\n", MIC_HANDCODE_FORCE_COMBINE_FORCES);
00212 
00213     printf("device :: MIC_HANDCODE_FORCE_SINGLE            (%d)\n", MIC_HANDCODE_FORCE_SINGLE);
00214     printf("device :: MIC_HANDCODE_FORCE_CALCR2TABLE       (%d)\n", MIC_HANDCODE_FORCE_CALCR2TABLE);
00215     printf("device :: MIC_HANDCODE_FORCE_SOA_VS_AOS        (%d)\n", MIC_HANDCODE_FORCE_SOA_VS_AOS);
00216 
00217     printf("device :: MIC_SORT_ATOMS  (%d)\n", MIC_SORT_ATOMS);
00218     printf("device :: MIC_SORT_LISTS  (%d)\n", MIC_SORT_LISTS);
00219 
00220     printf("device :: MIC_HANDCODE_PLGEN    (%d)\n", MIC_HANDCODE_PLGEN);
00221     printf("device :: MIC_TILE_PLGEN        (%d)\n", MIC_TILE_PLGEN);
00222     printf("device :: MIC_CONDITION_NORMAL  (%d)\n", MIC_CONDITION_NORMAL);
00223     printf("device :: MIC_PAD_PLGEN         (%d)\n", MIC_PAD_PLGEN);
00224 
00225     printf("device :: MIC_SORT_COMPUTES  (%d)\n", MIC_SORT_COMPUTES);
00226 
00227     printf("device :: MIC_SYNC_INPUT  (%d)\n", MIC_SYNC_INPUT);
00228 
00229     printf("device :: REFINE_PAIRLISTS            (%d)\n", REFINE_PAIRLISTS);
00230     printf("device ::   REFINE_PAIRLIST_HANDCODE  (%d)\n", REFINE_PAIRLIST_HANDCODE);
00231     printf("device ::   REFINE_PAIRLISTS_XYZ      (%d)\n", REFINE_PAIRLISTS_XYZ);
00232 
00233     printf("device :: MIC_FULL_CHECK          (%d)\n", MIC_FULL_CHECK);
00234     printf("device :: MIC_EXCL_CHECKSUM_FULL  (%d)\n", MIC_EXCL_CHECKSUM_FULL);
00235 
00236     printf("device :: MIC_ALIGN  (%d)\n", MIC_ALIGN);
00237 
00238     fflush(NULL);
00239 
00240   #endif // MIC_PRINT_CONFIG
00241 }
00242 
00243 
00244 // Function to initialize a given MIC device by initializing various device
00245 //   variables, arrays, etc.
00246 // Input:
00247 //  - pe : the PE number for the host core associated with the device
00248 //  - deviceNum : the device number to be initialized
00249 void mic_init_device(const int pe, const int node, const int deviceNum) {
00250 
00251   // Record the PE and node info associated with the given device
00252   host__pe = pe;
00253   host__node = node;
00254 
00255   // Initialize kernel data structures
00256   host__kernel_data = (mic_kernel_data*)(_MM_MALLOC_WRAPPER(2 * sizeof(mic_kernel_data), 64, "mic_kernel_data"));
00257   __ASSERT(host__kernel_data != NULL);
00258   mic_kernel_data * kernel_data = host__kernel_data;
00259 
00260   // Initialize flags for offload buffers that are not copied every timestep
00261   patch_pairs_copySize = 0;
00262   force_lists_copySize = 0;
00263   atom_params_copySize = 0;
00264 
00265   // Allocate data for the host__device_times_start buffer (constant size throughout simulation)
00266   #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0)
00267     host__device_times_start = (double*)_MM_MALLOC_WRAPPER(10 * sizeof(double), 64, "host__device_times_start");
00268     __ASSERT(host__device_times_start != NULL);
00269     double * device_times_start = host__device_times_start;
00270     #define DEVICE_TIMES_CLAUSE  nocopy(device_times_start[0:10] : alloc_if(1) free_if(0) align(64)) \
00271                                  nocopy(device__device_times_start)
00272   #else
00273     #define DEVICE_TIMES_CLAUSE
00274   #endif
00275 
00276   // Initialize the device itself via an offload pragma section
00277   #pragma offload target(mic:deviceNum) \
00278     in(pe) in(node) nocopy(device__pe) nocopy(device__node) \
00279     in(kernel_data[0:2] : alloc_if(1) free_if(0) align(MIC_ALIGN)) \
00280     nocopy(device__pl_array) nocopy(device__pl_size) nocopy(device__r2_array) \
00281     nocopy(device__numOMPThreads) \
00282     DEVICE_TIMES_CLAUSE \
00283     DEVICE_FPRINTF_CLAUSE
00284   {
00285     __MUST_BE_MIC;
00286     __FULL_CHECK(__ASSERT(node >= 0 && pe >= 0));
00287 
00288     device__pe = pe;
00289     device__node = node;
00290 
00291     #if MIC_DEVICE_FPRINTF != 0
00292     {
00293       char filename[128] = { 0 };
00294       sprintf(filename, "/tmp/namd_deviceDebugInfo.%d", device__pe);
00295       if (device__node <= 0) {
00296         printf("[MIC-DEBUG] :: Generating debug output to device file for MICs.\n");
00297         fflush(NULL);
00298       }
00299       device__fout = fopen(filename, "w");
00300     }
00301     #endif
00302 
00303     DEVICE_FPRINTF("Device on PE %d (node: %d) initializing...\n", device__pe, device__node);
00304 
00305     // Get the number of threads available to the code
00306     #if MULTIPLE_THREADS != 0
00307       device__numOMPThreads = omp_get_max_threads();
00308     #else
00309       device__numOMPThreads = 1;
00310     #endif
00311 
00312     // Initialize the r2 arrays (scratch buffers used by computes to refine pairlists
00313     //   each timestep, when pairlist refinement is enabled)
00314     #if REFINE_PAIRLISTS != 0
00315       device__pl_array = (int**)(_MM_MALLOC_WRAPPER(device__numOMPThreads * sizeof(int*), 64, "device__pl_array"));       __ASSERT(device__pl_array != NULL);
00316       device__pl_size = (int*)(_MM_MALLOC_WRAPPER(device__numOMPThreads * sizeof(int), 64, "device__pl_size"));           __ASSERT(device__pl_size != NULL);
00317       device__r2_array = (double**)(_MM_MALLOC_WRAPPER(device__numOMPThreads * sizeof(double*), 64, "device__r2_array");  __ASSERT(device__r2_array != NULL);
00318       for (int i = 0; i < device__numOMPThreads; i++) {
00319         device__pl_array[i] = NULL;
00320         device__pl_size[i] = 0;
00321         device__r2_array[i] = NULL;
00322       }
00323     #endif
00324 
00325     // Copy the pointer to the device_times_start buffer
00326     #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0)
00327       device__device_times_start = device_times_start;
00328     #endif
00329     
00330     // DMK : NOTE | TODO | FIXME - This will print the configuration of the MICs, but the condition
00331     //   assumes that there will be at least one MIC on the first node (node 0).  Further, it assumes
00332     //   that all MICs will be the same (i.e. only prints on one device).  If there are cases where
00333     //   these assumptions do not hold, we need to expand this code to cover those cases.  For now,
00334     //   leaving it this way because it reduces the amount of output during the run (espeically when
00335     //   scaling to multiple nodes and/or multiple MICs per node) and is likely the case (for now).
00336     if (node == 0 && deviceNum == 0) { mic_print_config(); }
00337 
00338   } // end pragma offload
00339 
00340   #undef DEVICE_TIMES_CLAUSE
00341 }
00342 
00343 
00344 // Function that returns 0 if executed on the host, 1 if executed on the MIC device.  Used to
00345 //   indicate if a target is available or not during application startup.
00346 // Input: N/A
00347 // Output:
00348 //  - 0 if executed on host, 1 if executed on device
00349 __attribute__((target(mic))) int mic_check_internal() {
00350     int retval;
00351     #ifdef __MIC__
00352         retval = 1;
00353     #else
00354         retval = 0;
00355     #endif
00356     return retval;
00357 }
00358 
00359 
00360 // Function to check that the given device is available for offloading.
00361 // Input:
00362 //  - dev : The device number (0+) to check
00363 // Output:
00364 //  - 1 if device is available (will call mic_dev_die if not available to prevent further application progress)
00365 int mic_check(int deviceNum) {
00366   int dev_ok = 0;
00367   #pragma offload target(mic:deviceNum) inout(dev_ok)
00368   {
00369     #if !defined(__MIC__)
00370       mic_dev_die("mic_check :: Attempt to execute offload on host (not supported yet)... exiting."); fflush(NULL);
00371     #endif
00372     dev_ok = mic_check_internal();
00373   }
00374   return dev_ok;
00375 }
00376 
00377 
00378 // DMK - NOTE : The following is debug code used for verifying data structures.  Should not be used in
00379 //   production runs.
00380 #if MIC_DATA_STRUCT_VERIFY != 0
00381 
00382 
00383 __declspec(target(mic))
00384 void _verify_parallel_memcpy(void * const dst,
00385                              const void * const src,
00386                              const size_t size
00387                             ) {
00388   const char* const src_c = (const char* const)src;
00389   char* const dst_c = (char* const)dst;
00390   #pragma omp parallel for schedule(static, 4096)
00391   for (int i = 0; i < size; i++) { dst_c[i] = src_c[i]; }
00392 }
00393 
00394 template<class T>
00395 __declspec(target(mic))
00396 void* _verify_remalloc(int &copySize, const int size, T* &ptr) {
00397   if (copySize < size) {
00398     copySize = (int)(1.2f * size);  // Add some extra buffer room
00399     copySize = (copySize + 1023) & (~1023);  // Round up to multiple of 1024
00400     _MM_FREE_WRAPPER(ptr);
00401     ptr = (T*)_MM_MALLOC_WRAPPER(copySize * sizeof(T), 64, "_verify_remalloc");
00402     __FULL_CHECK(assert(ptr != NULL));
00403   }
00404   return ptr;
00405 }
00406 
00407 
00408 __declspec(target(mic))
00409 int _verify_pairlist(const int pe, const int timestep, const int isRemote, const int phase,
00410                      const int i_upper, const int j_upper,
00411                      const int * const pairlist_base,
00412                      const int ppI, const char * const plStr,
00413                      const patch_pair &pp, const int plTypeIndex
00414                     ) {
00415 
00416   const atom_param * const pExt_0 = device__atom_params + pp.patch1_atom_start;
00417   const atom_param * const pExt_1 = device__atom_params + pp.patch2_atom_start;
00418 
00419   if (pairlist_base == NULL) {
00420     if (timestep > 0) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: pairlist_base == NULL !!!\n", pe, timestep, isRemote, phase); return 1; }
00421   } else if (i_upper <= 0) {
00422     printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: i_upper <= 0 !!!\n", pe, timestep, isRemote, phase); return 1;
00423   } else if (j_upper <= 0) {
00424     printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: j_upper <= 0 !!!\n", pe, timestep, isRemote, phase); return 1;
00425   } else {
00426 
00427     int currentI = 0;
00428     int inPadding = 0;
00429     int pairlist_size = pairlist_base[1] - 2;
00430     int pairlist_allocSize = pairlist_base[0];
00431     const int * const pairlist = pairlist_base + 2;
00432 
00433     if (pairlist_size + 2 > pairlist_allocSize) {
00434       printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: pairlist_size + 2 > pairlist_allocSize !!!\n", pe, timestep, isRemote, phase); return 1;
00435     }
00436 
00437     for (int k = 0; k < pairlist_size; k++) {
00438 
00439       int somethingWrong = 0;
00440       int i = (pairlist[k] >> 16) & 0xFFFF;
00441       int j =  pairlist[k]        & 0xFFFF;
00442 
00443       // Verify the interaction is in the correct list
00444       if (j != 0xFFFF) { // If this is a valid entry, not padding
00445         const int maxDiff = pExt_1[j].excl_maxdiff;
00446         const int indexDiff = pExt_0[i].index - pExt_1[j].index;
00447         if (indexDiff < -1 * maxDiff || indexDiff > maxDiff) { // Not in patten, so must be NORM
00448           if (plTypeIndex != PL_NORM_INDEX) {
00449             printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: interaction outside maxDiff range in non-NORM pairlist !!!"
00450                    " - ppI:%d, i:%d, j:%d, indexDiff:%d, maxDiff:%d, plTypeIndex:%d\n",
00451                    pe, timestep, isRemote, phase,
00452                    ppI, i, j, indexDiff, maxDiff, plTypeIndex
00453                   );
00454             somethingWrong = 1;
00455           }
00456         } else { // In pattern, so verify against bits
00457           const int offset = (2 * indexDiff) + pExt_1[j].excl_index;
00458           const int offset_major = offset / (sizeof(unsigned int) * 8);
00459           const int offset_minor = offset % (sizeof(unsigned int) * 8);
00460           const int exclFlags = ((device__exclusion_bits[offset_major]) >> offset_minor) & 0x03;
00461           if (exclFlags == 0x3) { 
00462             printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: invalid exclusion bit pattern detected !!!"
00463                    " - ppI:%d, i:%d, j:%d, indexDiff:%d, maxDiff:%d, plTypeIndex:%d\n",
00464                    pe, timestep, isRemote, phase,
00465                    ppI, i, j, indexDiff, maxDiff, plTypeIndex
00466                   );
00467             somethingWrong = 1;
00468           } else if (exclFlags != plTypeIndex) { 
00469             printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: plTypeIndex/exclution_bits mismatch !!!"
00470                    " - i:%d, j:%d, indexDiff:%d, maxDiff:%d, plTypeIndex:%d, exclFlags:%d\n",
00471                    pe, timestep, isRemote, phase,
00472                    i, j, indexDiff, maxDiff, plTypeIndex, exclFlags
00473                   );
00474             somethingWrong = 1;
00475           }
00476         }
00477       }
00478 
00479       // Check for a new "i" value
00480       if (i != currentI) {
00481         if (i < currentI) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: i < currentI !!!\n", pe, timestep, isRemote, phase); somethingWrong = 1; }
00482         if (k % 16 != 0) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: i changed when k % 16 != 0 !!!\n", pe, timestep, isRemote, phase); somethingWrong = 1; }
00483         currentI = i;
00484         inPadding = 0;  // New i means we are transitioning into a non-padding region
00485       }
00486 
00487       // If we are in a padding region...
00488       if (inPadding) {
00489         if (i < 0 || i >= i_upper) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: invalid i value detected in padded region!!!\n", pe, timestep, isRemote, phase); somethingWrong = 1; }
00490         if (j != 0xFFFF) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: invalid j value detected in padded region !!!\n", pe, timestep, isRemote, phase); somethingWrong = 1; }
00491 
00492       // Otherwise, we are in a non-padding region...
00493       } else {
00494         if (i < 0 || i >= i_upper) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: invalid i value detected !!!\n", pe, timestep, isRemote, phase); somethingWrong = 1; }
00495         if (j == 0xFFFF) {  // Detect transition into a padded region
00496           inPadding = 1;
00497         } else {
00498           if (j < 0 || j >= j_upper) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: invalid j value detected !!!\n", pe, timestep, isRemote, phase); somethingWrong = 1; }
00499         }
00500       }
00501 
00502       if (somethingWrong) {
00503         char buf[2048];
00504         int lo = k - 3; if (lo < 0) { lo = 0; }
00505         int hi = k + 3; if (hi >= pairlist_size) { hi = pairlist_size; }
00506         char *str = buf;
00507         str += sprintf(str, "[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d ::   pl.%s[%d:%d->%d,%d:%d] = { ... ", pe, timestep, isRemote, phase, plStr, ppI, lo, hi, k, pairlist_size);
00508         for (int _k = lo; _k < hi; _k++) { str += sprintf(str, "%s0x%08x%s ", ((_k==k)?("<<"):("")), pairlist[_k], ((_k==k)?(">>"):(""))); }
00509         str += sprintf(str, "}, i_upper:%d, j_upper:%d\n", i_upper, j_upper);
00510         printf(buf);
00511       }
00512     }
00513   }
00514 
00515   return 0;
00516 }
00517 
00518 
00519 __declspec(target(mic))
00520 int _verify_forces(const int pe, const int timestep, const int isRemote, const int phase,
00521                    const int index, const double4 * const forceBuffers, const double4 * const forces,
00522                    const char * const typeStr
00523                   ) {
00524 
00525   force_list &fl = device__force_lists[index];
00526 
00527   // Verify the indexes / sizes
00528   if (fl.force_list_start < 0 || fl.force_list_start >= device__force_buffers_req_size) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__force_lists[%d].force_list_start invalid (%d, size:%lu) !!!\n", pe, timestep, isRemote, phase, index, fl.force_list_start, device__force_buffers_req_size); return 1; }
00529   if (fl.force_output_start < 0 || fl.force_output_start >= device__atoms_size) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__force_lists[%d].force_output_start invalid (%d, size:%d) !!!\n", pe, timestep, isRemote, phase, index, fl.force_output_start, device__atoms_size); return 1; }
00530   if (fl.patch_stride < 0) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__force_lists[%d].patch_stride(%d) < 0 !!!\n", pe, timestep, isRemote, phase, index, fl.patch_stride); return 1; }
00531   if (fl.force_list_size < 0) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__force_lists[%d].force_list_size(%d) < 0 !!!\n", pe, timestep, isRemote, phase, index, fl.force_list_size); return 1; }
00532   if (fl.force_list_start + (4 * fl.patch_stride * fl.force_list_size) > device__force_buffers_req_size) {
00533     printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: fl[%d].force_list_start(%d) + (4 * fl.patch_stride(%d) * fl.force_list_size(%d)) > device__force_buffers_req_size(%lu) !!!\n",
00534            pe, timestep, isRemote, phase, index, fl.force_list_start, fl.patch_stride, fl.force_list_size, device__force_buffers_req_size);
00535     return 1;
00536   }
00537 
00538   // Check sub-lists for large forces
00539   double* f = (double*)(forceBuffers + fl.force_list_start);
00540   for (int i = 0; i < 4 * fl.patch_stride * fl.force_list_size; i++) {
00541     if (fabsf(f[i]) > 2.5e2) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: Large %sforce detected (1) !!! f[%d]:%le\n", pe, timestep, isRemote, phase, typeStr, i, f[i]); } // NOTE: Want to print them all, so don't return here
00542   }
00543 
00544   // Check final list for large forces
00545   f = (double*)(forces + fl.force_output_start);
00546   for (int i = 0; i < 4 * fl.patch_stride; i++) {
00547     if (fabsf(f[i]) > 2.5e2) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: Large %sforce detected (2) !!! f[%d]:%le\n", pe, timestep, isRemote, phase, typeStr, i, f[i]); } // NOTE: Want to print them all, so don't return here
00548   }
00549 
00550   return 0;
00551 }
00552 
00553 __declspec(target(mic))
00554 int _verify_buffers_match(const int pe, const int timestep, const int isRemote, const int phase,
00555                           const char * const buf, const char * const golden, const unsigned int bufSize,
00556                           const char * const str
00557                          ) {
00558   if (buf == NULL) {
00559     printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: buf == NULL (%s) !!!\n", pe, timestep, isRemote, phase, str); return 1;
00560   } else if (golden == NULL) {
00561     printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: golden == NULL (%s) !!!\n", pe, timestep, isRemote, phase, str); return 1;
00562   } else if (bufSize < 0) {
00563     printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: bufSize <= 0 (%s) !!!\n", pe, timestep, isRemote, phase, str); return 1;
00564   } else {
00565     int mismatchCount = 0;
00566     #pragma omp parallel for reduction(+:mismatchCount)
00567     for (int i = 0; i < bufSize; i++) {
00568       #if 1
00569         if (buf[i] != golden[i]) {
00570           printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: buf[%d/%d]:0x%02x != golden[%d]:0x%02x (%s) !!!\n",
00571                  pe, timestep, isRemote, phase, i, bufSize, buf[i], i, golden[i], str
00572                 );
00573           mismatchCount++;
00574         }
00575       #else
00576         if (start < 0) {
00577           if (buf[i] != golden[i]) {
00578             printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: buf[%d/%d]:0x%02x != golden[%d]:0x%02x (%s) !!!\n",
00579                    pe, timestep, isRemote, phase, i, bufSize, buf[i], i, golden[i], str
00580                   );
00581             start = i;
00582             mismatchFound = 1;
00583           }
00584         } else {
00585           if (buf[i] == golden[i] || i == bufSize - 1) {
00586             if (i < bufSize - 1 && buf[i+1] != golden[i+1]) { // NOTE: Ignore single byte matches found within consecutive mismatches
00587               printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: buf[%d/%d]:0x%02x != golden[%d]:0x%02x (%s) !!!\n",
00588                      pe, timestep, isRemote, phase, i, bufSize, buf[i], i, golden[i], str
00589                     );
00590             } else {
00591               printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: buf mismatch range %d -> %d (%s) !!!\n",
00592                      pe, timestep, isRemote, phase, start, ((buf[i] != golden[i] && i == bufSize - 1) ? (i+1) : (i)), str
00593                     );
00594               start = -1;
00595             }
00596           } else if (buf[i] != golden[i] && i - start < 16) {
00597             printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: buf[%d/%d]:0x%02x != golden[%d]:0x%02x (%s) !!!\n",
00598                    pe, timestep, isRemote, phase, i, bufSize, buf[i], i, golden[i], str
00599                   );
00600           }
00601         }
00602       #endif
00603     }
00604     if (mismatchCount > 0) { return 1; }
00605   }
00606   return 0;
00607 }
00608 
00609 __declspec(target(mic))
00610 void _verify_tables(const int pe, const int timestep, const int isRemote, const int phase) {
00611 
00612   // Verify that certain tables/buffers haven't been modified
00613   if (_verify_buffers_match(pe, timestep, isRemote, phase, (char*)(device__exclusion_bits), (char*)(device__exclusion_bits_copy), device__exclusion_bits_size * sizeof(unsigned int), "exclusion_bits")) { return; }
00614   if (_verify_buffers_match(pe, timestep, isRemote, phase, (char*)(device__constants), (char*)(device__constants_copy), sizeof(mic_constants), "constants")) { return; }
00615   #if MIC_HANDCODE_FORCE_SINGLE != 0
00616     if (_verify_buffers_match(pe, timestep, isRemote, phase, (char*)(device__table_four_float), (char*)(device__table_four_float_copy), 61 * device__table_four_n_16 * sizeof(float), "table_four(float)")) { return; }
00617     if (_verify_buffers_match(pe, timestep, isRemote, phase, (char*)(device__lj_table_float), (char*)(device__lj_table_float_copy), device__lj_table_size / sizeof(double) * sizeof(float), "lj_table(float)")) { return; }
00618   #else
00619     if (_verify_buffers_match(pe, timestep, isRemote, phase, (char*)(device__table_four), (char*)(device__table_four_copy), 61 * device__table_four_n_16 * sizeof(double), "table_four(double)")) { return; }
00620     if (_verify_buffers_match(pe, timestep, isRemote, phase, (char*)(device__lj_table), (char*)(device__lj_table_copy), device__lj_table_size * sizeof(double), "lj_table(double)")) { return; }
00621   #endif
00622 
00623   // Check to see if the constants struct has been placed inside the exclusion_bits buffer
00624   // NOTE: This is a specific check related to a specific error that was occuring (leaving in for now)
00625   char *p1 = (char*)(device__constants);
00626   char *p2 = (char*)(device__exclusion_bits);
00627   int pDiff = p1 - p2;
00628   if (p1 >= p2 && p1 < p2 + (device__exclusion_bits_size * sizeof(unsigned int))) {
00629     printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: !!!! OVERLAP OF EXCLUSIONS AND CONSTANTS DETECTED !!!! - constants:%p(%ld), exclusions:%p(%ld)\n",
00630            pe, timestep, isRemote, phase, device__constants, sizeof(device__constants), device__exclusion_bits, device__exclusion_bits_size * sizeof(unsigned int)
00631           );
00632   }
00633 
00634   fflush(NULL);
00635 }
00636 
00637 
00638 __declspec(target(mic))
00639 void _verify_data_structures(const int pe, const int timestep, const int isRemote, const int phase,
00640                              const int check_lo, const int check_hi, const int doSlow
00641                             ) {
00642 
00643   // Verify that the basic arrays are allocated
00644   if (device__atoms == NULL) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__atoms == NULL !!!\n", pe, timestep, isRemote, phase); return; }
00645   if (device__atom_params == NULL) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__atom_params == NULL !!!\n", pe, timestep, isRemote, phase); return; }
00646   if (device__forces == NULL) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__forces == NULL !!!\n", pe, timestep, isRemote, phase); return; }
00647   if (device__slow_forces == NULL) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__slow_forces == NULL !!!\n", pe, timestep, isRemote, phase); return; }
00648   if (device__patch_pairs == NULL) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__patch_pairs == NULL !!!\n", pe, timestep, isRemote, phase); return; }
00649   if (device__force_lists == NULL) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__force_lists == NULL !!!\n", pe, timestep, isRemote, phase); return; }
00650   if (device__pairlists == NULL) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__pairlists == NULL !!!\n", pe, timestep, isRemote, phase); return; }
00651 
00652   // Verify sizes (as much as possible)
00653   if (device__patch_pairs_size <= 0) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__patch_pairs_size <= 0 !!!\n", pe, timestep, isRemote, phase); return; }
00654   if (device__force_lists_size <= 0) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__force_lists_size <= 0 !!!\n", pe, timestep, isRemote, phase); return; }
00655 
00656   // Verify the tables
00657   _verify_tables(pe, timestep, isRemote, phase);
00658 
00659   // For each patch pair
00660   #if MULTIPLE_THREADS != 0
00661     #pragma omp parallel for
00662   #endif
00663   for (int k = check_lo; k < check_hi; k++) {
00664 
00665     const patch_pair &pp = device__patch_pairs[k];
00666 
00667     // Check indexing
00668     if (pp.patch1_atom_start < 0 || pp.patch1_atom_start >= device__atoms_size) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__patch_pairs[%d].patch1_atom_start invalid (%d, size:%d) !!!\n", pe, timestep, isRemote, phase, k, pp.patch1_atom_start, device__atoms_size); }
00669     if (pp.patch2_atom_start < 0 || pp.patch2_atom_start >= device__atoms_size) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__patch_pairs[%d].patch2_atom_start invalid (%d, size:%d) !!!\n", pe, timestep, isRemote, phase, k, pp.patch2_atom_start, device__atoms_size); }
00670     if (pp.patch1_force_start < 0 || pp.patch1_force_start >= device__force_buffers_req_size) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__patch_pairs[%d].patch1_force_start invalid (%d, size:%lu) !!!\n", pe, timestep, isRemote, phase, k, pp.patch1_force_start, device__force_buffers_req_size); }
00671     if (pp.patch2_force_start < 0 || pp.patch2_force_start >= device__force_buffers_req_size) { printf("[VERIFICATION-ERROR] :: PE:%d.%d.%d.%d :: device__patch_pairs[%d].patch2_force_start invalid (%d, size:%lu) !!!\n", pe, timestep, isRemote, phase, k, pp.patch2_force_start, device__force_buffers_req_size); }
00672 
00673     // Verify the pairlists
00674     int i_upper = pp.patch1_size;
00675     int j_upper = pp.patch2_size;
00676     const int ** const pairlists = (const int ** const)device__pairlists;
00677     _verify_pairlist(pe, timestep, isRemote, phase, i_upper, j_upper, pairlists[NUM_PAIRLIST_TYPES * k + PL_NORM_INDEX], k, "NORM", pp, PL_NORM_INDEX);
00678     _verify_pairlist(pe, timestep, isRemote, phase, i_upper, j_upper, pairlists[NUM_PAIRLIST_TYPES * k + PL_EXCL_INDEX], k, "EXCL", pp, PL_EXCL_INDEX);
00679     _verify_pairlist(pe, timestep, isRemote, phase, i_upper, j_upper, pairlists[NUM_PAIRLIST_TYPES * k +  PL_MOD_INDEX], k,  "MOD", pp,  PL_MOD_INDEX);
00680   }
00681 
00682   // Verify forces if this is the local kernel (end of timestep)
00683   if (isRemote == 0) {
00684     #if MULTIPLE_THREADS != 0
00685       #pragma omp parallel for
00686     #endif
00687     for (int k = 0; k < device__force_lists_size; k++) {
00688       _verify_forces(pe, timestep, isRemote, phase, k, device__force_buffers, device__forces, "");
00689       if (doSlow) { _verify_forces(pe, timestep, isRemote, phase, k, device__slow_force_buffers, device__slow_forces, "slow "); }
00690     }
00691   }
00692 
00693   fflush(NULL);
00694 }
00695 
00696 #endif // MIC_DATA_STRUCT_VERIFY
00697 
00698 
00699 
00700 // Function called during startup to push the table_four data (lookup table used
00701 //   during force computation) that has been calculated on the host down to the
00702 //   given device.
00703 // Input:
00704 //   - deviceNum : the device to push the data to (number within node)
00705 //   - table_four : pointer to the table data itself
00706 //   - table_four_n : dimension of the table_four table (used for indexing and
00707 //       size calculations)
00708 // Output: N/A
00709 void mic_bind_table_four(const int deviceNum,
00710                          const double *table_four,
00711                          const int table_four_n,
00712                          const int table_four_n_16
00713                         ) {
00714 
00715   // Verify parameters
00716   __FULL_CHECK(__ASSERT(deviceNum >= 0));
00717   __FULL_CHECK(__ASSERT(table_four != NULL));
00718   __FULL_CHECK(__ASSERT(table_four_n > 0));
00719   __FULL_CHECK(__ASSERT(table_four_n_16 > 0 && table_four_n_16 >= table_four_n));
00720   __FULL_CHECK(__ASSERT(table_four_n_16 % 16 == 0));
00721 
00722   // Copy the table pointer and dimension information into the device variables.
00723   //   Note that there are actually several sub-tables within the table_four
00724   //   table, each a multiple of table_four_n elements in length.
00725   int numTableFourElems = 61 * table_four_n_16;  // WARNING !!! Must match ComputeNonbondedUtil.C (const 61) !!!
00726 
00727   // Transfer the table_four data and dimension data to the given device
00728   #pragma offload target(mic:deviceNum) \
00729     in(table_four_n_16) nocopy(device__table_four_n_16) \
00730     in(numTableFourElems) \
00731     in(table_four[0:numTableFourElems] : alloc_if(1) free_if(1)) \
00732     nocopy(device__table_four) nocopy(device__table_four_float) \
00733     nocopy(device__table_four_copy) nocopy(device__table_four_float_copy)
00734   {
00735     __MUST_BE_MIC;
00736 
00737     // Copy and verify the table_four_n_16 value
00738     device__table_four_n_16 = table_four_n_16;
00739     __FULL_CHECK(__ASSERT(device__table_four_n_16 > 0));
00740 
00741     // Allocate memory on the device for the table_four table.  Depending on the precision being used
00742     //   for the table, either create a 64-bit or 32-bit copy of the data on the device.
00743     __FULL_CHECK(__ASSERT(device__table_four == NULL && device__table_four_float == NULL));  // Check for double allocation
00744     #if MIC_HANDCODE_FORCE_SINGLE != 0
00745 
00746       // Allocate and copy data into a 32-bit version of the table
00747       device__table_four_float = (float*)(_MM_MALLOC_WRAPPER(sizeof(float) * numTableFourElems, 64, "device__table_four_float"));
00748       __ASSERT(device__table_four_float != NULL);
00749       for (int i = 0; i < numTableFourElems; i++) {
00750         device__table_four_float[i] = (float)(table_four[i]);
00751       }
00752 
00753       // Create a second copy for verification purposes
00754       #if MIC_DATA_STRUCT_VERIFY != 0
00755         device__table_four_float_copy = (float*)(_MM_MALLOC_WRAPPER(sizeof(float) * numTableFourElems, 64, "device__table_four_float_copy"));
00756         __ASSERT(device__table_four_float_copy != NULL);
00757         memcpy(device__table_four_float_copy, device__table_four_float, numTableFourElems * sizeof(float));
00758       #endif
00759 
00760     #else // MIC_HANDCODE_FORCE_SINGLE != 0
00761 
00762       // Allocate and copy data into a 64-bit version of the table
00763       device__table_four = (double*)(_MM_MALLOC_WRAPPER(sizeof(double) * numTableFourElems, 64, "device__table_four"));
00764       __ASSERT(device__table_four != NULL);
00765       for (int i = 0; i < numTableFourElems; i++) {
00766         device__table_four[i] = table_four[i];
00767       }
00768       #if MIC_DATA_STRUCT_VERIFY != 0
00769         device__table_four_copy = (double*)(_MM_MALLOC_WRAPPER(sizeof(double) * numTableFourElems, 64, "device__table_four_copy"));
00770         __ASSERT(device__table_four_copy != NULL);
00771         memcpy(device__table_four_copy, device__table_four, numTableFourElems * sizeof(double));
00772       #endif
00773 
00774     #endif // MIC_HANDCODE_FORCE_SINGLE != 0
00775 
00776   } // end pragma offload
00777 }
00778 
00779 
00780 // Function called during startup to push the lj_table data (lookup table used
00781 //   during force computation) that has been calculated on the host down to the
00782 //   given device.
00783 // Input:
00784 //   - deviceNum : the device to push the data to (number within node)
00785 //   - lj_table : pointer to the table data itself (64-bit version on the host)
00786 //   - lj_table_dim : dimension of the lj_table (used for indexing)
00787 //   - lj_table_size : size of the lj_table
00788 // Output: N/A
00789 void mic_bind_lj_table(const int deviceNum,
00790                        const char * lj_table,
00791                        const int lj_table_dim,
00792                        const int lj_table_size
00793                       ) {
00794 
00795   // Verify Parameters
00796   __FULL_CHECK(__ASSERT(deviceNum >= 0));
00797   __FULL_CHECK(__ASSERT(lj_table != NULL));
00798   __FULL_CHECK(__ASSERT(lj_table_dim > 0));
00799   __FULL_CHECK(__ASSERT(lj_table_size > 0));
00800 
00801   // Transfer the table data, dimension, and size information to the given device.
00802   #pragma offload target(mic:deviceNum) \
00803     in(lj_table_dim) nocopy(device__lj_table_dim) \
00804     in(lj_table_size) nocopy(device__lj_table_size) \
00805     in(lj_table[0:lj_table_size] : alloc_if(1) free_if(1)) \
00806     nocopy(device__lj_table) nocopy(device__lj_table_float) \
00807     nocopy(device__lj_table_copy) nocopy(device__lj_table_float_copy) \
00808     nocopy(device__pe)
00809   {
00810     __MUST_BE_MIC;
00811 
00812     // Copy and verify the LJ table size and dimension info
00813     device__lj_table_dim = lj_table_dim;
00814     device__lj_table_size = lj_table_size;
00815     __FULL_CHECK(__ASSERT(device__lj_table_dim > 0 && device__lj_table_size > 0));
00816 
00817     // Allocate memory on the device for the LJ table.  Depending on the precision being used
00818     //   for the table, either create a 64-bit or 32-bit copy of the data on the device.
00819     __FULL_CHECK(__ASSERT(device__lj_table == NULL && device__lj_table_float == NULL));  // Check for double allocation
00820     int numElements = device__lj_table_size * sizeof(char) / sizeof(double);
00821     #if MIC_HANDCODE_FORCE_SINGLE != 0
00822 
00823       // Allocate and copy the data into a 32-bit version of the table
00824       device__lj_table_float = (float*)(_MM_MALLOC_WRAPPER(sizeof(float) * numElements, 64, "device__lj_table_float"));
00825       __ASSERT(device__lj_table_float != NULL);
00826       for (int i = 0; i < numElements; i++) {
00827         device__lj_table_float[i] = (float)(((double*)lj_table)[i]);
00828       }
00829 
00830       // Create a copy of the table for verification purposes later
00831       #if MIC_DATA_STRUCT_VERIFY != 0
00832         device__lj_table_float_copy = (float*)(_MM_MALLOC_WRAPPER(sizeof(float) * numElements, 64, "device__lj_table_float_copy"));
00833         __ASSERT(device__lj_table_float_copy != NULL);
00834         memcpy(device__lj_table_float_copy, device__lj_table_float, sizeof(float) * numElements);
00835       #endif
00836 
00837     #else // MIC_HANDCODE_FORCE_SINGLE != 0
00838 
00839       // Allocate and copy the data into a 64-bit version of the table
00840       device__lj_table = (double*)(_MM_MALLOC_WRAPPER(sizeof(double) * numElements, 64, "device__lj_table"));
00841       __ASSERT(device__lj_table_float != NULL);
00842       for (int i = 0; i < numElements; i++) {
00843         device__lj_table[i] = ((double*)lj_table)[i];
00844       }
00845 
00846       // Create a copy of the table for verificiation purposes later
00847       #if MIC_DATA_STRUCT_VERIFY != 0
00848         device__lj_table_copy = (double*)(_MM_MALLOC_WRAPPER(sizeof(double) * numElements, 64, "device__lj_table_copy"));
00849         __ASSERT(device__lj_table_copy != NULL);
00850         memcpy(device__lj_table_copy, device__lj_table, sizeof(double) * numElements);
00851       #endif
00852 
00853     #endif // MIC_HANDCODE_FORCE_SINGLE != 0
00854 
00855   } // end pragma offload
00856 }
00857 
00858 
00859 // Function called during startup to push the exclusion data (lookup table used
00860 //   during pairlist generation) that has been calculated on the host down to the
00861 //   given device.
00862 // Input:
00863 //   - deviceNum : the device to push the data to (number within node)
00864 //   - exclusion_bits : pointer to the exclusion data itself
00865 //   - exclusion_bits_size : size of the exclsion data
00866 // Output: N/A
00867 void mic_bind_exclusions(const int deviceNum,
00868                          unsigned int *exclusion_bits,
00869                          const long int exclusion_bits_size
00870                         ) {
00871 
00872   // Verify Parameters
00873   __FULL_CHECK(__ASSERT(deviceNum >= 0));
00874   __FULL_CHECK(__ASSERT(exclusion_bits != NULL));
00875   __FULL_CHECK(__ASSERT(exclusion_bits_size > 0));
00876 
00877   // Transfer the exclusion data and size information down to the given device.
00878   #pragma offload target(mic:deviceNum) \
00879     in(exclusion_bits_size) nocopy(device__exclusion_bits_size) \
00880     in(exclusion_bits[0:exclusion_bits_size] : alloc_if(1) free_if(1)) \
00881     nocopy(device__exclusion_bits) nocopy(device__exclusion_bits_copy)
00882   {
00883     __MUST_BE_MIC;
00884 
00885     // Copy and verify the table size info
00886     device__exclusion_bits_size = exclusion_bits_size;
00887     __FULL_CHECK(__ASSERT(device__exclusion_bits_size > 0));
00888 
00889     // Create a copy of the exclusion bits buffer on the device
00890     __ASSERT(device__exclusion_bits == NULL);  // Check for double allocate
00891     device__exclusion_bits = (unsigned int *)_MM_MALLOC_WRAPPER(sizeof(unsigned int) * device__exclusion_bits_size, 64, "device__exclusion_bits");
00892     __ASSERT(device__exclusion_bits != NULL);
00893     memcpy(device__exclusion_bits, exclusion_bits, sizeof(unsigned int) * device__exclusion_bits_size);
00894 
00895     // Create a copy of the table for verification purposes later
00896     #if MIC_DATA_STRUCT_VERIFY != 0
00897       device__exclusion_bits_copy = (unsigned int*)(_MM_MALLOC_WRAPPER(device__exclusion_bits_size * sizeof(unsigned int), 64, " device__exclusion_bits_copy"));
00898       __ASSERT(device__exclusion_bits_copy != NULL);
00899       memcpy(device__exclusion_bits_copy, device__exclusion_bits, sizeof(unsigned int) * device__exclusion_bits_size);
00900     #else
00901       device__exclusion_bits_copy = NULL;
00902     #endif
00903 
00904   } // end pragma offload
00905 }
00906 
00907 
00908 // Function called during startup to push (and calculate) several variables that
00909 //   are constant and used throughout the simulation.
00910 // Input:
00911 //   - deviceNum : the device to push the data to (number within node)
00912 //   ... : remaining parameters are simply simulation constants
00913 // Output: N/A
00914 void mic_bind_constants(const int deviceNum,
00915                         const double cutoff2,
00916                         const double dielectric_1,
00917                         const double scaling,
00918                         const double scale14,
00919                         const double r2_delta,
00920                         const int r2_delta_exp,
00921                         const int commOnly
00922                        ) {
00923   
00924   // Verify Parameters
00925   __FULL_CHECK(__ASSERT(deviceNum >= 0));
00926 
00927   // Create a mic_constants data structure and copy (or calculate) the
00928   //   various constants that are to be pushed to the device in to this
00929   //   data structure.
00930   mic_constants constants;
00931   constants.cutoff2 = cutoff2;
00932   constants.dielectric_1 = dielectric_1;
00933   constants.scaling = scaling;
00934   constants.scale14 = scale14;
00935   constants.modf_mod = 1.0 - scale14;
00936   constants.r2_delta = r2_delta;
00937   constants.r2_delta_exp = r2_delta_exp;
00938   constants.r2_delta_expc = 64 * (r2_delta_exp - 1023);
00939   constants.commOnly = commOnly;
00940   constants.singleKernelFlag = singleKernelFlag;
00941 
00942   // Transfer the constants to the given device
00943   #pragma offload target(mic:deviceNum) \
00944     in(constants) nocopy(device__constants) \
00945     nocopy(device__pe) nocopy(device__exclusion_bits) \
00946     nocopy(device__exclusion_bits_copy) nocopy(device__exclusion_bits_size) \
00947     nocopy(device__table_four_copy) nocopy(device__table_four_float_copy) \
00948     nocopy(device__lj_table_copy) nocopy(device__lj_table_float_copy) \
00949     nocopy(device__constants_copy)
00950   {
00951     __MUST_BE_MIC;
00952 
00953     // Create a copy of the constant info on the device
00954     __ASSERT(device__constants == NULL);  // Check for double allocation
00955     device__constants = (mic_constants*)_MM_MALLOC_WRAPPER(sizeof(mic_constants), 64, "device__constants");
00956     __ASSERT(device__constants != NULL);
00957     memcpy(device__constants, &constants, sizeof(mic_constants));
00958 
00959     // Correct r2_delta_expc for use with single precision if using mixed precision is
00960     //   being used on the device
00961     #if MIC_HANDCODE_FORCE_SINGLE != 0
00962       device__constants->r2_delta_expc = 64 * (device__constants->r2_delta_exp - 127);
00963     #endif
00964 
00965     // Copy the data for verification purposes later
00966     #if MIC_DATA_STRUCT_VERIFY != 0
00967       device__constants_copy = (mic_constants*)_MM_MALLOC_WRAPPER(sizeof(mic_constants), 64, "device__constants_copy");
00968       __ASSERT(device__constants_copy != NULL);
00969       memcpy(device__constants_copy, device__constants, sizeof(mic_constants));
00970     #endif
00971 
00972   } // end pragma offload
00973 }
00974 
00975 
00976 // Function called after patch pairs have been modified on the host to push those
00977 //   modifications to the MIC device.
00978 // Input:
00979 //   - deviceNum : the device to push the data to (number within node)
00980 //   - patch_pairs : array containing individual patch_pair records (data to copy)
00981 //   - patch_pairs_size : the length (in elements) of the patch_pairs array (valid/used elements)
00982 //   - patch_pairs_bufSize : the actual length (in elements) of the patch_pairs array (allocated)
00983 // Output: N/A
00984 void mic_bind_patch_pairs_only(const int deviceNum,
00985                                const patch_pair *patch_pairs,
00986                                const int patch_pairs_size,
00987                                const int patch_pairs_bufSize
00988                               ) {
00989 
00990   // NOTE: This function does not actually transfer the patch pairs, it only (re)allocates
00991   //   the device buffer(s) associated with them.  See mic_nonbonded_compute() for transfer.
00992 
00993   // Verify parameters
00994   __FULL_CHECK(__ASSERT(deviceNum >= 0));
00995   __FULL_CHECK(__ASSERT(patch_pairs != NULL));
00996   __FULL_CHECK(__ASSERT(patch_pairs_size > 0));
00997   __FULL_CHECK(__ASSERT(patch_pairs_bufSize > 0));
00998   __FULL_CHECK(__ASSERT(patch_pairs_bufSize >= patch_pairs_size));
00999 
01000   // Check if the buffer currently allocated on the device is too small.  If so,
01001   //   reallocate the buffer to be larger.
01002   if (patch_pairs_bufSize > host__patch_pairs_bufSize) {  // If buffer is too small, reallocate
01003 
01004     // Setup clause required to free device_times_computes buffer on the device
01005     #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0) && (MIC_DEVICE_TRACING_DETAILED != 0)
01006       double * device_times_computes = host__device_times_computes;
01007       #define TIMES_COMPUTES_FREE_CLAUSE  nocopy(device_times_computes : alloc_if(0) free_if(1))
01008     #else
01009       #define TIMES_COMPUTES_FREE_CLAUSE
01010     #endif
01011 
01012     // Free old buffers
01013     if (host__patch_pairs != NULL) {
01014 
01015       const patch_pair * _patch_pairs = host__patch_pairs;
01016       #pragma offload target(mic:deviceNum) \
01017         nocopy(_patch_pairs : alloc_if(0) free_if(1)) \
01018         TIMES_COMPUTES_FREE_CLAUSE
01019       { __MUST_BE_MIC; }
01020     }
01021 
01022     // (Re)allocate memory for device_times_computes buffer on the host, and setup clause for device allocation
01023     #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0) && (MIC_DEVICE_TRACING_DETAILED != 0)
01024       if (host__device_times_computes != NULL) { _MM_FREE_WRAPPER(host__device_times_computes); }
01025       host__device_times_computes = (double*)_MM_MALLOC_WRAPPER(2 * patch_pairs_bufSize * sizeof(double), 64, "host__device_times_computes");
01026       __ASSERT(host__device_times_computes != NULL);
01027       device_times_computes = host__device_times_computes;
01028       #define TIMES_COMPUTES_ALLOC_CLAUSE  nocopy(device_times_computes[0:2*patch_pairs_bufSize] : alloc_if(1) free_if(0) align(MIC_ALIGN))
01029     #else
01030       #define TIMES_COMPUTES_ALLOC_CLAUSE
01031     #endif
01032 
01033     // Record the new host-side pointer and buffer size
01034     host__patch_pairs = (patch_pair*)patch_pairs;
01035     host__patch_pairs_bufSize = patch_pairs_bufSize;
01036 
01037     // Allocate new memory
01038     #pragma offload target(mic:deviceNum) \
01039       in(patch_pairs_size) nocopy(device__patch_pairs_size) \
01040       in(patch_pairs_bufSize) \
01041       nocopy(patch_pairs[0:patch_pairs_bufSize] : alloc_if(1) free_if(0) align(MIC_ALIGN)) \
01042       nocopy(device__pairlists) nocopy(device__pairlists_alloc_size) \
01043       nocopy(device__numOMPThreads) nocopy(device__pe) \
01044       TIMES_COMPUTES_ALLOC_CLAUSE
01045     {
01046       __MUST_BE_MIC;
01047 
01048       // Copy and verify the number of patch pairs
01049       device__patch_pairs_size = patch_pairs_size;
01050       __ASSERT(device__patch_pairs_size > 0);
01051 
01052       // Make sure there are enough pairlists pointers (one per pairlist type per patch_pair)
01053       const int numPairlists = NUM_PAIRLIST_TYPES * device__patch_pairs_size;
01054       if (numPairlists > device__pairlists_alloc_size) {
01055         int **new_pairlists = (int**)(_MM_MALLOC_WRAPPER(numPairlists * sizeof(int*), 64, "device__pairlists"));
01056         int initStart = 0;
01057         if (device__pairlists != 0) {
01058           int **old_pairlists = (int**)(device__pairlists);
01059           memcpy((void*)new_pairlists, (void*)old_pairlists, sizeof(int*) * device__pairlists_alloc_size);
01060           _MM_FREE_WRAPPER(old_pairlists);
01061           initStart = device__pairlists_alloc_size;
01062         }
01063         for (int i = initStart; i < numPairlists; i++) { new_pairlists[i] = NULL; }
01064         device__pairlists = (uintptr_t)new_pairlists;
01065         device__pairlists_alloc_size = numPairlists;
01066       }
01067 
01068     } // end pragma offload
01069 
01070     #undef TIMES_COMPUTES_FREE_CLAUSE
01071     #undef TIMES_COMPUTES_ALLOC_CLAUSE
01072   }
01073 
01074   // Record the current 'used' length of the array and flag the array for data transfer
01075   host__patch_pairs_size = patch_pairs_size;
01076   patch_pairs_copySize = patch_pairs_size;
01077 }
01078 
01079 
01080 // Function called after force lists have been modified on the host to push those
01081 //   modifications to the MIC device.
01082 // Input:
01083 //   - deviceNum : the device to push the data to (number within node)
01084 //   - force_lists : array containing individual force_list records (data to copy)
01085 //   - force_lists_size : the length (in elements) of the force_lists array (valid/used elements)
01086 //   - force_lists_bufSize : the actual length (in elements) of the force_lists array (allocated)
01087 // Output: N/A
01088 void mic_bind_force_lists_only(const int deviceNum,
01089                                force_list *force_lists,
01090                                const int force_lists_size,
01091                                const int force_lists_bufSize
01092                               ) {
01093 
01094   // NOTE: This function does not actually transfer the force lists, it only (re)allocates
01095   //   the device buffer(s) associated with them.  See mic_nonbonded_compute() for transfer.
01096 
01097   // Verify parameters
01098   __FULL_CHECK(__ASSERT(deviceNum >= 0));
01099   __FULL_CHECK(__ASSERT(force_lists != NULL));
01100   __FULL_CHECK(__ASSERT(force_lists_size > 0));
01101   __FULL_CHECK(__ASSERT(force_lists_bufSize > 0));
01102   __FULL_CHECK(__ASSERT(force_lists_bufSize >= force_lists_size));
01103 
01104   // Check if the buffer currently allocated on the device is too small.  If so, reallocate the buffer.
01105   if (force_lists_bufSize > host__force_lists_bufSize) {
01106 
01107     // Setup clause needed to free the device_times_patches buffer on the device
01108     #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0) && (MIC_DEVICE_TRACING_DETAILED != 0)
01109       double * device_times_patches = host__device_times_patches;
01110       #define TIMES_PATCHES_FREE_CLAUSE  nocopy(device_times_patches : alloc_if(0) free_if(1))
01111     #else
01112       #define TIMES_PATCHES_FREE_CLAUSE
01113     #endif
01114 
01115     // Free the old buffer
01116     if (host__force_lists != NULL) {
01117       const force_list * _force_lists = host__force_lists;
01118       #pragma offload target(mic:deviceNum) \
01119         nocopy(_force_lists : alloc_if(0) free_if(1)) \
01120         TIMES_PATCHES_FREE_CLAUSE
01121       { __MUST_BE_MIC; }
01122     }
01123 
01124     // (Re)allocate the device_times_patches buffer on the host and setup the clause required to allocate on the device
01125     #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0) && (MIC_DEVICE_TRACING_DETAILED != 0)
01126       if (host__device_times_patches != NULL) { _MM_FREE_WRAPPER(host__device_times_patches); }
01127       host__device_times_patches = (double*)_MM_MALLOC_WRAPPER(2 * force_lists_bufSize * sizeof(double), 64, "host__device_times_patches");
01128       __ASSERT(host__device_times_patches != NULL);
01129       device_times_patches = host__device_times_patches;
01130       #define TIMES_PATCHES_ALLOC_CLAUSE  nocopy(device_times_patches[0:2*force_lists_bufSize] : alloc_if(1) free_if(0) align(MIC_ALIGN))
01131     #else
01132       #define TIMES_PATCHES_ALLOC_CLAUSE
01133     #endif
01134 
01135     // Record teh new host-side pointer and buffer size
01136     host__force_lists = force_lists;
01137     host__force_lists_bufSize = force_lists_bufSize;
01138 
01139     // Allocate the new memory
01140     #pragma offload target(mic:deviceNum) \
01141       in(force_lists_size) nocopy(device__force_lists_size) \
01142       nocopy(force_lists[0:force_lists_bufSize] : alloc_if(1) free_if(0) align(MIC_ALIGN)) \
01143       TIMES_PATCHES_ALLOC_CLAUSE
01144     {
01145       __MUST_BE_MIC;
01146 
01147       // Copy and verify the number of force lists
01148       device__force_lists_size = force_lists_size;
01149       __ASSERT(device__force_lists_size > 0);
01150     }
01151 
01152     #undef TIMES_PATCHES_FREE_CLAUSE
01153     #undef TIMES_PATCHES_ALLOC_CLAUSE
01154 
01155   } // end pragma offload
01156 
01157   // Record the current 'used' length of the array and flag the array for data transfer
01158   host__force_lists_size = force_lists_size;
01159   force_lists_copySize = force_lists_size;
01160 }
01161 
01162 
01163 // Function called after the atom list has been modified on the host to push those
01164 //   modifications to the MIC device.
01165 // Input:
01166 //   - deviceNum : the device to push the data to (number within node)
01167 //   - atoms : array containing individual atom records (data to copy)
01168 //   - atom_params : array containing individual atom_param records (data to copy)
01169 //   - forces : array that will contain output data from the device (data to copy back)
01170 //   - slow_forces : array that will contain output data from the device (data to copy back)
01171 //   - atoms_size : the length (in elements) of the specified arrays (valid/used elements)
01172 //   - atoms_bufSize : the actual length (in elements) of the specified arrays (allocated)
01173 // Output: N/A
01174 void mic_bind_atoms_only(const int deviceNum,
01175                          atom *atoms,
01176                          atom_param *atom_params,
01177                          double4 *forces,
01178                          double4 *slow_forces,
01179                          const int atoms_size,
01180                          const int atoms_bufSize
01181                         ) {
01182 
01183   // NOTE: This function does not actually transfer the atom/force data, it only (re)allocates
01184   //   the device buffer(s) associated with them.  See mic_nonbonded_compute() for transfer.
01185 
01186   // Verify parameters
01187   __FULL_CHECK(__ASSERT(deviceNum >= 0));
01188   __FULL_CHECK(__ASSERT(atoms != NULL));
01189   __FULL_CHECK(__ASSERT(atom_params != NULL));
01190   __FULL_CHECK(__ASSERT(forces != NULL));
01191   __FULL_CHECK(__ASSERT(slow_forces != NULL));
01192   __FULL_CHECK(__ASSERT(atoms_size > 0));
01193   __FULL_CHECK(__ASSERT(atoms_bufSize > 0));
01194   __FULL_CHECK(__ASSERT(atoms_bufSize >= atoms_size));
01195 
01196   // Check if the buffers are large enough (reallocate if not)
01197   if (atoms_bufSize > host__atoms_bufSize) {
01198 
01199     // Free the old buffers
01200     if (host__atoms != NULL) {
01201       const atom * _atoms = host__atoms;
01202       const atom_param * _atom_params = host__atom_params;
01203       const double4 * _forces = host__forces;
01204       const double4 * _slow_forces = host__slow_forces;
01205 
01206       #pragma offload target(mic:deviceNum) \
01207         nocopy(_atoms : alloc_if(0) free_if(1)) \
01208         nocopy(_atom_params : alloc_if(0) free_if(1)) \
01209         nocopy(_forces : alloc_if(0) free_if(1)) \
01210         nocopy(_slow_forces : alloc_if(0) free_if(1))
01211       { __MUST_BE_MIC; }
01212     }
01213 
01214     // Copy the new buffer and size info
01215     host__atoms = atoms;
01216     host__atom_params = atom_params;
01217     host__forces = forces;
01218     host__slow_forces = slow_forces;
01219     host__atoms_size = atoms_size;
01220     host__atoms_bufSize = atoms_bufSize;
01221 
01222     // Allocate the new buffers
01223     #pragma offload target(mic:deviceNum) \
01224       in(atoms_size : into(device__atoms_size)) \
01225       nocopy(atoms[0:atoms_bufSize] : alloc_if(1) free_if(0) align(MIC_ALIGN)) \
01226       nocopy(atom_params[0:atoms_bufSize] : alloc_if(1) free_if(0) align(MIC_ALIGN)) \
01227       nocopy(forces[0:atoms_bufSize] : alloc_if(1) free_if(0) align(MIC_ALIGN)) \
01228       nocopy(slow_forces[0:atoms_bufSize] : alloc_if(1) free_if(0) align(MIC_ALIGN))
01229     { __MUST_BE_MIC; }
01230   }
01231 
01232   // Signal the copy
01233   host__atoms_size = atoms_size;
01234   atom_params_copySize = atoms_size;
01235 }
01236 
01237 
01238 // Function called to allocate memory on the MIC device related to the individual and private
01239 //   force buffers for each of the compute objects.  Note that this function simply allocates
01240 //   the memory on the device, as there is no host-side equivalent buffer (i.e. device only).
01241 // Input:
01242 //   - deviceNum : the device to push the data to (number within node)
01243 //   - force_buffers_size : the amount of memory to be allocated on the device for the buffer
01244 void mic_bind_force_buffers_only(const int deviceNum,
01245                                  const size_t force_buffers_size
01246                                 ) {
01247 
01248   // Check if the requested size is larger than the already allocated size.  If so, flag the buffer
01249   //   as needing to be reallocated by increasing the request size passed to the device (occurs later).
01250   if (host__force_buffers_req_size < force_buffers_size) {
01251     host__force_buffers_req_size = (int)(force_buffers_size * 1.1f);
01252     host__force_buffers_req_size = (host__force_buffers_req_size + 4095) & (~4095);  // Round up to 4K
01253   }
01254 }
01255 
01256 
01257 #if MIC_SUBMIT_ATOMS_ON_ARRIVAL != 0
01258 
01259 void mic_submit_patch_data(const int deviceNum,
01260                            void * const hostPtr,
01261                            void* &hostPtr_prev,
01262                            const int transferBytes,
01263                            const int allocBytes_host,
01264                            int &allocBytes_device,
01265                            uint64_t &devicePtr,
01266                            void* &signal
01267                           ) {
01268 
01269   // NOTE: This function is called once per patch per timestep by one of the host threads
01270   // TODO : Since signals are thread specific, cannot use them here since the master does
01271   //   the checking for signal completions.  Figure out a way to make this asynchronous,
01272   //   but just do synchronous transfer for now to see what happens.
01273 
01274   // Check to see if the device's copy of the buffer needs to be reallocated
01275   // NOTE: If we are in a position where the buffer might grow in size, the contents
01276   //   need to be rewritten in full (i.e. atom migration has occured), so don't worry
01277   //   about the previous contents of the buffer
01278   int allocFlag = 0;
01279   if (hostPtr != hostPtr_prev || allocBytes_host > allocBytes_device) {
01280 
01281     // Free the old device buffer if it has been allocated previously
01282     if (devicePtr != 0) {
01283       char* __hostPtr_prev = (char*)hostPtr_prev;
01284       #pragma offload target(mic:deviceNum) \
01285         out(__hostPtr_prev[0:0] : alloc_if(0) free_if(1))
01286       { }
01287     }
01288 
01289     // Flag that allocation is needed
01290     allocFlag = 1;
01291   }
01292 
01293   // Transfer the data (allocating if needed)
01294   char* __hostPtr = (char*)hostPtr;
01295   if (allocFlag != 0) {
01296 
01297     uint64_t __devicePtr = 0;
01298     allocBytes_device = allocBytes_host;
01299     #pragma offload target(mic:deviceNum) \
01300       in(__hostPtr[0:allocBytes_host] : alloc_if(1) free_if(0) align(MIC_ALIGN)) \
01301       out(__devicePtr)
01302     { __devicePtr = (uint64_t)__hostPtr; }
01303 
01304     // Record the pointer for use later
01305     devicePtr = __devicePtr;
01306     signal = NULL; // Synchronous, so no need for the host to wait
01307 
01308   } else {
01309 
01310     #pragma offload_transfer target(mic:deviceNum) \
01311       in(__hostPtr[0:transferBytes] : alloc_if(0) free_if(0) align(MIC_ALIGN)) \
01312       signal(hostPtr)
01313     { }
01314 
01315     signal = hostPtr;  // Asynchronous, so force the host to wait on the signal later
01316   }
01317 
01318   // Take not of the host buffer that was transfered
01319   hostPtr_prev = hostPtr;
01320 }
01321 
01322 #endif  // MIC_SUBMIT_ATOMS_ON_ARRIVAL
01323 
01324 
01325 // Kernel bodies : Include CopmuteNonbondedMICKernelBase.h to declare each
01326 //   version of the force computation functions (calc_pair, calc_self, etc.).
01327 //   Each time the header file is included, different macros are set/unset
01328 //   to control the exact contents of 'that version of the kernel.'
01329 
01330 #define NBPAIR 1
01331 #define NBSELF 2
01332 
01333 #define NBTYPE NBPAIR
01334   #include "ComputeNonbondedMICKernelBase.h"
01335   #define CALCENERGY
01336     #include "ComputeNonbondedMICKernelBase.h"
01337   #undef CALCENERGY
01338   #define FULLELECT
01339     #include "ComputeNonbondedMICKernelBase.h"
01340     #define CALCENERGY
01341       #include "ComputeNonbondedMICKernelBase.h"
01342     #undef CALCENERGY
01343   #undef FULLELECT
01344 #undef NBTYPE
01345 
01346 #define NBTYPE NBSELF
01347   #include "ComputeNonbondedMICKernelBase.h"
01348   #define CALCENERGY
01349     #include "ComputeNonbondedMICKernelBase.h"
01350   #undef CALCENERGY
01351   #define FULLELECT
01352     #include "ComputeNonbondedMICKernelBase.h"
01353     #define CALCENERGY
01354       #include "ComputeNonbondedMICKernelBase.h"
01355     #undef CALCENERGY
01356   #undef FULLELECT
01357 #undef NBTYPE
01358 
01359 
01360 // This function is the main "computation" function, called each timestep to
01361 //   initiate computation on the MIC device.  A signal is setup as part of
01362 //   the offload pragma, which is periodically checked by the
01363 //   mic_check_remote_kernel_complete and mic_check_local_kernel_complete
01364 //   functions (which are called periodically by the Charm++ runtime system),
01365 //   so progress can continue once the device has finished its computation.
01366 // Input:
01367 //   - deviceNum : the device to push the data to (number within node)
01368 //   - isRemote : flag indicating a 'remote' (1) or 'local' (0) kernel invocation (currently, must be 'local')
01369 //   - lata : lattice information ('a' along with latb and latc)
01370 //   - latb : lattice information ('b' along with lata and latc)
01371 //   - latc : lattice information ('c' along with lata and latb)
01372 //   - doSlow : flag indicating if slow forces should be calculated in this kernel invocation
01373 //   - doEnergy : flag indiciating if energy information should be calculated in this kernel invocation
01374 //   - usePairlists : flag indicating if pairlists should be used within this kernel invocation (currently, must be)
01375 //   - savePairlists : flag indicating if pairlists should be saved across kernel invocations (currently, must be)
01376 //   - atomsChanged : flag indicating whether or not the atoms (and related) buffers have changed (length, not content)
01377 // Output: N/A
01378 void mic_nonbonded_forces(const int deviceNum,
01379                           const int isRemote,
01380                           const int numLocalAtoms,
01381                           const int numLocalComputes,
01382                           const int numLocalPatches,
01383                           const mic_position3_t lata,
01384                           const mic_position3_t latb,
01385                           const mic_position3_t latc,
01386                           const int doSlow,
01387                           const int doEnergy,
01388                           const int usePairlists,
01389                           const int savePairlists,
01390                           const int atomsChanged
01391                          ) {
01392 
01393   const int numAtoms = host__atoms_size;
01394   const int numPatchPairs = host__patch_pairs_size;
01395   const int numForceLists = host__force_lists_size;
01396 
01397   // Get and set the tag to use
01398   int *tag_kernel = ((isRemote) ? (&tag_remote_kernel) : (&tag_local_kernel));
01399   *tag_kernel = 1;
01400 
01401   // Setup the kernel data structures
01402   mic_kernel_data * remote_kernel_data = host__kernel_data + 1; //host__remote_kernel_data;
01403   mic_kernel_data * local_kernel_data = host__kernel_data; //host__local_kernel_data;
01404   mic_kernel_data * kernel_data = ((isRemote) ? (remote_kernel_data) : (local_kernel_data));
01405   mic_kernel_data * _kernel_data = host__kernel_data;
01406   #if MIC_SYNC_INPUT != 0
01407     // NOTE: With MIC_SYNC_INPUT, all input is pushed before either of the "local" or "remote" kernels
01408     //   are executed, so setup both kernel_data structs as the "remote" kernel is being issued
01409     if (singleKernelFlag != 0 || isRemote) {
01410       remote_kernel_data->isRemote = 1;
01411       local_kernel_data->isRemote = 0;
01412       remote_kernel_data->numLocalAtoms = local_kernel_data->numLocalAtoms = numLocalAtoms;
01413       remote_kernel_data->numLocalComputes = local_kernel_data->numLocalComputes = numLocalComputes;
01414       remote_kernel_data->numLocalPatches = local_kernel_data->numLocalPatches = numLocalPatches;
01415       remote_kernel_data->doSlow = local_kernel_data->doSlow = doSlow;
01416       remote_kernel_data->doEnergy = local_kernel_data->doEnergy = doEnergy;
01417       remote_kernel_data->usePairlists = local_kernel_data->usePairlists = usePairlists;
01418       remote_kernel_data->savePairlists = local_kernel_data->savePairlists = savePairlists;
01419       remote_kernel_data->lata.x = local_kernel_data->lata.x = lata.x;
01420       remote_kernel_data->lata.y = local_kernel_data->lata.y = lata.y;
01421       remote_kernel_data->lata.z = local_kernel_data->lata.z = lata.z;
01422       remote_kernel_data->latb.x = local_kernel_data->latb.x = latb.x;
01423       remote_kernel_data->latb.y = local_kernel_data->latb.y = latb.y;
01424       remote_kernel_data->latb.z = local_kernel_data->latb.z = latb.z;
01425       remote_kernel_data->latc.x = local_kernel_data->latc.x = latc.x;
01426       remote_kernel_data->latc.y = local_kernel_data->latc.y = latc.y;
01427       remote_kernel_data->latc.z = local_kernel_data->latc.z = latc.z;
01428       remote_kernel_data->numAtoms = local_kernel_data->numAtoms = numAtoms;
01429       remote_kernel_data->numPatchPairs = local_kernel_data->numPatchPairs = numPatchPairs;
01430       remote_kernel_data->numForceLists = local_kernel_data->numForceLists = numForceLists;
01431       remote_kernel_data->forceBuffersReqSize = local_kernel_data->forceBuffersReqSize = host__force_buffers_req_size;
01432     }
01433   #else
01434     kernel_data->isRemote = isRemote;
01435     kernel_data->numLocalAtoms = numLocalAtoms;
01436     kernel_data->numLocalComputes = numLocalComputes;
01437     kernel_data->numLocalPatches = numLocalPatches;
01438     kernel_data->doSlow = doSlow;
01439     kernel_data->doEnergy = doEnergy;
01440     kernel_data->usePairlists = usePairlists;
01441     kernel_data->savePairlists = savePairlists;
01442     kernel_data->lata.x = lata.x;
01443     kernel_data->lata.y = lata.y;
01444     kernel_data->lata.z = lata.z;
01445     kernel_data->latb.x = latb.x;
01446     kernel_data->latb.y = latb.y;
01447     kernel_data->latb.z = latb.z;
01448     kernel_data->latc.x = latc.x;
01449     kernel_data->latc.y = latc.y;
01450     kernel_data->latc.z = latc.z;
01451     kernel_data->numAtoms = numAtoms;
01452     kernel_data->numPatchPairs = numPatchPairs;
01453     kernel_data->numForceLists = numForceLists;
01454     kernel_data->forceBuffersReqSize = host__force_buffers_req_size;
01455   #endif
01456 
01457   // For buffers that are only periodically copied/updated, get the
01458   //   flags that indicate whether or not those buffers should be
01459   //   transfered during this kernel invocation, and then reset them
01460   const int slowForcesNumAtoms = ((doSlow) ? (numAtoms) : (0));
01461   const int _patch_pairs_copySize = patch_pairs_copySize;  // from __thread variable to stack variable for use within offload pragma
01462   const int _force_lists_copySize = force_lists_copySize;  // from __thread variable to stack variable for use within offload pragma
01463   const int _atom_params_copySize = atom_params_copySize;  // from __thread variable to stack variable for use within offload pragma
01464   patch_pairs_copySize = 0;
01465   force_lists_copySize = 0;
01466   atom_params_copySize = 0;
01467 
01468   // Calculate the start/size of any data buffers that need to be copied to the device this timestep
01469   // NOTE: Some "remote" computes still need "local patch" data, so copy all patch data before/during "remote."
01470   int toCopySize_atoms = (singleKernelFlag != 0 || isRemote) ? (numAtoms) : (0);  // The remote kernel will copy in all atoms
01471   int toCopySize_atom_params = (singleKernelFlag != 0 || isRemote) ? (_atom_params_copySize) : (0);  // If there are atom_params to copy, the remote kernel will copy in all atom_params
01472   int toCopySize_patch_pairs = (singleKernelFlag != 0 || isRemote) ? (_patch_pairs_copySize) : (0);  // This could potentially be split between kernels (but is relatively rare, so don't worry about it for now)
01473   int toCopySize_force_lists = (singleKernelFlag != 0 || isRemote) ? (_force_lists_copySize) : (0);  // This could potentially be split between kernels (but is relatively rare, so don't worry about it for now)
01474   int toCopyStart_forces = (singleKernelFlag != 0) ? (0) : ((isRemote) ? (numLocalAtoms) : (0));
01475   int toCopySize_forces = (singleKernelFlag != 0) ? (numAtoms) : ((isRemote) ? (numAtoms - numLocalAtoms) : (numLocalAtoms));
01476   int toCopyStart_slow_forces = (doSlow) ? (toCopyStart_forces) : (0);
01477   int toCopySize_slow_forces = (doSlow) ? (toCopySize_forces) : (0);
01478 
01479   // If tracing data is to be recorded, setup local variables to the host buffers that will
01480   //   receive the data and setup the clauses that will be used to transfer the data
01481   #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0)
01482     double * device_times_start = host__device_times_start;
01483     int toCopySize_device_times_start = ((isRemote) ? (0) : (10));
01484     #if MIC_DEVICE_TRACING_DETAILED != 0
01485       double * device_times_computes = host__device_times_computes;
01486       double * device_times_patches = host__device_times_patches;
01487       int toCopySize_device_times_computes = ((isRemote) ? (0) : (2 * numPatchPairs));
01488       int toCopySize_device_times_patches = ((isRemote) ? (0) : (2 * numForceLists));
01489       #if MIC_SYNC_OUTPUT != 0
01490         #define MIC_DEVICE_TIMING_PRAGMA_CLAUSE_DETAILED \
01491           in(device_times_computes[0:0] : alloc_if(0) free_if(0)) \
01492           in(device_times_patches[0:0] : alloc_if(0) free_if(0)) \
01493           nocopy(device__device_times_computes) nocopy(device__device_times_patches)
01494       #else
01495         #define MIC_DEVICE_TIMING_PRAGMA_CLAUSE_DETAILED \
01496           out(device_times_computes[0:toCopySize_device_times_computes] : alloc_if(0) free_if(0)) \
01497           out(device_times_patches[0:toCopySize_device_times_patches] : alloc_if(0) free_if(0)) \
01498           nocopy(device__device_times_computes) nocopy(device__device_times_patches)
01499       #endif
01500     #else
01501       #define MIC_DEVICE_TIMING_PRAGMA_CLAUSE_DETAILED
01502     #endif
01503     #if MIC_SYNC_OUTPUT != 0
01504       #define MIC_DEVICE_TIMING_PRAGMA_CLAUSE \
01505         MIC_DEVICE_TIMING_PRAGMA_CLAUSE_DETAILED \
01506         in(device_times_start[0:0] : alloc_if(0) free_if(0)) \
01507         nocopy(device__device_times_start)
01508     #else
01509       #define MIC_DEVICE_TIMING_PRAGMA_CLAUSE \
01510         MIC_DEVICE_TIMING_PRAGMA_CLAUSE_DETAILED \
01511         out(device_times_start[0:toCopySize_device_times_start] : alloc_if(0) free_if(0)) \
01512         nocopy(device__device_times_start)
01513     #endif
01514   #else
01515     #define MIC_DEVICE_TIMING_PRAGMA_CLAUSE
01516   #endif
01517 
01518   // Setup local pointers and offload clauses for transfering atom input data
01519   atom * atoms = host__atoms;
01520   atom_param * atom_params = host__atom_params;
01521   double4 * forces = host__forces;
01522   double4 * slow_forces = host__slow_forces;
01523   #if MIC_SUBMIT_ATOMS_ON_ARRIVAL != 0
01524     #define MIC_DEVICE_ATOMS_CLAUSE
01525   #else
01526     #define MIC_DEVICE_ATOMS_CLAUSE \
01527       in(atoms[0:toCopySize_atoms] : alloc_if(0) free_if(0)) \
01528       in(atom_params[0:toCopySize_atom_params] : alloc_if(0) free_if(0))
01529   #endif
01530 
01531   // If kernel data transfer stats are being collected, calculate and print the amount of data
01532   //   being transfered, separated by type : in, out, or inout.
01533   // NOTE: Only include data required for production runs (e.g. not tracing data).
01534   #if MIC_KERNEL_DATA_TRANSFER_STATS != 0
01535     int transfer_in = sizeof(isRemote)
01536       + (3 * sizeof(lata)) // for lata, latb, and latc
01537       + sizeof(size_t) // for device__force_buffers_req_size
01538       + (sizeof(atom) * toCopySize_atoms)
01539       + (sizeof(atom_param) * toCopySize_atom_params)
01540       + (sizeof(patch_pair) * toCopySize_patch_pairs)
01541       + (sizeof(force_list) * toCopySize_force_lists);
01542     int transfer_inout = sizeof(mic_kernel_data);
01543     int transfer_out = sizeof(double4) * (toCopySize_forces + toCopySize_slow_forces)
01544       + sizeof(int); // for device__timestep
01545     #if MIC_DEBUG > 0
01546       MICP("timestep:%06d - transfering %d bytes (isRemote:%d, in:%d, inout:%d, out:%d)\n", \
01547            host__timestep, transfer_in + transfer_inout + transfer_out, \
01548            isRemote, transfer_in, transfer_inout, transfer_out \
01549           ); MICPF;
01550     #else
01551       printf("[MIC-DEBUG] :: PE %04d-%05d-%1d :: transfering %d bytes (in:%d, inout:%d, out:%d)\n",
01552              host__pe, host__timestep, isRemote,
01553              transfer_in + transfer_inout + transfer_out, transfer_in, transfer_inout, transfer_out
01554             );
01555     #endif
01556   #endif
01557 
01558   // Setup input clauses required for this kernel.
01559   // NOTE: If MIC_SYNC_INPUT is being used, use an offload pragma to first transfer the data to
01560   //   the device and use nocopy clauses for the actual kernel offload pragmas.
01561   patch_pair * patch_pairs = host__patch_pairs;
01562   force_list * force_lists = host__force_lists;
01563   #if MIC_SYNC_INPUT != 0
01564 
01565     // Push the data to the device
01566     if (singleKernelFlag != 0 || isRemote != 0) {
01567       MICP("<< pushing input data >>\n"); MICPF;
01568       #if MIC_TRACING != 0
01569         double input_start = CmiWallTimer();
01570       #endif
01571       #pragma offload_transfer target(mic:deviceNum) \
01572         in(patch_pairs[0:toCopySize_patch_pairs] : alloc_if(0) free_if(0)) \
01573         in(force_lists[0:toCopySize_force_lists] : alloc_if(0) free_if(0)) \
01574         in(_kernel_data[0:2] : alloc_if(0) free_if(0)) \
01575         MIC_DEVICE_ATOMS_CLAUSE
01576       { }
01577       #if MIC_TRACING != 0
01578         double input_end = CmiWallTimer();
01579         MIC_TRACING_RECORD(MIC_EVENT_SYNC_INPUT_PRAGMA, input_start, input_end);
01580       #endif
01581     }
01582 
01583     // Setup the clauses for the actual kernel offload pragma (so that data is not transfered)
01584     #if MIC_SUBMIT_ATOMS_ON_ARRIVAL != 0
01585       #define TMP_SYNC_ATOMS_CLAUSE
01586     #else
01587       #define TMP_SYNC_ATOMS_CLAUSE \
01588         in(atoms[0:0] : alloc_if(0) free_if(0)) \
01589         in(atom_params[0:0] : alloc_if(0) free_if(0))
01590     #endif
01591 
01592     #if MIC_SYNC_OUTPUT != 0
01593       #define TMP_SYNC_KERNEL_DATA_CLAUSE \
01594         in(_kernel_data[0:0] : alloc_if(0) free_if(0))
01595     #else
01596       #define TMP_SYNC_KERNEL_DATA_CLAUSE \
01597         out(_kernel_data[isRemote:1] : alloc_if(0) free_if(0))
01598     #endif
01599 
01600     #define MIC_DEVICE_SYNC_INPUT_CLAUSE \
01601       nocopy(device__lata) nocopy(device__latb) nocopy(device__latc) \
01602       nocopy(device__atoms_size) \
01603       in(patch_pairs[0:0] : alloc_if(0) free_if(0)) nocopy(device__patch_pairs_size) \
01604       in(force_lists[0:0] : alloc_if(0) free_if(0)) nocopy(device__force_lists_size) \
01605       TMP_SYNC_KERNEL_DATA_CLAUSE \
01606       TMP_SYNC_ATOMS_CLAUSE
01607 
01608   #else
01609 
01610     #if MIC_SYNC_OUTPUT != 0
01611       #define TMP_SYNC_KERNEL_DATA_CLAUSE \
01612         in(_kernel_data[isRemote:1] : alloc_if(0) free_if(0))
01613     #else
01614       #define TMP_SYNC_KERNEL_DATA_CLAUSE \
01615         inout(_kernel_data[isRemote:1] : alloc_if(0) free_if(0))
01616     #endif
01617 
01618     // Setup the clauses for the actual kernel offload pragma (so that data is transfered)
01619     #define MIC_DEVICE_SYNC_INPUT_CLAUSE \
01620       nocopy(device__lata) nocopy(device__latb) nocopy(device__latc) \
01621       in(patch_pairs[0:toCopySize_patch_pairs] : alloc_if(0) free_if(0)) \
01622       in(force_lists[0:toCopySize_force_lists] : alloc_if(0) free_if(0)) \
01623       nocopy(device__atoms_size) nocopy(device__patch_pairs_size) nocopy(device__force_lists_size) \
01624       TMP_SYNC_KERNEL_DATA_CLAUSE \
01625       MIC_DEVICE_ATOMS_CLAUSE
01626 
01627   #endif
01628 
01629   MICP("<< issuing %s kernel >>\n", (isRemote) ? ("remote") : ("local")); MICPF;
01630 
01631   #if MIC_SYNC_OUTPUT != 0
01632     #define MIC_DEVICE_SYNC_OUTPUT_CLAUSE \
01633       in(forces[0:0] : alloc_if(0) free_if(0)) \
01634       in(slow_forces[0:0] : alloc_if(0) free_if(0)) \
01635       nocopy(device__timestep)
01636   #else
01637     #define MIC_DEVICE_SYNC_OUTPUT_CLAUSE \
01638       out(forces[toCopyStart_forces:toCopySize_forces] : alloc_if(0) free_if(0)) \
01639       out(slow_forces[toCopyStart_slow_forces:toCopySize_slow_forces] : alloc_if(0) free_if(0)) \
01640       out(device__timestep : into(host__timestep))
01641   #endif
01642 
01643   #if MIC_DATA_STRUCT_VERIFY != 0
01644     #define MIC_DEVICE_DATA_STRUCT_VERIFY_CLAUSE \
01645       nocopy(device__exclusion_bits_copy) nocopy(device__constants_copy) \
01646       nocopy(device__table_four_copy) nocopy(device__table_four_float_copy) \
01647       nocopy(device__lj_table_copy) nocopy(device__lj_table_float_copy) \
01648       nocopy(device__patch_pairs_copy) nocopy(device__force_lists_copy) \
01649       nocopy(device__atoms_copy) nocopy(device__atom_params_copy) \
01650       nocopy(device__patch_pairs_copy_size) nocopy(device__force_lists_copy_size) \
01651       nocopy(device__atoms_copy_size) nocopy(device__atom_params_copy_size)
01652   #else
01653     #define MIC_DEVICE_DATA_STRUCT_VERIFY_CLAUSE
01654   #endif
01655 
01656   #if REFINE_PAIRLISTS != 0
01657     #define MIC_DEVICE_REFINE_PAIRLISTS_CLAUSE \
01658       nocopy(device__r2_array) \
01659       nocopy(device__pl_array) \
01660       nocopy(device__pl_size)
01661   #else
01662     #define MIC_DEVICE_REFINE_PAIRLISTS_CLAUSE
01663   #endif
01664 
01665   #if MIC_TRACING != 0
01666     double pragma_start = CmiWallTimer();
01667   #endif
01668 
01669   // Trigger computation on the MIC device (asynchronous offload pragma that does the kernel work)
01670   #pragma offload target(mic:deviceNum) \
01671     in(isRemote) \
01672     MIC_DEVICE_SYNC_INPUT_CLAUSE \
01673     MIC_DEVICE_SYNC_OUTPUT_CLAUSE \
01674     MIC_DEVICE_TIMING_PRAGMA_CLAUSE \
01675     nocopy(device__force_buffers) nocopy(device__slow_force_buffers) \
01676     nocopy(device__force_buffers_alloc_size) nocopy(device__force_buffers_req_size) \
01677     nocopy(device__pairlists) nocopy(device__pairlists_alloc_size) \
01678     nocopy(device__table_four) nocopy(device__table_four_float) nocopy(device__table_four_n_16) \
01679     nocopy(device__lj_table) nocopy(device__lj_table_float) nocopy(device__lj_table_dim) \
01680     nocopy(device__exclusion_bits) \
01681     nocopy(device__constants) \
01682     nocopy(device__atoms) nocopy(device__atom_params) \
01683     nocopy(device__forces) nocopy(device__slow_forces) \
01684     nocopy(device__patch_pairs) nocopy(device__force_lists) \
01685     nocopy(device__numOMPThreads) \
01686     MIC_DEVICE_REFINE_PAIRLISTS_CLAUSE \
01687     MIC_DEVICE_DATA_STRUCT_VERIFY_CLAUSE \
01688     nocopy(device__pe) nocopy(device__node) \
01689     DEVICE_FPRINTF_CLAUSE
01690   {
01691 //    DEVICE_FPRINTF_CLAUSE \
01692 //    signal(tag_kernel)
01693     __MUST_BE_MIC;
01694     __FULL_CHECK(__ASSERT(isRemote == 0 || isRemote == 1));
01695     mic_kernel_data * kernel_data = _kernel_data + isRemote;
01696     __FULL_CHECK(__ASSERT(kernel_data != NULL));
01697     __FULL_CHECK(__ASSERT(device__numOMPThreads > 0));
01698 
01699     #if (MIC_DEVICE_FPRINTF != 0) && (MIC_DEVICE_FPRINTF_REOPEN_FREQ > 0)
01700       if ((device__timestep != 0 && device__timestep % MIC_DEVICE_FPRINTF_REOPEN_FREQ == 0) && (kernel_data->isRemote == 0)) {
01701         char filename[128] = { 0 };
01702         sprintf(filename, "/tmp/namd_deviceDebugInfo.%d", device__pe);
01703         if (device__node <= 0) {
01704           printf("[MIC-DEBUG] :: Reopening debug output files on MIC devices (timestep: %d).\n", device__timestep);
01705           fflush(NULL);
01706         }
01707         fclose(device__fout);
01708         device__fout = fopen(filename, "w");
01709         DEVICE_FPRINTF("Device file on PE %d (node: %d) - reopen (timestep: %d)...\n", device__pe, device__node, device__timestep);
01710       }
01711     #endif
01712 
01713     DEVICE_FPRINTF("%d %s ", device__timestep, (kernel_data->isRemote != 0 ? "R" : "L"));
01714 
01715     // DMK - TODO | FIXME - For now, copy values over into device__xxx variables
01716     //   so the older version of the code will still work (remove them eventually)
01717     device__patch_pairs = patch_pairs;
01718     device__force_lists = force_lists;
01719     device__atoms = atoms;
01720     device__atom_params = atom_params;
01721     device__forces = forces;
01722     device__slow_forces = slow_forces;
01723     __ASSERT(device__patch_pairs != NULL);
01724     __ASSERT(device__force_lists != NULL);
01725     __ASSERT(device__atoms != NULL);
01726     __ASSERT(device__atom_params != NULL);
01727     __ASSERT(device__forces != NULL);
01728     __ASSERT(device__slow_forces != NULL);
01729     #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0)
01730       device__device_times_start = device_times_start;
01731       __ASSERT(device__device_times_start);
01732       #if MIC_DEVICE_TRACING_DETAILED != 0
01733         device__device_times_computes = device_times_computes;
01734         device__device_times_patches = device_times_patches;
01735         __ASSERT(device__device_times_computes);
01736         __ASSERT(device__device_times_patches);
01737       #endif
01738     #endif
01739     device__lata.x = kernel_data->lata.x;
01740     device__lata.y = kernel_data->lata.y;
01741     device__lata.z = kernel_data->lata.z;
01742     device__latb.x = kernel_data->latb.x;
01743     device__latb.y = kernel_data->latb.y;
01744     device__latb.z = kernel_data->latb.z;
01745     device__latc.x = kernel_data->latc.x;
01746     device__latc.y = kernel_data->latc.y;
01747     device__latc.z = kernel_data->latc.z;
01748     device__atoms_size = kernel_data->numAtoms;
01749     device__patch_pairs_size = kernel_data->numPatchPairs;
01750     device__force_lists_size = kernel_data->numForceLists;
01751     device__force_buffers_req_size = kernel_data->forceBuffersReqSize;
01752 
01753     // TRACING - Record the start time of the kernel
01754     #if (MIC_TRACING != 0 && MIC_DEVICE_TRACING != 0)
01755       __ASSERT(device__device_times_start != NULL);
01756       device__device_times_start[((isRemote)?(0):(1))] = getCurrentTime();
01757     #endif
01758 
01759     // If data structure verification is being performed, create copies of the buffers that can
01760     //   change throughout the simulation at the start of the timestep on the device, and verify
01761     //   the lookup tables before any work is performed.
01762     #if MIC_DATA_STRUCT_VERIFY != 0
01763 
01764       _verify_tables(device__pe, device__timestep, isRemote, -1);
01765 
01766       _verify_remalloc<patch_pair>(device__patch_pairs_copy_size, device__patch_pairs_size, device__patch_pairs_copy);
01767       _verify_remalloc<force_list>(device__force_lists_copy_size, device__force_lists_size, device__force_lists_copy);
01768       _verify_remalloc<atom>(device__atoms_copy_size, device__atoms_size, device__atoms_copy);
01769       _verify_remalloc<atom_param>(device__atom_params_copy_size, device__atoms_size, device__atom_params_copy);
01770       __ASSERT(device__patch_pairs_copy != NULL && device__force_lists_copy != NULL);
01771       __ASSERT(device__atoms_copy != NULL && device__atom_params_copy != NULL);
01772 
01773       _verify_parallel_memcpy(device__patch_pairs_copy, device__patch_pairs, sizeof(patch_pair) * device__patch_pairs_size);
01774       _verify_parallel_memcpy(device__force_lists_copy, device__force_lists, sizeof(force_list) * device__force_lists_size);
01775       _verify_parallel_memcpy(device__atoms_copy, device__atoms, sizeof(atom) * device__atoms_size);
01776       _verify_parallel_memcpy(device__atom_params_copy, device__atom_params, sizeof(atom_param) * device__atoms_size);
01777     #endif
01778 
01779     // Make sure there is enough memory allocated for the force buffers (reallocate if not)
01780     if (device__force_buffers_req_size > device__force_buffers_alloc_size) {
01781       if (device__force_buffers != NULL) { _MM_FREE_WRAPPER(device__force_buffers); }
01782       if (device__slow_force_buffers != NULL) { _MM_FREE_WRAPPER(device__slow_force_buffers); }
01783       device__force_buffers_alloc_size = device__force_buffers_req_size;
01784       size_t reqSize = sizeof(double) * 4 * device__force_buffers_alloc_size;
01785       device__force_buffers = (double4*)(_MM_MALLOC_WRAPPER(reqSize, 64, "device__force_buffers"));
01786       device__slow_force_buffers = (double4*)(_MM_MALLOC_WRAPPER(reqSize, 64, "device__slow_force_buffers"));
01787       __ASSERT(device__force_buffers != NULL && device__slow_force_buffers != NULL);
01788     }
01789 
01790     // Declare and initialize the overall virial summation/accumulation variables that will
01791     //   eventually be passed back to the host
01792     double virial_xx = 0.0;
01793     double virial_xy = 0.0;
01794     double virial_xz = 0.0;
01795     double virial_yy = 0.0;
01796     double virial_yz = 0.0;
01797     double virial_zz = 0.0;
01798     double fullElectVirial_xx = 0.0;
01799     double fullElectVirial_xy = 0.0;
01800     double fullElectVirial_xz = 0.0;
01801     double fullElectVirial_yy = 0.0;
01802     double fullElectVirial_yz = 0.0;
01803     double fullElectVirial_zz = 0.0;
01804     double vdwEnergy = 0.0;
01805     double electEnergy = 0.0;
01806     double fullElectEnergy = 0.0;
01807     #if MIC_EXCL_CHECKSUM_FULL != 0
01808       int exclusionSum = 0;
01809     #endif
01810 
01811     // Calculate the lower and upper bounds for the patch pairs that will be computed within this kernel
01812     const int ppI_start = (isRemote) ? (kernel_data->numLocalComputes) : (0);
01813     const int ppI_end = (device__constants->singleKernelFlag != 0 || isRemote) ? (device__patch_pairs_size) : (kernel_data->numLocalComputes);
01814     __FULL_CHECK(__ASSERT(ppI_start >= 0 && ppI_start <= device__patch_pairs_size));
01815     __FULL_CHECK(__ASSERT(ppI_end >= 0 && ppI_end <= device__patch_pairs_size));
01816     __FULL_CHECK(__ASSERT(ppI_start <= ppI_end));
01817 
01818     // TRACING - Record the start time for the patch pairs (computes) parallel region
01819     #if (MIC_TRACING != 0 && MIC_DEVICE_TRACING != 0)
01820       device__device_times_start[((isRemote)?(2):(3))] = getCurrentTime();
01821     #endif
01822 
01823     DEVICE_FPRINTF(" C(%d,%d)", ppI_start, ppI_end);
01824 
01825     // Process each patch pair (compute)
01826     #pragma novector
01827     #if MULTIPLE_THREADS != 0
01828       #if MIC_EXCL_CHECKSUM_FULL != 0
01829         #define EXCL_CHECKSUM_CLAUSE  reduction(+ : exclusionSum)
01830       #else
01831         #define EXCL_CHECKSUM_CLAUSE
01832       #endif
01833       #pragma omp parallel for schedule(dynamic, 1) \
01834         reduction(+ : vdwEnergy, electEnergy, fullElectEnergy) \
01835         reduction(+ : virial_xx, virial_xy, virial_xz, virial_yy, virial_yz, virial_zz) \
01836         reduction(+ : fullElectVirial_xx, fullElectVirial_xy, fullElectVirial_xz, fullElectVirial_yy, fullElectVirial_yz, fullElectVirial_zz ) \
01837         EXCL_CHECKSUM_CLAUSE
01838       #undef EXCL_CHECKSUM_CLAUSE
01839     #endif
01840     for (int ppI = ppI_start; ppI < ppI_end; ppI++) {
01841 
01842       DEVICE_FPRINTF(" C");
01843 
01844       // TRACING - Record the start of each patch pair (compute)
01845       #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0) && (MIC_DEVICE_TRACING_DETAILED != 0)
01846         __FULL_CHECK(__ASSERT(device__device_times_computes != NULL));
01847         device__device_times_computes[ppI * 2] = getCurrentTime();
01848       #endif
01849 
01850       // Create and populate the mic_params data structure (passed into the kernel function)
01851       mic_params params;
01852 
01853       // Initialize the pointer to the patch_pairs structure for this compute
01854       params.pp = (patch_pair*)(device__patch_pairs + ppI);
01855       __FULL_CHECK(__ASSERT(params.pp != NULL));
01856 
01857       #if MIC_SORT_LISTS != 0
01858         const int abSwapFlag = ((params.pp->patch1_size < params.pp->patch2_size) ? (1) : (0));
01859         #define ABSWAP(t, f)  ((abSwapFlag)?(t):(f))
01860       #else
01861         #define ABSWAP(t, f)  (f)
01862       #endif
01863 
01864       // DMK - DEBUG - Set a few values used for debugging
01865       params.ppI = ppI;
01866       params.p1 = ABSWAP(params.pp->p2, params.pp->p1);
01867       params.p2 = ABSWAP(params.pp->p1, params.pp->p2);
01868       params.pe = device__pe;
01869       params.timestep = device__timestep;
01870 
01871       // If pairlist refinement is enabled, initialize the related scratch buffer pointers
01872       // In some cases, buffers used by the compute kernel functions (calc_self, etc.) only
01873       //   require scratch buffers (i.e. not used for input or output).  As such, we can
01874       //   allocate these buffers on a one-per-thread basis rather than a one-per-compute
01875       //   basis to save some memory (and get memory location reuse for any given thread).
01876       //   These fields relate to the pointer and sizes for those buffers.
01877       #if REFINE_PAIRLISTS != 0
01878         const int threadIndex = omp_get_thread_num();
01879         params.plArray_ptr = device__pl_array + threadIndex;
01880         params.plSize_ptr = device__pl_size + threadIndex;
01881         params.r2Array_ptr = device__r2_array + threadIndex;
01882       #endif
01883 
01884       // Initialize pointers to the various lookup tables, constants, etc. that are used
01885       //   from within the compute functions
01886       #if MIC_HANDCODE_FORCE_SINGLE != 0
01887         params.table_four_base_ptr = (void*)device__table_four_float;
01888         params.lj_table_base_ptr = (void*)device__lj_table_float;
01889       #else
01890         params.table_four_base_ptr = (void*)device__table_four;
01891         params.lj_table_base_ptr = (void*)device__lj_table;
01892       #endif
01893       params.table_four_n_16 = device__table_four_n_16;
01894       params.lj_table_dim = device__lj_table_dim;
01895       params.exclusion_bits = device__exclusion_bits;
01896       params.constants = device__constants;
01897       __FULL_CHECK(__ASSERT(params.table_four_base_ptr != NULL));
01898       __FULL_CHECK(__ASSERT(params.lj_table_base_ptr != NULL));
01899       __FULL_CHECK(__ASSERT(params.table_four_n_16 % 16 == 0));
01900       __FULL_CHECK(__ASSERT(params.exclusion_bits != NULL));
01901       __FULL_CHECK(__ASSERT(params.constants != NULL));
01902 
01903       // Setup pairlist flags and pairlist buffer (an array of pointers, one for each pairlist,
01904       //   for each compute)
01905       params.usePairlists = kernel_data->usePairlists;
01906       params.savePairlists = kernel_data->savePairlists;
01907       params.pairlists_ptr = &(((int**)device__pairlists)[NUM_PAIRLIST_TYPES * ppI]);
01908       __FULL_CHECK(__ASSERT(device__pairlists != NULL));
01909       __FULL_CHECK(__ASSERT(params.pairlists_ptr != NULL));
01910 
01911       // Setup the sizes of the atom lists and force lists
01912       int n0 = ABSWAP(params.pp->patch2_size, params.pp->patch1_size);
01913       int n1 = ABSWAP(params.pp->patch1_size, params.pp->patch2_size);
01914       int n0_16 = (n0 + 15) & (~15); // Round up to nearest multiple of 16
01915       int n1_16 = (n1 + 15) & (~15);
01916       params.numAtoms[0] = n0;
01917       params.numAtoms[1] = n1;
01918       params.numAtoms_16[0] = n0_16;
01919       params.numAtoms_16[1] = n1_16;
01920       __FULL_CHECK(__ASSERT(params.numAtoms[0] >= 0));
01921       __FULL_CHECK(__ASSERT(params.numAtoms[1] >= 0));
01922 
01923       // Setup the pointers to the input particle data for this compute
01924       // NOTE: All of the particle data is sent in two chunks (atoms and atom_params, where
01925       //   atoms changes every timestep but atom_params only changes periodically).  Each
01926       //   of these buffers contains several sub-arrays (one for each field) in an
01927       //   structure-of-arrays (SOA) format.  This code, along with the force code
01928       //   below it, is simply creating the array pointers for each "field's array."
01929       #if MIC_SUBMIT_ATOMS_ON_ARRIVAL != 0
01930         mic_position_t* pBase0 = (mic_position_t*)(params.pp->patch1_atomDataPtr);
01931         mic_position_t* pBase1 = (mic_position_t*)(params.pp->patch2_atomDataPtr);
01932         int *pExtBase0 = (int*)(pBase0 + n0_16);
01933         int *pExtBase1 = (int*)(pBase1 + n1_16);
01934       #else
01935         __FULL_CHECK(__ASSERT(device__atoms != NULL));
01936         __FULL_CHECK(__ASSERT(device__atom_params != NULL));
01937 
01938         __FULL_CHECK(__ASSERT(params.pp->patch1_atom_start >= 0));
01939         __FULL_CHECK(__ASSERT((params.pp->patch1_atom_start < device__atoms_size) || (params.pp->patch1_atom_start == device__atoms_size && n0 == 0)));
01940         __FULL_CHECK(__ASSERT(params.pp->patch2_atom_start >= 0));
01941         __FULL_CHECK(__ASSERT((params.pp->patch2_atom_start < device__atoms_size) || (params.pp->patch2_atom_start == device__atoms_size && n1 == 0)));
01942 
01943         mic_position_t* pBase0 = (mic_position_t*)(device__atoms + ABSWAP(params.pp->patch2_atom_start, params.pp->patch1_atom_start));
01944         mic_position_t* pBase1 = (mic_position_t*)(device__atoms + ABSWAP(params.pp->patch1_atom_start, params.pp->patch2_atom_start));
01945         int* pExtBase0 = (int*)(device__atom_params + ABSWAP(params.pp->patch2_atom_start, params.pp->patch1_atom_start));
01946         int* pExtBase1 = (int*)(device__atom_params + ABSWAP(params.pp->patch1_atom_start, params.pp->patch2_atom_start));
01947       #endif
01948       __FULL_CHECK(__ASSERT(pBase0 != NULL));
01949       __FULL_CHECK(__ASSERT(pBase1 != NULL));
01950       __FULL_CHECK(__ASSERT(pExtBase0 != NULL));
01951       __FULL_CHECK(__ASSERT(pExtBase1 != NULL));
01952       #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
01953         params.p[0] = (atom*)pBase0;
01954         params.p[1] = (atom*)pBase1;
01955         params.pExt[0] = (atom_param*)pExtBase0;
01956         params.pExt[1] = (atom_param*)pExtBase1;
01957       #else
01958         params.p_x[0] = pBase0 + (0 * n0_16); params.p_x[1] = pBase1 + (0 * n1_16);
01959         params.p_y[0] = pBase0 + (1 * n0_16); params.p_y[1] = pBase1 + (1 * n1_16);
01960         params.p_z[0] = pBase0 + (2 * n0_16); params.p_z[1] = pBase1 + (2 * n1_16);
01961         params.p_q[0] = pBase0 + (3 * n0_16); params.p_q[1] = pBase1 + (3 * n1_16);
01962         params.pExt_vdwType[0] = pExtBase0 + (0 * n0_16);
01963         params.pExt_index[0] = pExtBase0 + (1 * n0_16);
01964         params.pExt_exclIndex[0] = pExtBase0 + (2 * n0_16);
01965         params.pExt_exclMaxDiff[0] = pExtBase0 + (3 * n0_16);
01966         params.pExt_vdwType[1] = pExtBase1 + (0 * n1_16);
01967         params.pExt_index[1] = pExtBase1 + (1 * n1_16);
01968         params.pExt_exclIndex[1] = pExtBase1 + (2 * n1_16);
01969         params.pExt_exclMaxDiff[1] = pExtBase1 + (3 * n1_16);
01970       #endif
01971 
01972       // Setup the pointers to the output force data for this compute
01973       // NOTE: Forces are output every timestep, but slow forces (fullf) are output only
01974       //   during some timesteps.
01975       __FULL_CHECK(__ASSERT(device__force_lists != NULL));
01976       __FULL_CHECK(__ASSERT(device__force_buffers != NULL));
01977       __FULL_CHECK(__ASSERT(device__slow_force_buffers != NULL));
01978       __FULL_CHECK(__ASSERT(params.pp->patch1_force_list_index < device__force_lists_size));
01979       __FULL_CHECK(__ASSERT(params.pp->patch2_force_list_index < device__force_lists_size));
01980       __FULL_CHECK(__ASSERT(params.pp->patch1_force_start < device__force_buffers_req_size));
01981       __FULL_CHECK(__ASSERT(params.pp->patch2_force_start < device__force_buffers_req_size));
01982       params.doSlow = kernel_data->doSlow;  // Flag indicating if slow forces should be calculate this timestep or not
01983       params.fl[0] = (force_list*)(device__force_lists + ABSWAP(params.pp->patch2_force_list_index, params.pp->patch1_force_list_index));
01984       params.fl[1] = (force_list*)(device__force_lists + ABSWAP(params.pp->patch1_force_list_index, params.pp->patch2_force_list_index));
01985       double *ffBase0 = (double*)(device__force_buffers + ABSWAP(params.pp->patch2_force_start, params.pp->patch1_force_start));
01986       double *ffBase1 = (double*)(device__force_buffers + ABSWAP(params.pp->patch1_force_start, params.pp->patch2_force_start));
01987       double *fullfBase0 = (double*)(device__slow_force_buffers + ABSWAP(params.pp->patch2_force_start, params.pp->patch1_force_start));
01988       double *fullfBase1 = (double*)(device__slow_force_buffers + ABSWAP(params.pp->patch1_force_start, params.pp->patch2_force_start));
01989       #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
01990         params.ff[0] = (double4*)ffBase0;
01991         params.ff[1] = (double4*)ffBase1;
01992         params.fullf[0] = (double4*)fullfBase0;
01993         params.fullf[1] = (double4*)fullfBase1;
01994         __ASSERT(params.ff[0] != NULL);
01995         __ASSERT(params.ff[1] != NULL);
01996         __ASSERT(params.fullf[0] != NULL);
01997         __ASSERT(params.fullf[1] != NULL);
01998       #else
01999         params.ff_x[0] = ffBase0 + (0 * n0_16); params.ff_x[1] = ffBase1 + (0 * n1_16);
02000         params.ff_y[0] = ffBase0 + (1 * n0_16); params.ff_y[1] = ffBase1 + (1 * n1_16);
02001         params.ff_z[0] = ffBase0 + (2 * n0_16); params.ff_z[1] = ffBase1 + (2 * n1_16);
02002         params.ff_w[0] = ffBase0 + (3 * n0_16); params.ff_w[1] = ffBase1 + (3 * n1_16);
02003         params.fullf_x[0] = fullfBase0 + (0 * n0_16); params.fullf_x[1] = fullfBase1 + (0 * n1_16);
02004         params.fullf_y[0] = fullfBase0 + (1 * n0_16); params.fullf_y[1] = fullfBase1 + (1 * n1_16);
02005         params.fullf_z[0] = fullfBase0 + (2 * n0_16); params.fullf_z[1] = fullfBase1 + (2 * n1_16);
02006         params.fullf_w[0] = fullfBase0 + (3 * n0_16); params.fullf_w[1] = fullfBase1 + (3 * n1_16);
02007         __ASSERT(params.ff_x[0] != NULL); __ASSERT(params.ff_x[1] != NULL);
02008         __ASSERT(params.ff_y[0] != NULL); __ASSERT(params.ff_y[1] != NULL);
02009         __ASSERT(params.ff_z[0] != NULL); __ASSERT(params.ff_z[1] != NULL);
02010         __ASSERT(params.ff_w[0] != NULL); __ASSERT(params.ff_w[1] != NULL);
02011         __ASSERT(params.fullf_x[0] != NULL); __ASSERT(params.fullf_x[1] != NULL);
02012         __ASSERT(params.fullf_y[0] != NULL); __ASSERT(params.fullf_y[1] != NULL);
02013         __ASSERT(params.fullf_z[0] != NULL); __ASSERT(params.fullf_z[1] != NULL);
02014         __ASSERT(params.fullf_w[0] != NULL); __ASSERT(params.fullf_w[1] != NULL);
02015       #endif
02016 
02017       // Create the offsets for the first list of particles.
02018       // NOTE: In this version of the nonbonded code, the positions of the atoms are stored
02019       //   as offsets within the patch in which they are located.  These offsets represent
02020       //   the offsets between the two patches being interacted (including periodic boundaries).
02021       params.offset.x = params.pp->offset.x * device__lata.x
02022                       + params.pp->offset.y * device__latb.x
02023                       + params.pp->offset.z * device__latc.x;
02024       params.offset.y = params.pp->offset.x * device__lata.y
02025                       + params.pp->offset.y * device__latb.y
02026                       + params.pp->offset.z * device__latc.y;
02027       params.offset.z = params.pp->offset.x * device__lata.z
02028                       + params.pp->offset.y * device__latb.z
02029                       + params.pp->offset.z * device__latc.z;
02030       #if MIC_SORT_LISTS != 0
02031         params.offset.x *= ABSWAP(-1.0f, 1.0f);
02032         params.offset.y *= ABSWAP(-1.0f, 1.0f);
02033         params.offset.z *= ABSWAP(-1.0f, 1.0f);
02034       #endif
02035 
02036       // DMK - DEBUG - Record the center point for each of the input patches
02037       float patch1_center_x = params.pp->patch1_center_x * device__lata.x
02038                              + params.pp->patch1_center_y * device__latb.x
02039                              + params.pp->patch1_center_z * device__latc.x;
02040       float patch1_center_y = params.pp->patch1_center_x * device__lata.y
02041                              + params.pp->patch1_center_y * device__latb.y
02042                              + params.pp->patch1_center_z * device__latc.y;
02043       float patch1_center_z = params.pp->patch1_center_x * device__lata.z
02044                              + params.pp->patch1_center_y * device__latb.z
02045                              + params.pp->patch1_center_z * device__latc.z;
02046       float patch2_center_x = params.pp->patch2_center_x * device__lata.x
02047                              + params.pp->patch2_center_y * device__latb.x
02048                              + params.pp->patch2_center_z * device__latc.x;
02049       float patch2_center_y = params.pp->patch2_center_x * device__lata.y
02050                              + params.pp->patch2_center_y * device__latb.y
02051                              + params.pp->patch2_center_z * device__latc.y;
02052       float patch2_center_z = params.pp->patch2_center_x * device__lata.z
02053                              + params.pp->patch2_center_y * device__latb.z
02054                              + params.pp->patch2_center_z * device__latc.z;
02055       params.patch1_center_x = ABSWAP(patch2_center_x, patch1_center_x);
02056       params.patch2_center_x = ABSWAP(patch1_center_x, patch2_center_x);
02057       params.patch1_center_y = ABSWAP(patch2_center_y, patch1_center_y);
02058       params.patch2_center_y = ABSWAP(patch1_center_y, patch2_center_y);
02059       params.patch1_center_z = ABSWAP(patch2_center_z, patch1_center_z);
02060       params.patch2_center_z = ABSWAP(patch1_center_z, patch2_center_z);
02061 
02062       // Initialize the virial accumulators for this compute (zero them out)
02063       params.virial_xx = 0;
02064       params.virial_xy = 0;
02065       params.virial_xz = 0;
02066       params.virial_yy = 0;
02067       params.virial_yz = 0;
02068       params.virial_zz = 0;
02069       params.fullElectVirial_xx = 0;
02070       params.fullElectVirial_xy = 0;
02071       params.fullElectVirial_xz = 0;
02072       params.fullElectVirial_yy = 0;
02073       params.fullElectVirial_yz = 0;
02074       params.fullElectVirial_zz = 0;
02075       params.vdwEnergy = 0.0;
02076       params.electEnergy = 0.0;
02077       params.fullElectEnergy = 0.0;
02078       #if MIC_EXCL_CHECKSUM_FULL != 0
02079         params.exclusionSum = 0;
02080       #endif
02081 
02082       // Select the version of the kernel to call based on the timestep's requirements
02083       //   and what type of compute this compute is
02084       int isSelf = (params.pp->patch1_force_start == params.pp->patch2_force_start);
02085                     // NOTE: Many ways to check this (arbitrary test used here)
02086       int selfBit = ((isSelf) ? (0x01) : (0x00));
02087       int doSlowBit = ((kernel_data->doSlow) ? (0x02) : (0x00));
02088       int doEnergyBit = ((kernel_data->doEnergy) ? (0x04) : (0x00));
02089       int kernelSelect = selfBit | doSlowBit | doEnergyBit;
02090       DEVICE_FPRINTF("%d", kernelSelect);
02091       switch (kernelSelect) {
02092         case 0x00: calc_pair(params); break;
02093         case 0x01: calc_self(params); break;
02094         case 0x02: calc_pair_fullelect(params); break;
02095         case 0x03: calc_self_fullelect(params); break;
02096         case 0x04: calc_pair_energy(params); break;
02097         case 0x05: calc_self_energy(params); break;
02098         case 0x06: calc_pair_energy_fullelect(params); break;
02099         case 0x07: calc_self_energy_fullelect(params); break;
02100         default:
02101           mic_dev_die("!!! INVALID KERNEL SELECTION ON MIC DEVICE !!!\n");
02102           break;
02103       } // end switch (kernelSelect)
02104 
02105       // Contribute this compute's virial summations into the overall virial summation
02106       //   that will be passed back to the host
02107       #if MIC_SORT_LISTS != 0
02108         if (abSwapFlag) {
02109           virial_xx -= params.virial_xx;
02110           virial_xy -= params.virial_xy;
02111           virial_xz -= params.virial_xz;
02112           virial_yy -= params.virial_yy;
02113           virial_yz -= params.virial_yz;
02114           virial_zz -= params.virial_zz;
02115           fullElectVirial_xx -= params.fullElectVirial_xx;
02116           fullElectVirial_xy -= params.fullElectVirial_xy;
02117           fullElectVirial_xz -= params.fullElectVirial_xz;
02118           fullElectVirial_yy -= params.fullElectVirial_yy;
02119           fullElectVirial_yz -= params.fullElectVirial_yz;
02120           fullElectVirial_zz -= params.fullElectVirial_zz;
02121           vdwEnergy -= params.vdwEnergy;
02122           electEnergy -= params.electEnergy;
02123           fullElectEnergy -= params.fullElectEnergy;
02124         } else {
02125       #endif
02126           virial_xx += params.virial_xx;
02127           virial_xy += params.virial_xy;
02128           virial_xz += params.virial_xz;
02129           virial_yy += params.virial_yy;
02130           virial_yz += params.virial_yz;
02131           virial_zz += params.virial_zz;
02132           fullElectVirial_xx += params.fullElectVirial_xx;
02133           fullElectVirial_xy += params.fullElectVirial_xy;
02134           fullElectVirial_xz += params.fullElectVirial_xz;
02135           fullElectVirial_yy += params.fullElectVirial_yy;
02136           fullElectVirial_yz += params.fullElectVirial_yz;
02137           fullElectVirial_zz += params.fullElectVirial_zz;
02138           vdwEnergy += params.vdwEnergy;
02139           electEnergy += params.electEnergy;
02140           fullElectEnergy += params.fullElectEnergy;
02141       #if MIC_SORT_LISTS != 0
02142         }
02143       #endif
02144 
02145       #if MIC_EXCL_CHECKSUM_FULL != 0
02146         exclusionSum += params.exclusionSum;
02147       #endif
02148 
02149       #undef ABSWAP
02150 
02151       // TRACING - Record the end time for the compute
02152       #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0) && (MIC_DEVICE_TRACING_DETAILED != 0)
02153         device__device_times_computes[ppI * 2 + 1] = getCurrentTime();
02154       #endif
02155 
02156     } // end parallel for (ppI < device__patch_pairs_size)
02157 
02158     // TRACING - Record the end time for all the computes
02159     #if (MIC_TRACING != 0 && MIC_DEVICE_TRACING != 0)
02160       device__device_times_start[((isRemote)?(4):(5))] = getCurrentTime();
02161     #endif
02162 
02163     // Store the reduced virial values into the kernel_data structure to be passed
02164     //   back to the host core
02165     kernel_data->virial_xx = virial_xx;
02166     kernel_data->virial_xy = virial_xy;
02167     kernel_data->virial_xz = virial_xz;
02168     kernel_data->virial_yy = virial_yy;
02169     kernel_data->virial_yz = virial_yz;
02170     kernel_data->virial_zz = virial_zz;
02171     kernel_data->fullElectVirial_xx = fullElectVirial_xx;
02172     kernel_data->fullElectVirial_xy = fullElectVirial_xy;
02173     kernel_data->fullElectVirial_xz = fullElectVirial_xz;
02174     kernel_data->fullElectVirial_yy = fullElectVirial_yy;
02175     kernel_data->fullElectVirial_yz = fullElectVirial_yz;
02176     kernel_data->fullElectVirial_zz = fullElectVirial_zz;
02177     kernel_data->vdwEnergy = vdwEnergy;
02178     kernel_data->electEnergy = electEnergy;
02179     kernel_data->fullElectEnergy = fullElectEnergy;
02180     #if MIC_EXCL_CHECKSUM_FULL != 0
02181       kernel_data->exclusionSum = exclusionSum;
02182     #endif
02183 
02184     // Calculate the upper and lower bounds for the force lists to be processed by this kernel
02185     int flI_start = (isRemote) ? (kernel_data->numLocalPatches) : (0);
02186     int flI_end = (device__constants->singleKernelFlag != 0 || isRemote) ? (device__force_lists_size) : (kernel_data->numLocalPatches);
02187     __FULL_CHECK(__ASSERT(flI_start >= 0 && flI_start <= device__force_lists_size));
02188     __FULL_CHECK(__ASSERT(flI_end >= 0 && flI_start <= device__force_lists_size));
02189     __FULL_CHECK(__ASSERT(flI_start <= flI_end));
02190 
02191     // Because there are fewer patches than computes, create more parallelism by sub-dividing the
02192     //   the force lists (patches), causing each patch to be processed by multiple threads
02193     const int numThreads = device__numOMPThreads; //omp_get_max_threads();
02194     const int numForceLoopIters = flI_end - flI_start;
02195     int numForceLoopSplits = 1;
02196     if ((numForceLoopIters > 0) && ((2 * numThreads) > numForceLoopIters)) {
02197       numForceLoopSplits = (2 * numThreads) / numForceLoopIters;  // NOTE: 2 * numThreads to break it up more (smaller chunks of work)
02198       if (numForceLoopSplits < 2) { numForceLoopSplits = 2; }  // At least split in half
02199       if (numForceLoopSplits > 16) { numForceLoopSplits = 16; }  // Don't split any single patch too fine (threading overhead costs)
02200     }
02201     DEVICE_FPRINTF(" F(%d,%d:%d)", flI_start, flI_end, numForceLoopSplits);
02202     flI_start *= numForceLoopSplits;
02203     flI_end *= numForceLoopSplits;
02204 
02205     // TRACING - Record the start time for the processing of all force lists (patches)
02206     #if (MIC_TRACING != 0 && MIC_DEVICE_TRACING != 0)
02207       device__device_times_start[((isRemote)?(6):(7))] = getCurrentTime();
02208     #endif
02209 
02210     // Process each of the force lists (patches)
02211     #pragma novector
02212     #if MULTIPLE_THREADS != 0
02213       #pragma omp parallel for schedule(dynamic)
02214     #endif
02215     for (int _flI = flI_start; _flI < flI_end; _flI++) {
02216 
02217       // From _flI, calculate the flI (patch index) and flIPart (thread per patch) values
02218       int flI = _flI / numForceLoopSplits;
02219       __FULL_CHECK(__ASSERT(flI >= 0 && flI < device__force_lists_size));
02220       int flIPart = _flI % numForceLoopSplits;
02221       
02222       // TRACING - Record the start of each force reduction
02223       // DMK - FIXME | NOTE : THIS ONLY RECORDS ONE TASK/ITERATION OF EACH SPLIT SET, SO
02224       //   PROJECTIONS WILL ONLY SHOW SOME OF THE FORCE "TASKS" !!! (the problem is
02225       //   numForceLoopSplits can differ between the "remote" and "local" kernels).
02226       #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0) && (MIC_DEVICE_TRACING_DETAILED != 0)
02227         // DMK - NOTE : Don't select the same part each time (e.g. only flIPart == 0)
02228         if (flIPart == flI % numForceLoopSplits) {
02229           device__device_times_patches[flI * 2] = getCurrentTime();
02230         }
02231       #endif
02232 
02233       // Get the output force list pointer and force array length
02234       const force_list &fl = device__force_lists[flI];
02235       const int f_len = fl.patch_stride * 4; // NOTE : number of individual doubles
02236       __ASSUME(f_len % 16 == 0);
02237 
02238       // Calculate this "task's" portion of the patch object's work (flIPartILo -> flIPartIHi)
02239       int flIPartJLo = (int)(((float)(f_len)) * (((float)(flIPart    )) / ((float)(numForceLoopSplits))));
02240       int flIPartJHi = (int)(((float)(f_len)) * (((float)(flIPart + 1)) / ((float)(numForceLoopSplits))));
02241       // NOTE: Force flIPartJLo and flIPartJHi to cacheline boundaries to avoid false sharing
02242       flIPartJLo = (flIPartJLo + 7) & (~7);
02243       flIPartJHi = (flIPartJHi + 7) & (~7);
02244       if (flIPartJHi > f_len) { flIPartJHi = f_len; }
02245       __ASSUME(flIPartJLo % 8 == 0);
02246       __ASSUME(flIPartJHi % 8 == 0);  // NOTE: true after capping at f_len since f_len % 16 == 0
02247 
02248       // Sum the force output for each compute contributing to this force list (patch)
02249       {
02250         // Setup the pointer to the final force array that will be passed back up to the host
02251         __FULL_CHECK(__ASSERT(device__forces != NULL));
02252         __FULL_CHECK(__ASSERT(fl.force_output_start >= 0));
02253         __FULL_CHECK(__ASSERT((fl.force_output_start < device__atoms_size) || (fl.force_output_start == device__atoms_size && f_len == 0)));
02254         double * RESTRICT fDst = (double*)(device__forces + fl.force_output_start);
02255         __FULL_CHECK(__ASSERT(fDst != NULL));
02256         __ASSUME_ALIGNED(fDst);
02257 
02258         // Setup the pointer to the list of arrays, each with output from one of
02259         //   the compute objects that contributed to this patch's force data
02260         __FULL_CHECK(__ASSERT(device__force_buffers != NULL));
02261         __FULL_CHECK(__ASSERT(fl.force_list_start >= 0));
02262         __FULL_CHECK(__ASSERT((fl.force_list_start < device__force_buffers_req_size) || (fl.force_list_start == device__force_buffers_req_size && f_len == 0)));
02263         double *fSrcBase = (double*)(device__force_buffers + fl.force_list_start);
02264         __FULL_CHECK(__ASSERT(fSrcBase != NULL));
02265         __ASSUME_ALIGNED(fSrcBase);
02266 
02267         // Accumulate the forces from the various computes contributing to this patch
02268         memset(fDst + flIPartJLo, 0, sizeof(double) * (flIPartJHi - flIPartJLo));
02269         for (int i = 0; i < fl.force_list_size; i++) {
02270           #pragma simd
02271           for (int j = flIPartJLo; j < flIPartJHi; j++) {
02272             fDst[j] += fSrcBase[i * f_len + j];
02273           }
02274         }
02275       }
02276 
02277       // Sum the output for each contributing compute
02278       if (kernel_data->doSlow) {
02279 
02280         // Setup the pointer to the final force array that will be passed back up to the host
02281         __FULL_CHECK(__ASSERT(device__slow_forces != NULL));
02282         double * RESTRICT fDst = (double*)(device__slow_forces + fl.force_output_start);
02283         __FULL_CHECK(__ASSERT(fDst != NULL));
02284         __ASSUME_ALIGNED(fDst);
02285 
02286         // Setup the pointer to the list of arrays, each with output from one of
02287         //   the compute objects that contributed to this patch's force data
02288         __FULL_CHECK(__ASSERT(device__slow_force_buffers != NULL));
02289         double *fSrcBase = (double*)(device__slow_force_buffers + fl.force_list_start);
02290         __FULL_CHECK(__ASSERT(fSrcBase != NULL));
02291         __ASSUME_ALIGNED(fSrcBase);
02292 
02293         // Accumulate the forces from the various computes contributing to this patch
02294         memset(fDst + flIPartJLo, 0, sizeof(double) * (flIPartJHi - flIPartJLo));
02295         for (int i = 0; i < fl.force_list_size; i++) {
02296           #pragma simd
02297           for (int j = flIPartJLo; j < flIPartJHi; j++) {
02298             fDst[j] += fSrcBase[i * f_len + j];
02299           }
02300         }
02301       }
02302      
02303       // TRACING - Record the end time for each force list (patch)
02304       #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0) && (MIC_DEVICE_TRACING_DETAILED != 0)
02305         device__device_times_patches[flI * 2 + 1] = getCurrentTime();
02306       #endif
02307 
02308     } // end parallel for (flI < device__force_lists_size)
02309 
02310     // TRACING - Record the end time for all the force lists (patches)
02311     #if (MIC_TRACING != 0 && MIC_DEVICE_TRACING != 0)
02312       device__device_times_start[((isRemote)?(8):(9))] = getCurrentTime();
02313     #endif
02314 
02315     // DMK - DEBUG
02316     #if MIC_DATA_STRUCT_VERIFY != 0
02317       _verify_data_structures(device__pe, device__timestep, isRemote, 2, ppI_start, ppI_end, kernel_data->doSlow);
02318       _verify_buffers_match(device__pe, device__timestep, isRemote, 2, (char*)device__patch_pairs, (char*)device__patch_pairs_copy, sizeof(patch_pairs) * device__patch_pairs_size, "device__patch_pairs_copy");
02319       _verify_buffers_match(device__pe, device__timestep, isRemote, 2, (char*)device__force_lists, (char*)device__force_lists_copy, sizeof(force_lists) * device__force_lists_size, "device__force_lists_copy");
02320       _verify_buffers_match(device__pe, device__timestep, isRemote, 2, (char*)device__atoms, (char*)device__atoms_copy, sizeof(atom) * device__atoms_size, "device__atoms_copy");
02321       _verify_buffers_match(device__pe, device__timestep, isRemote, 2, (char*)device__atom_params, (char*)device__atom_params_copy, sizeof(atom_param) * device__atoms_size, "device__atom_params_copy");
02322     #endif
02323 
02324     // DMK - DEBUG
02325     #if MIC_TRACK_DEVICE_MEM_USAGE != 0
02326       if (device__timestep % 100 == 0 && isRemote == 0) {
02327 
02328         uint64_t plTotalMem = sizeof(void*) * device__pairlists_alloc_size;
02329         uint64_t plUsedMem = plTotalMem;
02330         uint64_t plPeakUsedMem = plTotalMem;
02331         if (device__pairlists != NULL) {
02332           for (int k = 0; k < device__pairlists_alloc_size; k++) {
02333             int * plPtr = ((int**)device__pairlists)[k];
02334             if (plPtr != NULL) {
02335               plTotalMem += plPtr[0] * sizeof(int) + 64;
02336               plUsedMem += plPtr[1] * sizeof(int);
02337               plPeakUsedMem += plPtr[-1] * sizeof(int);
02338             }
02339           }
02340         }
02341 
02342         uint64_t fbsTotalMem = device__force_buffers_alloc_size * 4 * sizeof(double);
02343         MemInfo mi;
02344         readMemInfo(&mi);
02345 
02346         printf("[MIC-MEMINFO] :: PE %d :: MIC mem usage (MB) -- ts: %d - "
02347                "f: %.3lf, t: %.3lf, c: %.3lf, a: %.3lf, i: %.3lf - "
02348                "vS: %.3lf, vP: %.3lf - "
02349                "plTM: %.3lf, plUM: %.3lf, plUPM: %.3lf, fbsTM(x2): %.3lf\n",
02350                ((int)(device__pe)),
02351                ((int)(device__timestep)),
02352                ((double)(mi.memFree)) * 1.0e-6,
02353                ((double)(mi.memTotal)) * 1.0e-6,
02354                ((double)(mi.cached)) * 1.0e-6,
02355                ((double)(mi.active)) * 1.0e-6,
02356                ((double)(mi.inactive)) * 1.0e-6,
02357                ((double)(mi.vmSize)) * 1.0e-6,
02358                ((double)(mi.vmPeak)) * 1.0e-6,
02359                ((double)(plTotalMem)) * 1.0e-6,
02360                ((double)(plUsedMem)) * 1.0e-6,
02361                ((double)(plPeakUsedMem)) * 1.0e-6,
02362                ((double)(fbsTotalMem)) * 1.0e-6
02363               ); fflush(NULL);
02364       }
02365     #endif
02366 
02367     // DMK - DEBUG - Increment the timestep counter on the device
02368     if (kernel_data->isRemote == 0) { device__timestep++; }
02369 
02370     DEVICE_FPRINTF(" |\n");
02371 
02372   } // end pragma offload
02373 
02374   #if MIC_TRACING != 0
02375     double pragma_end = CmiWallTimer();
02376     MIC_TRACING_RECORD(MIC_EVENT_OFFLOAD_PRAGMA, pragma_start, pragma_end);
02377   #endif
02378 
02379   #if MIC_SYNC_INPUT != 0
02380     #undef TMP_SYNC_ATOMS_CLAUSE
02381   #endif
02382   #undef TMP_SYNC_KERNEL_DATA_CLAUSE
02383   #undef MIC_DEVICE_SYNC_INPUT_CLAUSE
02384 
02385   #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0)  
02386     #undef MIC_DEVICE_TIMING_PRAGMA_CLAUSE_DETAILED
02387   #endif
02388   #undef MIC_DEVICE_TIMING_PRAGMA_CLAUSE
02389 
02390   #undef MIC_DEVICE_DATA_STRUCT_VERIFY_CLAUSE
02391   #undef MIC_DEVICE_REFINE_PAIRLISTS_CLAUSE
02392 }
02393 
02394 void mic_transfer_output(const int deviceNum,
02395                          const int isRemote,
02396                          const int numLocalAtoms,
02397                          const int doSlow
02398                         ) {
02399 
02400   const int numAtoms = host__atoms_size;
02401 
02402   mic_kernel_data * kernel_data = host__kernel_data;
02403 
02404   int toCopyStart_forces = (singleKernelFlag != 0) ? (0) : ((isRemote) ? (numLocalAtoms) : (0));
02405   int toCopySize_forces = (singleKernelFlag != 0) ? (numAtoms) : ((isRemote) ? (numAtoms - numLocalAtoms) : (numLocalAtoms));
02406   int toCopyStart_slow_forces = (doSlow) ? (toCopyStart_forces) : (0);
02407   int toCopySize_slow_forces = (doSlow) ? (toCopySize_forces) : (0);
02408 
02409   double4 * forces = host__forces;
02410   double4 * slow_forces = host__slow_forces;
02411 
02412   #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0)
02413     const int numPatchPairs = host__patch_pairs_size;
02414     const int numForceLists = host__force_lists_size;
02415     double * device_times_start = host__device_times_start;
02416     int toCopySize_device_times_start = ((isRemote) ? (0) : (10));
02417     #if MIC_DEVICE_TRACING_DETAILED != 0
02418       double * device_times_computes = host__device_times_computes;
02419       double * device_times_patches = host__device_times_patches;
02420       int toCopySize_device_times_computes = ((isRemote) ? (0) : (2 * numPatchPairs));
02421       int toCopySize_device_times_patches = ((isRemote) ? (0) : (2 * numForceLists));
02422       #define MIC_DEVICE_TIMING_PRAGMA_CLAUSE_DETAILED \
02423         out(device_times_computes[0:toCopySize_device_times_computes] : alloc_if(0) free_if(0)) \
02424         out(device_times_patches[0:toCopySize_device_times_patches] : alloc_if(0) free_if(0))
02425     #else
02426       #define MIC_DEVICE_TIMING_PRAGMA_CLAUSE_DETAILED
02427     #endif
02428     #define MIC_DEVICE_TIMING_PRAGMA_CLAUSE \
02429       MIC_DEVICE_TIMING_PRAGMA_CLAUSE_DETAILED \
02430       out(device_times_start[0:toCopySize_device_times_start] : alloc_if(0) free_if(0))
02431   #else
02432     #define MIC_DEVICE_TIMING_PRAGMA_CLAUSE
02433   #endif
02434 
02435   #pragma offload_transfer target(mic:deviceNum) \
02436     out(forces[toCopyStart_forces:toCopySize_forces] : alloc_if(0) free_if(0)) \
02437     out(slow_forces[toCopyStart_slow_forces:toCopySize_slow_forces] : alloc_if(0) free_if(0)) \
02438     out(kernel_data[isRemote:1] : alloc_if(0) free_if(0)) \
02439     MIC_DEVICE_TIMING_PRAGMA_CLAUSE
02440   { }
02441   //  out(device__timestep : into(host__timestep))
02442 
02443   #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0)
02444     #undef MIC_DEVICE_TIMING_PRAGMA_CLAUSE_DETAILED
02445   #endif
02446   #undef MIC_DEVICE_TIMING_PRAGMA_CLAUSE
02447 }
02448 
02449 int mic_check_remote_kernel_complete(const int deviceNum) {
02450   // DMK - NOTE : Disable these warnings for now since the device code is only called once,
02451   //   but the state variable goes from 0 -> 1 -> 2 -> 0, which checks at both 1 and 2
02452   //if (!tag_remote_kernel) { printf("WARNING :: mic_check_remote_kernel_complete :: called when kernel not active.\n"); }
02453   // if (_Offload_signaled(deviceNum, &tag_remote_kernel)) {
02454     tag_remote_kernel = 0;
02455     return 1;
02456   // }
02457   // return 0;
02458 }
02459 
02460 int mic_check_local_kernel_complete(const int deviceNum) {
02461   // DMK - NOTE : Disable these warnings for now since the device code is only called once,
02462   //   but the state variable goes from 0 -> 1 -> 2 -> 0, which checks at both 1 and 2
02463   //if (!tag_local_kernel) { printf("WARNING :: mic_check_local_kernel_complete :: called when kernel not active.\n"); }
02464   // if (_Offload_signaled(deviceNum, &tag_local_kernel)) {
02465     tag_local_kernel = 0;
02466     return 1;
02467   // }
02468   // return 0;
02469 }
02470 
02471 
02472 void mic_free_device(const int deviceNum) {
02473 
02474   mic_kernel_data * kernel_data = host__kernel_data;
02475 
02476   // Cleanup kernel data (for buffers allocated via offload pragmas, use "free_if(1)")
02477   #pragma offload target(mic:deviceNum) \
02478     nocopy(kernel_data : alloc_if(0) free_if(1)) \
02479     nocopy(device__table_four) nocopy(device__table_four_float) \
02480     nocopy(device__lj_table) nocopy(device__lj_table_float) \
02481     nocopy(device__exclusion_bits) nocopy(device__exclusion_bits_copy) \
02482     nocopy(device__constants) \
02483     nocopy(device__force_buffers) nocopy(device__slow_force_buffers) \
02484     nocopy(device__pairlists) nocopy(device__pairlists_alloc_size) \
02485     nocopy(device__pl_array) nocopy(device__pl_size) nocopy(device__r2_array) \
02486     nocopy(device__patch_pairs : alloc_if(0) free_if(1)) \
02487     nocopy(device__force_lists : alloc_if(0) free_if(1)) \
02488     nocopy(device__numOMPThreads) \
02489     DEVICE_FPRINTF_CLAUSE
02490   {
02491     // Cleanup pairlist memory
02492     if (device__pairlists != NULL) {
02493       int **pairlists = (int**)(device__pairlists);
02494       for (int i = 0; i < device__pairlists_alloc_size; i++) {
02495         if (pairlists[i] != NULL) { _MM_FREE_WRAPPER(pairlists[i]); }
02496       }
02497       _MM_FREE_WRAPPER(pairlists);
02498       device__pairlists = 0;
02499       device__pairlists_alloc_size = 0;
02500     }
02501 
02502     // Cleanup refined pairlist memory
02503     #if REFINE_PAIRLISTS != 0
02504       for (int i = 0; i < device__numOMPThreads; i++) {
02505         if (device__pl_array[i] != NULL) { _MM_FREE_WRAPPER(device__pl_array[i]); }
02506         if (device__r2_array[i] != NULL) { _MM_FREE_WRAPPER(device__r2_array[i]); }
02507       }
02508       _MM_FREE_WRAPPER(device__pl_array); device__pl_array = NULL;
02509       _MM_FREE_WRAPPER(device__r2_array); device__r2_array = NULL;
02510       _MM_FREE_WRAPPER(device__pl_size); device__pl_size = NULL;
02511     #endif
02512 
02513     // Clean up tables
02514     #if MIC_HANDCODE_FORCE_SINGLE != 0
02515       if (device__table_four != NULL) { _MM_FREE_WRAPPER(device__table_four); device__table_four = NULL; }
02516       if (device__table_four_float != NULL) { _MM_FREE_WRAPPER(device__table_four_float); device__table_four_float = NULL; }
02517       if (device__lj_table != NULL) { _MM_FREE_WRAPPER(device__lj_table); device__lj_table = NULL; }
02518       if (device__lj_table_float != NULL) { _MM_FREE_WRAPPER(device__lj_table_float); device__lj_table_float = NULL; }
02519     #endif
02520     if (device__exclusion_bits != NULL) { _MM_FREE_WRAPPER(device__exclusion_bits); device__exclusion_bits = NULL; }
02521     if (device__exclusion_bits_copy != NULL) { _MM_FREE_WRAPPER(device__exclusion_bits_copy); device__exclusion_bits_copy = NULL; }
02522     if (device__constants != NULL) { _MM_FREE_WRAPPER(device__constants); device__constants = NULL; }
02523 
02524     // Clean up intermediate force buffers
02525     if (device__force_buffers != NULL) { _MM_FREE_WRAPPER(device__force_buffers); device__force_buffers = NULL; }
02526     if (device__slow_force_buffers != NULL) { _MM_FREE_WRAPPER(device__slow_force_buffers); device__slow_force_buffers = NULL; }
02527 
02528     #if MIC_DEVICE_FPRINTF != 0
02529       fclose(device__fout);
02530     #endif
02531   }
02532 
02533   if (host__kernel_data != NULL) { _MM_FREE_WRAPPER(host__kernel_data); host__kernel_data = NULL; }
02534 }
02535 
02536 
02537 // DMK - DEBUG
02538 #if MIC_TRACK_DEVICE_MEM_USAGE != 0
02539 
02540 __declspec(target(mic))
02541 void printMemInfo(int device__pe, int device__timestep, MemInfo * mi) {
02542   printf("[MIC-MEMINFO] :: PE %d :: MIC mem usage (MB) -- ts: %d - "
02543          "f: %.3lf, t: %.3lf, c: %.3lf, a: %.3lf, i: %.3lf - "
02544          "vS: %.3lf, vP: %.3lf\n",
02545          ((int)(device__pe)),
02546          ((int)(device__timestep)),
02547          ((double)(mi->memFree)) * 1.0e-6,
02548          ((double)(mi->memTotal)) * 1.0e-6,
02549          ((double)(mi->cached)) * 1.0e-6,
02550          ((double)(mi->active)) * 1.0e-6,
02551          ((double)(mi->inactive)) * 1.0e-6,
02552          ((double)(mi->vmSize)) * 1.0e-6,
02553          ((double)(mi->vmPeak)) * 1.0e-6
02554         ); fflush(NULL);
02555 }
02556 
02557 __declspec(target(mic))
02558 void readMemInfo_processLine(MemInfo * memInfo, char * n, char * v, char * u) {
02559   assert(n != NULL && v != NULL);
02560 
02561   size_t * loc = NULL;
02562   if (strcmp(n, "MemTotal:") == 0) { loc = &(memInfo->memTotal); }
02563   if (strcmp(n, "MemFree:") == 0) { loc = &(memInfo->memFree); }
02564   if (strcmp(n, "Cached:") == 0) { loc = &(memInfo->cached); }
02565   if (strcmp(n, "Active:") == 0) { loc = &(memInfo->active); }
02566   if (strcmp(n, "Inactive:") == 0) { loc = &(memInfo->inactive); }
02567   if (strcmp(n, "VmSize:") == 0) { loc = &(memInfo->vmSize); }
02568   if (strcmp(n, "VmPeak:") == 0) { loc = &(memInfo->vmPeak); }
02569   if (loc == NULL) { return; }
02570 
02571   *loc = (size_t)(atoi(v));
02572   if (u != NULL) {
02573     if (strcmp(u, "kB") == 0) { (*loc) *= 1024; }
02574   }
02575 }
02576 
02577 __declspec(target(mic))
02578 bool readMemInfo(MemInfo * memInfo) {
02579   if (memInfo == NULL) { return false; }
02580   memset(memInfo, 0, sizeof(MemInfo));
02581 
02582   FILE * procMem = fopen("/proc/meminfo", "r");
02583   if (procMem == NULL) {
02584     printf("[WARNING] :: Unable to read /proc/meminfo\n");
02585     return false;
02586   }
02587   char line[256], n[128], v[128], u[128];
02588   while (!feof(procMem)) {
02589     int numFilled = fscanf(procMem, "%[^\n]\n", line);
02590     if (numFilled == 0) { printf("empty line\n"); break; }
02591     numFilled = sscanf(line, "%s%s%s", n, v, u);
02592     if (numFilled == 3) {
02593       readMemInfo_processLine(memInfo, n, v, u);
02594     } else if (numFilled == 2) { // No unit or prev line had name only
02595       int vLen = strlen(v);
02596       if (v[vLen-1] == ':') { // Prev was name only (correct)
02597         memcpy(n, v, vLen + 1);
02598       }
02599       readMemInfo_processLine(memInfo, n, v, NULL);
02600     }
02601   }
02602   fclose(procMem);
02603 
02604   pid_t pid = getpid();
02605   char filename[256];
02606   sprintf(filename, "/proc/%d/status", (int)pid);
02607   procMem = fopen(filename, "r");
02608   if (procMem == NULL) {
02609     printf("[WARNING] :: Unable to read %s\n", filename);
02610     return false;
02611   }
02612   while (!feof(procMem)) {
02613     int numFilled = fscanf(procMem, "%[^\n]\n", line);
02614     if (numFilled == 0) { printf("empty line\n"); break; }
02615     numFilled = sscanf(line, "%s%s%s", n, v, u);
02616     if (numFilled == 3) {
02617       readMemInfo_processLine(memInfo, n, v, u);
02618     } else if (numFilled == 2) { // No unit
02619       readMemInfo_processLine(memInfo, n, v, NULL);
02620     }
02621   }
02622   fclose(procMem);
02623 
02624   return true;
02625 }
02626 
02627 #endif  // MIC_TRACK_DEVICE_MEM_USAGE != 0
02628 
02629 
02630 __declspec(target(mic))
02631 void* _mm_malloc_withPrint(size_t s, int a, char * l) {
02632   void * ptr = _mm_malloc(s, a);
02633   printf("[MIC-MALLOC] :: _mm_malloc - l: \"%s\", s: %lu (%lu), ptr: %p\n", l, s, sizeof(s), ptr);
02634   return ptr;
02635 }
02636 
02637 __declspec(target(mic))
02638 void _mm_free_withPrint(void * ptr) {
02639   printf("[MIC-MALLOC] :: _mm_free - ptr: %p\n", ptr);
02640   _mm_free(ptr);
02641 }
02642 
02643 
02644 #else  // NAMD_MIC
02645 
02646 #include "ComputeNonbondedMICKernel.h"
02647 #include "ComputeNonbondedMICKernelBase.h"
02648 
02649 #endif  // NAMD_MIC

Generated on Thu Sep 21 01:17:11 2017 for NAMD by  doxygen 1.4.7