29 #define __thread __declspec(thread)
35 extern __thread mic_kernel_data * host__kernel_data;
36 extern __thread
int singleKernelFlag;
39 #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0)
40 __thread
double mic_first_kernel_submit_time;
41 extern __thread patch_pair* host__patch_pairs;
42 extern __thread
int host__patch_pairs_size;
43 extern __thread
int host__force_lists_size;
44 #if MIC_DEVICE_TRACING_DETAILED != 0
45 extern __thread
double* host__device_times_computes;
46 extern __thread
double* host__device_times_patches;
48 extern __thread
double* host__device_times_start;
52 __thread
int mic_singleKernel = -1;
53 __thread
int mic_deviceThreshold = -1;
54 __thread
int mic_hostSplit = -1;
55 __thread
int mic_numParts_self_p1 = -1;
56 __thread
int mic_numParts_pair_p1 = -1;
57 __thread
int mic_numParts_pair_p2 = -1;
61 int mic_device_count = 0;
62 int mic_get_device_count() {
return mic_device_count; }
65 void mic_errcheck(
const char *msg) {
70 void mic_die(
const char *msg) {
73 sprintf(host,
"physical node %d", CmiPhysicalNodeID(CkMyPe()));
75 gethostname(host, 128); host[127] = 0;
77 char devstr[128] =
"";
79 if (devnum == CkMyPe()) {
80 sprintf(devstr,
" device %d", devnum);
83 sprintf(errmsg,
"MIC error on Pe %d (%s%s): %s", CkMyPe(), host, devstr, msg);
91 static __thread
int usedevicelist;
94 usedevicelist = CmiGetArgStringDesc(argv,
"+devices", &devicelist,
95 "comma-delimited list of MIC device numbers such as 0,2,1,2");
97 CmiGetArgInt(argv,
"+micSK", &mic_singleKernel);
98 CmiGetArgInt(argv,
"+micDT", &mic_deviceThreshold);
99 CmiGetArgInt(argv,
"+micHS", &mic_hostSplit);
100 CmiGetArgInt(argv,
"+micSP1", &mic_numParts_self_p1);
101 CmiGetArgInt(argv,
"+micPP1", &mic_numParts_pair_p1);
102 CmiGetArgInt(argv,
"+micPP2", &mic_numParts_pair_p2);
107 static __thread
int shared_mic;
108 static __thread
int first_pe_sharing_mic;
109 static __thread
int next_pe_sharing_mic;
110 static __thread
int devicePe;
111 static __thread
int numPesSharingDevice;
112 static __thread
int *pesSharingDevice;
114 static __thread
int mic_is_mine;
115 static __thread
int myDevice;
120 for (
int i=0; i<numPesSharingDevice; ++i ) {
121 if ( pesSharingDevice[i] == pe )
return true;
127 if ( a == b )
return 0;
128 for (
int bit = 1; bit; bit *= 2 ) {
129 if ( (a&bit) != (b&bit) )
return ((a&bit) < (b&bit));
138 if ( 0 == CkMyPe() ) {
139 MIC_TRACING_REGISTER_EVENTS
145 sprintf(host,
"physical node %d", CmiPhysicalNodeID(CkMyPe()));
147 gethostname(host, 128); host[127] = 0;
150 int myPhysicalNodeID = CmiPhysicalNodeID(CkMyPe());
151 int myRankInPhysicalNode;
152 int numPesOnPhysicalNode;
153 int *pesOnPhysicalNode;
154 CmiGetPesOnPhysicalNode(myPhysicalNodeID,
156 &numPesOnPhysicalNode
161 for ( i=0; i < numPesOnPhysicalNode; ++i ) {
162 if ( i && (pesOnPhysicalNode[i] <= pesOnPhysicalNode[i-1]) ) {
163 i = numPesOnPhysicalNode;
166 if ( pesOnPhysicalNode[i] == CkMyPe() )
break;
168 if ( i == numPesOnPhysicalNode || i != CmiPhysicalRank(CkMyPe()) ) {
169 CkPrintf(
"Bad result from CmiGetPesOnPhysicalNode!\n");
170 for ( i=0; i < numPesOnPhysicalNode; ++i ) {
171 CkPrintf(
"pe %d physnode rank %d of %d is %d\n", CkMyPe(),
172 i, numPesOnPhysicalNode, pesOnPhysicalNode[i]);
174 myRankInPhysicalNode = 0;
175 numPesOnPhysicalNode = 1;
176 pesOnPhysicalNode =
new int[1];
177 pesOnPhysicalNode[0] = CkMyPe();
179 myRankInPhysicalNode = i;
184 int deviceCount = _Offload_number_of_devices();
185 if ( deviceCount < 1 ) {
186 mic_die(
"No MIC devices found.");
192 if ( usedevicelist ) {
193 devices =
new int[strlen(devicelist)];
195 while ( devicelist[i] ) {
196 ndevices += sscanf(devicelist+i,
"%d",devices+ndevices);
197 while ( devicelist[i] && isdigit(devicelist[i]) ) ++i;
198 while ( devicelist[i] && ! isdigit(devicelist[i]) ) ++i;
202 CkPrintf(
"Did not find +devices i,j,k,... argument, using all\n");
204 devices =
new int[deviceCount];
206 for (
int i=0; i<deviceCount; ++i ) {
207 int dev = i % deviceCount;
210 #pragma offload target(mic: dev) in(dev)
211 dev_ok = mic_check_local();
213 int dev_ok = mic_check(dev);
215 if ( dev_ok ) devices[ndevices++] = dev;
216 else CkPrintf(
"Offload to device %d on Pe %d failed.\n",dev,CkMyPe());
220 mic_device_count = ndevices;
223 mic_die(
"No usable MIC devices found.");
228 first_pe_sharing_mic = CkMyPe();
229 next_pe_sharing_mic = CkMyPe();
232 if ( numPesOnPhysicalNode > 1 ) {
233 int myDeviceRank = myRankInPhysicalNode * ndevices / numPesOnPhysicalNode;
234 dev = devices[myDeviceRank];
236 pesSharingDevice =
new int[numPesOnPhysicalNode];
238 numPesSharingDevice = 0;
239 for (
int i = 0; i < numPesOnPhysicalNode; ++i ) {
240 if ( i * ndevices / numPesOnPhysicalNode == myDeviceRank ) {
241 int thisPe = pesOnPhysicalNode[i];
242 pesSharingDevice[numPesSharingDevice++] = thisPe;
243 if ( devicePe < 1 ) devicePe = thisPe;
247 for (
int j = 0; j < ndevices; ++j ) {
248 if ( devices[j] == dev && j != myDeviceRank ) shared_mic = 1;
250 if ( shared_mic && devicePe == CkMyPe() ) {
251 if ( CmiPhysicalNodeID(CkMyPe()) < 2 )
252 CkPrintf(
"Pe %d sharing MIC device %d\n", CkMyPe(), dev);
255 dev = devices[CkMyPe() % ndevices];
257 pesSharingDevice =
new int[1];
258 pesSharingDevice[0] = CkMyPe();
259 numPesSharingDevice = 1;
262 if ( devicePe != CkMyPe() ) {
263 if ( CmiPhysicalNodeID(devicePe) < 2 )
264 CkPrintf(
"Pe %d physical rank %d will use MIC device of pe %d\n",
265 CkMyPe(), myRankInPhysicalNode, devicePe);
272 first_pe_sharing_mic = CkMyPe();
273 next_pe_sharing_mic = CkMyPe();
275 mic_is_mine = ( first_pe_sharing_mic == CkMyPe() );
277 if ( dev >= deviceCount ) {
279 sprintf(buf,
"Pe %d unable to bind to MIC device %d on %s because only %d devices are present",
280 CkMyPe(), dev, host, deviceCount);
284 if ( CmiPhysicalNodeID(devicePe) < 2 )
285 CkPrintf(
"Pe %d physical rank %d binding to MIC device %d on %s: %d procs %d threads \n",
286 CkMyPe(), myRankInPhysicalNode, dev, host,
287 omp_get_num_procs_target(TARGET_MIC,dev),
288 omp_get_max_threads_target(TARGET_MIC,dev));
293 mic_init_device(CkMyPe(), CkMyNode(), myDevice);
295 int dev_ok = mic_check(dev);
296 if ( dev_ok ) devices[ndevices++] = dev;
299 sprintf(errmsg,
"Failed binding to device %d on Pe %d", dev, CkMyPe());
305 if (devicePe != CkMyPe()) {
return; }
306 mic_free_device(myDevice);
312 void send_build_mic_force_table() {
316 void build_mic_force_table() {
317 if (devicePe != CkMyPe()) {
return; }
324 if (devicePe != CkMyPe()) {
return; }
328 mic_bind_lj_table(deviceNum, (
char*)lj_table_base_ptr, lj_table_dim, lj_table_size);
332 if (devicePe != CkMyPe()) {
return; }
333 mic_bind_table_four(deviceNum, mic_table_base_ptr, mic_table_n, mic_table_n_16);
337 if (devicePe != CkMyPe()) {
return; }
338 mic_bind_constants(deviceNum,
351 return ( li[1] < lj[1] );
360 if (devicePe != CkMyPe()) {
return; }
391 #ifdef MEM_OPT_VERSION
392 int natoms = mol->exclSigPoolSize;
401 exclusionsByAtom =
new int2[natoms];
412 for (
int i = 0; i < natoms; i++) {
436 for (
int j = 0; j < n; j++) {
438 if (index < min_index) { min_index = index; }
439 if (index > max_index) { max_index = index; }
440 curExclList.
add(index);
443 for (
int j = 0; j < n; j++) {
445 if (index < min_index) { min_index = index; }
446 if (index > max_index) { max_index = index; }
447 curModList.
add(index);
451 n = full_excl[0] + 1;
452 for (
int j = 1; j < n; j++) {
453 int index = full_excl[j] - i;
454 if (index < min_index) { min_index = index; }
455 if (index > max_index) { max_index = index; }
456 curExclList.
add(index);
460 for (
int j = 1; j < n; j++) {
461 int index = mod_excl[j] - i;
462 if (index < min_index) { min_index = index; }
463 if (index > max_index) { max_index = index; }
464 curModList.
add(index);
467 int maxDiff = -1 * min_index;
468 if (maxDiff < max_index) { maxDiff = max_index; }
470 curExclList.
sort(); curExclList.
add(-1 * maxDiff - 1);
471 curModList.
sort(); curModList.
add(-1 * maxDiff - 1);
480 int curExclListIndex = 0;
481 int curModListIndex = 0;
484 for (
int j = -1 * maxDiff; j <= maxDiff; j++) {
487 int bitPattern = 0x00;
495 if (curExclList[curExclListIndex] == j) {
501 if (curModList[curModListIndex] == j) {
507 curList[j + maxDiff] = bitPattern;
518 for (j = 0; j < unique_lists.
size(); j++) {
521 if (n != unique_lists[j][0]) {
continue; }
528 if (unique_lists[j][k+3] != curList[k]) {
break; }
530 if (k == n) {
break; }
535 if (j == unique_lists.
size()) {
544 for (
int k = 0; k < n; k++) { list[k + 3] = curList[k]; }
547 unique_lists.
add(list);
551 listsByAtom[i] = unique_lists[j];
567 long int totalBits = 0;
568 int nlists = unique_lists.
size();
570 for (
int j = 0; j < nlists; j++) {
571 int32 *list = unique_lists[j];
572 int maxDiff = list[1];
573 list[2] = totalBits + (2 * maxDiff);
574 totalBits += 2 * (2 * maxDiff + 1);
578 for (
int i = 0; i < natoms; i++) {
579 exclusionsByAtom[i].x = (listsByAtom[i])[1];
580 exclusionsByAtom[i].y = (listsByAtom[i])[2];
584 delete [] listsByAtom; listsByAtom = NULL;
587 const long int uintBitCnt =
sizeof(
unsigned int) * 8;
588 if (totalBits & (uintBitCnt - 1)) {
589 totalBits += (uintBitCnt - (totalBits & (uintBitCnt - 1)));
591 long int totalBytes = totalBits / 8;
594 if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
595 CkPrintf(
"Info: Found %d unique exclusion lists needed %ld bytes\n",
596 unique_lists.
size(), totalBytes);
601 long int totalUInts = totalBits / (
sizeof(
unsigned int) * 8);
602 unsigned int *exclusion_bits =
new unsigned int[totalUInts];
603 memset(exclusion_bits, 0, totalBytes);
608 for (
int i = 0; i < unique_lists.
size(); i++) {
611 int32 *list = unique_lists[i];
615 if (offset + (2 * list[1]) != list[2]) {
616 NAMD_bug(
"ComputeNonbondedMIC::bind_exclusions offset mismatch");
620 const int n = list[0];
621 for (
int j = 0; j < n; j++) {
622 const int offset_j = offset + (2 * j);
623 const int offset_major = offset_j / (
sizeof(
unsigned int) * 8);
624 const int offset_minor = offset_j % (
sizeof(
unsigned int) * 8);
625 const int entry_mask = (list[j + 3] & 0x3) << offset_minor;
626 exclusion_bits[offset_major] |= entry_mask;
630 offset += 2 * (2 * list[1] + 1);
636 mic_bind_exclusions(deviceNum, exclusion_bits, totalUInts);
641 delete [] exclusion_bits;
647 void register_mic_compute_self(
ComputeID c,
PatchID pid,
int part,
int numParts) {
649 if ( ! micCompute )
NAMD_bug(
"register_self called early");
652 MICP(
"register_mic_compute_self(c:%d, pid:%d, part:%d, numParts:%d) - Called...\n", c, pid, part, numParts); MICPF;
656 micCompute->requirePatch(pid);
660 numParts = mic_numParts_self_p1;
661 if (numParts < 1) { numParts = 1; }
662 for (
int part = 0; part < numParts; part++) {
668 cr.
pid[0] = pid; cr.
pid[1] = pid;
675 if (singleKernelFlag != 0 || micCompute->patchRecords[pid].isLocal) {
676 micCompute->localComputeRecords.add(cr);
678 micCompute->remoteComputeRecords.add(cr);
684 void register_mic_compute_pair(
ComputeID c,
PatchID pid[],
int t[],
int part,
int numParts) {
686 if ( ! micCompute )
NAMD_bug(
"register_pair called early");
689 MICP(
"register_mic_compute_pair(c:%d, pid:{%d,%d}, t:--, part:%d, numParts:%d) - Called...\n", c, pid[0], pid[1], part, numParts); MICPF;
693 micCompute->requirePatch(pid[0]);
694 micCompute->requirePatch(pid[1]);
700 Vector offset = micCompute->patchMap->center(pid[0])
701 - micCompute->patchMap->center(pid[1]);
702 offset.
x += (t1%3-1) - (t2%3-1);
703 offset.
y += ((t1/3)%3-1) - ((t2/3)%3-1);
704 offset.
z += (t1/9-1) - (t2/9-1);
715 int trans0 = computeMap->
trans(c, 0);
716 int trans1 = computeMap->
trans(c, 1);
723 int da = index_a0 - index_a1; da *= ((da < 0) ? (-1) : (1));
724 int db = index_b0 - index_b1; db *= ((db < 0) ? (-1) : (1));
725 int dc = index_c0 - index_c1; dc *= ((dc < 0) ? (-1) : (1));
726 int manDist = da + db + dc;
729 numParts = mic_numParts_pair_p1 - (mic_numParts_pair_p2 * manDist);
730 if (numParts < 1) { numParts = 1; }
731 for (
int part = 0; part < numParts; part++) {
737 cr.
pid[0] = pid[0]; cr.
pid[1] = pid[1];
746 if ((singleKernelFlag != 0) ||
747 (micCompute->patchRecords[pid[0]].isLocal && micCompute->patchRecords[pid[1]].isLocal)
749 micCompute->localComputeRecords.add(cr);
751 micCompute->remoteComputeRecords.add(cr);
758 void unregister_mic_compute(
ComputeID c) {
759 NAMD_bug(
"unregister_compute unimplemented");
764 void mic_heartbeat(
void *arg,
double t) {
766 MICP(
"[DEBUG:%d] :: heartbeat (t:%lf)...\n", CkMyPe(), t); MICPF;
768 printf(
"[DEBUG:%d] :: heartbeat (t:%lf)...\n", CkMyPe(), t); fflush(NULL);
770 CcdCallFnAfter(mic_heartbeat, NULL, 1000.0);
778 ) :
Compute(c), slaveIndex(idx) {
781 CkPrintf(
"C.N.MIC[%d]::constructor cid=%d\n", CkMyPe(), c);
785 #if MIC_HEARTBEAT != 0
786 mic_heartbeat(NULL, CmiWallTimer());
790 MICP(
"ComputeNonbondedMIC::ComputeNonbondedMIC(cid:%d) - Called...\n", c); MICPF;
792 master = m ? m :
this;
801 NAMD_die(
"pressure profile not supported in MIC");
803 if (mic_singleKernel >= 0) {
804 singleKernelFlag = ((mic_singleKernel != 0) ? (1) : (0));
813 pairlistTolerance = 0.;
823 #if MIC_SUBMIT_ATOMS_ON_ARRIVAL != 0
825 exclusionsByAtom_ptr = NULL;
826 atomSubmitSignals =
new std::set<void*>(); __ASSERT(atomSubmitSignals != NULL);
832 iout <<
iINFO <<
"MIC SINGLE OFFLOAD: " << ((singleKernelFlag != 0) ? (
"SET") : (
"UNSET")) <<
"\n" <<
endi;
833 iout << iINFO <<
"MIC DEVICE THRESHOLD: " << mic_deviceThreshold <<
"\n" <<
endi;
834 iout << iINFO <<
"MIC HOST SPLIT: " << mic_hostSplit <<
"\n" <<
endi;
835 iout << iINFO <<
"MIC NUM PARTS SELF P1: " << mic_numParts_self_p1 <<
"\n" <<
endi;
836 iout << iINFO <<
"MIC NUM PARTS PAIR P1: " << mic_numParts_pair_p1 <<
"\n" <<
endi;
837 iout << iINFO <<
"MIC NUM PARTS PAIR P2: " << mic_numParts_pair_p2 <<
"\n" <<
endi;
840 if ( master !=
this ) {
841 masterPe = master->masterPe;
842 master->slaves[slaveIndex] =
this;
843 if ( master->slavePes[slaveIndex] != CkMyPe() ) {
844 NAMD_bug(
"ComputeNonbondedMIC slavePes[slaveIndex] != CkMyPe");
849 computeMgr->sendMICPEData(CkMyPe(), 1);
855 bind_exclusions(myDevice);
859 #if MIC_SUBMIT_ATOMS_ON_ARRIVAL != 0
860 micDevice = myDevice; __ASSERT(micDevice >= 0);
861 exclusionsByAtom_ptr =
exclusionsByAtom; __ASSERT(exclusionsByAtom_ptr != NULL);
882 if ( pr.refCount == 0 ) {
886 pr.isLocal = ( CkNodeOf(patchMap->
node(pid)) == CkMyNode() );
889 if ( singleKernelFlag != 0 || pr.isLocal ) {
906 pr.dHdrPrefix = NULL;
917 const int myPE = CkMyPe();
919 for (
int i=0; i<npatches; ++i ) {
921 patch_record &pr = recs[pid];
922 if ( pr.hostPe == myPE ) {
926 if ( singleKernelFlag != 0 || pr.isLocal ) {
933 pr.p = patchMap->
patch(pid);
934 pr.
positionBox = pr.p->registerPositionPickup(
this);
936 pr.forceBox = pr.p->registerForceDeposit(
this);
938 pr.intRadBox = pr.p->registerIntRadPickup(
this);
939 pr.psiSumBox = pr.p->registerPsiSumDeposit(
this);
940 pr.bornRadBox = pr.p->registerBornRadPickup(
this);
941 pr.dEdaSumBox = pr.p->registerDEdaSumDeposit(
this);
942 pr.dHdrPrefixBox = pr.p->registerDHdrPrefixPickup(
this);
949 if ( CmiPhysicalNodeID(CkMyPe()) < 2 )
956 MICP(
"ComputeNonbondedMIC::assignPatches() - Called...\n"); MICPF;
958 int *pesOnNodeSharingDevice =
new int[CkMyNodeSize()];
959 int numPesOnNodeSharingDevice = 0;
960 int masterIndex = -1;
961 for (
int i=0; i<numPesSharingDevice; ++i ) {
962 int pe = pesSharingDevice[i];
963 if ( pe == CkMyPe() ) masterIndex = numPesOnNodeSharingDevice;
964 if ( CkNodeOf(pe) == CkMyNode() ) {
965 pesOnNodeSharingDevice[numPesOnNodeSharingDevice++] = pe;
975 int *count =
new int[npatches];
976 memset(count, 0,
sizeof(
int)*npatches);
977 int *pcount =
new int[numPesOnNodeSharingDevice];
978 memset(pcount, 0,
sizeof(
int)*numPesOnNodeSharingDevice);
979 int *rankpcount =
new int[CkMyNodeSize()];
980 memset(rankpcount, 0,
sizeof(
int)*CkMyNodeSize());
981 char *table =
new char[npatches*numPesOnNodeSharingDevice];
982 memset(table, 0, npatches*numPesOnNodeSharingDevice);
984 int unassignedpatches = npatches;
987 for (
int i=0; i<npatches; ++i ) {
990 pr.hostPe = CkMyPe();
992 unassignedpatches = 0;
993 pcount[masterIndex] = npatches;
997 for (
int i=0; i<npatches; ++i ) {
1000 int homePe = patchMap->
node(pid);
1002 for (
int j=0; j<numPesOnNodeSharingDevice; ++j ) {
1003 int pe = pesOnNodeSharingDevice[j];
1004 if ( pe == homePe ) {
1005 pr.hostPe = pe; --unassignedpatches;
1009 table[i*numPesOnNodeSharingDevice+j] = 1;
1012 if ( pr.hostPe == -1 && CkNodeOf(homePe) == CkMyNode() ) {
1013 pr.hostPe = homePe; --unassignedpatches;
1014 rankpcount[CkRankOf(homePe)] += 1;
1020 for (
int i=0; i<npatches; ++i ) {
1023 if ( pr.hostPe != -1 )
continue;
1026 for (
int j=0; j<numPesOnNodeSharingDevice; ++j ) {
1027 if ( table[i*numPesOnNodeSharingDevice+j] ) { ++c; lastj=j; }
1031 pr.hostPe = pesOnNodeSharingDevice[lastj];
1032 --unassignedpatches;
1036 while ( unassignedpatches ) {
1038 for ( i=0; i<npatches; ++i ) {
1039 if ( ! table[i*numPesOnNodeSharingDevice+assignj] )
continue;
1042 if ( pr.hostPe != -1 )
continue;
1043 pr.hostPe = pesOnNodeSharingDevice[assignj];
1044 --unassignedpatches;
1045 pcount[assignj] += 1;
1046 if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
1049 if ( i<npatches )
continue;
1050 for ( i=0; i<npatches; ++i ) {
1053 if ( pr.hostPe != -1 )
continue;
1054 if ( count[i] )
continue;
1055 pr.hostPe = pesOnNodeSharingDevice[assignj];
1056 --unassignedpatches;
1057 pcount[assignj] += 1;
1058 if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
1061 if ( i<npatches )
continue;
1062 if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
1065 for (
int i=0; i<npatches; ++i ) {
1071 slavePes =
new int[CkMyNodeSize()];
1074 for (
int j=0; j<numPesOnNodeSharingDevice; ++j ) {
1075 int pe = pesOnNodeSharingDevice[j];
1076 int rank = pe - CkNodeFirst(CkMyNode());
1079 if ( pe == CkMyPe() )
continue;
1080 if ( ! pcount[j] && ! rankpcount[rank] )
continue;
1081 rankpcount[rank] = 0;
1083 computeMgr->sendCreateNonbondedMICSlave(pe,
numSlaves);
1086 for (
int j=0; j<CkMyNodeSize(); ++j ) {
1087 int pe = CkNodeFirst(CkMyNode()) + j;
1090 if ( ! rankpcount[j] )
continue;
1091 if ( pe == CkMyPe() )
continue;
1093 computeMgr->sendCreateNonbondedMICSlave(pe,
numSlaves);
1098 delete [] pesOnNodeSharingDevice;
1101 delete [] rankpcount;
1105 static __thread
int num_atom_records_allocated;
1116 static __thread
int mic_timer_count;
1117 static __thread
double mic_timer_total;
1123 #if MIC_TRACING != 0
1124 __thread
double mic_tracing_offload_start_remote = 0.0;
1125 __thread
double mic_tracing_offload_start_local = 0.0;
1126 __thread
double mic_tracing_offload_end_remote = 0.0;
1127 __thread
double mic_tracing_offload_end_local = 0.0;
1128 static __thread
double mic_tracing_polling_start = 0.0;
1129 static __thread
int mic_tracing_polling_count = 0;
1130 #define MIC_TRACING_POLLING_SET_FREQ ( 100 )
1133 #define MIC_POLL(FN,ARG) CcdCallFnAfter(FN,ARG,0.1)
1136 #define GBISP(...) CkPrintf(__VA_ARGS__);
1142 #define count_limit 5000000 // DMK - NOTE : I have this set fairly high so that I can test executions
1148 void mic_check_remote_progress(
void *arg,
double) {
1151 if (mic_check_remote_kernel_complete(myDevice)) {
1154 #if MIC_SYNC_OUTPUT != 0
1155 #if MIC_TRACING != 0
1156 double xfer_start = CmiWallTimer();
1158 mic_transfer_output(myDevice,
1163 #if MIC_TRACING != 0
1164 double xfer_end = CmiWallTimer();
1165 MIC_TRACING_RECORD(MIC_EVENT_SYNC_OUTPUT_REMOTE_PRAGMA, xfer_start, xfer_end);
1175 check_remote_count = 0;
1176 mic_errcheck(
"at mic remote stream completed");
1180 MICP(
"[DEBUG:%d] :: << detected remote kernel completion >>\n", CkMyPe()); MICPF;
1186 "mic_check_remote_progress polled %d times over %f s on step %d",
1187 check_remote_count, CkWallTimer() - remote_submit_time,
1204 MIC_POLL(mic_check_remote_progress, arg);
1208 void mic_check_local_progress(
void *arg,
double) {
1211 if (mic_check_local_kernel_complete(myDevice)) {
1214 #if MIC_SYNC_OUTPUT != 0
1215 #if MIC_TRACING != 0
1216 double xfer_start = CmiWallTimer();
1218 mic_transfer_output(myDevice,
1223 #if MIC_TRACING != 0
1224 double xfer_end = CmiWallTimer();
1225 MIC_TRACING_RECORD(MIC_EVENT_SYNC_OUTPUT_LOCAL_PRAGMA, xfer_start, xfer_end);
1235 check_local_count = 0;
1236 mic_errcheck(
"at mic local stream completed");
1240 MICP(
"[DEBUG:%d] :: << detected local kernel completion >>\n", CkMyPe()); MICPF;
1246 "mic_check_local_progress polled %d times over %f s on step %d",
1247 check_local_count, CkWallTimer() - local_submit_time,
1254 }
else if ( check_remote_count ) {
1255 NAMD_bug(
"nonzero check_remote_count in mic_check_local_progres");
1259 MIC_POLL(mic_check_local_progress, arg);
1269 cr_sortop(
const Lattice &lattice) : l(lattice) { }
1282 #if MIC_SUBMIT_ATOMS_ON_ARRIVAL != 0
1308 #if MIC_TRACING != 0
1309 double atomSubmit_start = CmiWallTimer();
1316 CompAtom *a = pr.positionBox->open(); pr.
x = a;
1318 int2 *exclusionsByAtom =
master->exclusionsByAtom_ptr;
1320 int numAtoms_16 = ((numAtoms + 15) & (~15));
1324 int &allocSize = p->mic_atomData_allocSize_host;
1332 int micDevice =
master->micDevice;
1333 __ASSERT(micDevice < MIC_MAX_DEVICES_PER_NODE);
1334 int transferFlag = 0;
1338 if (p->mic_atomData_seq != seq || p->mic_atomData_deviceSeq[micDevice] != seq) {
1339 pthread_mutex_lock(&(p->mic_atomData_mutex));
1344 int tmp_mic_atomData_seq = p->mic_atomData_seq;
1345 int tmp_mic_atomData_deviceSeq = p->mic_atomData_deviceSeq[micDevice];
1346 p->mic_atomData_seq = seq;
1347 p->mic_atomData_deviceSeq[micDevice] = seq;
1350 if (tmp_mic_atomData_seq != seq) {
1354 if (numAtoms_16 > allocSize || atomData == NULL) {
1355 if (atomData != NULL) { _MM_FREE_WRAPPER(atomData); }
1356 int toAllocSize = (int)(numAtoms_16 * 1.1f);
1357 toAllocSize = ((toAllocSize + 15) & (~15));
1358 atomData = (
char*)(_MM_MALLOC_WRAPPER((
sizeof(atom) +
sizeof(atom_param)) * toAllocSize, MIC_ALIGN,
"atomData"));
1359 __ASSERT(atomData != NULL);
1360 allocSize = toAllocSize;
1367 atom* dev_a = (atom*)atomData;
1368 atom_param* dev_aExt = (atom_param*)(dev_a + numAtoms_16);
1369 memcpy(dev_a, ca,
sizeof(atom) * numAtoms);
1370 if (doneMigration || allocFlag) {
1371 #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
1372 for (
int k = 0; k < numAtoms; k++) {
1374 dev_aExt[k].vdw_type = a[j].
vdwType;
1375 dev_aExt[k].index = aExt[j].
id;
1376 #ifdef MEM_OPT_VERSION
1377 dev_aExt[k].excl_index = exclusionsByAtom[aExt[j].exclId].y;
1378 dev_aExt[k].excl_maxdiff = exclusionsByAtom[aExt[j].exclId].x;
1380 dev_aExt[k].excl_index = exclusionsByAtom[aExt[j].
id].y;
1381 dev_aExt[k].excl_maxdiff = exclusionsByAtom[aExt[j].
id].x;
1385 int *dev_aExt_vdwType = ((
int*)dev_aExt) + (0 * numAtoms_16);
1386 int *dev_aExt_index = ((
int*)dev_aExt) + (1 * numAtoms_16);
1387 int *dev_aExt_exclIndex = ((
int*)dev_aExt) + (2 * numAtoms_16);
1388 int *dev_aExt_exclMaxDiff = ((
int*)dev_aExt) + (3 * numAtoms_16);
1389 for (
int k = 0; k < numAtoms; k++) {
1391 dev_aExt_vdwType[k] = a[j].
vdwType;
1392 dev_aExt_index[k] = aExt[j].
id;
1393 #ifdef MEM_OPT_VERSION
1394 dev_aExt_exclIndex[k] = exclusionsByAtom[aExt[j].exclId].y;
1395 dev_aExt_exclMaxDiff[k] = exclusionsByAtom[aExt[j].exclId].x;
1397 dev_aExt_exclIndex[k] = exclusionsByAtom[aExt[j].
id].y;
1398 dev_aExt_exclMaxDiff[k] = exclusionsByAtom[aExt[j].
id].x;
1409 if (tmp_mic_atomData_deviceSeq != seq) { transferFlag = 1; }
1411 pthread_mutex_unlock(&(p->mic_atomData_mutex));
1415 if (transferFlag != 0) {
1416 int allocBytes = allocSize * (
sizeof(atom) +
sizeof(atom_param));
1417 int transferBytes = numAtoms_16 *
sizeof(atom);
1418 if (doneMigration) { transferBytes += numAtoms_16 *
sizeof(atom_param); }
1419 void *signal = NULL;
1421 #if MIC_TRACING != 0
1422 double atomTransfer_start = CmiWallTimer();
1425 mic_submit_patch_data(micDevice,
1427 p->mic_atomData_prev[micDevice],
1430 p->mic_atomData_allocSize_device[micDevice],
1431 p->mic_atomData_devicePtr[micDevice],
1435 #if MIC_TRACING != 0
1436 double atomTransfer_finish = CmiWallTimer();
1437 MIC_TRACING_RECORD(MIC_EVENT_ATOMS_TRANSFER, atomTransfer_start, atomTransfer_finish);
1440 if (signal != NULL) { atomSubmitSignals->insert(signal); }
1443 #if MIC_TRACING != 0
1444 double atomSubmit_finish = CmiWallTimer();
1445 MIC_TRACING_RECORD(MIC_EVENT_ATOMS_SUBMIT, atomSubmit_start, atomSubmit_finish);
1453 #endif // MIC_SUBMIT_ATOMS_ON_ARRIVAL != 0
1459 pr.positionBox->skip();
1460 pr.forceBox->skip();
1479 doSlow = flags.doFullElectrostatics;
1483 if ( ! flags.doNonbonded ) {
1484 GBISP(
"GBIS[%d] noWork() don't do nonbonded\n",CkMyPe());
1486 computeMgr->sendNonbondedMICSlaveReady(
masterPe,
1504 #if MIC_SUBMIT_ATOMS_ON_ARRIVAL != 0
1506 #if MIC_TRACING != 0
1507 double atomSubmit_wait_start = CmiWallTimer();
1510 int micDevice =
master->micDevice;
1512 std::set<void*>::iterator it;
1513 for (it = atomSubmitSignals->begin(); it != atomSubmitSignals->end(); it++) {
1515 #if 0 // Use blocking offload_wait pragma
1516 #pragma offload_wait target(mic:micDevice) wait(sig)
1518 #else // Use busy wait
1519 while (!_Offload_signaled(
master->micDevice, sig)) { }
1522 atomSubmitSignals->clear();
1524 #if MIC_TRACING != 0
1525 double atomSubmit_wait_finished = CmiWallTimer();
1526 MIC_TRACING_RECORD(MIC_EVENT_ATOMS_WAIT, atomSubmit_wait_start, atomSubmit_wait_finished);
1534 GBISP(
"GBIS[%d] noWork() P0[%d] open()\n",CkMyPe(), pr.patchID);
1536 #if MIC_SUBMIT_ATOMS_ON_ARRIVAL == 0
1537 pr.x = pr.positionBox->open();
1539 pr.xExt = pr.p->getCompAtomExtInfo();
1559 if (
master ==
this )
return 0;
1562 computeMgr->sendNonbondedMICSlaveReady(
masterPe,
1568 workStarted = ((singleKernelFlag != 0) ? (2) : (1));
1576 GBISP(
"C.N.MIC[%d]::doWork: seq %d, phase %d, workStarted %d\n", \
1591 if (
master ==
this && kernel_launch_state > 2 ) {
1596 mic_check_local_progress(
this,0.);
1606 workStarted = ((singleKernelFlag != 0) ? (2) : (1));
1642 #if MIC_SORT_COMPUTES != 0
1661 force_lists.resize(npatches);
1662 for (
int i=0; i<npatches; ++i ) {
1664 force_lists[i].force_list_size = 0;
1670 for (
int i=0; i<ncomputes; ++i ) {
1677 force_lists[lp1].force_list_size++;
1679 force_lists[lp2].force_list_size++;
1689 #if MIC_SUBMIT_ATOMS_ON_ARRIVAL != 0
1690 pp.patch1_atomDataPtr =
patchRecords[cr.
pid[0]].p->mic_atomData_devicePtr[myDevice];
1691 pp.patch2_atomDataPtr =
patchRecords[cr.
pid[1]].p->mic_atomData_devicePtr[myDevice];
1696 for (
int i=0; i<ncomputes; ++i ) {
1702 pp.patch1_force_list_index = lp1;
1703 pp.patch1_force_list_size = force_lists[lp1].force_list_size;
1706 pp.patch2_force_list_index = pp.patch1_force_list_index;
1707 pp.patch2_force_list_size = pp.patch1_force_list_size;
1710 pp.patch2_force_list_index = lp2;
1711 pp.patch2_force_list_size = force_lists[lp2].force_list_size;
1715 if ( CmiPhysicalNodeID(CkMyPe()) < 2 )
1716 CkPrintf(
"Pe %d has %d local and %d remote patches and %d local and %d remote computes.\n",
1728 int max_atoms_per_patch = 0;
1730 for (i = 0; i < npatches; i++) {
1740 force_lists[i].force_list_start = flstart;
1741 force_lists[i].force_output_start = istart;
1742 force_lists[i].atom_start = istart;
1746 pr.localStart = istart;
1750 int natoms = pr.p->getNumAtoms();
1751 int nfreeatoms = natoms;
1754 for (
int j=0; j<natoms; ++j ) {
1755 if ( aExt[j].atomFixed ) --nfreeatoms;
1758 if ( natoms > max_atoms_per_patch ) max_atoms_per_patch = natoms;
1759 pr.numAtoms = natoms;
1760 pr.numFreeAtoms = nfreeatoms;
1761 force_lists[i].patch_size = natoms;
1762 natoms = (natoms + 15) & (~15);
1763 nfreeatoms = (nfreeatoms + 15) & (~15);
1764 force_lists[i].patch_stride = natoms;
1767 int _flstart = flstart;
1768 int _vlstart = vlstart;
1771 flstart += natoms * force_lists[i].force_list_size;
1772 vlstart += 16 * force_lists[i].force_list_size;
1776 __ASSERT(_flstart <= flstart && _vlstart <= vlstart);
1778 force_lists[i].force_list_size = 0;
1792 if ( num_atom_records_allocated ) {
1794 _MM_FREE_WRAPPER(
atoms);
1795 _MM_FREE_WRAPPER(
forces);
1807 atom_params = (atom_param*)_MM_MALLOC_WRAPPER(num_atom_records_allocated *
sizeof(atom_param), 64,
"atom_params");
1808 atoms = (atom*)_MM_MALLOC_WRAPPER(num_atom_records_allocated *
sizeof(atom), 64,
"atoms");
1810 forces = (double4*)_MM_MALLOC_WRAPPER(num_atom_records_allocated *
sizeof(double4), 64,
"forces");
1811 slow_forces = (double4*)_MM_MALLOC_WRAPPER(num_atom_records_allocated *
sizeof(double4), 64,
"slow_forces");
1822 (intRad0H == NULL || intRadSH == NULL ||
psiSumH == NULL ||
1823 bornRadH == NULL ||
dEdaSumH == NULL || dHdrPrefixH == NULL))) {
1824 NAMD_die(
"Unable to allocate forces in ComputeNonbondedMIC::doWork");
1831 for (
int i=0; i<ncomputes; ++i ) {
1841 pp.patch1_force_start = force_lists[lp1].force_list_start
1842 + (force_lists[lp1].patch_stride * force_lists[lp1].force_list_size);
1843 force_lists[lp1].force_list_size++;
1848 pp.patch2_atom_start = pp.patch1_atom_start;
1849 pp.patch2_force_start = pp.patch1_force_start;
1850 pp.patch2_size = pp.patch1_size;
1851 pp.patch2_force_size = pp.patch1_force_size;
1854 pp.patch2_force_start = force_lists[lp2].force_list_start
1855 + (force_lists[lp2].patch_stride * force_lists[lp2].force_list_size);
1856 force_lists[lp2].force_list_size++;
1871 Vector p1_center = micCompute->patchMap->center(p1);
1872 pp.patch1_center_x = p1_center.
x;
1873 pp.patch1_center_y = p1_center.
y;
1874 pp.patch1_center_z = p1_center.
z;
1875 Vector p2_center = micCompute->patchMap->center(p2);
1876 pp.patch2_center_x = p2_center.
x;
1877 pp.patch2_center_y = p2_center.
y;
1878 pp.patch2_center_z = p2_center.
z;
1885 pp.block_flags_start = bfstart;
1886 bfstart += ((pp.patch1_force_size + 127) >> 7) << 5;
1891 CkPrintf(
"Pe %d mic_bind_patch_pairs %d %d %d %d %d\n", CkMyPe(),
1894 max_atoms_per_patch);
1897 int totalmem =
patch_pairs.size() *
sizeof(patch_pair) +
1898 force_lists.size() *
sizeof(force_list) +
1899 num_force_records *
sizeof(double4) +
1911 mic_bind_force_lists_only(myDevice, force_lists.begin(), force_lists.size(), force_lists.bufSize());
1913 mic_bind_force_buffers_only(myDevice, (
size_t)num_force_records);
1921 float maxAtomMovement = 0.;
1922 float maxPatchTolerance = 0.;
1924 for (
int i=0; i<activePatches.size(); ++i ) {
1928 float maxMove = pr.p->flags.maxAtomMovement;
1929 if ( maxMove > maxAtomMovement ) maxAtomMovement = maxMove;
1931 float maxTol = pr.p->flags.pairlistTolerance;
1932 if ( maxTol > maxPatchTolerance ) maxPatchTolerance = maxTol;
1934 int start = pr.localStart;
1935 int n = pr.numAtoms;
1936 int n_16 = (n + 15) & (~15);
1940 #if MIC_SUBMIT_ATOMS_ON_ARRIVAL == 0
1944 #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
1946 for (
int k = 0; k < n; k++) {
1948 ap[k].vdw_type = a[j].
vdwType;
1949 ap[k].index = aExt[j].
id;
1950 #ifdef MEM_OPT_VERSION
1951 ap[k].excl_index = exclusionsByAtom[aExt[j].exclId].y;
1952 ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].exclId].x;
1954 ap[k].excl_index = exclusionsByAtom[aExt[j].
id].y;
1955 ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].
id].x;
1960 int *ap_vdwType = ((
int*)ap) + (0 * n_16);
1961 int *ap_index = ((
int*)ap) + (1 * n_16);
1962 int *ap_exclIndex = ((
int*)ap) + (2 * n_16);
1963 int *ap_exclMaxDiff = ((
int*)ap) + (3 * n_16);
1964 for (
int k=0; k<n; ++k ) {
1967 ap_index[k] = aExt[j].
id;
1968 #ifdef MEM_OPT_VERSION
1969 ap_exclIndex[k] = exclusionsByAtom[aExt[j].exclId].y;
1970 ap_exclMaxDiff[k] = exclusionsByAtom[aExt[j].exclId].x;
1972 ap_exclIndex[k] = exclusionsByAtom[aExt[j].
id].y;
1973 ap_exclMaxDiff[k] = exclusionsByAtom[aExt[j].
id].x;
1981 const CudaAtom *ac = pr.p->getCudaAtomList();
1982 atom *ap =
atoms + start;
1983 #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
1984 memcpy(ap, ac,
sizeof(atom)*n);
1986 memcpy(ap, ac,
sizeof(atom)*n_16);
1991 #endif // MIC_SUBMIT_ATOMS_ON_ARRIVAL == 0
2003 if ( flags.savePairlists ) {
2006 }
else if ( flags.usePairlists ) {
2067 kernel_time = CkWallTimer();
2068 kernel_launch_state = ((singleKernelFlag != 0) ? (2) : (1));
2072 void mic_check_remote_calc(
void *arg,
double) {
2073 if (mic_check_remote_kernel_complete(myDevice)) {
2074 computeMgr->sendYieldDevice(next_pe_sharing_mic);
2076 MIC_POLL(mic_check_remote_calc, arg);
2080 void mic_check_local_calc(
void *arg,
double) {
2081 if (mic_check_local_kernel_complete(myDevice)) {
2082 computeMgr->sendYieldDevice(next_pe_sharing_mic);
2084 MIC_POLL(mic_check_local_calc, arg);
2090 GBISP(
"C.N.MIC[%d]::recvYieldDevice: seq %d, workStarted %d, \
2091 gbisPhase %d, kls %d, from pe %d\n", CkMyPe(),
sequence(), \
2106 switch ( kernel_launch_state ) {
2112 GBISP(
"C.N.MIC[%d]::recvYieldDeviceR: case 1\n", CkMyPe())
2115 remote_submit_time = CkWallTimer();
2122 NAMD_die(
"Unsupported feature (DMK33949330)");
2130 mic_nonbonded_forces(myDevice, 1,
2131 num_local_atom_records,
2132 num_local_compute_records,
2133 num_local_patch_records,
2139 #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0)
2140 mic_first_kernel_submit_time = CmiWallTimer();
2148 MIC_POLL(mic_check_remote_progress,
this);
2157 GBISP(
"C.N.MIC[%d]::recvYieldDeviceL: case 2\n", CkMyPe())
2166 NAMD_die(
"Unsupported feature (DMK83620583)");
2171 local_submit_time = CkWallTimer();
2178 mic_nonbonded_forces(myDevice, 0,
2179 num_local_atom_records,
2180 num_local_compute_records,
2181 num_local_patch_records,
2188 #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0)
2189 if (singleKernelFlag) { mic_first_kernel_submit_time = CmiWallTimer(); }
2198 MIC_POLL(mic_check_local_progress,
this);
2205 GBISP(
"C.N.MIC[%d]::recvYieldDevice: case default\n", CkMyPe())
2227 GBISP(
"C.N.MIC[%d]::fnWork: workStarted %d, phase %d\n", \
2234 mic_kernel_data * host__local_kernel_data = host__kernel_data;
2235 mic_kernel_data * host__remote_kernel_data = host__kernel_data + 1;
2236 mic_kernel_data* &kernel_data = (
workStarted == 1) ? (host__remote_kernel_data) : (host__local_kernel_data);
2240 for (
int i=0; i<patches.size(); ++i ) {
2242 GBISP(
"GBIS[%d] fnWork() P0[%d] force.open()\n",CkMyPe(), pr.patchID);
2243 pr.r = pr.forceBox->open();
2248 for (
int i=0; i<patches.size(); ++i ) {
2252 int start = pr.localStart;
2256 if ( !simParams->
GBISOn || gbisPhase == 3 ) {
2261 int nAtoms = pr.numAtoms;
2262 int nAtoms_16 = (nAtoms + 15) & (~15);
2272 #if MIC_HANDCODE_FORCE_SOA_VS_AOS != 0
2275 for (
int k = 0; k < nAtoms; k++) {
2283 for (
int k = 0; k < nAtoms; k++) {
2285 f_slow[j].
x += af_slow[k].x;
2286 f_slow[j].
y += af_slow[k].y;
2287 f_slow[j].
z += af_slow[k].z;
2295 double *af_x = ((
double*)af) + (0 * nAtoms_16);
2296 double *af_y = ((
double*)af) + (1 * nAtoms_16);
2297 double *af_z = ((
double*)af) + (2 * nAtoms_16);
2298 double *af_w = ((
double*)af) + (3 * nAtoms_16);
2299 double *af_slow_x = ((
double*)af_slow) + (0 * nAtoms_16);
2300 double *af_slow_y = ((
double*)af_slow) + (1 * nAtoms_16);
2301 double *af_slow_z = ((
double*)af_slow) + (2 * nAtoms_16);
2302 double *af_slow_w = ((
double*)af_slow) + (3 * nAtoms_16);
2304 for (
int k = 0; k < nAtoms; k++) {
2310 f_slow[j].
x += af_slow_x[k];
2311 f_slow[j].
y += af_slow_y[k];
2312 f_slow[j].
z += af_slow_z[k];
2321 if ( i % 31 == 0 )
for (
int j=0; j<3; ++j ) {
2322 CkPrintf(
"Pe %d patch %d atom %d (%f %f %f) force %f\n", CkMyPe(), i,
2323 j, pr.x[j].position.x, pr.x[j].position.y, pr.x[j].position.z,
2371 pr.positionBox->close(&(pr.x));
2372 pr.forceBox->close(&(pr.r));
2384 double virial_xx = host__local_kernel_data->virial_xx;
2385 double virial_xy = host__local_kernel_data->virial_xy;
2386 double virial_xz = host__local_kernel_data->virial_xz;
2387 double virial_yy = host__local_kernel_data->virial_yy;
2388 double virial_yz = host__local_kernel_data->virial_yz;
2389 double virial_zz = host__local_kernel_data->virial_zz;
2390 double fullElectVirial_xx = host__local_kernel_data->fullElectVirial_xx;
2391 double fullElectVirial_xy = host__local_kernel_data->fullElectVirial_xy;
2392 double fullElectVirial_xz = host__local_kernel_data->fullElectVirial_xz;
2393 double fullElectVirial_yy = host__local_kernel_data->fullElectVirial_yy;
2394 double fullElectVirial_yz = host__local_kernel_data->fullElectVirial_yz;
2395 double fullElectVirial_zz = host__local_kernel_data->fullElectVirial_zz;
2396 double vdwEnergy = host__local_kernel_data->vdwEnergy;
2397 double electEnergy = host__local_kernel_data->electEnergy;
2398 double fullElectEnergy = host__local_kernel_data->fullElectEnergy;
2399 if (singleKernelFlag == 0) {
2400 virial_xx += host__remote_kernel_data->virial_xx;
2401 virial_xy += host__remote_kernel_data->virial_xy;
2402 virial_xz += host__remote_kernel_data->virial_xz;
2403 virial_yy += host__remote_kernel_data->virial_yy;
2404 virial_yz += host__remote_kernel_data->virial_yz;
2405 virial_zz += host__remote_kernel_data->virial_zz;
2406 fullElectVirial_xx += host__remote_kernel_data->fullElectVirial_xx;
2407 fullElectVirial_xy += host__remote_kernel_data->fullElectVirial_xy;
2408 fullElectVirial_xz += host__remote_kernel_data->fullElectVirial_xz;
2409 fullElectVirial_yy += host__remote_kernel_data->fullElectVirial_yy;
2410 fullElectVirial_yz += host__remote_kernel_data->fullElectVirial_yz;
2411 fullElectVirial_zz += host__remote_kernel_data->fullElectVirial_zz;
2412 vdwEnergy += host__remote_kernel_data->vdwEnergy;
2413 electEnergy += host__remote_kernel_data->electEnergy;
2414 fullElectEnergy += host__remote_kernel_data->fullElectEnergy;
2435 Tensor virial_slow_tensor;
2436 virial_slow_tensor.
xx = fullElectVirial_xx;
2437 virial_slow_tensor.
xy = fullElectVirial_xy;
2438 virial_slow_tensor.
xz = fullElectVirial_xz;
2439 virial_slow_tensor.
yx = fullElectVirial_xy;
2440 virial_slow_tensor.
yy = fullElectVirial_yy;
2441 virial_slow_tensor.
yz = fullElectVirial_yz;
2442 virial_slow_tensor.
zx = fullElectVirial_xz;
2443 virial_slow_tensor.
zy = fullElectVirial_yz;
2444 virial_slow_tensor.
zz = fullElectVirial_zz;
2450 #if MIC_EXCL_CHECKSUM_FULL != 0
2451 int exclusionSum = host__local_kernel_data->exclusionSum;
2452 if (singleKernelFlag == 0) { exclusionSum += host__remote_kernel_data->exclusionSum; }
2458 #if (MIC_TRACING != 0) && (MIC_DEVICE_TRACING != 0)
2460 double timeBase = host__device_times_start[((singleKernelFlag) ? (1) : (0))];
2462 #if MIC_DEVICE_TRACING_DETAILED != 0
2470 for (
int i = 0; i < host__patch_pairs_size; i++) {
2473 int pid0 = host__patch_pairs[i].p1;
2474 int pid1 = host__patch_pairs[i].p2;
2477 int trans0 = computeMap->
trans(host__patch_pairs[i].
cid, 0);
2478 int trans1 = computeMap->
trans(host__patch_pairs[i].cid, 1);
2485 int da = index_a0 - index_a1; da *= ((da < 0) ? (-1) : (1));
2486 int db = index_b0 - index_b1; db *= ((db < 0) ? (-1) : (1));
2487 int dc = index_c0 - index_c1; dc *= ((dc < 0) ? (-1) : (1));
2488 dist = da + db + dc;
2492 double ueStart = host__device_times_computes[i * 2];
2493 double ueEnd = host__device_times_computes[i * 2 + 1];
2494 ueStart += mic_first_kernel_submit_time - timeBase;
2495 ueEnd += mic_first_kernel_submit_time - timeBase;
2497 if (dist > 7) { dist = 7; }
2498 traceUserBracketEvent(MIC_EVENT_DEVICE_COMPUTE + dist, ueStart, ueEnd);
2502 for (
int i = 0; i < host__force_lists_size; i++) {
2503 double ueStart = host__device_times_patches[i * 2];
2504 double ueEnd = host__device_times_patches[i * 2 + 1];
2505 ueStart += mic_first_kernel_submit_time - timeBase;
2506 ueEnd += mic_first_kernel_submit_time - timeBase;
2507 traceUserBracketEvent(MIC_EVENT_DEVICE_PATCH, ueStart, ueEnd);
2510 #endif // MIC_DEVICE_TRACING_DETAILED
2513 double lPTime0 = host__device_times_start[3];
2514 double lPTime1 = host__device_times_start[5];
2515 double lPTime2 = host__device_times_start[7];
2516 double lPTime3 = host__device_times_start[9];
2517 lPTime0 += mic_first_kernel_submit_time - timeBase;
2518 lPTime1 += mic_first_kernel_submit_time - timeBase;
2519 lPTime2 += mic_first_kernel_submit_time - timeBase;
2520 lPTime3 += mic_first_kernel_submit_time - timeBase;
2521 traceUserBracketEvent(MIC_EVENT_DEVICE_COMPUTES, lPTime0, lPTime1);
2523 traceUserBracketEvent(MIC_EVENT_DEVICE_PATCHES, lPTime2, lPTime3);
2524 if (singleKernelFlag == 0) {
2525 double rPTime0 = host__device_times_start[2];
2526 double rPTime1 = host__device_times_start[4];
2527 double rPTime2 = host__device_times_start[6];
2528 double rPTime3 = host__device_times_start[8];
2529 rPTime0 += mic_first_kernel_submit_time - timeBase;
2530 rPTime1 += mic_first_kernel_submit_time - timeBase;
2531 rPTime2 += mic_first_kernel_submit_time - timeBase;
2532 rPTime3 += mic_first_kernel_submit_time - timeBase;
2533 traceUserBracketEvent(MIC_EVENT_DEVICE_COMPUTES, rPTime0, rPTime1);
2535 traceUserBracketEvent(MIC_EVENT_DEVICE_PATCHES, rPTime2, rPTime3);
2538 #endif // MIC_DEVICE_TRACING
2544 GBISP(
"finished\n");
2545 if (simParams->
GBISOn) gbisPhase = 1 + (gbisPhase % 3);
2558 if ( !simParams->
GBISOn || gbisPhase == 3 ) {
2561 MICP(
"submitting reduction...\n"); MICPF;
2585 float upload_ms, remote_calc_ms;
2586 float local_calc_ms, total_ms;
2587 mic_errcheck(
"before event timers");
2588 micEventElapsedTime(&upload_ms, start_upload,
start_calc);
2589 mic_errcheck(
"in event timer 1");
2591 mic_errcheck(
"in event timer 2");
2593 mic_errcheck(
"in event timer 3");
2595 mic_errcheck(
"in event timer 4");
2596 mic_errcheck(
"in event timers");
2598 CkPrintf(
"MIC EVENT TIMING: %d %f %f %f %f\n",
2599 CkMyPe(), upload_ms, remote_calc_ms,
2600 local_calc_ms, total_ms);
2604 mic_timer_total /= mic_timer_count;
2605 CkPrintf(
"MIC TIMING: %d %f ms/step on node %d\n",
2606 step, mic_timer_total * 1.e3, CkMyPe());
2608 mic_timer_count = 0;
2609 mic_timer_total = 0;
2616 GBISP(
"C.N.MIC[%d]::fnWork: incrementing phase\n", CkMyPe())
2617 if (simParams->GBISOn) gbisPhase = 1 + (gbisPhase % 3);
2629 __thread FILE* mic_output = NULL;
2630 __thread
int mic_output_set = 0;
2633 void mic_initproc() {
2640 void debugInit(FILE* fout) {
2641 if (mic_output != NULL) {
return; }
2647 sprintf(fname,
"mic_debug.%d", CkMyPe());
2648 printf(
"[MIC-INFO] :: Creating MIC debug file \"%s\" for PE %d...\n", fname, CkMyPe());
2649 mic_output = fopen(fname,
"w");
2655 if (mic_output_set == 0) { fclose(mic_output); mic_output = NULL; }
2659 void mic_initHostDeviceLDB() {
2665 #define COMPUTE_DISTANCE(cid) \
2666 int manDist = -1; { \
2667 if (computeMap->type(cid) == computeNonbondedSelfType) { \
2669 } else if (computeMap->type(cid) == computeNonbondedPairType) { \
2670 int aSize = patchMap->gridsize_a(); \
2671 int bSize = patchMap->gridsize_b(); \
2672 int cSize = patchMap->gridsize_c(); \
2673 int pid0 = computeMap->pid(cid, 0); \
2674 int pid1 = computeMap->pid(cid, 1); \
2675 int trans0 = computeMap->trans(cid, 0); \
2676 int trans1 = computeMap->trans(cid, 1); \
2677 int index_a0 = patchMap->index_a(pid0) + aSize * Lattice::offset_a(trans0); \
2678 int index_b0 = patchMap->index_b(pid0) + bSize * Lattice::offset_b(trans0); \
2679 int index_c0 = patchMap->index_c(pid0) + cSize * Lattice::offset_c(trans0); \
2680 int index_a1 = patchMap->index_a(pid1) + aSize * Lattice::offset_a(trans1); \
2681 int index_b1 = patchMap->index_b(pid1) + bSize * Lattice::offset_b(trans1); \
2682 int index_c1 = patchMap->index_c(pid1) + cSize * Lattice::offset_c(trans1); \
2683 int da = index_a0 - index_a1; da *= ((da < 0) ? (-1) : (1)); \
2684 int db = index_b0 - index_b1; db *= ((db < 0) ? (-1) : (1)); \
2685 int dc = index_c0 - index_c1; dc *= ((dc < 0) ? (-1) : (1)); \
2686 manDist = da + db + dc; \
2691 void mic_hostDeviceLDB() {
2697 if (devicePe != CkMyPe()) {
2707 void mic_setDeviceLDBParams(
int dt,
int hs,
int sp1,
int pp1,
int pp2) {
2708 if (CkMyPe() != 0) {
2709 if (dt >= 0) { mic_deviceThreshold = dt; }
2710 if (hs >= 0) { mic_hostSplit = hs; }
2711 if (sp1 > 0) { mic_numParts_self_p1 = sp1; }
2712 if (pp1 > 0) { mic_numParts_pair_p1 = pp1; }
2713 if (pp2 > 0) { mic_numParts_pair_p2 = pp2; }
2718 void mic_contributeHostDeviceLDB(
int peSetLen,
int * peSet) {
2721 static int numContribs = 0;
2722 __ASSERT(numContribs >= 0 && numContribs < CkNumPes());
2724 static double hdLDBTime = 0.0;
2725 static double hdLDBTime_phase0 = 0.0;
2726 static double hdLDBTime_phase1 = 0.0;
2727 static double hdLDBTime_phase2a = 0.0;
2728 static double hdLDBTime_phase2b = 0.0;
2729 double startTime = CmiWallTimer();
2737 if (numContribs == 0) {
2741 if (mic_hostSplit < 0) { mic_hostSplit = simParams->
mic_hostSplit; }
2752 #if MIC_VERBOSE_HVD_LDB != 0
2753 CkPrintf(
"[MIC-HvD-LDB] :: PE %d :: offload pe list = {", CkMyPe());
2754 for (
int i = 0; i < peSetLen; i++) { CkPrintf(
" %d", peSet[i]); }
2760 if (mic_deviceThreshold < 0 || mic_hostSplit <= 0) {
2763 const int ppn = CkNodeSize(0);
2764 const int ppm = ppn / mic_get_device_count();
2765 const float atomsPerPE = ((float)(molecule->
numAtoms)) / ((
float)(CkNumPes()));
2766 const int numAway = ((simParams->
twoAwayX > 0) ? (1) : (0))
2767 + ((simParams->
twoAwayY > 0) ? (1) : (0))
2768 + ((simParams->
twoAwayZ > 0) ? (1) : (0));
2771 const float atomsPerPE_adj = atomsPerPE + (atomsPerPE * 0.85f * numAway);
2774 #if MIC_VERBOSE_HVD_LDB != 0
2775 printf(
"[MIC-HvD-LDB] :: PE %d :: ppn: %d, ppm: %d, atomsPerPe: %f (%f)\n", CkMyPe(), ppn, ppm, atomsPerPE, atomsPerPE_adj);
2787 mic_deviceThreshold = 2;
2791 float hs_base = 0.714f * 79.0f;
2792 float hostVsDevice_adj = 0.714f * ppm / 2.0f;
2793 float scaling_adj = 0.714f * 26000.0f / atomsPerPE_adj;
2795 mic_hostSplit = (int)(hs_base - hostVsDevice_adj - scaling_adj);
2796 if (mic_hostSplit < 5) { mic_hostSplit = 5; }
2797 if (mic_hostSplit > 95) { mic_hostSplit = 95; }
2805 iout <<
iINFO <<
"MIC-HvD-LDB - Automated HvD LDB Setting DT:" << mic_deviceThreshold <<
" and HS:" << mic_hostSplit <<
"\n" <<
endi;
2808 if (mic_hostSplit > 100) { mic_hostSplit = 100; }
2810 __ASSERT(mic_deviceThreshold >= 0 && mic_hostSplit >= 0);
2814 std::queue<int> * selfs =
new std::queue<int>();
2815 std::queue<int> * pairs =
new std::queue<int>();
2816 std::queue<int> * selfsTmp =
new std::queue<int>();
2817 std::queue<int> * pairsTmp =
new std::queue<int>();
2818 __ASSERT(selfs != NULL && pairs != NULL && selfsTmp != NULL && pairsTmp != NULL);
2822 #define WIDEN_PID_RANGE(pid) \
2823 if (minPID < 0) { minPID = pid; maxPID = pid; } \
2824 if (pid < minPID) { minPID = pid; } \
2825 if (pid > maxPID) { maxPID = pid; }
2827 double startTime_phase0 = CmiWallTimer();
2832 int totalNumSelfs = 0;
2833 int totalNumPairs = 0;
2834 for (
int i = 0; i < nComputes; i++) {
2837 int pe = computeMap->
node(i);
2838 bool peInSet =
false;
2839 for (
int j = 0; !peInSet && j < peSetLen; j++) {
2840 if (peSet[j] == pe) { peInSet =
true; }
2842 if (!peInSet) {
continue; }
2854 COMPUTE_DISTANCE(i);
2855 if (manDist < 0 || manDist > mic_deviceThreshold) {
2863 int pid0 = computeMap->
pid(i, 0); WIDEN_PID_RANGE(pid0);
2867 int pid0 = computeMap->
pid(i, 0); WIDEN_PID_RANGE(pid0);
2868 int pid1 = computeMap->
pid(i, 1); WIDEN_PID_RANGE(pid1);
2871 __ASSERT(minPID <= maxPID);
2873 hdLDBTime_phase0 += CmiWallTimer() - startTime_phase0;
2876 int total = totalNumSelfs + totalNumPairs;
2877 int target = (int)((total * mic_hostSplit) / 100.0f);
2878 if (target > total) { target = total; }
2881 #if MIC_VERBOSE_HVD_LDB != 0
2882 printf(
"[MIC-HvD-LDB] :: PE %d :: Phase 0 Result - selfs:%d, pairs:%d, other:%d, self+pair:%d, target:%d\n",
2883 CkMyPe(), (
int)(selfs->size()), (
int)(pairs->size()), (
int)(pairsTmp->size()), total, target
2891 bool * pidFlag =
new bool[maxPID - minPID + 1];
2892 std::queue<int> * offloads =
new std::queue<int>();
2893 __ASSERT(offloads != NULL && pidFlag != NULL);
2894 int offloads_selfs = 0;
2895 int offloads_pairs = 0;
2896 for (
int i = minPID; i <= maxPID; i++) { pidFlag[i - minPID] =
false; }
2900 int peLo = peSet[0];
2901 int peHi = peSet[0];
2902 for (
int i = 0; i < peSetLen; i++) {
2903 if (peSet[i] < peLo) { peLo = peSet[i]; }
2904 if (peSet[i] > peHi) { peHi = peSet[i]; }
2906 __ASSERT(peLo <= peHi);
2907 bool * peSetFlag =
new bool[peHi - peLo + 1];
2908 __ASSERT(peSetFlag != NULL);
2909 for (
int i = peLo; i <= peHi; i++) { peSetFlag[i - peLo] =
false; }
2913 double startTime_phase1 = CmiWallTimer();
2918 int numSelfs = selfs->size();
2919 for (
int i = 0; i < numSelfs; i++) {
2920 int cid = selfs->front(); selfs->pop();
2921 int pid = computeMap->
pid(cid, 0);
2922 int pe0 = patchMap->
node(pid);
2923 if (pe0 >= peLo && pe0 <= peHi && !(peSetFlag[pe0 - peLo])) {
2924 offloads->push(cid);
2926 peSetFlag[pe0 - peLo] =
true;
2927 pidFlag[pid - minPID] =
true;
2933 int numPairs = pairs->size();
2934 for (
int i = 0; i < numPairs; i++) {
2935 int cid = pairs->front(); pairs->pop();
2936 int pid0 = computeMap->
pid(cid, 0);
2937 int pid1 = computeMap->
pid(cid, 1);
2938 int pe0 = patchMap->
node(pid0);
2939 int pe1 = patchMap->
node(pid1);
2940 if ((pe0 >= peLo && pe0 <= peHi && !(peSetFlag[pe0 - peLo])) ||
2941 (pe1 >= peLo && pe1 <= peHi && !(peSetFlag[pe1 - peLo]))
2943 offloads->push(cid);
2945 if (pe0 >= peLo && pe0 <= peHi) {
2946 peSetFlag[pe0 - peLo] =
true;
2947 pidFlag[pid0 - minPID] =
true;
2949 if (pe1 >= peLo && pe1 <= peHi) {
2950 peSetFlag[pe1 - peLo] =
true;
2951 pidFlag[pid1 - minPID] =
true;
2961 numPairs = pairsTmp->size();
2962 bool usedOther =
false;
2963 while (!(pairsTmp->empty())) {
2964 int cid = pairsTmp->front(); pairsTmp->pop();
2965 int pid0 = computeMap->
pid(cid, 0);
2966 int pid1 = computeMap->
pid(cid, 1);
2967 int pe0 = patchMap->
node(pid0);
2968 int pe1 = patchMap->
node(pid1);
2969 if ((pe0 >= peLo && pe0 <= peHi && !(peSetFlag[pe0 - peLo])) ||
2970 (pe1 >= peLo && pe1 <= peHi && !(peSetFlag[pe1 - peLo]))
2972 offloads->push(cid);
2974 if (pe0 >= peLo && pe0 <= peHi) {
2975 peSetFlag[pe0 - peLo] =
true;
2976 pidFlag[pid0 - minPID] =
true;
2978 if (pe1 >= peLo && pe1 <= peHi) {
2979 peSetFlag[pe1 - peLo] =
true;
2980 pidFlag[pid1 - minPID] =
true;
2984 if (usedOther) { CkPrintf(
"[MIC-WARNING] :: Offloaded non-bonded compute that didn't meet LDB criteria to satisfy phase 1 requirement (just FYI, not a problem)\n"); }
2986 hdLDBTime_phase1 += CmiWallTimer() - startTime_phase1;
2991 for (
int i = 0; i < peSetLen; i++) {
2992 if (peSetFlag[peSet[i] - peLo]) { numPEsSet++; }
2994 __ASSERT(numPEsSet > 0);
2997 #if MIC_VERBOSE_HVD_LDB != 0
2998 printf(
"[MIC-HvD-LDB] :: PE %d :: Phase 1 Result - selfs:%d, pairs:%d, offloads:%d, target:%d\n",
2999 CkMyPe(), (
int)(selfs->size()), (
int)(pairs->size()), (
int)(offloads->size()), target
3006 while (offloads->size() < target) {
3010 double time_0 = CmiWallTimer();
3020 int miniTarget = (target - offloads->size()) / 50;
3022 if (miniTarget < 1) { miniTarget = 1; }
3023 for (
int k = 0; k < miniTarget; k++) {
3026 if (selfs->size() > 0) { cid = selfs->front(); selfs->pop(); offloads_selfs++; }
3027 else if (pairs->size() > 0) { cid = pairs->front(); pairs->pop(); offloads_pairs++; }
3032 offloads->push(cid);
3036 int pid0 = computeMap->
pid(cid, 0);
3037 pidFlag[pid0 - minPID] =
true;
3039 int pid1 = computeMap->
pid(cid, 1);
3040 pidFlag[pid1 - minPID] =
true;
3043 if (cid < 0) {
break; }
3047 double time_1 = CmiWallTimer();
3051 while (offloads->size() < target && selfs->size() > 0) {
3052 int cid = selfs->front(); selfs->pop();
3053 int pid0 = computeMap->
pid(cid, 0);
3054 if (pidFlag[pid0 - minPID]) {
3055 offloads->push(cid);
3058 selfsTmp->push(cid);
3061 { std::queue<int> * t = selfs; selfs = selfsTmp; selfsTmp = t; }
3063 while (offloads->size() < target && pairs->size() > 0) {
3064 int cid = pairs->front(); pairs->pop();
3065 int pid0 = computeMap->
pid(cid, 0);
3066 int pid1 = computeMap->
pid(cid, 1);
3067 if (pidFlag[pid0 - minPID] && pidFlag[pid1 - minPID]) {
3068 offloads->push(cid);
3071 pairsTmp->push(cid);
3074 { std::queue<int> * t = pairs; pairs = pairsTmp; pairsTmp = t; }
3076 double time_2 = CmiWallTimer();
3077 hdLDBTime_phase2a += time_1 - time_0;
3078 hdLDBTime_phase2b += time_2 - time_1;
3082 #if MIC_VERBOSE_HVD_LDB != 0
3083 printf(
"[MIC-HvD-LDB] :: PE %d :: Phase 2 Result [%d->%d] -- selfs:%d, pairs:%d, offloads:%d (s:%d, p:%d), target:%d\n",
3084 CkMyPe(), peLo, peHi, (
int)(selfs->size() + selfsTmp->size()), (
int)(pairs->size() + pairsTmp->size()),
3085 (
int)(offloads->size()), offloads_selfs, offloads_pairs, target
3091 if (mic_numParts_self_p1 <= 0 || mic_numParts_pair_p1 <= 0 || mic_numParts_pair_p2 <= 0) {
3093 int numDeviceComputes = offloads->size();
3094 if (numDeviceComputes > 1000) {
3095 mic_numParts_self_p1 = 1;
3096 mic_numParts_pair_p1 = 1;
3097 mic_numParts_pair_p2 = 1;
3098 }
else if (numDeviceComputes <= 1000 && numDeviceComputes > 350) {
3099 mic_numParts_self_p1 = 3;
3100 mic_numParts_pair_p1 = 3;
3101 mic_numParts_pair_p2 = 1;
3104 if (offloads_selfs <= 0) { offloads_selfs = 1; }
3105 if (offloads_pairs <= 0) { offloads_pairs = 1; }
3106 #define CONFINE_TO_RANGE(v, mi, ma) (v = (v > ma) ? (ma) : ((v < mi) ? (mi) : (v)))
3107 mic_numParts_self_p1 = 300 / offloads_selfs; CONFINE_TO_RANGE(mic_numParts_self_p1, 8, 16);
3108 mic_numParts_pair_p1 = 500 / offloads_pairs; CONFINE_TO_RANGE(mic_numParts_pair_p1, 4, 16);
3109 #undef CONFINE_TO_RANGE
3110 mic_numParts_pair_p2 = mic_numParts_pair_p1 / 2;
3111 mic_numParts_pair_p1 += mic_numParts_pair_p2;
3115 iout <<
iINFO <<
"MIC-HvD-LDB - Automated HvD LDB Setting NumParts { " << mic_numParts_self_p1 <<
", " << mic_numParts_pair_p1 <<
", " << mic_numParts_pair_p2 <<
" }\n" <<
endi;
3119 while (offloads->size() > 0) {
3120 int cid = offloads->front(); offloads->pop();
3121 computeMap->setDirectToDevice(cid, 1);
3130 delete [] peSetFlag;
3137 hdLDBTime += CmiWallTimer() - startTime;
3138 if (numContribs >= CkNumPes()) {
3139 mic_dumpHostDeviceComputeMap();
3141 mic_numParts_self_p1, mic_numParts_pair_p1, mic_numParts_pair_p2);
3142 #if MIC_VERBOSE_HVD_LDB != 0
3143 CkPrintf(
"[MIC-HvD-LDB] :: PE %d :: Total HvD LDB Time: %lf sec = { %lf, %lf, (%lf, %lf) }\n",
3144 CkMyPe(), hdLDBTime, hdLDBTime_phase0, hdLDBTime_phase1, hdLDBTime_phase2a, hdLDBTime_phase2b
3151 void mic_dumpHostDeviceComputeMap() {
3152 #if MIC_DUMP_COMPUTE_MAPPING != 0
3155 char filename[256] = { 0 };
3156 sprintf(filename,
"deviceComputeMap.%d", CkMyPe());
3157 FILE * fmap = fopen(filename,
"w");
3159 printf(
"[MIC-INFO] :: PE %d :: Writing device compute map to \"%s\".\n", CkMyPe(), filename);
3160 fprintf(fmap,
"Device compute map\n");
3161 for (
int i = 0; i < nComputes; i++) {
3162 if (i % 50 == 0) { fprintf(fmap,
"\n%d:", i); }
3163 fprintf(fmap,
"%s%d", ((i % 10 == 0) ? (
" ") : (
"")), computeMap->directToDevice(i));
3167 printf(
"[MIC-WARNING] :: Unable to dump device compute map to file \"%s\"... skipping.\n", filename);
void setNumPatches(int n)
static void bind_force_table(int deviceNum)
static int offset_b(int i)
register BigReal virial_xy
#define COMPUTE_PROXY_PRIORITY
register BigReal virial_xz
std::ostream & iINFO(std::ostream &s)
ResizeArray< compute_record > computeRecords
Type * getNewArray(int n)
static void messageFinishMIC(Compute *)
#define PROXY_RESULTS_PRIORITY
ResizeArray< int > localActivePatches
register BigReal virial_yz
void recvYieldDevice(int pe)
static bool sortop_bitreverse(int a, int b)
static int offset_c(int i)
static __thread int check_remote_count
static ProxyMgr * Object()
ComputeNonbondedMIC ** slaves
void requirePatch(int pid)
static BigReal dielectric_1
static __thread cudaEvent_t end_remote_download
int gridsize_c(void) const
static PatchMap * Object()
static __thread ComputeMgr * computeMgr
const int32 * get_mod_exclusions_for_atom(int anum) const
static void send_contributeHostDeviceLDB(int peSetLen, int *peSet)
#define ADD_TENSOR_OBJECT(R, RL, D)
SimParameters * simParameters
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
static __thread int lj_table_size
register BigReal electEnergy
int index_a(int pid) const
static __thread atom * atoms
static const Molecule * mol
static void bind_lj_table(int deviceNum)
static __thread float * bornRadH
std::ostream & endi(std::ostream &s)
static __thread int2 * exclusionsByAtom
static __thread float * dHdrPrefixH
#define PROXY_DATA_PRIORITY
int num_local_compute_records
SubmitReduction * willSubmit(int setID, int size=-1)
int get_table_dim() const
if(ComputeNonbondedUtil::goMethod==2)
static ReductionMgr * Object(void)
Patch * patch(PatchID pid)
static PatchMap * ObjectOnPe(int pe)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
static __thread cudaEvent_t end_local_download
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float plcutoff2
register BigReal virial_yy
static Units next(Units u)
static __thread patch_pair * patch_pairs
CudaAtom * getCudaAtomList()
static __thread float * intRadSH
bool mic_device_shared_with_pe(int pe)
ResizeArray< int > activePatches
int gridsize_a(void) const
static __thread double kernel_time
int num_remote_compute_records
void NAMD_bug(const char *err_msg)
static int offset_a(int i)
ComputeType type(ComputeID cid)
OwnerBox< Patch, CompAtom > positionBox
ComputeNonbondedMIC(ComputeID c, ComputeMgr *mgr, ComputeNonbondedMIC *m=0, int idx=-1)
int index_b(int pid) const
static void bind_exclusions(int deviceNum)
static void bind_constants(int deviceNum)
register BigReal virial_zz
ResizeArray< compute_record > remoteComputeRecords
void createProxy(PatchID pid)
void NAMD_die(const char *err_msg)
SubmitReduction * reduction
const TableEntry * get_table() const
int num_local_atom_records
static AtomMap * Object()
void mic_getargs(char **)
ResizeArray< int > remoteHostedPatches
int index_c(int pid) const
static __thread double remote_submit_time
int add(const Elem &elem)
static __thread double local_submit_time
ResizeArray< int > hostedPatches
static void send_setDeviceLDBParams(int dt, int hs, int sp1, int pp1, int pp2)
int numPatches(void) const
static __thread float * energy_gbis
static ComputeMap * Object()
ResizeArray< compute_record > localComputeRecords
static __thread int kernel_launch_state
static const LJTable * ljTable
register BigReal virial_xx
ComputeNonbondedMIC * master
static __thread cudaEvent_t start_calc
bool operator()(int32 *li, int32 *lj)
static __thread atom_param * atom_params
ResizeArray< patch_record > patchRecords
static void send_initHostDeviceLDB()
int pid(ComputeID cid, int i)
int num_local_patch_records
virtual void patchReady(PatchID, int doneMigration, int seq)
int num_remote_atom_records
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
int num_remote_patch_records
int trans(ComputeID cid, int i)
ResizeArray< int > remoteActivePatches
void sendBuildMICForceTable()
int gridsize_b(void) const
const int32 * get_full_exclusions_for_atom(int anum) const
static __thread float * intRad0H
static __thread int check_local_count
ResizeArray< int > localHostedPatches
CompAtomExt * getCompAtomExtInfo()