7 #include <cuda_runtime.h>
11 #include <hip/hip_runtime.h>
32 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
35 #define __thread __declspec(thread)
40 extern __thread cudaStream_t
stream;
41 extern __thread cudaStream_t
stream2;
47 if ((err = cudaGetLastError()) != cudaSuccess) {
49 gethostname(host, 128); host[127] = 0;
50 char devstr[128] =
"";
52 if ( cudaGetDevice(&devnum) == cudaSuccess ) {
53 sprintf(devstr,
" device %d", devnum);
55 cudaDeviceProp deviceProp;
56 if ( cudaGetDeviceProperties(&deviceProp, devnum) == cudaSuccess ) {
57 sprintf(devstr,
" device %d pci %x:%x:%x", devnum,
58 deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
61 sprintf(errmsg,
"CUDA error %s on Pe %d (%s%s): %s", msg, CkMyPe(), host, devstr, cudaGetErrorString(err));
67 if ( a == b )
return 0;
68 for (
int bit = 1; bit; bit *= 2 ) {
69 if ( (a&bit) != (b&bit) )
return ((a&bit) < (b&bit));
93 int tsize = (((dim+16+31)/32)*32)-16;
94 if ( tsize < dim )
NAMD_bug(
"ComputeNonbondedCUDA::build_lj_table bad tsize");
98 for (
int i=0; i<dim; ++i, row += tsize ) {
99 for (
int j=0; j<dim; ++j ) {
109 if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
110 CkPrintf(
"Info: Updated CUDA LJ table with %d x %d elements.\n", dim, dim);
122 const int r2_delta_expc = 64 * (r2_delta_exp - 1023);
126 double r = ((double) FORCE_TABLE_SIZE) / ( (double) i + 0.5 );
130 union {
double f;
int32 i[2]; } byte_order_test;
131 byte_order_test.f = 1.0;
132 int32 *r2iilist = (
int32*)r2list + ( byte_order_test.i[0] ? 0 : 1 );
135 double r = ((double) FORCE_TABLE_SIZE) / ( (double) i + 0.5 );
136 int table_i = (r2iilist[2*i] >> 14) + r2_delta_expc;
160 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
162 BigReal ener = table_a + diffa *
163 ( ( table_d * diffa + table_c ) * diffa + table_b);
176 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
178 BigReal ener = table_a + diffa *
179 ( ( table_d * diffa + table_c ) * diffa + table_b);
192 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
193 t[i].y = 2. * -1. * grad;
194 BigReal ener = table_a + diffa *
195 ( ( table_d * diffa + table_c ) * diffa + table_b);
196 et[i].y = -1. * ener;
208 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
210 BigReal ener = table_a + diffa *
211 ( ( table_d * diffa + table_c ) * diffa + table_b);
241 if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
242 CkPrintf(
"Info: Updated CUDA force table with %d elements.\n", FORCE_TABLE_SIZE);
248 return ( li[1] < lj[1] );
262 #ifdef MEM_OPT_VERSION
263 int natoms = mol->exclSigPoolSize;
277 for (
int i=0; i<natoms; ++i ) {
280 #ifdef MEM_OPT_VERSION
283 for (
int j=0; j<n; ++j ) { curList.
add(sig->
fullOffset[j]); }
287 int n = mol_list[0] + 1;
288 for (
int j=1; j<n; ++j ) {
289 curList.
add(mol_list[j] - i);
295 for ( j=0; j<unique_lists.
size(); ++j ) {
296 if ( n != unique_lists[j][0] )
continue;
298 for ( k=0; k<n; ++k ) {
299 if ( unique_lists[j][k+3] != curList[k] )
break;
303 if ( j == unique_lists.
size() ) {
307 maxdiff = -1 * curList[0];
308 if ( curList[n-1] > maxdiff ) maxdiff = curList[n-1];
310 for (
int k=0; k<n; ++k ) {
311 list[k+3] = curList[k];
313 unique_lists.
add(list);
315 listsByAtom[i] = unique_lists[j];
319 long int totalbits = 0;
320 int nlists = unique_lists.
size();
321 for (
int j=0; j<nlists; ++j ) {
322 int32 *list = unique_lists[j];
323 int maxdiff = list[1];
324 list[2] = totalbits + maxdiff;
325 totalbits += 2*maxdiff + 1;
327 for (
int i=0; i<natoms; ++i ) {
331 delete [] listsByAtom;
333 if ( totalbits & 31 ) totalbits += ( 32 - ( totalbits & 31 ) );
336 long int bytesneeded = totalbits / 8;
337 if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
338 CkPrintf(
"Info: Found %d unique exclusion lists needing %ld bytes\n",
339 unique_lists.
size(), bytesneeded);
343 if ( bytesneeded > bytesavail ) {
345 sprintf(errmsg,
"Found %d unique exclusion lists needing %ld bytes "
346 "but only %ld bytes can be addressed with 32-bit int.",
347 unique_lists.
size(), bytesneeded, bytesavail);
352 #define SET_EXCL(EXCL,BASE,DIFF) \
353 (EXCL)[((BASE)+(DIFF))>>5] |= (1<<(((BASE)+(DIFF))&31))
355 unsigned int *exclusion_bits =
new unsigned int[totalbits/32];
356 memset(exclusion_bits, 0, totalbits/8);
359 for (
int i=0; i<unique_lists.
size(); ++i ) {
360 base += unique_lists[i][1];
361 if ( unique_lists[i][2] != (
int32)base ) {
362 NAMD_bug(
"ComputeNonbondedCUDA::build_exclusions base != stored");
364 int n = unique_lists[i][0];
365 for (
int j=0; j<n; ++j ) {
366 SET_EXCL(exclusion_bits,base,unique_lists[i][j+3]);
368 base += unique_lists[i][1] + 1;
373 delete [] exclusion_bits;
385 cr.
pid[0] = pid; cr.
pid[1] = pid;
403 cr.
pid[0] = pid[0]; cr.
pid[1] = pid[1];
409 offset.
x += (t1%3-1) - (t2%3-1);
410 offset.
y += ((t1/3)%3-1) - ((t2/3)%3-1);
411 offset.
z += (t1/9-1) - (t2/9-1);
423 NAMD_bug(
"unregister_compute unimplemented");
440 CkPrintf(
"C.N.CUDA[%d]::constructor cid=%d\n", CkMyPe(), c);
443 if (
sizeof(patch_pair) & 15 )
NAMD_bug(
"sizeof(patch_pair) % 16 != 0");
444 if (
sizeof(atom) & 15 )
NAMD_bug(
"sizeof(atom) % 16 != 0");
445 if (
sizeof(atom_param) & 15 )
NAMD_bug(
"sizeof(atom_param) % 16 != 0");
457 NAMD_die(
"pressure profile not supported in CUDA");
496 NAMD_bug(
"ComputeNonbondedCUDA slavePes[slaveIndex] != CkMyPe");
512 NAMD_die(
"CUDA kernel cannot use +nomergegrids with GBIS simulations");
521 NAMD_die(
"CUDA kernel requires +mergegrids if +nostreaming is used");
525 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP)
526 int leastPriority, greatestPriority;
527 cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
529 if ( leastPriority != greatestPriority ) {
530 if ( CkMyNode() == 0 ) {
532 CkPrintf(
"CUDA device %d stream priority range %d %d\n", dev, leastPriority, greatestPriority);
535 greatestPriority = leastPriority;
537 if (params->
usePMECUDA) greatestPriority = leastPriority;
538 cudaStreamCreateWithPriority(&
stream,cudaStreamDefault,greatestPriority);
539 cudaStreamCreateWithPriority(&
stream2,cudaStreamDefault,leastPriority);
543 cudaStreamCreate(&
stream);
546 cudaDeviceProp deviceProp;
547 cudaGetDeviceProperties(&deviceProp, dev);
549 if ( deviceProp.concurrentKernels && deviceProp.major > 2 ) {
550 if ( CkMyNode() == 0 ) CkPrintf(
"CUDA device %d supports concurrent kernels.\n", dev);
565 cudaEventCreateWithFlags(&
start_calc,cudaEventDisableTiming);
588 }
else if ( CkNumNodes() < 2 ) {
621 for (
int i=0; i<npatches; ++i ) {
624 if ( pr.
hostPe == CkMyPe() ) {
648 if ( CmiPhysicalNodeID(CkMyPe()) < 2 )
656 if ( ppi != ppj )
return ppi < ppj;
663 int *pesOnNodeSharingDevice =
new int[CkMyNodeSize()];
664 int numPesOnNodeSharingDevice = 0;
665 int masterIndex = -1;
668 if ( pe == CkMyPe() ) masterIndex = numPesOnNodeSharingDevice;
669 if ( CkNodeOf(pe) == CkMyNode() ) {
670 pesOnNodeSharingDevice[numPesOnNodeSharingDevice++] = pe;
683 for (
int i=0; i<npatches; ++i ) {
686 if ( CkNodeOf(homePe) == CkMyNode() ) {
687 homePatchByRank[CkRankOf(homePe)].
add(pid);
690 for (
int i=0; i<CkMyNodeSize(); ++i ) {
692 std::sort(homePatchByRank[i].begin(),homePatchByRank[i].end(),so);
693 int masterBoost = ( CkMyRank() == i ? 2 : 0 );
694 for (
int j=0; j<homePatchByRank[i].
size(); ++j ) {
695 int pid = homePatchByRank[i][j];
696 patchRecords[pid].reversePriorityRankInPe = j + masterBoost;
701 int *count =
new int[npatches];
702 memset(count, 0,
sizeof(
int)*npatches);
703 int *pcount =
new int[numPesOnNodeSharingDevice];
704 memset(pcount, 0,
sizeof(
int)*numPesOnNodeSharingDevice);
705 int *rankpcount =
new int[CkMyNodeSize()];
706 memset(rankpcount, 0,
sizeof(
int)*CkMyNodeSize());
707 char *table =
new char[npatches*numPesOnNodeSharingDevice];
708 memset(table, 0, npatches*numPesOnNodeSharingDevice);
710 int unassignedpatches = npatches;
713 for (
int i=0; i<npatches; ++i ) {
718 unassignedpatches = 0;
719 pcount[masterIndex] = npatches;
723 for (
int i=0; i<npatches; ++i ) {
727 for (
int j=0; j<numPesOnNodeSharingDevice; ++j ) {
728 int pe = pesOnNodeSharingDevice[j];
729 if ( pe == homePe ) {
730 pr.
hostPe = pe; --unassignedpatches;
734 table[i*numPesOnNodeSharingDevice+j] = 1;
737 if ( pr.
hostPe == -1 && CkNodeOf(homePe) == CkMyNode() ) {
738 pr.
hostPe = homePe; --unassignedpatches;
739 rankpcount[CkRankOf(homePe)] += 1;
744 for (
int i=0; i<npatches; ++i ) {
747 if ( pr.
hostPe != -1 )
continue;
750 for (
int j=0; j<numPesOnNodeSharingDevice; ++j ) {
751 if ( table[i*numPesOnNodeSharingDevice+j] ) { ++c; lastj=j; }
755 pr.
hostPe = pesOnNodeSharingDevice[lastj];
760 while ( unassignedpatches ) {
762 for ( i=0; i<npatches; ++i ) {
763 if ( ! table[i*numPesOnNodeSharingDevice+assignj] )
continue;
766 if ( pr.
hostPe != -1 )
continue;
767 pr.
hostPe = pesOnNodeSharingDevice[assignj];
769 pcount[assignj] += 1;
770 if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
773 if ( i<npatches )
continue;
774 for ( i=0; i<npatches; ++i ) {
777 if ( pr.
hostPe != -1 )
continue;
778 if ( count[i] )
continue;
779 pr.
hostPe = pesOnNodeSharingDevice[assignj];
781 pcount[assignj] += 1;
782 if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
785 if ( i<npatches )
continue;
786 if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
789 for (
int i=0; i<npatches; ++i ) {
798 for (
int j=0; j<numPesOnNodeSharingDevice; ++j ) {
799 int pe = pesOnNodeSharingDevice[j];
800 int rank = pe - CkNodeFirst(CkMyNode());
803 if ( pe == CkMyPe() )
continue;
804 if ( ! pcount[j] && ! rankpcount[rank] )
continue;
805 rankpcount[rank] = 0;
810 for (
int j=0; j<CkMyNodeSize(); ++j ) {
811 int pe = CkNodeFirst(CkMyNode()) + j;
814 if ( ! rankpcount[j] )
continue;
815 if ( pe == CkMyPe() )
continue;
822 delete [] pesOnNodeSharingDevice;
825 delete [] rankpcount;
875 #define CUDA_POLL(FN,ARG) CcdCallFnAfter(FN,ARG,0.1)
880 #define GBISP(...) CkPrintf(__VA_ARGS__);
885 #define count_limit 1000000
955 sprintf(errmsg,
"cuda_check_progress polled %d times over %f s on step %d",
970 if ( err == cudaSuccess ) {
979 }
else if ( err != cudaErrorNotReady ) {
981 sprintf(errmsg,
"in cuda_check_remote_progress after polling %d times over %f s on step %d",
987 sprintf(errmsg,
"cuda_check_remote_progress polled %d times over %f s on step %d",
992 NAMD_bug(
"nonzero check_local_count in cuda_check_remote_progress");
1003 if ( err == cudaSuccess ) {
1009 }
else if ( err != cudaErrorNotReady ) {
1011 sprintf(errmsg,
"in cuda_check_local_progress after polling %d times over %f s on step %d",
1017 sprintf(errmsg,
"cuda_check_local_progress polled %d times over %f s on step %d",
1022 NAMD_bug(
"nonzero check_remote_count in cuda_check_local_progress");
1078 if ( rpri != rprj )
return rpri > rprj;
1083 if ( ppi != ppj )
return ppi < ppj;
1122 GBISP(
"GBIS[%d] noWork() don't do nonbonded\n",CkMyPe());
1142 GBISP(
"GBIS[%d] noWork() P0[%d] open()\n",CkMyPe(), pr.
patchID);
1149 GBISP(
"GBIS[%d] noWork() P1[%d] open()\n",CkMyPe(),pr.
patchID);
1153 GBISP(
"GBIS[%d] noWork() P2[%d] open()\n",CkMyPe(),pr.
patchID);
1157 GBISP(
"GBIS[%d] noWork() P3[%d] open()\n",CkMyPe(),pr.
patchID);
1160 GBISP(
"opened GBIS boxes");
1164 if (
master ==
this )
return 0;
1177 GBISP(
"C.N.CUDA[%d]::doWork: seq %d, phase %d, workStarted %d, atomsChanged %d\n", \
1193 if (
master ==
this && kernel_launch_state > 2 ) {
1227 1.2f, cudaHostAllocMapped);
1230 force_ready_queue[k] = -1;
1235 1.2f, cudaHostAllocMapped);
1239 1.2f, cudaHostAllocMapped);
1243 1.2f, cudaHostAllocMapped);
1285 for (
int i=0; i<nrc; ++i ) {
1288 for (
int i=0; i<nlc; ++i ) {
1293 patch_pair_num.
resize(npatches);
1294 for (
int i=0; i<npatches; ++i ) {
1296 patch_pair_num[i] = 0;
1300 patch_pairs.
resize(ncomputes);
1301 for (
int i=0; i<ncomputes; ++i ) {
1305 patch_pair_num[lp1]++;
1306 if (lp1 != lp2) patch_pair_num[lp2]++;
1307 patch_pair &pp = patch_pairs[i];
1313 for (
int i=0; i<ncomputes; ++i ) {
1317 patch_pair &pp = patch_pairs[i];
1318 pp.patch1_ind = lp1;
1319 pp.patch2_ind = lp2;
1320 pp.patch1_num_pairs = patch_pair_num[lp1];
1321 pp.patch2_num_pairs = patch_pair_num[lp2];
1324 if ( CmiPhysicalNodeID(CkMyPe()) < 2 ) {
1325 CkPrintf(
"Pe %d has %d local and %d remote patches and %d local and %d remote computes.\n",
1332 int len = patch_pairs.
size();
1335 if ( len != nlc + nrc )
NAMD_bug(
"array size mismatch in ComputeNonbondedCUDA reordering");
1340 for (
int i=0; i<len; ++i ) {
1343 if ( boi < nrc ) { dest = --irc; }
else { dest = --ilc; }
1345 new_patch_pairs[dest] = patch_pairs[boi];
1347 if ( irc != 0 || ilc != nrc )
NAMD_bug(
"block index mismatch in ComputeNonbondedCUDA reordering");
1349 patch_pairs.
swap(new_patch_pairs);
1354 for ( i=0; i<npatches; ++i ) {
1361 int nfreeatoms = natoms;
1364 for (
int j=0; j<natoms; ++j ) {
1365 if ( aExt[j].atomFixed ) --nfreeatoms;
1371 istart += 16 - (natoms & 15);
1393 int exclmask_start = 0;
1395 for (
int i=0; i<ncomputes; ++i ) {
1399 patch_pair &pp = patch_pairs[i];
1406 pp.plist_start = bfstart;
1408 int size1 = (pp.patch1_size-1)/
WARPSIZE+1;
1409 int size2 = (pp.patch2_size-1)/
WARPSIZE+1;
1410 pp.plist_size = (size1*size2-1)/32+1;
1411 bfstart += pp.plist_size;
1412 pp.exclmask_start = exclmask_start;
1413 exclmask_start += size1*size2;
1425 float maxAtomMovement = 0.;
1426 float maxPatchTolerance = 0.;
1432 if ( maxMove > maxAtomMovement ) maxAtomMovement = maxMove;
1435 if ( maxTol > maxPatchTolerance ) maxPatchTolerance = maxTol;
1444 for (
int k=0; k<n; ++k ) {
1446 ap[k].vdw_type = a[j].
vdwType;
1448 ap[k].index = aExt[j].
id;
1449 #ifdef MEM_OPT_VERSION
1452 #else // ! MEM_OPT_VERSION
1455 #endif // MEM_OPT_VERSION
1462 memcpy(ap, ac,
sizeof(atom)*n);
1466 atom *ap =
atoms + start;
1467 for (
int k=0; k<n; ++k ) {
1469 ap[k].position.x = a[j].
position.
x - center.
x;
1470 ap[k].position.y = a[j].
position.
y - center.
y;
1471 ap[k].position.z = a[j].
position.
z - center.
z;
1472 ap[k].charge = charge_scaling * a[j].
charge;
1511 lata.x = lattice.
a().
x;
1512 lata.y = lattice.
a().
y;
1513 lata.z = lattice.
a().
z;
1514 latb.x = lattice.
b().
x;
1515 latb.y = lattice.
b().
y;
1516 latb.z = lattice.
b().
z;
1517 latc.x = lattice.
c().
x;
1518 latc.y = lattice.
c().
y;
1519 latc.z = lattice.
c().
z;
1522 for (
int ic=0; ic<ncomputes; ++ic ) {
1523 patch_pair &pp = patch_pairs[ic];
1524 atom *a1 =
atoms + pp.patch1_atom_start;
1525 int n1 = pp.patch1_size;
1526 atom *a2 =
atoms + pp.patch2_atom_start;
1527 int n2 = pp.patch2_size;
1528 float offx = pp.offset.
x * lata.x
1529 + pp.offset.y * latb.x
1530 + pp.offset.z * latc.x;
1531 float offy = pp.offset.x * lata.y
1532 + pp.offset.y * latb.y
1533 + pp.offset.z * latc.y;
1534 float offz = pp.offset.x * lata.z
1535 + pp.offset.y * latb.z
1536 + pp.offset.z * latc.z;
1538 int atoms_tried = 0;
1539 int blocks_tried = 0;
1541 int blocks_used = 0;
1542 for (
int ii=0; ii<n1; ii+=32 ) {
1543 for (
int jj=0; jj<n2; jj+=16 ) {
1545 for (
int j=jj; j<jj+16 && j<n2; ++j ) {
1547 for (
int i=ii; i<ii+32 && i<n1; ++i ) {
1548 float dx = offx + a1[i].position.x - a2[j].position.x;
1549 float dy = offy + a1[i].position.y - a2[j].position.y;
1550 float dz = offz + a1[i].position.z - a2[j].position.z;
1551 float r2 = dx*dx + dy*dy + dz*dz;
1552 if ( r2 <
cutoff2 ) atom_used = 1;
1555 if ( atom_used ) { block_used = 1; ++atoms_used; }
1558 if ( block_used ) { ++blocks_used; }
1561 CkPrintf(
"blocks = %d/%d (%f) atoms = %d/%d (%f)\n",
1562 blocks_used, blocks_tried, blocks_used/(
float)blocks_tried,
1563 atoms_used, atoms_tried, atoms_used/(
float)atoms_tried);
1575 GBISP(
"doWork[%d] accessing arrays for P%d\n",CkMyPe(),
gbisPhase);
1581 for (
int k=0; k<pr.
numAtoms; ++k ) {
1583 intRad0[k] = pr.
intRad[2*j+0];
1584 intRadS[k] = pr.
intRad[2*j+1];
1589 for (
int k=0; k<pr.
numAtoms; ++k ) {
1595 for (
int k=0; k<pr.
numAtoms; ++k ) {
1604 kernel_launch_state = 1;
1638 GBISP(
"C.N.CUDA[%d]::recvYieldDevice: seq %d, workStarted %d, \
1639 gbisPhase %d, kls %d, from pe %d\n", CkMyPe(),
sequence(), \
1660 if ( kernel_launch_state == 1 || kernel_launch_state == 2 ) {
1661 walltime = CkWallTimer();
1665 switch ( kernel_launch_state ) {
1669 GBISP(
"C.N.CUDA[%d]::recvYieldDeviceR: case 1\n", CkMyPe())
1681 if ( simParams->
GBISOn) {
1691 if ( simParams->
GBISOn) {
1729 GBISP(
"C.N.CUDA[%d]::recvYieldDeviceR: <<<P2>>>\n", CkMyPe())
1747 GBISP(
"C.N.CUDA[%d]::recvYieldDeviceR: <<<P3>>>\n", CkMyPe())
1768 GBISP(
"C.N.CUDA[%d]::recvYieldDeviceL: case 2\n", CkMyPe())
1802 GBISP(
"C.N.CUDA[%d]::recvYieldDeviceL: adding POLL \
1803 cuda_check_local_progress\n", CkMyPe())
1807 GBISP(
"C.N.CUDA[%d]::recvYieldDeviceL: adding POLL \
1808 cuda_check_local_calc\n", CkMyPe())
1818 GBISP(
"C.N.CUDA[%d]::recvYieldDeviceL: calling <<<P2>>>\n", CkMyPe())
1834 GBISP(
"C.N.CUDA[%d]::recvYieldDeviceL: calling <<<P3>>>\n", CkMyPe())
1853 GBISP(
"C.N.CUDA[%d]::recvYieldDevice: case default\n", CkMyPe())
1858 GBISP(
"C.N.CUDA[%d]::recvYieldDevice: DONE\n", CkMyPe())
1878 int start = pr.localStart;
1880 int nfree = pr.numAtoms;
1889 for (
int k=0; k<nfree; ++k ) {
1897 f_slow[j].
x += af_slow[k].x;
1898 f_slow[j].
y += af_slow[k].y;
1899 f_slow[j].
z += af_slow[k].z;
1915 GBISP(
"C.N.CUDA[%d]::fnWork: workStarted %d, phase %d\n", \
1927 for (
int i=0; i<patches.
size(); ++i ) {
1933 GBISP(
"GBIS[%d] fnWork() P0[%d] force.open()\n",CkMyPe(), pr.
patchID);
1944 if ( i % 31 == 0 )
for (
int j=0; j<3; ++j ) {
1945 CkPrintf(
"Pe %d patch %d atom %d (%f %f %f) force %f\n", CkMyPe(), i,
1956 for (
int k=0; k<pr.
numAtoms; ++k ) {
1958 pr.
psiSum[j] += psiSumMaster[k];
1960 GBISP(
"C.N.CUDA[%d]::fnWork: P1 psiSum.close()\n", CkMyPe());
1966 for (
int k=0; k<pr.
numAtoms; ++k ) {
1968 pr.
dEdaSum[j] += dEdaSumMaster[k];
1970 GBISP(
"C.N.CUDA[%d]::fnWork: P2 dEdaSum.close()\n", CkMyPe());
1974 GBISP(
"C.N.CUDA[%d]::fnWork: P3 all.close()\n", CkMyPe());
1982 GBISP(
"C.N.CUDA[%d]::fnWork: pos/force.close()\n", CkMyPe());
1994 virial_slow *= (-1./6.);
2002 ( localHostedPatches.size() ||
master == this ) ) {
2003 GBISP(
"not finished, call again\n");
2008 GBISP(
"finished\n");
2024 GBISP(
"C.N.CUDA[%d]::fnWork: incrementing phase\n", CkMyPe())
2027 GBISP(
"C.N.CUDA[%d] finished ready for next step\n",CkMyPe());
2043 for (
int i=0; i<patch_len; ++i ) {
2047 for (
int i=0; i<order_len; ++i ) {
2052 for (
int i=0; i<patch_len; ++i ) {
2053 iout <<
"patch_last " << i <<
" " << plast[i] <<
"\n";
2078 nexcluded += ((
int *)
virials)[16*i+12];
2092 Tensor virial_slow_tensor;
2120 float upload_ms, remote_calc_ms;
2121 float local_calc_ms, total_ms;
2123 cudaEventElapsedTime(&upload_ms, start_upload,
start_calc);
2133 CkPrintf(
"CUDA EVENT TIMING: %d %f %f %f %f\n",
2134 CkMyPe(), upload_ms, remote_calc_ms,
2135 local_calc_ms, total_ms);
2140 CkPrintf(
"CUDA TIMING: %d %f ms/step on node %d\n",
void setNumPatches(int n)
ResizeArray< int > remoteHostedPatches
void sendNonbondedCUDASlaveEnqueuePatch(ComputeNonbondedCUDA *c, int, int, int, int, FinishWorkMsg *)
static __thread int * block_order
Box< Patch, GBReal > * registerDEdaSumDeposit(Compute *cid)
static BigReal * fast_table
#define COMPUTE_PROXY_PRIORITY
void cuda_bind_force_table(const float4 *t, const float4 *et)
void sendNonbondedCUDASlaveSkip(ComputeNonbondedCUDA *c, int)
void sendBuildCudaForceTable()
void build_cuda_exclusions()
static BigReal * scor_table
void sendYieldDevice(int pe)
Type * getNewArray(int n)
#define PROXY_RESULTS_PRIORITY
ResizeArray< int > localActivePatches
static void messageFinishCUDA(Compute *)
void cuda_bind_forces(float4 *f, float4 *f_slow)
static __thread int check_count
static bool sortop_bitreverse(int a, int b)
Box< Patch, GBReal > * psiSumBox
void build_cuda_force_table()
void unregister_cuda_compute(ComputeID c)
static __thread double cuda_timer_total
BigReal solvent_dielectric
int cuda_stream_finished()
static __thread int intRadSH_size
static __thread int check_remote_count
static ProxyMgr * Object()
void cuda_check_progress(void *arg, double walltime)
ResizeArray< compute_record > computeRecords
void cuda_bind_exclusions(const unsigned int *t, int n)
void cuda_check_local_progress(void *arg, double walltime)
void cuda_bind_atoms(const atom *a)
#define CUDA_TRACE_POLL_REMOTE
Box< Patch, Results > * forceBox
static BigReal dielectric_1
static void build_lj_table()
static __thread cudaEvent_t end_remote_download
static PatchMap * Object()
void setMergeGrids(const int val)
#define CUDA_TRACE_REMOTE(START, END)
static __thread ComputeMgr * computeMgr
#define ADD_TENSOR_OBJECT(R, RL, D)
SimParameters * simParameters
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
static __thread int dummy_size
int index_a(int pid) const
static const Molecule * mol
const ComputeNonbondedCUDA::patch_record * pr
static __thread float * bornRadH
std::ostream & endi(std::ostream &s)
static __thread int2 * exclusionsByAtom
static BigReal * vdwa_table
static __thread float * dHdrPrefixH
static void messageEnqueueWork(Compute *)
#define PROXY_DATA_PRIORITY
SubmitReduction * willSubmit(int setID, int size=-1)
Box< Patch, Real > * registerBornRadPickup(Compute *cid)
int get_table_dim() const
static ReductionMgr * Object(void)
void cuda_bind_GBIS_energy(float *e)
bool pid_compare_priority(int pidi, int pidj)
Patch * patch(PatchID pid)
Box< Patch, Real > * intRadBox
static PatchMap * ObjectOnPe(int pe)
cr_sortop_distance & distop
void sendNonbondedCUDASlaveReady(int, int, int, int)
void CcdCallBacksReset(void *ignored, double curWallTime)
void cuda_bind_GBIS_dEdaSum(GBReal *dEdaSumH)
SubmitReduction * reduction
static __thread int dHdrPrefixH_size
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
static __thread cudaEvent_t end_local_download
BigReal coulomb_radius_offset
static BigReal * r2_table
void register_cuda_compute_self(ComputeID c, PatchID pid)
void cuda_nonbonded_forces(float3 lata, float3 latb, float3 latc, float cutoff2, float plcutoff2, int cbegin, int ccount, int ctotal, int doSlow, int doEnergy, int usePairlists, int savePairlists, int doStreaming, int saveOrder, cudaStream_t &strm)
__thread cudaStream_t stream
static __thread float * slow_virials
bool operator()(int pidj, int pidi)
static __thread int force_ready_queue_next
void send_build_cuda_force_table()
static __thread int intRad0H_size
void cuda_check_local_calc(void *arg, double walltime)
#define CUDA_POLL(FN, ARG)
static __thread ResizeArray< int > * patch_pair_num_ptr
Box< Patch, GBReal > * registerPsiSumDeposit(Compute *cid)
static __thread patch_pair * patch_pairs
CudaAtom * getCudaAtomList()
static __thread float * intRadSH
const TableEntry * table_val(unsigned int i, unsigned int j) const
void cuda_bind_patch_pairs(patch_pair *h_patch_pairs, int npatch_pairs, int npatches, int natoms, int plist_len, int nexclmask)
static __thread double kernel_time
void setGpuIsMine(const int val)
ComputeNonbondedCUDA ** slaves
void cuda_bind_vdw_types(const int *t)
void cuda_bind_lj_table(const float2 *t, int _lj_table_size)
void sendCreateNonbondedCUDASlave(int, int)
void NAMD_bug(const char *err_msg)
Box< Patch, Real > * registerIntRadPickup(Compute *cid)
void cuda_bind_GBIS_dHdrPrefix(float *dHdrPrefixH)
static __thread int force_ready_queue_size
static __thread int num_remote_atoms
int index_b(int pid) const
static __thread int virials_size
void cuda_bind_GBIS_bornRad(float *bornRadH)
ResizeArray< int > remoteActivePatches
cr_sortop_distance(const Lattice &lattice)
int getPesSharingDevice(const int i)
void cudaDie(const char *msg, cudaError_t err=cudaSuccess)
void createProxy(PatchID pid)
ComputeNonbondedCUDA * master
ResizeArray< compute_record > localComputeRecords
void NAMD_die(const char *err_msg)
static __thread float * virials
void cuda_bind_virials(float *v, int *queue, int *blockorder)
void requirePatch(int pid)
ResizeArray< compute_record > remoteComputeRecords
LocalWorkMsg * localWorkMsg2
static AtomMap * Object()
void cuda_bind_GBIS_psiSum(GBReal *psiSumH)
void cuda_GBIS_P3(int cbegin, int ccount, int pbegin, int pcount, float a_cut, float rho_0, float scaling, float3 lata, float3 latb, float3 latc, cudaStream_t &strm)
static __thread int cuda_timer_count
static __thread ResizeArray< patch_pair > * patch_pairs_ptr
static __thread int bornRadH_size
static __thread int num_virials
ResizeArray< int > hostedPatches
ComputeNonbondedCUDA(ComputeID c, ComputeMgr *mgr, ComputeNonbondedCUDA *m=0, int idx=-1)
ResizeArray< int > activePatches
static void build_exclusions()
int index_c(int pid) const
bool operator()(ComputeNonbondedCUDA::compute_record i, ComputeNonbondedCUDA::compute_record j)
static __thread int vdw_types_size
static __thread double remote_submit_time
int add(const Elem &elem)
static __thread double local_submit_time
void cuda_GBIS_P1(int cbegin, int ccount, int pbegin, int pcount, float a_cut, float rho_0, float3 lata, float3 latb, float3 latc, cudaStream_t &strm)
bool operator()(ComputeNonbondedCUDA::compute_record j, ComputeNonbondedCUDA::compute_record i)
ScaledPosition center(int pid) const
BlockRadixSort::TempStorage sort
void cuda_check_remote_progress(void *arg, double walltime)
static __thread int force_ready_queue_len
static __thread int energy_gbis_size
static __thread float * dummy_dev
int numPatches(void) const
void swap(ResizeArray< Elem > &ra)
#define CUDA_TRACE_POLL_LOCAL
void cuda_errcheck(const char *msg)
Position unscale(ScaledPosition s) const
void cuda_bind_GBIS_intRad(float *intRad0H, float *intRadSH)
static BigReal * vdwb_table
int getNextPeSharingGpu()
ResizeArray< int > localHostedPatches
void sendNonbondedCUDASlaveEnqueue(ComputeNonbondedCUDA *c, int, int, int, int)
static __thread float * energy_gbis
Box< Patch, GBReal > * dEdaSumBox
Box< Patch, Real > * registerDHdrPrefixPickup(Compute *cid)
int reversePriorityRankInPe
static __thread int kernel_launch_state
void cuda_check_remote_calc(void *arg, double walltime)
static const LJTable * ljTable
void recvYieldDevice(int pe)
__thread DeviceCUDA * deviceCUDA
ComputeNonbondedCUDA * slave
static __thread cudaEvent_t start_calc
bool operator()(int32 *li, int32 *lj)
static __thread atom_param * atom_params
__thread int max_grid_size
ResizeArray< patch_record > patchRecords
cr_sortop_reverse_priority(cr_sortop_distance &sod, const ComputeNonbondedCUDA::patch_record *patchrecs)
Box< Patch, CompAtom > * positionBox
BigReal pairlistTolerance
static __thread int block_order_size
static __thread int * vdw_types
void cuda_GBIS_P2(int cbegin, int ccount, int pbegin, int pcount, float a_cut, float r_cut, float scaling, float kappa, float smoothDist, float epsilon_p, float epsilon_s, float3 lata, float3 latb, float3 latc, int doEnergy, int doFullElec, cudaStream_t &strm)
__thread cudaStream_t stream2
Box< Patch, Real > * dHdrPrefixBox
static __thread int atom_params_size
static __thread int * force_ready_queue
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
void cuda_bind_atom_params(const atom_param *t)
#define SET_EXCL(EXCL, BASE, DIFF)
#define CUDA_TRACE_LOCAL(START, END)
static void build_force_table()
Box< Patch, Real > * bornRadBox
int getNumPesSharingDevice()
static __thread int num_atoms
const int32 * get_full_exclusions_for_atom(int anum) const
static __thread int num_local_atoms
void close(Data **const t)
static __thread float * intRad0H
static __thread int check_local_count
Box< Patch, CompAtom > * registerPositionPickup(Compute *cid)
#define PATCH_PRIORITY(PID)
void messageFinishPatch(int)
CompAtomExt * getCompAtomExtInfo()
static __thread ComputeNonbondedCUDA * cudaCompute
void register_cuda_compute_pair(ComputeID c, PatchID pid[], int t[])
Box< Patch, Results > * registerForceDeposit(Compute *cid)