version 1.2 | version 1.3 |
---|
| |
// | // |
// Class constructor | // Class constructor |
// | // |
CudaComputeNonbonded::CudaComputeNonbonded(ComputeID c, int deviceID, bool doStreaming) : Compute(c), | CudaComputeNonbonded::CudaComputeNonbonded(ComputeID c, int deviceID, |
deviceID(deviceID), doStreaming(doStreaming), nonbondedKernel(deviceID, doStreaming), | CudaNonbondedTables& cudaNonbondedTables, bool doStreaming) : |
| Compute(c), deviceID(deviceID), doStreaming(doStreaming), nonbondedKernel(deviceID, cudaNonbondedTables, doStreaming), |
tileListKernel(deviceID, doStreaming), GBISKernel(deviceID) { | tileListKernel(deviceID, doStreaming), GBISKernel(deviceID) { |
| |
cudaCheck(cudaSetDevice(deviceID)); | cudaCheck(cudaSetDevice(deviceID)); |
| |
patches[i].dHdrPrefixBox = patches[i].patch->registerDHdrPrefixPickup(this); | patches[i].dHdrPrefixBox = patches[i].patch->registerDHdrPrefixPickup(this); |
} | } |
// Store Pe where this patch was registered | // Store Pe where this patch was registered |
| #if 1 |
| if (patches[i].pe != CkMyPe()) { |
| NAMD_bug("CudaComputeNonbonded::assignPatch, patch assigned to incorrect Pe"); |
| } |
| #else |
patches[i].pe = CkMyPe(); | patches[i].pe = CkMyPe(); |
| #endif |
// | // |
patches[i].isSamePhysicalNode = ( CmiPhysicalNodeID(patchMap->node(pid)) == CmiPhysicalNodeID(CkMyPe()) ); | patches[i].isSamePhysicalNode = ( CmiPhysicalNodeID(patchMap->node(pid)) == CmiPhysicalNodeID(CkMyPe()) ); |
patches[i].isSameNode = ( CkNodeOf(patchMap->node(pid)) == CkMyNode() ); | patches[i].isSameNode = ( CkNodeOf(patchMap->node(pid)) == CkMyNode() ); |
| |
std::sort(patches.begin(), patches.end()); | std::sort(patches.begin(), patches.end()); |
std::vector<PatchRecord>::iterator last = std::unique(patches.begin(), patches.end()); | std::vector<PatchRecord>::iterator last = std::unique(patches.begin(), patches.end()); |
patches.erase(last, patches.end()); | patches.erase(last, patches.end()); |
// | // Set number of patches |
std::map<PatchID, int> pidMap; | |
// Set number of patches and register boxes | |
setNumPatches(patches.size()); | setNumPatches(patches.size()); |
masterPe = CkMyPe(); | masterPe = CkMyPe(); |
computeMgr = computeMgrIn; | computeMgr = computeMgrIn; |
// Start patch counter | // Start patch counter |
patchesCounter = getNumPatches(); | patchesCounter = getNumPatches(); |
| // Patch ID map |
| std::map<PatchID, int> pidMap; |
| #if 1 |
| //------------------------------------------------------- |
| // Copied in from ComputeNonbondedCUDA::assignPatches() |
| //------------------------------------------------------- |
| |
| std::vector<int> pesOnNodeSharingDevice(CkMyNodeSize()); |
| int numPesOnNodeSharingDevice = 0; |
| int masterIndex = -1; |
| for ( int i=0; i<deviceCUDA->getNumPesSharingDevice(); ++i ) { |
| int pe = deviceCUDA->getPesSharingDevice(i); |
| if ( pe == CkMyPe() ) masterIndex = numPesOnNodeSharingDevice; |
| if ( CkNodeOf(pe) == CkMyNode() ) { |
| pesOnNodeSharingDevice[numPesOnNodeSharingDevice++] = pe; |
| } |
| } |
| |
| std::vector<int> count(patches.size(), 0); |
| std::vector<int> pcount(numPesOnNodeSharingDevice, 0); |
| std::vector<int> rankpcount(CkMyNodeSize(), 0); |
| std::vector<char> table(patches.size()*numPesOnNodeSharingDevice, 0); |
| |
| PatchMap* patchMap = PatchMap::Object(); |
| |
| int unassignedpatches = patches.size(); |
| |
| for (int i=0;i < patches.size(); ++i) { |
| patches[i].pe = -1; |
| } |
| |
| // assign if home pe and build table of natural proxies |
| for (int i=0;i < patches.size(); ++i) { |
| int pid = patches[i].patchID; |
| // homePe = PE where the patch currently resides |
| int homePe = patchMap->node(pid); |
| for ( int j=0; j < numPesOnNodeSharingDevice; ++j ) { |
| int pe = pesOnNodeSharingDevice[j]; |
| // If homePe is sharing this device, assign this patch to homePe |
| if ( pe == homePe ) { |
| patches[i].pe = pe; |
| --unassignedpatches; |
| pcount[j] += 1; |
| } |
| if ( PatchMap::ObjectOnPe(pe)->patch(pid) ) { |
| table[i*numPesOnNodeSharingDevice + j] = 1; |
| } |
| } |
| // Assign this patch to homePe, if it resides on the same node |
| if ( patches[i].pe == -1 && CkNodeOf(homePe) == CkMyNode() ) { |
| patches[i].pe = homePe; |
| --unassignedpatches; |
| rankpcount[CkRankOf(homePe)] += 1; |
| } |
| } |
| // assign if only one pe has a required proxy |
| for (int i=0; i < patches.size(); ++i) { |
| int pid = patches[i].patchID; |
| if ( patches[i].pe != -1 ) continue; |
| int c = 0; |
| int lastj; |
| for (int j=0; j < numPesOnNodeSharingDevice; ++j) { |
| if ( table[i*numPesOnNodeSharingDevice + j] ) { |
| ++c; |
| lastj = j; |
| } |
| } |
| count[i] = c; |
| if ( c == 1 ) { |
| patches[i].pe = pesOnNodeSharingDevice[lastj]; |
| --unassignedpatches; |
| pcount[lastj] += 1; |
| } |
| } |
| int assignj = 0; |
| while ( unassignedpatches ) { |
| int i; |
| for (i=0;i < patches.size(); ++i) { |
| if ( ! table[i*numPesOnNodeSharingDevice + assignj] ) continue; |
| int pid = patches[i].patchID; |
| // patch_record &pr = patchRecords[pid]; |
| if ( patches[i].pe != -1 ) continue; |
| patches[i].pe = pesOnNodeSharingDevice[assignj]; |
| --unassignedpatches; |
| pcount[assignj] += 1; |
| if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0; |
| break; |
| } |
| if (i < patches.size() ) continue; // start search again |
| for ( i=0;i < patches.size(); ++i ) { |
| int pid = patches[i].patchID; |
| // patch_record &pr = patchRecords[pid]; |
| if ( patches[i].pe != -1 ) continue; |
| if ( count[i] ) continue; |
| patches[i].pe = pesOnNodeSharingDevice[assignj]; |
| --unassignedpatches; |
| pcount[assignj] += 1; |
| if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0; |
| break; |
| } |
| if ( i < patches.size() ) continue; // start search again |
| if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0; |
| } |
| |
| // For each rank, list of patches |
| rankPatches.resize(CkMyNodeSize()); |
| for (int i=0; i < patches.size(); ++i) { |
| rankPatches[CkRankOf(patches[i].pe)].push_back(i); |
| pidMap[patches[i].patchID] = i; |
| } |
| |
| // for ( int i=0; i < patches.size(); ++i ) { |
| // CkPrintf("Pe %d patch %d hostPe %d\n", CkMyPe(), patches[i].patchID, patches[i].pe); |
| // } |
| |
| /* |
| slavePes = new int[CkMyNodeSize()]; |
| slaves = new ComputeNonbondedCUDA*[CkMyNodeSize()]; |
| numSlaves = 0; |
| for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) { |
| int pe = pesOnNodeSharingDevice[j]; |
| int rank = pe - CkNodeFirst(CkMyNode()); |
| // CkPrintf("host %d sharing %d pe %d rank %d pcount %d rankpcount %d\n", |
| // CkMyPe(),j,pe,rank,pcount[j],rankpcount[rank]); |
| if ( pe == CkMyPe() ) continue; |
| if ( ! pcount[j] && ! rankpcount[rank] ) continue; |
| rankpcount[rank] = 0; // skip in rank loop below |
| slavePes[numSlaves] = pe; |
| computeMgr->sendCreateNonbondedCUDASlave(pe,numSlaves); |
| ++numSlaves; |
| } |
| for ( int j=0; j<CkMyNodeSize(); ++j ) { |
| int pe = CkNodeFirst(CkMyNode()) + j; |
| // CkPrintf("host %d rank %d pe %d rankpcount %d\n", |
| // CkMyPe(),j,pe,rankpcount[j]); |
| if ( ! rankpcount[j] ) continue; |
| if ( pe == CkMyPe() ) continue; |
| slavePes[numSlaves] = pe; |
| computeMgr->sendCreateNonbondedCUDASlave(pe,numSlaves); |
| ++numSlaves; |
| } |
| */ |
| |
| #else |
// For each rank, list of patches | // For each rank, list of patches |
rankPatches.resize(CkMyNodeSize()); | rankPatches.resize(CkMyNodeSize()); |
// For each rank, list of home patch IDs | // For each rank, list of home patch IDs |
| |
rankPatches[CkRankOf(pe)].push_back(i); | rankPatches[CkRankOf(pe)].push_back(i); |
pidMap[pid] = i; | pidMap[pid] = i; |
} | } |
| |
| delete [] rankHomePatchIDs; |
| #endif |
// Setup computes using pidMap | // Setup computes using pidMap |
for (int i=0;i < computes.size();i++) { | for (int i=0;i < computes.size();i++) { |
computes[i].patchInd[0] = pidMap[computes[i].pid[0]]; | computes[i].patchInd[0] = pidMap[computes[i].pid[0]]; |
computes[i].patchInd[1] = pidMap[computes[i].pid[1]]; | computes[i].patchInd[1] = pidMap[computes[i].pid[1]]; |
} | } |
| |
for (int i=0;i < CkMyNodeSize();i++) { | for (int i=0;i < CkMyNodeSize();i++) { |
if (rankPatches[i].size() > 0) pes.push_back(CkNodeFirst(CkMyNode()) + i); | if (rankPatches[i].size() > 0) pes.push_back(CkNodeFirst(CkMyNode()) + i); |
} | } |
computeMgr->sendAssignPatchesOnPe(pes, this); | computeMgr->sendAssignPatchesOnPe(pes, this); |
delete [] rankHomePatchIDs; | |
} | } |
| |
void CudaComputeNonbonded::initialize() { | void CudaComputeNonbonded::initialize() { |
| |
SimParameters *simParams = Node::Object()->simParameters; | SimParameters *simParams = Node::Object()->simParameters; |
Lattice lattice = patches[0].patch->flags.lattice; | Lattice lattice = patches[0].patch->flags.lattice; |
| |
| float3 lata = make_float3(lattice.a().x, lattice.a().y, lattice.a().z); |
| float3 latb = make_float3(lattice.b().x, lattice.b().y, lattice.b().z); |
| float3 latc = make_float3(lattice.c().x, lattice.c().y, lattice.c().z); |
| |
GBISKernel.GBISphase1(tileListKernel, atomStorageSize, | GBISKernel.GBISphase1(tileListKernel, atomStorageSize, |
lattice.a().x, lattice.b().y, lattice.c().z, | lata, latb, latc, |
simParams->alpha_cutoff-simParams->fsMax, psiSumH, stream); | simParams->alpha_cutoff-simParams->fsMax, psiSumH, stream); |
} | } |
| |
| |
SimParameters *simParams = Node::Object()->simParameters; | SimParameters *simParams = Node::Object()->simParameters; |
Lattice lattice = patches[0].patch->flags.lattice; | Lattice lattice = patches[0].patch->flags.lattice; |
| |
| float3 lata = make_float3(lattice.a().x, lattice.a().y, lattice.a().z); |
| float3 latb = make_float3(lattice.b().x, lattice.b().y, lattice.b().z); |
| float3 latc = make_float3(lattice.c().x, lattice.c().y, lattice.c().z); |
| |
GBISKernel.updateBornRad(atomStorageSize, bornRadH, stream); | GBISKernel.updateBornRad(atomStorageSize, bornRadH, stream); |
| |
GBISKernel.GBISphase2(tileListKernel, atomStorageSize, | GBISKernel.GBISphase2(tileListKernel, atomStorageSize, |
doEnergy, doSlow, | doEnergy, doSlow, |
lattice.a().x, lattice.b().y, lattice.c().z, | lata, latb, latc, |
simParams->cutoff, simParams->nonbondedScaling, simParams->kappa, | simParams->cutoff, simParams->nonbondedScaling, simParams->kappa, |
(simParams->switchingActive ? simParams->switchingDist : -1.0), | (simParams->switchingActive ? simParams->switchingDist : -1.0), |
simParams->dielectric, simParams->solvent_dielectric, | simParams->dielectric, simParams->solvent_dielectric, |
| |
SimParameters *simParams = Node::Object()->simParameters; | SimParameters *simParams = Node::Object()->simParameters; |
Lattice lattice = patches[0].patch->flags.lattice; | Lattice lattice = patches[0].patch->flags.lattice; |
| |
| float3 lata = make_float3(lattice.a().x, lattice.a().y, lattice.a().z); |
| float3 latb = make_float3(lattice.b().x, lattice.b().y, lattice.b().z); |
| float3 latc = make_float3(lattice.c().x, lattice.c().y, lattice.c().z); |
| |
if (doSlow) { | if (doSlow) { |
GBISKernel.update_dHdrPrefix(atomStorageSize, dHdrPrefixH, stream); | GBISKernel.update_dHdrPrefix(atomStorageSize, dHdrPrefixH, stream); |
| |
GBISKernel.GBISphase3(tileListKernel, atomStorageSize, | GBISKernel.GBISphase3(tileListKernel, atomStorageSize, |
lattice.a().x, lattice.b().y, lattice.c().z, | lata, latb, latc, |
simParams->alpha_cutoff-simParams->fsMax, d_forcesSlow, stream); | simParams->alpha_cutoff-simParams->fsMax, d_forcesSlow, stream); |
} | } |
} | } |
| |
cudaCheck(cudaSetDevice(deviceID)); | cudaCheck(cudaSetDevice(deviceID)); |
| |
Lattice lattice = patches[0].patch->flags.lattice; | Lattice lattice = patches[0].patch->flags.lattice; |
| float3 lata = make_float3(lattice.a().x, lattice.a().y, lattice.a().z); |
| float3 latb = make_float3(lattice.b().x, lattice.b().y, lattice.b().z); |
| float3 latc = make_float3(lattice.c().x, lattice.c().y, lattice.c().z); |
| |
if (atomsChanged) { | if (atomsChanged) { |
int numTileLists = calcNumTileLists(); | int numTileLists = calcNumTileLists(); |
// Build initial tile lists and sort | // Build initial tile lists and sort |
tileListKernel.buildTileLists(numTileLists, patches.size(), atomStorageSize, | tileListKernel.buildTileLists(numTileLists, patches.size(), atomStorageSize, |
maxTileListLen,lattice.a().x, lattice.b().y, lattice.c().z, | maxTileListLen, lata, latb, latc, |
cudaPatches, (const float4*)atoms, plcutoff2, stream); | cudaPatches, (const float4*)atoms, plcutoff2, stream); |
// Prepare tile list for atom-based refinement | // Prepare tile list for atom-based refinement |
tileListKernel.prepareTileList(stream); | tileListKernel.prepareTileList(stream); |
| |
| |
// Calculate forces (and refine tile list if atomsChanged=true) | // Calculate forces (and refine tile list if atomsChanged=true) |
nonbondedKernel.nonbondedForce(tileListKernel, atomStorageSize, atomsChanged, | nonbondedKernel.nonbondedForce(tileListKernel, atomStorageSize, atomsChanged, |
doEnergy, doVirial, doSlow, | doEnergy, doVirial, doSlow, lata, latb, latc, |
lattice.a().x, lattice.b().y, lattice.c().z, | |
(const float4*)atoms, cutoff2, d_forces, d_forcesSlow, h_forces, h_forcesSlow, | (const float4*)atoms, cutoff2, d_forces, d_forcesSlow, h_forces, h_forcesSlow, |
stream); | stream); |
| |
| |
delete [] exclusion_bits; | delete [] exclusion_bits; |
} | } |
| |
void CudaComputeNonbonded::buildTables() { | |
buildForceAndEnergyTables(4096); | |
buildVdwCoefTable(); | |
} | |
| |
// | |
// Builds the VdW Lennard-Jones coefficient table. Swiped from ComputeNonbondedCUDA.C | |
// NOTE: should only be called once | |
// | |
void CudaComputeNonbonded::buildVdwCoefTable() { | |
cudaCheck(cudaSetDevice(deviceID)); | |
| |
const LJTable* const ljTable = ComputeNonbondedUtil:: ljTable; | |
const int dim = ljTable->get_table_dim(); | |
| |
// round dim up to odd multiple of 16 | |
int tsize = (((dim+16+31)/32)*32)-16; | |
if ( tsize < dim ) NAMD_bug("CudaComputeNonbonded::buildVdwCoefTable bad tsize"); | |
| |
float2 *h_vdwCoefTable = new float2[tsize*tsize]; | |
float2 *row = h_vdwCoefTable; | |
for ( int i=0; i<dim; ++i, row += tsize ) { | |
for ( int j=0; j<dim; ++j ) { | |
const LJTable::TableEntry *e = ljTable->table_val(i,j); | |
row[j].x = e->A * scaling; | |
row[j].y = e->B * scaling; | |
} | |
} | |
| |
nonbondedKernel.bindVdwCoefTable(h_vdwCoefTable, tsize); | |
| |
delete [] h_vdwCoefTable; | |
| |
if ( ! CmiPhysicalNodeID(CkMyPe()) ) { | |
CkPrintf("Info: Updated CUDA LJ table with %d x %d elements.\n", dim, dim); | |
} | |
} | |
| |
// | |
// Builds force and energy tables. Swiped from ComputeNonbondedCUDA.C | |
// | |
void CudaComputeNonbonded::buildForceAndEnergyTables(int tableSize) { | |
cudaCheck(cudaSetDevice(deviceID)); | |
| |
float4* t = new float4[tableSize]; | |
float4* et = new float4[tableSize]; // energy table | |
| |
const BigReal r2_delta = ComputeNonbondedUtil:: r2_delta; | |
const int r2_delta_exp = ComputeNonbondedUtil:: r2_delta_exp; | |
// const int r2_delta_expc = 64 * (r2_delta_exp - 127); | |
const int r2_delta_expc = 64 * (r2_delta_exp - 1023); | |
| |
double* r2list = new double[tableSize]; // double to match cpu code | |
for ( int i=1; i<tableSize; ++i ) { | |
double r = ((double) tableSize) / ( (double) i + 0.5 ); | |
r2list[i] = r*r + r2_delta; | |
} | |
| |
union { double f; int32 i[2]; } byte_order_test; | |
byte_order_test.f = 1.0; // should occupy high-order bits only | |
int32 *r2iilist = (int32*)r2list + ( byte_order_test.i[0] ? 0 : 1 ); | |
| |
for ( int i=1; i<tableSize; ++i ) { | |
double r = ((double) tableSize) / ( (double) i + 0.5 ); | |
int table_i = (r2iilist[2*i] >> 14) + r2_delta_expc; // table_i >= 0 | |
| |
if ( r > cutoff ) { | |
t[i].x = 0.; | |
t[i].y = 0.; | |
t[i].z = 0.; | |
t[i].w = 0.; | |
et[i].x = 0.; | |
et[i].y = 0.; | |
et[i].z = 0.; | |
et[i].w = 0.; | |
continue; | |
} | |
| |
BigReal diffa = r2list[i] - r2_table[table_i]; | |
| |
// coulomb 1/r or fast force | |
// t[i].x = 1. / (r2 * r); // -1/r * d/dr r^-1 | |
{ | |
BigReal table_a = fast_table[4*table_i]; | |
BigReal table_b = fast_table[4*table_i+1]; | |
BigReal table_c = fast_table[4*table_i+2]; | |
BigReal table_d = fast_table[4*table_i+3]; | |
BigReal grad = | |
( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b; | |
t[i].x = 2. * grad; | |
BigReal ener = table_a + diffa * | |
( ( table_d * diffa + table_c ) * diffa + table_b); | |
et[i].x = ener; | |
} | |
| |
| |
// pme correction for slow force | |
// t[i].w = 0.; | |
{ | |
BigReal table_a = scor_table[4*table_i]; | |
BigReal table_b = scor_table[4*table_i+1]; | |
BigReal table_c = scor_table[4*table_i+2]; | |
BigReal table_d = scor_table[4*table_i+3]; | |
BigReal grad = | |
( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b; | |
t[i].w = 2. * grad; | |
BigReal ener = table_a + diffa * | |
( ( table_d * diffa + table_c ) * diffa + table_b); | |
et[i].w = ener; | |
} | |
| |
| |
// vdw 1/r^6 | |
// t[i].y = 6. / (r8); // -1/r * d/dr r^-6 | |
{ | |
BigReal table_a = vdwb_table[4*table_i]; | |
BigReal table_b = vdwb_table[4*table_i+1]; | |
BigReal table_c = vdwb_table[4*table_i+2]; | |
BigReal table_d = vdwb_table[4*table_i+3]; | |
BigReal grad = | |
( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b; | |
t[i].y = 2. * -1. * grad; | |
BigReal ener = table_a + diffa * | |
( ( table_d * diffa + table_c ) * diffa + table_b); | |
et[i].y = -1. * ener; | |
} | |
| |
| |
// vdw 1/r^12 | |
// t[i].z = 12e / (r8 * r4 * r2); // -1/r * d/dr r^-12 | |
{ | |
BigReal table_a = vdwa_table[4*table_i]; | |
BigReal table_b = vdwa_table[4*table_i+1]; | |
BigReal table_c = vdwa_table[4*table_i+2]; | |
BigReal table_d = vdwa_table[4*table_i+3]; | |
BigReal grad = | |
( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b; | |
t[i].z = 2. * grad; | |
BigReal ener = table_a + diffa * | |
( ( table_d * diffa + table_c ) * diffa + table_b); | |
et[i].z = ener; | |
} | |
| |
} | |
| |
t[0].x = 0.f; | |
t[0].y = 0.f; | |
t[0].z = 0.f; | |
t[0].w = 0.f; | |
et[0].x = et[1].x; | |
et[0].y = et[1].y; | |
et[0].z = et[1].z; | |
et[0].w = et[1].w; | |
| |
nonbondedKernel.bindForceAndEnergyTable(tableSize, t, et); | |
| |
if ( ! CmiPhysicalNodeID(CkMyPe()) ) { | |
CkPrintf("Info: Updated CUDA force table with %d elements.\n", tableSize); | |
} | |
| |
delete [] t; | |
delete [] et; | |
delete [] r2list; | |
} | |
| |
#endif // NAMD_CUDA | #endif // NAMD_CUDA |