#include <ComputeNonbondedCUDA.h>
Inheritance diagram for ComputeNonbondedCUDA:

Public Member Functions | |
| ComputeNonbondedCUDA (ComputeID c, ComputeMgr *mgr) | |
| ~ComputeNonbondedCUDA () | |
| void | atomUpdate () |
| void | doWork () |
| void | recvYieldDevice (int pe) |
| int | finishWork () |
| void | build_exclusions () |
| void | requirePatch (int pid) |
Static Public Member Functions | |
| void | build_force_table () |
Public Attributes | |
| int | workStarted |
| ResizeArray< int > | activePatches |
| ResizeArray< int > | localActivePatches |
| ResizeArray< int > | remoteActivePatches |
| ResizeArray< patch_record > | patchRecords |
| ResizeArray< compute_record > | computeRecords |
| ResizeArray< compute_record > | localComputeRecords |
| ResizeArray< compute_record > | remoteComputeRecords |
| int | num_atom_records |
| int | num_local_atom_records |
| int | num_remote_atom_records |
| int | num_force_records |
| PatchMap * | patchMap |
| AtomMap * | atomMap |
| SubmitReduction * | reduction |
| ComputeNonbondedCUDAKernel * | kernel |
|
||||||||||||
|
Definition at line 472 of file ComputeNonbondedCUDA.C. References atomMap, atomsChanged, build_exclusions(), computeMgr, computesChanged, cudaCompute, end_local_calc, end_local_download, end_remote_calc, end_remote_download, NAMD_die(), Node::Object(), AtomMap::Object(), PatchMap::Object(), patchMap, SimParameters::pressureProfileOn, reduction, Node::simParameters, start_calc, start_upload, and workStarted. 00472 : Compute(c) { 00473 // CkPrintf("create ComputeNonbondedCUDA\n"); 00474 cudaCompute = this; 00475 computeMgr = mgr; 00476 patchMap = PatchMap::Object(); 00477 atomMap = AtomMap::Object(); 00478 reduction = 0; 00479 build_exclusions(); 00480 00481 SimParameters *params = Node::Object()->simParameters; 00482 if (params->pressureProfileOn) { 00483 NAMD_die("pressure profile not supported in CUDA"); 00484 } 00485 00486 atomsChanged = 1; 00487 computesChanged = 1; 00488 workStarted = 0; 00489 basePriority = PROXY_DATA_PRIORITY; 00490 00491 cudaEventCreate(&start_upload); 00492 cudaEventCreate(&start_calc); 00493 cudaEventCreate(&end_remote_calc); 00494 cudaEventCreate(&end_remote_download); 00495 cudaEventCreate(&end_local_calc); 00496 cudaEventCreate(&end_local_download); 00497 }
|
|
|
Definition at line 500 of file ComputeNonbondedCUDA.C. 00500 { ; }
|
|
|
Reimplemented from Compute. Definition at line 577 of file ComputeNonbondedCUDA.C. References atomsChanged. 00577 { atomsChanged = 1; }
|
|
|
Definition at line 327 of file ComputeNonbondedCUDA.C. References ResizeArray< Elem >::add(), cuda_bind_exclusions(), exclusionsByAtom, Molecule::get_full_exclusions_for_atom(), ObjectArena< Type >::getNewArray(), int32, j, Node::molecule, NAMD_bug(), Molecule::numAtoms, Node::Object(), ResizeArray< Elem >::resize(), SET_EXCL, ResizeArray< Elem >::size(), and SortableResizeArray< Elem >::sort(). Referenced by ComputeNonbondedCUDA(). 00327 {
00328 Molecule *mol = Node::Object()->molecule;
00329 int natoms = mol->numAtoms;
00330 exclusionsByAtom = new ushort2[natoms];
00331
00332 // create unique sorted lists
00333
00334 ObjectArena<int32> listArena;
00335 ResizeArray<int32*> unique_lists;
00336 SortableResizeArray<int32> curList;
00337 int totalbits = 0;
00338 for ( int i=0; i<natoms; ++i ) {
00339 const int32 *mol_list = mol->get_full_exclusions_for_atom(i);
00340 curList.resize(0);
00341 int n = mol_list[0] + 1;
00342 curList.add(0); // always excluded from self
00343 for ( int j=1; j<n; ++j ) { curList.add(mol_list[j] - i); }
00344 curList.sort();
00345
00346 int j;
00347 for ( j=0; j<unique_lists.size(); ++j ) {
00348 if ( n != unique_lists[j][0] ) continue; // no match
00349 int k;
00350 for ( k=0; k<n; ++k ) {
00351 if ( unique_lists[j][k+3] != curList[k] ) break;
00352 }
00353 if ( k == n ) break; // found match
00354 }
00355 if ( j == unique_lists.size() ) { // no match
00356 int32 *list = listArena.getNewArray(n+3);
00357 list[0] = n;
00358 int maxdiff = 0;
00359 maxdiff = -1 * curList[0];
00360 if ( curList[n-1] > maxdiff ) maxdiff = curList[n-1];
00361 list[1] = maxdiff;
00362 list[2] = totalbits + maxdiff;
00363 totalbits += 2*maxdiff + 1;
00364 for ( int k=0; k<n; ++k ) {
00365 list[k+3] = curList[k];
00366 }
00367 unique_lists.add(list);
00368 }
00369 exclusionsByAtom[i].x = unique_lists[j][1]; // maxdiff
00370 exclusionsByAtom[i].y = unique_lists[j][2]; // start
00371 }
00372
00373 if ( totalbits & 31 ) totalbits += ( 32 - ( totalbits & 31 ) );
00374
00375 if ( ! CkMyPe() ) {
00376 CkPrintf("Info: Found %d unique exclusion lists needing %d bytes\n",
00377 unique_lists.size(), totalbits / 8);
00378 }
00379
00380 #define SET_EXCL(EXCL,BASE,DIFF) \
00381 (EXCL)[((BASE)+(DIFF))>>5] |= (1<<(((BASE)+(DIFF))&31))
00382
00383 unsigned int *exclusion_bits = new unsigned int[totalbits/32];
00384 memset(exclusion_bits, 0, totalbits/8);
00385
00386 int base = 0;
00387 for ( int i=0; i<unique_lists.size(); ++i ) {
00388 base += unique_lists[i][1];
00389 if ( base != unique_lists[i][2] ) {
00390 NAMD_bug("ComputeNonbondedCUDA::build_exclusions base != stored");
00391 }
00392 int n = unique_lists[i][0];
00393 for ( int j=0; j<n; ++j ) {
00394 SET_EXCL(exclusion_bits,base,unique_lists[i][j+3]);
00395 }
00396 base += unique_lists[i][1] + 1;
00397 }
00398
00399 cuda_bind_exclusions(exclusion_bits, totalbits/32);
00400
00401 delete [] exclusion_bits;
00402 }
|
|
|
Definition at line 211 of file ComputeNonbondedCUDA.C. References BigReal, cuda_bind_force_table(), diffa, FORCE_TABLE_SIZE, int32, and table_i. Referenced by build_cuda_force_table(). 00211 { // static
00212
00213 float4 t[FORCE_TABLE_SIZE];
00214
00215 const BigReal r2_delta = ComputeNonbondedUtil:: r2_delta;
00216 const int r2_delta_exp = ComputeNonbondedUtil:: r2_delta_exp;
00217 // const int r2_delta_expc = 64 * (r2_delta_exp - 127);
00218 const int r2_delta_expc = 64 * (r2_delta_exp - 1023);
00219
00220 double r2list[FORCE_TABLE_SIZE]; // double to match cpu code
00221 for ( int i=1; i<FORCE_TABLE_SIZE; ++i ) {
00222 double r = ((double) FORCE_TABLE_SIZE) / ( (double) i + 0.5 );
00223 r2list[i] = r*r + r2_delta;
00224 }
00225
00226 union { double f; int32 i[2]; } byte_order_test;
00227 byte_order_test.f = 1.0; // should occupy high-order bits only
00228 int32 *r2iilist = (int32*)r2list + ( byte_order_test.i[0] ? 0 : 1 );
00229
00230 for ( int i=1; i<FORCE_TABLE_SIZE; ++i ) {
00231 double r = ((double) FORCE_TABLE_SIZE) / ( (double) i + 0.5 );
00232 int table_i = (r2iilist[2*i] >> 14) + r2_delta_expc; // table_i >= 0
00233
00234 if ( r > cutoff ) {
00235 t[i].x = 0.;
00236 t[i].y = 0.;
00237 t[i].z = 0.;
00238 t[i].w = 0.;
00239 continue;
00240 }
00241
00242 BigReal diffa = r2list[i] - r2_table[table_i];
00243
00244 // coulomb 1/r or fast force
00245 // t[i].x = 1. / (r2 * r); // -1/r * d/dr r^-1
00246 {
00247 // BigReal table_a = fast_table[4*table_i];
00248 BigReal table_b = fast_table[4*table_i+1];
00249 BigReal table_c = fast_table[4*table_i+2];
00250 BigReal table_d = fast_table[4*table_i+3];
00251 BigReal grad =
00252 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00253 t[i].x = 2. * grad;
00254 }
00255
00256
00257 // pme correction for slow force
00258 // t[i].w = 0.;
00259 {
00260 // BigReal table_a = scor_table[4*table_i];
00261 BigReal table_b = scor_table[4*table_i+1];
00262 BigReal table_c = scor_table[4*table_i+2];
00263 BigReal table_d = scor_table[4*table_i+3];
00264 BigReal grad =
00265 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00266 t[i].w = 2. * grad;
00267 }
00268
00269
00270 // vdw 1/r^6
00271 // t[i].y = 6. / (r8); // -1/r * d/dr r^-6
00272 {
00273 // BigReal table_a = vdwb_table[4*table_i];
00274 BigReal table_b = vdwb_table[4*table_i+1];
00275 BigReal table_c = vdwb_table[4*table_i+2];
00276 BigReal table_d = vdwb_table[4*table_i+3];
00277 BigReal grad =
00278 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00279 t[i].y = 2. * -1. * grad;
00280 }
00281
00282
00283 // vdw 1/r^12
00284 // t[i].z = 12e / (r8 * r4 * r2); // -1/r * d/dr r^-12
00285 {
00286 // BigReal table_a = vdwa_table[4*table_i];
00287 BigReal table_b = vdwa_table[4*table_i+1];
00288 BigReal table_c = vdwa_table[4*table_i+2];
00289 BigReal table_d = vdwa_table[4*table_i+3];
00290 BigReal grad =
00291 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00292 t[i].z = 2. * grad;
00293 }
00294
00295 // CkPrintf("%d %g %g %g %g %g %g\n", i, r, diffa,
00296 // t[i].x, t[i].y, t[i].z, t[i].w);
00297
00298 /*
00299 double r2 = r * r;
00300 double r4 = r2 * r2;
00301 double r8 = r4 * r4;
00302
00303 t[i].x = 1. / (r2 * r); // -1/r * d/dr r^-1
00304 t[i].y = 6. / (r8); // -1/r * d/dr r^-6
00305 t[i].z = 12. / (r8 * r4 * r2); // -1/r * d/dr r^-12
00306 t[i].w = 0.;
00307 */
00308 }
00309
00310 t[0].x = 0.f;
00311 t[0].y = 0.f;
00312 t[0].z = 0.f;
00313 t[0].w = 0.f;
00314
00315 cuda_bind_force_table(t);
00316
00317 if ( ! CkMyPe() ) {
00318 CkPrintf("Info: Updated CUDA force table with %d elements.\n", FORCE_TABLE_SIZE);
00319 }
00320 }
|
|
|
Reimplemented from Compute. Definition at line 581 of file ComputeNonbondedCUDA.C. References activePatches, atom_params, CompAtomExt::atomFixed, atoms, atomsChanged, Molecule::atomvdwtype(), ResizeArray< Elem >::begin(), ccd_index_local_download, ccd_index_remote_download, PatchMap::center(), CompAtom::charge, COLOUMB, computeRecords, computesChanged, cuda_bind_atom_params(), cuda_bind_atoms(), cuda_bind_patch_pairs(), cuda_check_local_progress(), cuda_check_remote_progress(), CUDA_CONDITION, cuda_load_forces(), cuda_nonbonded_forces(), cuda_stream_finished(), cudaCompute, end_local_calc, end_local_download, end_remote_calc, end_remote_download, exclusionsByAtom, finishWork(), Patch::flags, force_lists, forces, Parameters::get_vdw_params(), Patch::getCompAtomExtInfo(), Patch::getNumAtoms(), CompAtomExt::id, j, kernel_launch_state, kernel_time, Flags::lattice, localActivePatches, localComputeRecords, ComputeNonbondedCUDA::patch_record::localStart, MAX_ATOMS_PER_PATCH, Node::molecule, NAMD_die(), num_atom_records, num_atom_records_allocated, num_force_records, num_local_atom_records, num_remote_atom_records, ComputeNonbondedCUDA::patch_record::numAtoms, ComputeNonbondedCUDA::patch_record::numFreeAtoms, Node::Object(), ComputeNonbondedCUDA::compute_record::offset, Box< Owner, Data >::open(), SimParameters::outputCudaTiming, ComputeNonbondedCUDA::patch_record::p, Node::parameters, patch_pairs, ComputeNonbondedCUDA::patch_record::patchID, patchMap, patchRecords, ComputeNonbondedCUDA::compute_record::pid, CompAtom::position, ComputeNonbondedCUDA::patch_record::positionBox, Real, recvYieldDevice(), remoteActivePatches, remoteComputeRecords, ResizeArray< Elem >::resize(), Node::simParameters, simParams, ResizeArray< Elem >::size(), slow_forces, start_calc, start_upload, stream, Lattice::unscale(), SimParameters::useFlexibleCell, workStarted, Vector::x, ComputeNonbondedCUDA::patch_record::x, ComputeNonbondedCUDA::patch_record::xExt, Vector::y, and Vector::z. 00581 {
00582
00583 // CkPrintf("Pe %d doWork %d\n", CkMyPe(), workStarted);
00584
00585 if ( workStarted ) {
00586 if ( finishWork() ) { // finished
00587 workStarted = 0;
00588 basePriority = PROXY_DATA_PRIORITY; // higher to aid overlap
00589 } else { // need to call again
00590 workStarted = 2;
00591 basePriority = COMPUTE_HOME_PRIORITY; // lower for local
00592 if ( kernel_launch_state > 2 ) ccd_index_local_download = CcdCallOnCondition(CUDA_CONDITION,cuda_check_local_progress,this);
00593 }
00594 return;
00595 }
00596 workStarted = 1;
00597 basePriority = COMPUTE_PROXY_PRIORITY;
00598
00599 Molecule *mol = Node::Object()->molecule;
00600 Parameters *params = Node::Object()->parameters;
00601 SimParameters *simParams = Node::Object()->simParameters;
00602
00603 if ( simParams->useFlexibleCell ) {
00604 NAMD_die("flexible-cell constant pressure not supported in CUDA");
00605 }
00606
00607 for ( int i=0; i<activePatches.size(); ++i ) {
00608 patch_record &pr = patchRecords[activePatches[i]];
00609 pr.x = pr.positionBox->open();
00610 pr.xExt = pr.p->getCompAtomExtInfo();
00611 }
00612
00613 if ( atomsChanged || computesChanged ) {
00614 int npatches = activePatches.size();
00615
00616 if ( computesChanged ) {
00617 computesChanged = 0;
00618
00619 int *ap = activePatches.begin();
00620 for ( int i=0; i<localActivePatches.size(); ++i ) {
00621 *(ap++) = localActivePatches[i];
00622 }
00623 for ( int i=0; i<remoteActivePatches.size(); ++i ) {
00624 *(ap++) = remoteActivePatches[i];
00625 }
00626
00627 int nlc = localComputeRecords.size();
00628 int nrc = remoteComputeRecords.size();
00629 computeRecords.resize(nlc+nrc);
00630 compute_record *cr = computeRecords.begin();
00631 for ( int i=0; i<nlc; ++i ) {
00632 *(cr++) = localComputeRecords[i];
00633 }
00634 for ( int i=0; i<nrc; ++i ) {
00635 *(cr++) = remoteComputeRecords[i];
00636 }
00637
00638 force_lists.resize(npatches);
00639 for ( int i=0; i<npatches; ++i ) {
00640 patchRecords[activePatches[i]].localIndex = i;
00641 force_lists[i].force_list_size = 0;
00642 }
00643
00644 int ncomputes = computeRecords.size();
00645 patch_pairs.resize(ncomputes);
00646 for ( int i=0; i<ncomputes; ++i ) {
00647 ComputeNonbondedCUDA::compute_record &cr = computeRecords[i];
00648 int lp1 = patchRecords[cr.pid[0]].localIndex;
00649 int lp2 = patchRecords[cr.pid[1]].localIndex;
00650 force_lists[lp1].force_list_size++;
00651 patch_pair &pp = patch_pairs[i];
00652 pp.offset.x = cr.offset.x;
00653 pp.offset.y = cr.offset.y;
00654 pp.offset.z = cr.offset.z;
00655 }
00656
00657 if ( simParams->outputCudaTiming ) {
00658 CkPrintf("Pe %d has %d local and %d remote patches and %d local and %d remote computes.\n",
00659 CkMyPe(), localActivePatches.size(), remoteActivePatches.size(),
00660 localComputeRecords.size(), remoteComputeRecords.size());
00661 }
00662 }
00663
00664 int istart = 0;
00665 int flstart = 0;
00666 int max_atoms_per_patch = 0;
00667 int i;
00668 for ( i=0; i<npatches; ++i ) {
00669 if ( i == localActivePatches.size() ) {
00670 num_local_atom_records = istart;
00671 }
00672 force_lists[i].force_list_start = flstart;
00673 force_lists[i].force_output_start = istart;
00674 patch_record &pr = patchRecords[activePatches[i]];
00675 pr.localStart = istart;
00676 int natoms = pr.p->getNumAtoms();
00677 int nfreeatoms = natoms;
00678 if ( fixedAtomsOn ) {
00679 const CompAtomExt *aExt = pr.xExt;
00680 for ( int j=0; j<natoms; ++j ) {
00681 if ( aExt[j].atomFixed ) --nfreeatoms;
00682 }
00683 }
00684 if ( natoms > max_atoms_per_patch ) max_atoms_per_patch = natoms;
00685 pr.numAtoms = natoms;
00686 pr.numFreeAtoms = nfreeatoms;
00687 if ( natoms & 15 ) { natoms += 16 - (natoms & 15); }
00688 if ( nfreeatoms & 15 ) { nfreeatoms += 16 - (nfreeatoms & 15); }
00689 force_lists[i].patch_size = nfreeatoms;
00690 flstart += nfreeatoms * force_lists[i].force_list_size;
00691 istart += natoms; // already rounded up
00692 force_lists[i].force_list_size = 0; // rebuild below
00693 }
00694 if ( i == localActivePatches.size() ) {
00695 num_local_atom_records = istart;
00696 }
00697 num_force_records = flstart;
00698 num_atom_records = istart;
00699 num_remote_atom_records = num_atom_records - num_local_atom_records;
00700 if ( num_atom_records > num_atom_records_allocated ) {
00701 if ( num_atom_records_allocated ) {
00702 cudaFreeHost(atom_params);
00703 cudaFreeHost(atoms);
00704 cudaFreeHost(forces);
00705 cudaFreeHost(slow_forces);
00706 }
00707 num_atom_records_allocated = 1.1 * num_atom_records + 1;
00708 cudaMallocHost((void**)&atom_params,sizeof(atom_param)*num_atom_records_allocated);
00709 cudaMallocHost((void**)&atoms,sizeof(atom)*num_atom_records_allocated);
00710 cudaMallocHost((void**)&forces,sizeof(float4)*num_atom_records_allocated);
00711 cudaMallocHost((void**)&slow_forces,sizeof(float4)*num_atom_records_allocated);
00712 }
00713
00714 #if 0
00715 if ( max_atoms_per_patch > MAX_ATOMS_PER_PATCH ) {
00716 char errstr[1024];
00717 sprintf(errstr,"Found patch with %d atoms; limit is %d for CUDA."
00718 " Try enabling twoAwayX, twoAwayY, and/or twoAwayZ to fix this.",
00719 max_atoms_per_patch, MAX_ATOMS_PER_PATCH);
00720 NAMD_die(errstr);
00721 }
00722 #endif
00723
00724 int ncomputes = computeRecords.size();
00725 for ( int i=0; i<ncomputes; ++i ) {
00726 ComputeNonbondedCUDA::compute_record &cr = computeRecords[i];
00727 int p1 = cr.pid[0];
00728 int p2 = cr.pid[1];
00729 int lp1 = patchRecords[p1].localIndex;
00730 int lp2 = patchRecords[p2].localIndex;
00731 patch_pair &pp = patch_pairs[i];
00732 pp.patch1_atom_start = patchRecords[p1].localStart;
00733 pp.patch1_force_start = force_lists[lp1].force_list_start +
00734 force_lists[lp1].patch_size * force_lists[lp1].force_list_size;
00735 pp.patch1_size = patchRecords[p1].numAtoms;
00736 pp.patch1_force_size = patchRecords[p1].numFreeAtoms;
00737 // istart += pp.patch1_size;
00738 // if ( istart & 15 ) { istart += 16 - (istart & 15); }
00739 pp.patch2_atom_start = patchRecords[p2].localStart;
00740 // pp.patch2_force_start = force_lists[lp2].force_list_start +
00741 // force_lists[lp2].patch_size * force_lists[lp2].force_list_size;
00742 pp.patch2_size = patchRecords[p2].numAtoms;
00743 // pp.patch2_force_size = patchRecords[p2].numFreeAtoms;
00744 // this must happen at end to get self computes right
00745 force_lists[lp1].force_list_size++;
00746 // if ( lp1 != lp2 || cr.t[0] != cr.t[1] ) {
00747 // force_lists[lp2].force_list_size++;
00748 // }
00749 // istart += pp.patch2_size;
00750 // if ( istart & 15 ) { istart += 16 - (istart & 15); }
00751 }
00752
00753 #if 0
00754 CkPrintf("Pe %d cuda_bind_patch_pairs %d %d %d %d %d\n", CkMyPe(),
00755 patch_pairs.size(), force_lists.size(),
00756 num_atom_records, num_force_records,
00757 max_atoms_per_patch);
00758 #endif
00759
00760 int totalmem = patch_pairs.size() * sizeof(patch_pair) +
00761 force_lists.size() * sizeof(force_list) +
00762 num_force_records * sizeof(float4) +
00763 num_atom_records * sizeof(atom) +
00764 num_atom_records * sizeof(atom_param) +
00765 num_atom_records * sizeof(float4);
00766 int totalcopy = num_atom_records * ( sizeof(atom) + sizeof(float4) );
00767 /*
00768 CkPrintf("Pe %d allocating %d MB of GPU memory, will copy %d kB per step\n",
00769 CkMyPe(), totalmem >> 20, totalcopy >> 10);
00770 */
00771
00772 cuda_bind_patch_pairs(patch_pairs.begin(), patch_pairs.size(),
00773 force_lists.begin(), force_lists.size(),
00774 num_atom_records, num_force_records,
00775 max_atoms_per_patch);
00776
00777 } // atomsChanged || computesChanged
00778
00779 double charge_scaling = sqrt(COLOUMB * scaling * dielectric_1);
00780
00781
00782 for ( int i=0; i<activePatches.size(); ++i ) {
00783 patch_record &pr = patchRecords[activePatches[i]];
00784
00785 int start = pr.localStart;
00786 int n = pr.numAtoms;
00787 const CompAtom *a = pr.x;
00788 const CompAtomExt *aExt = pr.xExt;
00789 if ( atomsChanged ) {
00790 atom_param *ap = atom_params + start;
00791 int k = 0;
00792 for ( int j=0; j<n; ++j ) {
00793 // put free atoms first
00794 if ( fixedAtomsOn && aExt[j].atomFixed ) continue;
00795 ap[k].index = aExt[j].id;
00796 ap[k].excl_index = exclusionsByAtom[aExt[j].id].y;
00797 ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].id].x;
00798 int vdwtype = mol->atomvdwtype(aExt[j].id);
00799 Real sig, eps, sig14, eps14;
00800 params->get_vdw_params(&sig,&eps,&sig14,&eps14,vdwtype);
00801 ap[k].sqrt_epsilon = sqrt(4.0 * scaling * eps);
00802 ap[k].half_sigma = 0.5 * sig;
00803 // CkPrintf("%d %d %f %f\n", k, vdwtype, sig, eps);
00804 ++k;
00805 }
00806 if ( fixedAtomsOn ) for ( int j=0; j<n; ++j ) {
00807 // put fixed atoms at end
00808 if ( ! (fixedAtomsOn && aExt[j].atomFixed) ) continue;
00809 ap[k].index = aExt[j].id;
00810 ap[k].excl_index = exclusionsByAtom[aExt[j].id].y;
00811 ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].id].x;
00812 int vdwtype = mol->atomvdwtype(aExt[j].id);
00813 Real sig, eps, sig14, eps14;
00814 params->get_vdw_params(&sig,&eps,&sig14,&eps14,vdwtype);
00815 ap[k].sqrt_epsilon = sqrt(4.0 * scaling * eps);
00816 ap[k].half_sigma = 0.5 * sig;
00817 // CkPrintf("%d %d %f %f\n", k, vdwtype, sig, eps);
00818 ++k;
00819 }
00820 }
00821 {
00822 Vector center =
00823 pr.p->flags.lattice.unscale(cudaCompute->patchMap->center(pr.patchID));
00824 atom *ap = atoms + start;
00825 int k = 0;
00826 for ( int j=0; j<n; ++j ) {
00827 // put free atoms first
00828 if ( fixedAtomsOn && aExt[j].atomFixed ) continue;
00829 ap[k].position.x = a[j].position.x - center.x;
00830 ap[k].position.y = a[j].position.y - center.y;
00831 ap[k].position.z = a[j].position.z - center.z;
00832 ap[k].charge = charge_scaling * a[j].charge;
00833 ++k;
00834 }
00835 if ( fixedAtomsOn ) for ( int j=0; j<n; ++j ) {
00836 // put fixed atoms at end
00837 if ( ! (fixedAtomsOn && aExt[j].atomFixed) ) continue;
00838 ap[k].position.x = a[j].position.x - center.x;
00839 ap[k].position.y = a[j].position.y - center.y;
00840 ap[k].position.z = a[j].position.z - center.z;
00841 ap[k].charge = charge_scaling * a[j].charge;
00842 ++k;
00843 }
00844 }
00845 }
00846
00847 kernel_time = -1. * CkWallTimer();
00848 #if 0
00849 kernel_launch_state = 3;
00850
00851 cudaEventRecord(start_upload, stream);
00852
00853 if ( atomsChanged ) {
00854 cuda_bind_atom_params(atom_params);
00855 }
00856
00857 XXX - THIS PATH NOT UPDATED FOR SLOW FORCES - DO NOT COMPILE
00858 cuda_bind_atoms(atoms);
00859 cudaEventRecord(start_calc, stream);
00860 cuda_nonbonded_forces(cutoff2,
00861 localComputeRecords.size(),remoteComputeRecords.size(),
00862 localActivePatches.size(),remoteActivePatches.size());
00863 cudaEventRecord(end_remote_calc, stream);
00864 cuda_load_forces(forces,num_local_atom_records,num_remote_atom_records);
00865 cudaEventRecord(end_remote_download, stream);
00866 cuda_nonbonded_forces(cutoff2,
00867 0,localComputeRecords.size(),
00868 0,localActivePatches.size());
00869 cudaEventRecord(end_local_calc, stream);
00870 cuda_load_forces(forces,0,num_local_atom_records);
00871 cudaEventRecord(end_local_download, stream);
00872
00873 if ( cuda_stream_finished() ) {
00874 CkPrintf("CUDA not overlapping with CPU work.\n");
00875 }
00876
00877 // finishWork();
00878
00879 ccd_index_remote_download = CcdCallOnCondition(CUDA_CONDITION,cuda_check_remote_progress,this);
00880 #else
00881
00882 kernel_launch_state = 1;
00883 if ( gpu_is_mine ) recvYieldDevice(-1);
00884
00885 #endif
00886 }
|
|
|
Definition at line 985 of file ComputeNonbondedCUDA.C. References activePatches, CompAtomExt::atomFixed, atomsChanged, Box< Owner, Data >::close(), cuda_errcheck(), cuda_timer_count, cuda_timer_total, Flags::doFullElectrostatics, end_local_calc, end_local_download, end_remote_calc, end_remote_download, Results::f, ComputeNonbondedCUDA::patch_record::f, Force, ComputeNonbondedCUDA::patch_record::forceBox, forces, Molecule::get_full_exclusions_for_atom(), SubmitReduction::item(), j, localActivePatches, ComputeNonbondedCUDA::patch_record::localStart, Node::molecule, ComputeNonbondedCUDA::patch_record::numAtoms, Node::Object(), Box< Owner, Data >::open(), SimParameters::outputCudaTiming, patchRecords, CompAtom::position, ComputeNonbondedCUDA::patch_record::positionBox, ComputeNonbondedCUDA::patch_record::r, reduction, remoteActivePatches, Node::simParameters, simParams, ResizeArray< Elem >::size(), slow_forces, start_calc, start_upload, SubmitReduction::submit(), workStarted, Vector::x, ComputeNonbondedCUDA::patch_record::x, ComputeNonbondedCUDA::patch_record::xExt, Vector::y, and Vector::z. Referenced by doWork(). 00985 {
00986
00987 cuda_errcheck("cuda stream completed");
00988
00989 Molecule *mol = Node::Object()->molecule;
00990 SimParameters *simParams = Node::Object()->simParameters;
00991
00992 Flags &flags = patchRecords[activePatches[0]].p->flags;
00993 int doSlow = flags.doFullElectrostatics;
00994
00995 ResizeArray<int> &patches( workStarted == 1 ?
00996 remoteActivePatches : localActivePatches );
00997
00998 for ( int i=0; i<patches.size(); ++i ) {
00999 patch_record &pr = patchRecords[patches[i]];
01000 pr.r = pr.forceBox->open();
01001 pr.f = pr.r->f[Results::nbond];
01002 }
01003
01004 // long long int wcount = 0;
01005 double virial = 0.;
01006 double virial_slow = 0.;
01007
01008 for ( int i=0; i<patches.size(); ++i ) {
01009 patch_record &pr = patchRecords[patches[i]];
01010 int start = pr.localStart;
01011 int n = pr.numAtoms;
01012 Force *f = pr.f;
01013 Force *f_slow = pr.r->f[Results::slow];
01014 const CompAtom *a = pr.x;
01015 const CompAtomExt *aExt = pr.xExt;
01016 float4 *af = forces + start;
01017 float4 *af_slow = slow_forces + start;
01018 int k = 0;
01019 for ( int j=0; j<n; ++j ) {
01020 // only free atoms return forces
01021 if ( fixedAtomsOn && aExt[j].atomFixed ) continue;
01022 f[j].x += af[k].x;
01023 f[j].y += af[k].y;
01024 f[j].z += af[k].z;
01025 // wcount += af[k].w;
01026 virial += af[k].w;
01027 if ( doSlow ) {
01028 f_slow[j].x += af_slow[k].x;
01029 f_slow[j].y += af_slow[k].y;
01030 f_slow[j].z += af_slow[k].z;
01031 virial_slow += af_slow[k].w;
01032 }
01033 ++k;
01034 }
01035
01036 #if 0
01037 // check exclusions reported as w
01038 if ( CkNumPes() == 1 ) {
01039 const CompAtomExt *aExt = pr.xExt;
01040 int k = 0;
01041 for ( int j=0; j<n; ++j ) {
01042 // only free atoms return forces
01043 if ( fixedAtomsOn && aExt[j].atomFixed ) continue;
01044 int excl_expected = mol->get_full_exclusions_for_atom(aExt[j].id)[0] + 1;
01045 if ( af[k].w != excl_expected ) {
01046 CkPrintf("%d:%d(%d) atom %d found %d exclusions but expected %d\n",
01047 i, j, k, aExt[j].id, (int)af[k].w, excl_expected );
01048 }
01049 ++k;
01050 }
01051 }
01052 #endif
01053
01054
01055 #if 0
01056 if ( i % 31 == 0 ) for ( int j=0; j<3; ++j ) {
01057 CkPrintf("Pe %d patch %d atom %d (%f %f %f) force %f\n", CkMyPe(), i,
01058 j, pr.x[j].position.x, pr.x[j].position.y, pr.x[j].position.z,
01059 af[j].w);
01060 }
01061 #endif
01062 pr.positionBox->close(&(pr.x));
01063 pr.forceBox->close(&(pr.r));
01064 }
01065
01066 virial *= (-1./6.);
01067 reduction->item(REDUCTION_VIRIAL_NBOND_XX) += virial;
01068 reduction->item(REDUCTION_VIRIAL_NBOND_YY) += virial;
01069 reduction->item(REDUCTION_VIRIAL_NBOND_ZZ) += virial;
01070 if ( doSlow ) {
01071 virial_slow *= (-1./6.);
01072 reduction->item(REDUCTION_VIRIAL_SLOW_XX) += virial_slow;
01073 reduction->item(REDUCTION_VIRIAL_SLOW_YY) += virial_slow;
01074 reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += virial_slow;
01075 }
01076
01077 if ( workStarted == 1 ) return 0; // not finished, call again
01078
01079 atomsChanged = 0;
01080 reduction->submit();
01081
01082 // int natoms = mol->numAtoms;
01083 // double wpa = wcount; wpa /= natoms;
01084
01085 // CkPrintf("Pe %d CUDA kernel %f ms, total %f ms, wpa %f\n", CkMyPe(),
01086 // kernel_time * 1.e3, time * 1.e3, wpa);
01087
01088 float upload_ms, remote_calc_ms, remote_download_ms;
01089 float local_calc_ms, local_download_ms, total_ms;
01090 cuda_errcheck("before event timers");
01091 cudaEventElapsedTime(&upload_ms, start_upload, start_calc);
01092 cuda_errcheck("event timer 1");
01093 cudaEventElapsedTime(&remote_calc_ms, start_calc, end_remote_calc);
01094 cuda_errcheck("event timer 2");
01095 cudaEventElapsedTime(&remote_download_ms, end_remote_calc, end_remote_download);
01096 cuda_errcheck("event timer 3");
01097 cudaEventElapsedTime(&local_calc_ms, end_remote_download, end_local_calc);
01098 cuda_errcheck("event timer 4");
01099 cudaEventElapsedTime(&local_download_ms, end_local_calc, end_local_download);
01100 cuda_errcheck("event timer 5");
01101 cudaEventElapsedTime(&total_ms, start_upload, end_local_download);
01102 cuda_errcheck("event timer 6");
01103 cuda_errcheck("event timers");
01104
01105 cuda_timer_total += kernel_time;
01106 if ( simParams->outputCudaTiming &&
01107 cuda_timer_count % simParams->outputCudaTiming == 0 ) {
01108 cuda_timer_total /= (cuda_timer_count + 1);
01109 CkPrintf("CUDA EVENT TIMING: %d %f %f %f %f %f %f\n",
01110 CkMyPe(), upload_ms, remote_calc_ms, remote_download_ms,
01111 local_calc_ms, local_download_ms, total_ms);
01112 CkPrintf("CUDA TIMING: %f ms/step on node %d\n",
01113 cuda_timer_total * 1.e3, CkMyPe());
01114 cuda_timer_count = 0;
01115 cuda_timer_total = 0;
01116 }
01117 cuda_timer_count++;
01118
01119 return 1; // finished and ready for next step
01120 }
|
|
|
Definition at line 916 of file ComputeNonbondedCUDA.C. References Lattice::a(), activePatches, atom_params, atoms, Lattice::b(), Lattice::c(), ccd_index_local_calc, ccd_index_local_download, ccd_index_remote_calc, ccd_index_remote_download, cuda_bind_atom_params(), cuda_bind_atoms(), cuda_check_local_calc(), cuda_check_local_progress(), cuda_check_remote_calc(), cuda_check_remote_progress(), CUDA_CONDITION, cuda_load_forces(), cuda_nonbonded_forces(), Flags::doFullElectrostatics, end_local_calc, end_local_download, end_remote_calc, end_remote_download, forces, gpu_is_mine, Flags::lattice, localActivePatches, localComputeRecords, num_local_atom_records, num_remote_atom_records, patchRecords, remoteActivePatches, remoteComputeRecords, ResizeArray< Elem >::size(), slow_forces, start_calc, start_upload, stream, workStarted, Vector::x, Vector::y, and Vector::z. Referenced by doWork(), and ComputeMgr::recvYieldDevice(). 00916 {
00917
00918 // CkPrintf("Pe %d entering state %d via yield from pe %d\n",
00919 // CkMyPe(), kernel_launch_state, pe);
00920
00921 Flags &flags = patchRecords[activePatches[0]].p->flags;
00922 int doSlow = flags.doFullElectrostatics;
00923
00924 Lattice &lattice = flags.lattice;
00925 float3 lata, latb, latc;
00926 lata.x = lattice.a().x;
00927 lata.y = lattice.a().y;
00928 lata.z = lattice.a().z;
00929 latb.x = lattice.b().x;
00930 latb.y = lattice.b().y;
00931 latb.z = lattice.b().z;
00932 latc.x = lattice.c().x;
00933 latc.y = lattice.c().y;
00934 latc.z = lattice.c().z;
00935
00936 switch ( kernel_launch_state ) {
00937 case 1:
00938 ++kernel_launch_state;
00939 gpu_is_mine = 0;
00940 cudaEventRecord(start_upload, stream);
00941
00942 if ( atomsChanged ) {
00943 cuda_bind_atom_params(atom_params);
00944 }
00945
00946 cuda_bind_atoms(atoms);
00947 cudaEventRecord(start_calc, stream);
00948 cuda_nonbonded_forces(lata, latb, latc, cutoff2,
00949 localComputeRecords.size(),remoteComputeRecords.size(),
00950 localActivePatches.size(),remoteActivePatches.size(), doSlow);
00951 cudaEventRecord(end_remote_calc, stream);
00952 cuda_load_forces(forces, (doSlow ? slow_forces : 0 ),
00953 num_local_atom_records,num_remote_atom_records);
00954 cudaEventRecord(end_remote_download, stream);
00955 ccd_index_remote_download = CcdCallOnCondition(CUDA_CONDITION,cuda_check_remote_progress,this);
00956 if ( shared_gpu ) {
00957 ccd_index_remote_calc = CcdCallOnCondition(CUDA_CONDITION,cuda_check_remote_calc,this);
00958 break;
00959 }
00960
00961 case 2:
00962 ++kernel_launch_state;
00963 gpu_is_mine = 0;
00964 cuda_nonbonded_forces(lata, latb, latc, cutoff2,
00965 0,localComputeRecords.size(),
00966 0,localActivePatches.size(), doSlow);
00967 cudaEventRecord(end_local_calc, stream);
00968 cuda_load_forces(forces, (doSlow ? slow_forces : 0 ),
00969 0,num_local_atom_records);
00970 cudaEventRecord(end_local_download, stream);
00971 if ( workStarted == 2 ) ccd_index_local_download = CcdCallOnCondition(CUDA_CONDITION,cuda_check_local_progress,this);
00972 if ( shared_gpu ) {
00973 ccd_index_local_calc = CcdCallOnCondition(CUDA_CONDITION,cuda_check_local_calc,this);
00974 break;
00975 }
00976
00977 default:
00978 gpu_is_mine = 1;
00979 break;
00980 }
00981
00982 }
|
|
|
|
Definition at line 53 of file ComputeNonbondedCUDA.h. Referenced by doWork(), finishWork(), recvYieldDevice(), and requirePatch(). |
|
|
Definition at line 64 of file ComputeNonbondedCUDA.h. Referenced by ComputeNonbondedCUDA(). |
|
|
Definition at line 55 of file ComputeNonbondedCUDA.h. Referenced by doWork(). |
|
|
Definition at line 67 of file ComputeNonbondedCUDA.h. |
|
|
Definition at line 53 of file ComputeNonbondedCUDA.h. Referenced by doWork(), finishWork(), recvYieldDevice(), and requirePatch(). |
|
|
Definition at line 56 of file ComputeNonbondedCUDA.h. Referenced by doWork(), recvYieldDevice(), register_cuda_compute_pair(), and register_cuda_compute_self(). |
|
|
Definition at line 58 of file ComputeNonbondedCUDA.h. Referenced by doWork(). |
|
|
Definition at line 61 of file ComputeNonbondedCUDA.h. Referenced by doWork(). |
|
|
Definition at line 59 of file ComputeNonbondedCUDA.h. Referenced by doWork(), and recvYieldDevice(). |
|
|
Definition at line 60 of file ComputeNonbondedCUDA.h. Referenced by doWork(), and recvYieldDevice(). |
|
|
Definition at line 63 of file ComputeNonbondedCUDA.h. Referenced by ComputeNonbondedCUDA(), doWork(), register_cuda_compute_pair(), register_cuda_compute_self(), and requirePatch(). |
|
|
Definition at line 54 of file ComputeNonbondedCUDA.h. Referenced by doWork(), finishWork(), recvYieldDevice(), and requirePatch(). |
|
|
Definition at line 65 of file ComputeNonbondedCUDA.h. Referenced by ComputeNonbondedCUDA(), finishWork(), and requirePatch(). |
|
|
Definition at line 53 of file ComputeNonbondedCUDA.h. Referenced by doWork(), finishWork(), recvYieldDevice(), and requirePatch(). |
|
|
Definition at line 56 of file ComputeNonbondedCUDA.h. Referenced by doWork(), recvYieldDevice(), register_cuda_compute_pair(), and register_cuda_compute_self(). |
|
|
Definition at line 45 of file ComputeNonbondedCUDA.h. Referenced by ComputeNonbondedCUDA(), doWork(), finishWork(), and recvYieldDevice(). |
1.3.9.1