Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | File List | Class Members | File Members

ComputeNonbondedCUDA Class Reference

#include <ComputeNonbondedCUDA.h>

Inheritance diagram for ComputeNonbondedCUDA:

Compute ComputeNonbondedUtil List of all members.

Public Member Functions

 ComputeNonbondedCUDA (ComputeID c, ComputeMgr *mgr)
 ~ComputeNonbondedCUDA ()
void atomUpdate ()
void doWork ()
void recvYieldDevice (int pe)
int finishWork ()
void build_exclusions ()
void requirePatch (int pid)

Static Public Member Functions

void build_force_table ()

Public Attributes

int workStarted
ResizeArray< int > activePatches
ResizeArray< int > localActivePatches
ResizeArray< int > remoteActivePatches
ResizeArray< patch_recordpatchRecords
ResizeArray< compute_recordcomputeRecords
ResizeArray< compute_recordlocalComputeRecords
ResizeArray< compute_recordremoteComputeRecords
int num_atom_records
int num_local_atom_records
int num_remote_atom_records
int num_force_records
PatchMappatchMap
AtomMapatomMap
SubmitReductionreduction
ComputeNonbondedCUDAKernel * kernel

Constructor & Destructor Documentation

ComputeNonbondedCUDA::ComputeNonbondedCUDA ComputeID  c,
ComputeMgr mgr
 

Definition at line 472 of file ComputeNonbondedCUDA.C.

References atomMap, atomsChanged, build_exclusions(), computeMgr, computesChanged, cudaCompute, end_local_calc, end_local_download, end_remote_calc, end_remote_download, NAMD_die(), Node::Object(), AtomMap::Object(), PatchMap::Object(), patchMap, SimParameters::pressureProfileOn, reduction, Node::simParameters, start_calc, start_upload, and workStarted.

00472                                                                        : Compute(c) {
00473   // CkPrintf("create ComputeNonbondedCUDA\n");
00474   cudaCompute = this;
00475   computeMgr = mgr;
00476   patchMap = PatchMap::Object();
00477   atomMap = AtomMap::Object();
00478   reduction = 0;
00479   build_exclusions();
00480 
00481   SimParameters *params = Node::Object()->simParameters;
00482   if (params->pressureProfileOn) {
00483     NAMD_die("pressure profile not supported in CUDA");
00484   }
00485 
00486   atomsChanged = 1;
00487   computesChanged = 1;
00488   workStarted = 0;
00489   basePriority = PROXY_DATA_PRIORITY;
00490 
00491   cudaEventCreate(&start_upload);
00492   cudaEventCreate(&start_calc);
00493   cudaEventCreate(&end_remote_calc);
00494   cudaEventCreate(&end_remote_download);
00495   cudaEventCreate(&end_local_calc);
00496   cudaEventCreate(&end_local_download);
00497 }

ComputeNonbondedCUDA::~ComputeNonbondedCUDA  ) 
 

Definition at line 500 of file ComputeNonbondedCUDA.C.

00500 { ; }


Member Function Documentation

void ComputeNonbondedCUDA::atomUpdate  )  [virtual]
 

Reimplemented from Compute.

Definition at line 577 of file ComputeNonbondedCUDA.C.

References atomsChanged.

00577 { atomsChanged = 1; }

void ComputeNonbondedCUDA::build_exclusions  ) 
 

Definition at line 327 of file ComputeNonbondedCUDA.C.

References ResizeArray< Elem >::add(), cuda_bind_exclusions(), exclusionsByAtom, Molecule::get_full_exclusions_for_atom(), ObjectArena< Type >::getNewArray(), int32, j, Node::molecule, NAMD_bug(), Molecule::numAtoms, Node::Object(), ResizeArray< Elem >::resize(), SET_EXCL, ResizeArray< Elem >::size(), and SortableResizeArray< Elem >::sort().

Referenced by ComputeNonbondedCUDA().

00327                                             {
00328   Molecule *mol = Node::Object()->molecule;
00329   int natoms = mol->numAtoms; 
00330   exclusionsByAtom = new ushort2[natoms];
00331 
00332   // create unique sorted lists
00333 
00334   ObjectArena<int32> listArena;
00335   ResizeArray<int32*> unique_lists;
00336   SortableResizeArray<int32> curList;
00337   int totalbits = 0;
00338   for ( int i=0; i<natoms; ++i ) {
00339     const int32 *mol_list = mol->get_full_exclusions_for_atom(i);
00340     curList.resize(0);
00341     int n = mol_list[0] + 1;
00342     curList.add(0);  // always excluded from self
00343     for ( int j=1; j<n; ++j ) { curList.add(mol_list[j] - i); }
00344     curList.sort();
00345 
00346     int j;
00347     for ( j=0; j<unique_lists.size(); ++j ) {
00348       if ( n != unique_lists[j][0] ) continue;  // no match
00349       int k;
00350       for ( k=0; k<n; ++k ) {
00351         if ( unique_lists[j][k+3] != curList[k] ) break;
00352       }
00353       if ( k == n ) break;  // found match
00354     }
00355     if ( j == unique_lists.size() ) {  // no match
00356       int32 *list = listArena.getNewArray(n+3);
00357       list[0] = n;
00358       int maxdiff = 0;
00359       maxdiff = -1 * curList[0];
00360       if ( curList[n-1] > maxdiff ) maxdiff = curList[n-1];
00361       list[1] = maxdiff;
00362       list[2] = totalbits + maxdiff;
00363       totalbits += 2*maxdiff + 1;
00364       for ( int k=0; k<n; ++k ) {
00365         list[k+3] = curList[k];
00366       }
00367       unique_lists.add(list);
00368     }
00369     exclusionsByAtom[i].x = unique_lists[j][1];  // maxdiff
00370     exclusionsByAtom[i].y = unique_lists[j][2];  // start
00371   }
00372 
00373   if ( totalbits & 31 ) totalbits += ( 32 - ( totalbits & 31 ) );
00374 
00375   if ( ! CkMyPe() ) {
00376   CkPrintf("Info: Found %d unique exclusion lists needing %d bytes\n",
00377                 unique_lists.size(), totalbits / 8);
00378   }
00379 
00380 #define SET_EXCL(EXCL,BASE,DIFF) \
00381          (EXCL)[((BASE)+(DIFF))>>5] |= (1<<(((BASE)+(DIFF))&31))
00382 
00383   unsigned int *exclusion_bits = new unsigned int[totalbits/32];
00384   memset(exclusion_bits, 0, totalbits/8);
00385 
00386   int base = 0;
00387   for ( int i=0; i<unique_lists.size(); ++i ) {
00388     base += unique_lists[i][1];
00389     if ( base != unique_lists[i][2] ) {
00390       NAMD_bug("ComputeNonbondedCUDA::build_exclusions base != stored");
00391     }
00392     int n = unique_lists[i][0];
00393     for ( int j=0; j<n; ++j ) {
00394       SET_EXCL(exclusion_bits,base,unique_lists[i][j+3]);
00395     }
00396     base += unique_lists[i][1] + 1;
00397   }
00398 
00399   cuda_bind_exclusions(exclusion_bits, totalbits/32);
00400 
00401   delete [] exclusion_bits;
00402 }

void ComputeNonbondedCUDA::build_force_table  )  [static]
 

Definition at line 211 of file ComputeNonbondedCUDA.C.

References BigReal, cuda_bind_force_table(), diffa, FORCE_TABLE_SIZE, int32, and table_i.

Referenced by build_cuda_force_table().

00211                                              {  // static
00212 
00213   float4 t[FORCE_TABLE_SIZE];
00214 
00215   const BigReal r2_delta = ComputeNonbondedUtil:: r2_delta;
00216   const int r2_delta_exp = ComputeNonbondedUtil:: r2_delta_exp;
00217   // const int r2_delta_expc = 64 * (r2_delta_exp - 127);
00218   const int r2_delta_expc = 64 * (r2_delta_exp - 1023);
00219 
00220   double r2list[FORCE_TABLE_SIZE];  // double to match cpu code
00221   for ( int i=1; i<FORCE_TABLE_SIZE; ++i ) {
00222     double r = ((double) FORCE_TABLE_SIZE) / ( (double) i + 0.5 );
00223     r2list[i] = r*r + r2_delta;
00224   }
00225 
00226   union { double f; int32 i[2]; } byte_order_test;
00227   byte_order_test.f = 1.0;  // should occupy high-order bits only
00228   int32 *r2iilist = (int32*)r2list + ( byte_order_test.i[0] ? 0 : 1 );
00229 
00230   for ( int i=1; i<FORCE_TABLE_SIZE; ++i ) {
00231     double r = ((double) FORCE_TABLE_SIZE) / ( (double) i + 0.5 );
00232     int table_i = (r2iilist[2*i] >> 14) + r2_delta_expc;  // table_i >= 0
00233 
00234     if ( r > cutoff ) {
00235       t[i].x = 0.;
00236       t[i].y = 0.;
00237       t[i].z = 0.;
00238       t[i].w = 0.;
00239       continue;
00240     }
00241 
00242     BigReal diffa = r2list[i] - r2_table[table_i];
00243 
00244     // coulomb 1/r or fast force
00245     // t[i].x = 1. / (r2 * r);  // -1/r * d/dr r^-1
00246     {
00247       // BigReal table_a = fast_table[4*table_i];
00248       BigReal table_b = fast_table[4*table_i+1];
00249       BigReal table_c = fast_table[4*table_i+2];
00250       BigReal table_d = fast_table[4*table_i+3];
00251       BigReal grad =
00252                 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00253       t[i].x = 2. * grad;
00254     }
00255 
00256 
00257     // pme correction for slow force
00258     // t[i].w = 0.;
00259     {
00260       // BigReal table_a = scor_table[4*table_i];
00261       BigReal table_b = scor_table[4*table_i+1];
00262       BigReal table_c = scor_table[4*table_i+2];
00263       BigReal table_d = scor_table[4*table_i+3];
00264       BigReal grad =
00265                 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00266       t[i].w = 2. * grad;
00267     }
00268 
00269 
00270     // vdw 1/r^6
00271     // t[i].y = 6. / (r8);  // -1/r * d/dr r^-6
00272     {
00273       // BigReal table_a = vdwb_table[4*table_i];
00274       BigReal table_b = vdwb_table[4*table_i+1];
00275       BigReal table_c = vdwb_table[4*table_i+2];
00276       BigReal table_d = vdwb_table[4*table_i+3];
00277       BigReal grad =
00278                 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00279       t[i].y = 2. * -1. * grad;
00280     }
00281 
00282 
00283     // vdw 1/r^12
00284     // t[i].z = 12e / (r8 * r4 * r2);  // -1/r * d/dr r^-12
00285     {
00286       // BigReal table_a = vdwa_table[4*table_i];
00287       BigReal table_b = vdwa_table[4*table_i+1];
00288       BigReal table_c = vdwa_table[4*table_i+2];
00289       BigReal table_d = vdwa_table[4*table_i+3];
00290       BigReal grad =
00291                 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00292       t[i].z = 2. * grad;
00293     }
00294 
00295     // CkPrintf("%d %g %g %g %g %g %g\n", i, r, diffa,
00296     //   t[i].x, t[i].y, t[i].z, t[i].w);
00297 
00298 /*
00299     double r2 = r * r;
00300     double r4 = r2 * r2;
00301     double r8 = r4 * r4;
00302 
00303     t[i].x = 1. / (r2 * r);  // -1/r * d/dr r^-1
00304     t[i].y = 6. / (r8);  // -1/r * d/dr r^-6
00305     t[i].z = 12. / (r8 * r4 * r2);  // -1/r * d/dr r^-12
00306     t[i].w = 0.;
00307 */
00308   }
00309 
00310   t[0].x = 0.f;
00311   t[0].y = 0.f;
00312   t[0].z = 0.f;
00313   t[0].w = 0.f;
00314 
00315   cuda_bind_force_table(t);
00316 
00317   if ( ! CkMyPe() ) {
00318     CkPrintf("Info: Updated CUDA force table with %d elements.\n", FORCE_TABLE_SIZE);
00319   }
00320 }

void ComputeNonbondedCUDA::doWork  )  [virtual]
 

Reimplemented from Compute.

Definition at line 581 of file ComputeNonbondedCUDA.C.

References activePatches, atom_params, CompAtomExt::atomFixed, atoms, atomsChanged, Molecule::atomvdwtype(), ResizeArray< Elem >::begin(), ccd_index_local_download, ccd_index_remote_download, PatchMap::center(), CompAtom::charge, COLOUMB, computeRecords, computesChanged, cuda_bind_atom_params(), cuda_bind_atoms(), cuda_bind_patch_pairs(), cuda_check_local_progress(), cuda_check_remote_progress(), CUDA_CONDITION, cuda_load_forces(), cuda_nonbonded_forces(), cuda_stream_finished(), cudaCompute, end_local_calc, end_local_download, end_remote_calc, end_remote_download, exclusionsByAtom, finishWork(), Patch::flags, force_lists, forces, Parameters::get_vdw_params(), Patch::getCompAtomExtInfo(), Patch::getNumAtoms(), CompAtomExt::id, j, kernel_launch_state, kernel_time, Flags::lattice, localActivePatches, localComputeRecords, ComputeNonbondedCUDA::patch_record::localStart, MAX_ATOMS_PER_PATCH, Node::molecule, NAMD_die(), num_atom_records, num_atom_records_allocated, num_force_records, num_local_atom_records, num_remote_atom_records, ComputeNonbondedCUDA::patch_record::numAtoms, ComputeNonbondedCUDA::patch_record::numFreeAtoms, Node::Object(), ComputeNonbondedCUDA::compute_record::offset, Box< Owner, Data >::open(), SimParameters::outputCudaTiming, ComputeNonbondedCUDA::patch_record::p, Node::parameters, patch_pairs, ComputeNonbondedCUDA::patch_record::patchID, patchMap, patchRecords, ComputeNonbondedCUDA::compute_record::pid, CompAtom::position, ComputeNonbondedCUDA::patch_record::positionBox, Real, recvYieldDevice(), remoteActivePatches, remoteComputeRecords, ResizeArray< Elem >::resize(), Node::simParameters, simParams, ResizeArray< Elem >::size(), slow_forces, start_calc, start_upload, stream, Lattice::unscale(), SimParameters::useFlexibleCell, workStarted, Vector::x, ComputeNonbondedCUDA::patch_record::x, ComputeNonbondedCUDA::patch_record::xExt, Vector::y, and Vector::z.

00581                                   {
00582 
00583 // CkPrintf("Pe %d doWork %d\n", CkMyPe(), workStarted);
00584 
00585   if ( workStarted ) {
00586     if ( finishWork() ) {  // finished
00587       workStarted = 0;
00588       basePriority = PROXY_DATA_PRIORITY;  // higher to aid overlap
00589     } else {  // need to call again
00590       workStarted = 2;
00591       basePriority = COMPUTE_HOME_PRIORITY;  // lower for local
00592       if ( kernel_launch_state > 2 ) ccd_index_local_download = CcdCallOnCondition(CUDA_CONDITION,cuda_check_local_progress,this);
00593     }
00594     return;
00595   }
00596   workStarted = 1;
00597   basePriority = COMPUTE_PROXY_PRIORITY;
00598 
00599   Molecule *mol = Node::Object()->molecule;
00600   Parameters *params = Node::Object()->parameters;
00601   SimParameters *simParams = Node::Object()->simParameters;
00602 
00603   if ( simParams->useFlexibleCell ) {
00604     NAMD_die("flexible-cell constant pressure not supported in CUDA");
00605   }
00606 
00607   for ( int i=0; i<activePatches.size(); ++i ) {
00608     patch_record &pr = patchRecords[activePatches[i]];
00609     pr.x = pr.positionBox->open();
00610     pr.xExt = pr.p->getCompAtomExtInfo();
00611   }
00612 
00613  if ( atomsChanged || computesChanged ) {
00614   int npatches = activePatches.size();
00615 
00616   if ( computesChanged ) {
00617     computesChanged = 0;
00618 
00619     int *ap = activePatches.begin();
00620     for ( int i=0; i<localActivePatches.size(); ++i ) {
00621       *(ap++) = localActivePatches[i];
00622     }
00623     for ( int i=0; i<remoteActivePatches.size(); ++i ) {
00624       *(ap++) = remoteActivePatches[i];
00625     }
00626 
00627     int nlc = localComputeRecords.size();
00628     int nrc = remoteComputeRecords.size();
00629     computeRecords.resize(nlc+nrc);
00630     compute_record *cr = computeRecords.begin();
00631     for ( int i=0; i<nlc; ++i ) {
00632       *(cr++) = localComputeRecords[i];
00633     }
00634     for ( int i=0; i<nrc; ++i ) {
00635       *(cr++) = remoteComputeRecords[i];
00636     }
00637 
00638     force_lists.resize(npatches);
00639     for ( int i=0; i<npatches; ++i ) {
00640       patchRecords[activePatches[i]].localIndex = i;
00641       force_lists[i].force_list_size = 0;
00642     }
00643 
00644     int ncomputes = computeRecords.size();
00645     patch_pairs.resize(ncomputes);
00646     for ( int i=0; i<ncomputes; ++i ) {
00647       ComputeNonbondedCUDA::compute_record &cr = computeRecords[i];
00648       int lp1 = patchRecords[cr.pid[0]].localIndex;
00649       int lp2 = patchRecords[cr.pid[1]].localIndex;
00650       force_lists[lp1].force_list_size++;
00651       patch_pair &pp = patch_pairs[i];
00652       pp.offset.x = cr.offset.x;
00653       pp.offset.y = cr.offset.y;
00654       pp.offset.z = cr.offset.z;
00655     }
00656 
00657    if ( simParams->outputCudaTiming ) {
00658     CkPrintf("Pe %d has %d local and %d remote patches and %d local and %d remote computes.\n",
00659         CkMyPe(), localActivePatches.size(), remoteActivePatches.size(),
00660         localComputeRecords.size(), remoteComputeRecords.size());
00661    }
00662   }
00663 
00664   int istart = 0;
00665   int flstart = 0;
00666   int max_atoms_per_patch = 0;
00667   int i;
00668   for ( i=0; i<npatches; ++i ) {
00669     if ( i == localActivePatches.size() ) {
00670        num_local_atom_records = istart;
00671     }
00672     force_lists[i].force_list_start = flstart;
00673     force_lists[i].force_output_start = istart;
00674     patch_record &pr = patchRecords[activePatches[i]];
00675     pr.localStart = istart;
00676     int natoms = pr.p->getNumAtoms();
00677     int nfreeatoms = natoms;
00678     if ( fixedAtomsOn ) {
00679       const CompAtomExt *aExt = pr.xExt;
00680       for ( int j=0; j<natoms; ++j ) {
00681         if ( aExt[j].atomFixed ) --nfreeatoms;
00682       }
00683     }
00684     if ( natoms > max_atoms_per_patch ) max_atoms_per_patch = natoms;
00685     pr.numAtoms = natoms;
00686     pr.numFreeAtoms = nfreeatoms;
00687     if ( natoms & 15 ) { natoms += 16 - (natoms & 15); }
00688     if ( nfreeatoms & 15 ) { nfreeatoms += 16 - (nfreeatoms & 15); }
00689     force_lists[i].patch_size = nfreeatoms;
00690     flstart += nfreeatoms * force_lists[i].force_list_size;
00691     istart += natoms;  // already rounded up
00692     force_lists[i].force_list_size = 0;  // rebuild below
00693   }
00694   if ( i == localActivePatches.size() ) {
00695      num_local_atom_records = istart;
00696   }
00697   num_force_records = flstart;
00698   num_atom_records = istart;
00699   num_remote_atom_records = num_atom_records - num_local_atom_records;
00700   if ( num_atom_records > num_atom_records_allocated ) {
00701     if ( num_atom_records_allocated ) {
00702       cudaFreeHost(atom_params);
00703       cudaFreeHost(atoms);
00704       cudaFreeHost(forces);
00705       cudaFreeHost(slow_forces);
00706     }
00707     num_atom_records_allocated = 1.1 * num_atom_records + 1;
00708     cudaMallocHost((void**)&atom_params,sizeof(atom_param)*num_atom_records_allocated);
00709     cudaMallocHost((void**)&atoms,sizeof(atom)*num_atom_records_allocated);
00710     cudaMallocHost((void**)&forces,sizeof(float4)*num_atom_records_allocated);
00711     cudaMallocHost((void**)&slow_forces,sizeof(float4)*num_atom_records_allocated);
00712   }
00713 
00714 #if 0
00715   if ( max_atoms_per_patch > MAX_ATOMS_PER_PATCH ) {
00716     char errstr[1024];
00717     sprintf(errstr,"Found patch with %d atoms; limit is %d for CUDA."
00718         "  Try enabling twoAwayX, twoAwayY, and/or twoAwayZ to fix this.",
00719         max_atoms_per_patch, MAX_ATOMS_PER_PATCH);
00720     NAMD_die(errstr);
00721   }
00722 #endif
00723 
00724   int ncomputes = computeRecords.size();
00725   for ( int i=0; i<ncomputes; ++i ) {
00726     ComputeNonbondedCUDA::compute_record &cr = computeRecords[i];
00727     int p1 = cr.pid[0];
00728     int p2 = cr.pid[1];
00729     int lp1 = patchRecords[p1].localIndex;
00730     int lp2 = patchRecords[p2].localIndex;
00731     patch_pair &pp = patch_pairs[i];
00732     pp.patch1_atom_start = patchRecords[p1].localStart;
00733     pp.patch1_force_start = force_lists[lp1].force_list_start +
00734         force_lists[lp1].patch_size * force_lists[lp1].force_list_size;
00735     pp.patch1_size = patchRecords[p1].numAtoms;
00736     pp.patch1_force_size = patchRecords[p1].numFreeAtoms;
00737     // istart += pp.patch1_size;
00738     // if ( istart & 15 ) { istart += 16 - (istart & 15); }
00739     pp.patch2_atom_start = patchRecords[p2].localStart;
00740     // pp.patch2_force_start = force_lists[lp2].force_list_start +
00741 //      force_lists[lp2].patch_size * force_lists[lp2].force_list_size;
00742     pp.patch2_size = patchRecords[p2].numAtoms;
00743     // pp.patch2_force_size = patchRecords[p2].numFreeAtoms;
00744     // this must happen at end to get self computes right
00745     force_lists[lp1].force_list_size++;
00746     // if ( lp1 != lp2 || cr.t[0] != cr.t[1] ) {
00747     //   force_lists[lp2].force_list_size++;
00748     // }
00749     // istart += pp.patch2_size;
00750     // if ( istart & 15 ) { istart += 16 - (istart & 15); }
00751   }
00752 
00753 #if 0
00754   CkPrintf("Pe %d cuda_bind_patch_pairs %d %d %d %d %d\n", CkMyPe(),
00755         patch_pairs.size(), force_lists.size(),
00756         num_atom_records, num_force_records,
00757         max_atoms_per_patch);
00758 #endif
00759 
00760   int totalmem = patch_pairs.size() * sizeof(patch_pair) +
00761                 force_lists.size() * sizeof(force_list) +
00762                 num_force_records * sizeof(float4) +
00763                 num_atom_records * sizeof(atom) +
00764                 num_atom_records * sizeof(atom_param) +
00765                 num_atom_records * sizeof(float4);
00766   int totalcopy = num_atom_records * ( sizeof(atom) + sizeof(float4) );
00767 /*
00768   CkPrintf("Pe %d allocating %d MB of GPU memory, will copy %d kB per step\n",
00769                         CkMyPe(), totalmem >> 20, totalcopy >> 10);
00770 */
00771 
00772   cuda_bind_patch_pairs(patch_pairs.begin(), patch_pairs.size(),
00773                         force_lists.begin(), force_lists.size(),
00774                         num_atom_records, num_force_records,
00775                         max_atoms_per_patch);
00776 
00777  }  // atomsChanged || computesChanged
00778 
00779   double charge_scaling = sqrt(COLOUMB * scaling * dielectric_1);
00780 
00781 
00782   for ( int i=0; i<activePatches.size(); ++i ) {
00783     patch_record &pr = patchRecords[activePatches[i]];
00784 
00785     int start = pr.localStart;
00786     int n = pr.numAtoms;
00787     const CompAtom *a = pr.x;
00788     const CompAtomExt *aExt = pr.xExt;
00789     if ( atomsChanged ) {
00790       atom_param *ap = atom_params + start;
00791       int k = 0;
00792       for ( int j=0; j<n; ++j ) {
00793         // put free atoms first
00794         if ( fixedAtomsOn && aExt[j].atomFixed ) continue;
00795         ap[k].index = aExt[j].id;
00796         ap[k].excl_index = exclusionsByAtom[aExt[j].id].y;
00797         ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].id].x;
00798         int vdwtype = mol->atomvdwtype(aExt[j].id);
00799         Real sig, eps, sig14, eps14;
00800         params->get_vdw_params(&sig,&eps,&sig14,&eps14,vdwtype);
00801         ap[k].sqrt_epsilon = sqrt(4.0 * scaling * eps);
00802         ap[k].half_sigma = 0.5 * sig;
00803 // CkPrintf("%d %d %f %f\n", k, vdwtype, sig, eps);
00804         ++k;
00805       }
00806       if ( fixedAtomsOn ) for ( int j=0; j<n; ++j ) {
00807         // put fixed atoms at end
00808         if ( ! (fixedAtomsOn && aExt[j].atomFixed) ) continue;
00809         ap[k].index = aExt[j].id;
00810         ap[k].excl_index = exclusionsByAtom[aExt[j].id].y;
00811         ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].id].x;
00812         int vdwtype = mol->atomvdwtype(aExt[j].id);
00813         Real sig, eps, sig14, eps14;
00814         params->get_vdw_params(&sig,&eps,&sig14,&eps14,vdwtype);
00815         ap[k].sqrt_epsilon = sqrt(4.0 * scaling * eps);
00816         ap[k].half_sigma = 0.5 * sig;
00817 // CkPrintf("%d %d %f %f\n", k, vdwtype, sig, eps);
00818         ++k;
00819       }
00820     }
00821     {
00822       Vector center =
00823         pr.p->flags.lattice.unscale(cudaCompute->patchMap->center(pr.patchID));
00824       atom *ap = atoms + start;
00825       int k = 0;
00826       for ( int j=0; j<n; ++j ) {
00827         // put free atoms first
00828         if ( fixedAtomsOn && aExt[j].atomFixed ) continue;
00829         ap[k].position.x = a[j].position.x - center.x;
00830         ap[k].position.y = a[j].position.y - center.y;
00831         ap[k].position.z = a[j].position.z - center.z;
00832         ap[k].charge = charge_scaling * a[j].charge;
00833         ++k;
00834       }
00835       if ( fixedAtomsOn ) for ( int j=0; j<n; ++j ) {
00836         // put fixed atoms at end
00837         if ( ! (fixedAtomsOn && aExt[j].atomFixed) ) continue;
00838         ap[k].position.x = a[j].position.x - center.x;
00839         ap[k].position.y = a[j].position.y - center.y;
00840         ap[k].position.z = a[j].position.z - center.z;
00841         ap[k].charge = charge_scaling * a[j].charge;
00842         ++k;
00843       }
00844     }
00845   }
00846 
00847   kernel_time = -1. * CkWallTimer();
00848 #if 0
00849   kernel_launch_state = 3;
00850 
00851   cudaEventRecord(start_upload, stream);
00852 
00853   if ( atomsChanged ) {
00854     cuda_bind_atom_params(atom_params);
00855   }
00856 
00857   XXX - THIS PATH NOT UPDATED FOR SLOW FORCES - DO NOT COMPILE
00858   cuda_bind_atoms(atoms);
00859   cudaEventRecord(start_calc, stream);
00860   cuda_nonbonded_forces(cutoff2,
00861         localComputeRecords.size(),remoteComputeRecords.size(),
00862         localActivePatches.size(),remoteActivePatches.size());
00863   cudaEventRecord(end_remote_calc, stream);
00864   cuda_load_forces(forces,num_local_atom_records,num_remote_atom_records);
00865   cudaEventRecord(end_remote_download, stream);
00866   cuda_nonbonded_forces(cutoff2,
00867         0,localComputeRecords.size(),
00868         0,localActivePatches.size());
00869   cudaEventRecord(end_local_calc, stream);
00870   cuda_load_forces(forces,0,num_local_atom_records);
00871   cudaEventRecord(end_local_download, stream);
00872 
00873   if ( cuda_stream_finished() ) {
00874     CkPrintf("CUDA not overlapping with CPU work.\n");
00875   }
00876 
00877   // finishWork();
00878 
00879   ccd_index_remote_download = CcdCallOnCondition(CUDA_CONDITION,cuda_check_remote_progress,this);
00880 #else
00881 
00882   kernel_launch_state = 1;
00883   if ( gpu_is_mine ) recvYieldDevice(-1);
00884 
00885 #endif
00886 }

int ComputeNonbondedCUDA::finishWork  ) 
 

Definition at line 985 of file ComputeNonbondedCUDA.C.

References activePatches, CompAtomExt::atomFixed, atomsChanged, Box< Owner, Data >::close(), cuda_errcheck(), cuda_timer_count, cuda_timer_total, Flags::doFullElectrostatics, end_local_calc, end_local_download, end_remote_calc, end_remote_download, Results::f, ComputeNonbondedCUDA::patch_record::f, Force, ComputeNonbondedCUDA::patch_record::forceBox, forces, Molecule::get_full_exclusions_for_atom(), SubmitReduction::item(), j, localActivePatches, ComputeNonbondedCUDA::patch_record::localStart, Node::molecule, ComputeNonbondedCUDA::patch_record::numAtoms, Node::Object(), Box< Owner, Data >::open(), SimParameters::outputCudaTiming, patchRecords, CompAtom::position, ComputeNonbondedCUDA::patch_record::positionBox, ComputeNonbondedCUDA::patch_record::r, reduction, remoteActivePatches, Node::simParameters, simParams, ResizeArray< Elem >::size(), slow_forces, start_calc, start_upload, SubmitReduction::submit(), workStarted, Vector::x, ComputeNonbondedCUDA::patch_record::x, ComputeNonbondedCUDA::patch_record::xExt, Vector::y, and Vector::z.

Referenced by doWork().

00985                                      {
00986 
00987   cuda_errcheck("cuda stream completed");
00988 
00989   Molecule *mol = Node::Object()->molecule;
00990   SimParameters *simParams = Node::Object()->simParameters;
00991 
00992   Flags &flags = patchRecords[activePatches[0]].p->flags;
00993   int doSlow = flags.doFullElectrostatics;
00994 
00995   ResizeArray<int> &patches( workStarted == 1 ?
00996                                 remoteActivePatches : localActivePatches );
00997 
00998   for ( int i=0; i<patches.size(); ++i ) {
00999     patch_record &pr = patchRecords[patches[i]];
01000     pr.r = pr.forceBox->open();
01001     pr.f = pr.r->f[Results::nbond];
01002   }
01003 
01004   // long long int wcount = 0;
01005   double virial = 0.;
01006   double virial_slow = 0.;
01007 
01008   for ( int i=0; i<patches.size(); ++i ) {
01009     patch_record &pr = patchRecords[patches[i]];
01010     int start = pr.localStart;
01011     int n = pr.numAtoms;
01012     Force *f = pr.f;
01013     Force *f_slow = pr.r->f[Results::slow];
01014     const CompAtom *a = pr.x;
01015     const CompAtomExt *aExt = pr.xExt;
01016     float4 *af = forces + start;
01017     float4 *af_slow = slow_forces + start;
01018     int k = 0;
01019     for ( int j=0; j<n; ++j ) {
01020       // only free atoms return forces
01021       if ( fixedAtomsOn && aExt[j].atomFixed ) continue;
01022       f[j].x += af[k].x;
01023       f[j].y += af[k].y;
01024       f[j].z += af[k].z;
01025       // wcount += af[k].w;
01026       virial += af[k].w;
01027       if ( doSlow ) {
01028         f_slow[j].x += af_slow[k].x;
01029         f_slow[j].y += af_slow[k].y;
01030         f_slow[j].z += af_slow[k].z;
01031         virial_slow += af_slow[k].w;
01032       }
01033       ++k;
01034     }
01035 
01036 #if 0
01037     // check exclusions reported as w
01038     if ( CkNumPes() == 1 ) {
01039       const CompAtomExt *aExt = pr.xExt;
01040       int k = 0;
01041       for ( int j=0; j<n; ++j ) {
01042         // only free atoms return forces
01043         if ( fixedAtomsOn && aExt[j].atomFixed ) continue;
01044         int excl_expected = mol->get_full_exclusions_for_atom(aExt[j].id)[0] + 1;
01045         if ( af[k].w != excl_expected ) {
01046           CkPrintf("%d:%d(%d) atom %d found %d exclusions but expected %d\n",
01047                 i, j, k, aExt[j].id, (int)af[k].w, excl_expected );
01048         }
01049         ++k;
01050       }
01051     }
01052 #endif
01053     
01054 
01055 #if 0
01056     if ( i % 31 == 0 ) for ( int j=0; j<3; ++j ) {
01057       CkPrintf("Pe %d patch %d atom %d (%f %f %f) force %f\n", CkMyPe(), i,
01058         j, pr.x[j].position.x, pr.x[j].position.y, pr.x[j].position.z,
01059         af[j].w);
01060     }
01061 #endif
01062     pr.positionBox->close(&(pr.x));
01063     pr.forceBox->close(&(pr.r));
01064   }
01065 
01066   virial *= (-1./6.);
01067   reduction->item(REDUCTION_VIRIAL_NBOND_XX) += virial;
01068   reduction->item(REDUCTION_VIRIAL_NBOND_YY) += virial;
01069   reduction->item(REDUCTION_VIRIAL_NBOND_ZZ) += virial;
01070   if ( doSlow ) {
01071     virial_slow *= (-1./6.);
01072     reduction->item(REDUCTION_VIRIAL_SLOW_XX) += virial_slow;
01073     reduction->item(REDUCTION_VIRIAL_SLOW_YY) += virial_slow;
01074     reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += virial_slow;
01075   }
01076 
01077   if ( workStarted == 1 ) return 0;  // not finished, call again
01078 
01079   atomsChanged = 0;
01080   reduction->submit();
01081 
01082   // int natoms = mol->numAtoms; 
01083   // double wpa = wcount;  wpa /= natoms;
01084 
01085   // CkPrintf("Pe %d CUDA kernel %f ms, total %f ms, wpa %f\n", CkMyPe(),
01086         //      kernel_time * 1.e3, time * 1.e3, wpa);
01087 
01088   float upload_ms, remote_calc_ms, remote_download_ms;
01089   float local_calc_ms, local_download_ms, total_ms;
01090   cuda_errcheck("before event timers");
01091   cudaEventElapsedTime(&upload_ms, start_upload, start_calc);
01092   cuda_errcheck("event timer 1");
01093   cudaEventElapsedTime(&remote_calc_ms, start_calc, end_remote_calc);
01094   cuda_errcheck("event timer 2");
01095   cudaEventElapsedTime(&remote_download_ms, end_remote_calc, end_remote_download);
01096   cuda_errcheck("event timer 3");
01097   cudaEventElapsedTime(&local_calc_ms, end_remote_download, end_local_calc);
01098   cuda_errcheck("event timer 4");
01099   cudaEventElapsedTime(&local_download_ms, end_local_calc, end_local_download);
01100   cuda_errcheck("event timer 5");
01101   cudaEventElapsedTime(&total_ms, start_upload, end_local_download);
01102   cuda_errcheck("event timer 6");
01103   cuda_errcheck("event timers");
01104 
01105   cuda_timer_total += kernel_time;
01106   if ( simParams->outputCudaTiming &&
01107         cuda_timer_count % simParams->outputCudaTiming == 0 ) {
01108     cuda_timer_total /= (cuda_timer_count + 1);
01109     CkPrintf("CUDA EVENT TIMING: %d %f %f %f %f %f %f\n",
01110         CkMyPe(), upload_ms, remote_calc_ms, remote_download_ms,
01111                         local_calc_ms, local_download_ms, total_ms);
01112     CkPrintf("CUDA TIMING: %f ms/step on node %d\n",
01113                         cuda_timer_total * 1.e3, CkMyPe());
01114     cuda_timer_count = 0;
01115     cuda_timer_total = 0;
01116   }
01117   cuda_timer_count++;
01118 
01119   return 1;  // finished and ready for next step
01120 }

void ComputeNonbondedCUDA::recvYieldDevice int  pe  ) 
 

Definition at line 916 of file ComputeNonbondedCUDA.C.

References Lattice::a(), activePatches, atom_params, atoms, Lattice::b(), Lattice::c(), ccd_index_local_calc, ccd_index_local_download, ccd_index_remote_calc, ccd_index_remote_download, cuda_bind_atom_params(), cuda_bind_atoms(), cuda_check_local_calc(), cuda_check_local_progress(), cuda_check_remote_calc(), cuda_check_remote_progress(), CUDA_CONDITION, cuda_load_forces(), cuda_nonbonded_forces(), Flags::doFullElectrostatics, end_local_calc, end_local_download, end_remote_calc, end_remote_download, forces, gpu_is_mine, Flags::lattice, localActivePatches, localComputeRecords, num_local_atom_records, num_remote_atom_records, patchRecords, remoteActivePatches, remoteComputeRecords, ResizeArray< Elem >::size(), slow_forces, start_calc, start_upload, stream, workStarted, Vector::x, Vector::y, and Vector::z.

Referenced by doWork(), and ComputeMgr::recvYieldDevice().

00916                                                  {
00917 
00918 // CkPrintf("Pe %d entering state %d via yield from pe %d\n",
00919 //           CkMyPe(), kernel_launch_state, pe);
00920 
00921   Flags &flags = patchRecords[activePatches[0]].p->flags;
00922   int doSlow = flags.doFullElectrostatics;
00923 
00924   Lattice &lattice = flags.lattice;
00925   float3 lata, latb, latc;
00926   lata.x = lattice.a().x;
00927   lata.y = lattice.a().y;
00928   lata.z = lattice.a().z;
00929   latb.x = lattice.b().x;
00930   latb.y = lattice.b().y;
00931   latb.z = lattice.b().z;
00932   latc.x = lattice.c().x;
00933   latc.y = lattice.c().y;
00934   latc.z = lattice.c().z;
00935 
00936   switch ( kernel_launch_state ) {
00937   case 1:
00938     ++kernel_launch_state;
00939     gpu_is_mine = 0;
00940     cudaEventRecord(start_upload, stream);
00941 
00942     if ( atomsChanged ) {
00943       cuda_bind_atom_params(atom_params);
00944     }
00945 
00946     cuda_bind_atoms(atoms);
00947     cudaEventRecord(start_calc, stream);
00948     cuda_nonbonded_forces(lata, latb, latc, cutoff2,
00949         localComputeRecords.size(),remoteComputeRecords.size(),
00950         localActivePatches.size(),remoteActivePatches.size(), doSlow);
00951     cudaEventRecord(end_remote_calc, stream);
00952     cuda_load_forces(forces, (doSlow ? slow_forces : 0 ),
00953         num_local_atom_records,num_remote_atom_records);
00954     cudaEventRecord(end_remote_download, stream);
00955     ccd_index_remote_download = CcdCallOnCondition(CUDA_CONDITION,cuda_check_remote_progress,this);
00956     if ( shared_gpu ) {
00957       ccd_index_remote_calc = CcdCallOnCondition(CUDA_CONDITION,cuda_check_remote_calc,this);
00958       break;
00959     }
00960  
00961   case 2:
00962     ++kernel_launch_state;
00963     gpu_is_mine = 0;
00964     cuda_nonbonded_forces(lata, latb, latc, cutoff2,
00965         0,localComputeRecords.size(),
00966         0,localActivePatches.size(), doSlow);
00967     cudaEventRecord(end_local_calc, stream);
00968     cuda_load_forces(forces, (doSlow ? slow_forces : 0 ),
00969         0,num_local_atom_records);
00970     cudaEventRecord(end_local_download, stream);
00971     if ( workStarted == 2 ) ccd_index_local_download = CcdCallOnCondition(CUDA_CONDITION,cuda_check_local_progress,this);
00972     if ( shared_gpu ) {
00973       ccd_index_local_calc = CcdCallOnCondition(CUDA_CONDITION,cuda_check_local_calc,this);
00974       break;
00975     }
00976 
00977   default:
00978     gpu_is_mine = 1;
00979     break;
00980   }
00981 
00982 }

void ComputeNonbondedCUDA::requirePatch int  pid  ) 
 

Definition at line 502 of file ComputeNonbondedCUDA.C.

References activePatches, ResizeArray< Elem >::add(), computesChanged, ComputeNonbondedCUDA::patch_record::f, ComputeNonbondedCUDA::patch_record::forceBox, ComputeNonbondedCUDA::patch_record::isLocal, ResizeArray< Elem >::item(), localActivePatches, PatchMap::node(), ReductionMgr::Object(), ComputeNonbondedCUDA::patch_record::p, PatchMap::patch(), ComputeNonbondedCUDA::patch_record::patchID, patchMap, patchRecords, ComputeNonbondedCUDA::patch_record::positionBox, ComputeNonbondedCUDA::patch_record::r, reduction, REDUCTIONS_BASIC, ComputeNonbondedCUDA::patch_record::refCount, Patch::registerForceDeposit(), Patch::registerPositionPickup(), remoteActivePatches, Compute::setNumPatches(), ResizeArray< Elem >::size(), ReductionMgr::willSubmit(), ComputeNonbondedCUDA::patch_record::x, and ComputeNonbondedCUDA::patch_record::xExt.

Referenced by register_cuda_compute_pair(), and register_cuda_compute_self().

00502                                                {
00503 
00504   if ( ! reduction ) {
00505     reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
00506   }
00507   computesChanged = 1;
00508   patch_record &pr = patchRecords.item(pid);
00509   if ( pr.refCount == 0 ) {
00510     pr.isLocal = ( patchMap->node(pid) == CkMyPe() );
00511     if ( pr.isLocal ) {
00512       localActivePatches.add(pid);
00513     } else {
00514       remoteActivePatches.add(pid);
00515     }
00516     activePatches.add(pid);
00517     setNumPatches(activePatches.size());
00518     pr.patchID = pid;
00519     pr.p = patchMap->patch(pid);
00520     pr.positionBox = pr.p->registerPositionPickup(cid);
00521     pr.forceBox = pr.p->registerForceDeposit(cid);
00522     pr.x = NULL;
00523     pr.xExt = NULL;
00524     pr.r = NULL;
00525     pr.f = NULL;
00526   }
00527   pr.refCount += 1;
00528 }


Member Data Documentation

ResizeArray<int> ComputeNonbondedCUDA::activePatches
 

Definition at line 53 of file ComputeNonbondedCUDA.h.

Referenced by doWork(), finishWork(), recvYieldDevice(), and requirePatch().

AtomMap* ComputeNonbondedCUDA::atomMap
 

Definition at line 64 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA().

ResizeArray<compute_record> ComputeNonbondedCUDA::computeRecords
 

Definition at line 55 of file ComputeNonbondedCUDA.h.

Referenced by doWork().

ComputeNonbondedCUDAKernel* ComputeNonbondedCUDA::kernel
 

Definition at line 67 of file ComputeNonbondedCUDA.h.

ResizeArray<int> ComputeNonbondedCUDA::localActivePatches
 

Definition at line 53 of file ComputeNonbondedCUDA.h.

Referenced by doWork(), finishWork(), recvYieldDevice(), and requirePatch().

ResizeArray<compute_record> ComputeNonbondedCUDA::localComputeRecords
 

Definition at line 56 of file ComputeNonbondedCUDA.h.

Referenced by doWork(), recvYieldDevice(), register_cuda_compute_pair(), and register_cuda_compute_self().

int ComputeNonbondedCUDA::num_atom_records
 

Definition at line 58 of file ComputeNonbondedCUDA.h.

Referenced by doWork().

int ComputeNonbondedCUDA::num_force_records
 

Definition at line 61 of file ComputeNonbondedCUDA.h.

Referenced by doWork().

int ComputeNonbondedCUDA::num_local_atom_records
 

Definition at line 59 of file ComputeNonbondedCUDA.h.

Referenced by doWork(), and recvYieldDevice().

int ComputeNonbondedCUDA::num_remote_atom_records
 

Definition at line 60 of file ComputeNonbondedCUDA.h.

Referenced by doWork(), and recvYieldDevice().

PatchMap* ComputeNonbondedCUDA::patchMap
 

Definition at line 63 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), register_cuda_compute_pair(), register_cuda_compute_self(), and requirePatch().

ResizeArray<patch_record> ComputeNonbondedCUDA::patchRecords
 

Definition at line 54 of file ComputeNonbondedCUDA.h.

Referenced by doWork(), finishWork(), recvYieldDevice(), and requirePatch().

SubmitReduction* ComputeNonbondedCUDA::reduction
 

Definition at line 65 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), finishWork(), and requirePatch().

ResizeArray<int> ComputeNonbondedCUDA::remoteActivePatches
 

Definition at line 53 of file ComputeNonbondedCUDA.h.

Referenced by doWork(), finishWork(), recvYieldDevice(), and requirePatch().

ResizeArray<compute_record> ComputeNonbondedCUDA::remoteComputeRecords
 

Definition at line 56 of file ComputeNonbondedCUDA.h.

Referenced by doWork(), recvYieldDevice(), register_cuda_compute_pair(), and register_cuda_compute_self().

int ComputeNonbondedCUDA::workStarted
 

Definition at line 45 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), finishWork(), and recvYieldDevice().


The documentation for this class was generated from the following files:
Generated on Mon Nov 23 04:59:35 2009 for NAMD by  doxygen 1.3.9.1