ComputeNonbondedCUDA Class Reference

#include <ComputeNonbondedCUDA.h>

Inheritance diagram for ComputeNonbondedCUDA:

Compute ComputeNonbondedUtil List of all members.

Public Member Functions

 ComputeNonbondedCUDA (ComputeID c, ComputeMgr *mgr, ComputeNonbondedCUDA *m=0, int idx=-1)
 ~ComputeNonbondedCUDA ()
void atomUpdate ()
void doWork ()
int noWork ()
void skip ()
void recvYieldDevice (int pe)
int finishWork ()
void finishReductions ()
void finishPatch (int)
void messageFinishPatch (int)
void requirePatch (int pid)
void assignPatches ()
void registerPatches ()

Static Public Member Functions

static void build_lj_table ()
static void build_force_table ()
static void build_exclusions ()

Public Attributes

LocalWorkMsglocalWorkMsg2
int workStarted
Lattice lattice
int doSlow
int doEnergy
int step
ResizeArray< int > activePatches
ResizeArray< int > localActivePatches
ResizeArray< int > remoteActivePatches
ResizeArray< int > hostedPatches
ResizeArray< int > localHostedPatches
ResizeArray< int > remoteHostedPatches
ResizeArray< patch_recordpatchRecords
ResizeArray< compute_recordcomputeRecords
ResizeArray< compute_recordlocalComputeRecords
ResizeArray< compute_recordremoteComputeRecords
int forces_size
float4 * forces
int slow_forces_size
float4 * slow_forces
int psiSumH_size
GBRealpsiSumH
int dEdaSumH_size
GBRealdEdaSumH
int deviceID
PatchMappatchMap
AtomMapatomMap
SubmitReductionreduction
ComputeNonbondedCUDAmaster
int masterPe
int slaveIndex
ComputeNonbondedCUDA ** slaves
int * slavePes
int numSlaves
int atomsChanged
int computesChanged
int patchPairsReordered
int pairlistsValid
float pairlistTolerance
int usePairlists
int savePairlists
float plcutoff2
int atoms_size
CudaAtomatoms

Classes

struct  compute_record
struct  patch_record

Detailed Description

Definition at line 20 of file ComputeNonbondedCUDA.h.


Constructor & Destructor Documentation

ComputeNonbondedCUDA::ComputeNonbondedCUDA ( ComputeID  c,
ComputeMgr mgr,
ComputeNonbondedCUDA m = 0,
int  idx = -1 
)

Definition at line 437 of file ComputeNonbondedCUDA.C.

References atomMap, atoms, atoms_size, atomsChanged, Compute::basePriority, build_exclusions(), computeMgr, computesChanged, cuda_errcheck(), cuda_init(), cudaCompute, dEdaSumH, dEdaSumH_size, deviceCUDA, deviceID, end_local_download, end_remote_download, forces, forces_size, SimParameters::GBISOn, DeviceCUDA::getDeviceID(), DeviceCUDA::getMergeGrids(), DeviceCUDA::getNoMergeGrids(), DeviceCUDA::getNoStreaming(), DeviceCUDA::getSharedGpu(), init_arrays(), localWorkMsg2, master, masterPe, max_grid_size, NAMD_bug(), NAMD_die(), PatchMap::numPatches(), Node::Object(), AtomMap::Object(), PatchMap::Object(), pairlistsValid, pairlistTolerance, patch_pair_num_ptr, patch_pairs_ptr, patchMap, patchPairsReordered, patchRecords, plcutoff2, SimParameters::PMEOffload, SimParameters::PMEOn, SimParameters::pressureProfileOn, PRIORITY_SIZE, PROXY_DATA_PRIORITY, psiSumH, psiSumH_size, reduction, registerPatches(), savePairlists, DeviceCUDA::setGpuIsMine(), DeviceCUDA::setMergeGrids(), Node::simParameters, slaveIndex, slavePes, slaves, slow_forces, slow_forces_size, start_calc, stream, stream2, usePairlists, SimParameters::usePMECUDA, and workStarted.

00438                                                                              : Compute(c), slaveIndex(idx) {
00439 #ifdef PRINT_GBIS
00440    CkPrintf("C.N.CUDA[%d]::constructor cid=%d\n", CkMyPe(), c);
00441 #endif
00442 
00443   if ( sizeof(patch_pair) & 15 ) NAMD_bug("sizeof(patch_pair) % 16 != 0");
00444   if ( sizeof(atom) & 15 ) NAMD_bug("sizeof(atom) % 16 != 0");
00445   if ( sizeof(atom_param) & 15 ) NAMD_bug("sizeof(atom_param) % 16 != 0");
00446 
00447   // CkPrintf("create ComputeNonbondedCUDA\n");
00448   master = m ? m : this;
00449   cudaCompute = this;
00450   computeMgr = mgr;
00451   patchMap = PatchMap::Object();
00452   atomMap = AtomMap::Object();
00453   reduction = 0;
00454 
00455   SimParameters *params = Node::Object()->simParameters;
00456   if (params->pressureProfileOn) {
00457     NAMD_die("pressure profile not supported in CUDA");
00458   }
00459 
00460   atomsChanged = 1;
00461   computesChanged = 1;
00462   patchPairsReordered = 1;
00463 
00464   pairlistsValid = 0;
00465   pairlistTolerance = 0.;
00466   usePairlists = 0;
00467   savePairlists = 0;
00468   plcutoff2 = 0.;
00469 
00470   workStarted = 0;
00471   basePriority = PROXY_DATA_PRIORITY;
00472   localWorkMsg2 = new (PRIORITY_SIZE) LocalWorkMsg;
00473 
00474   // Zero array sizes and pointers
00475   init_arrays();
00476 
00477   atoms_size = 0;
00478   atoms = NULL;
00479 
00480   forces_size = 0;
00481   forces = NULL;
00482   
00483   slow_forces_size = 0;
00484   slow_forces = NULL;
00485 
00486   psiSumH_size = 0;
00487   psiSumH = NULL;
00488 
00489   dEdaSumH_size = 0;
00490   dEdaSumH = NULL;
00491 
00492   if ( master != this ) { // I am slave
00493     masterPe = master->masterPe;
00494     master->slaves[slaveIndex] = this;
00495     if ( master->slavePes[slaveIndex] != CkMyPe() ) {
00496       NAMD_bug("ComputeNonbondedCUDA slavePes[slaveIndex] != CkMyPe");
00497     }
00498     deviceID = master->deviceID;
00499     registerPatches();
00500     return;
00501   }
00502   masterPe = CkMyPe();
00503 
00504   const bool streaming = ! (deviceCUDA->getNoStreaming() || params->GBISOn);
00505   if ( streaming && ! deviceCUDA->getSharedGpu() && ! deviceCUDA->getNoMergeGrids() )
00506     deviceCUDA->setMergeGrids(1);
00507 
00508   // Sanity checks for New CUDA kernel
00509   if (params->GBISOn) {
00510     // GBIS
00511     if (deviceCUDA->getNoMergeGrids()) {
00512       NAMD_die("CUDA kernel cannot use +nomergegrids with GBIS simulations");
00513     }
00514     // Set mergegrids ON as long as user hasn't defined +nomergegrids
00515     if (!deviceCUDA->getNoMergeGrids()) deviceCUDA->setMergeGrids(1);
00516     // Final sanity check
00517     if (!deviceCUDA->getMergeGrids()) NAMD_die("CUDA GBIS kernel final sanity check failed");
00518   } else {
00519     // non-GBIS
00520     if (deviceCUDA->getNoStreaming() && !deviceCUDA->getMergeGrids()) {
00521       NAMD_die("CUDA kernel requires +mergegrids if +nostreaming is used");
00522     }
00523   }
00524 
00525 #if CUDA_VERSION >= 5050
00526   int leastPriority, greatestPriority;
00527   cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
00528   cuda_errcheck("in cudaDeviceGetStreamPriorityRange");
00529   if ( leastPriority != greatestPriority ) {
00530     if ( CkMyNode() == 0 ) {
00531       int dev = deviceCUDA->getDeviceID();
00532       CkPrintf("CUDA device %d stream priority range %d %d\n", dev, leastPriority, greatestPriority);
00533     }
00534     if ( deviceCUDA->getMergeGrids() && params->PMEOn && params->PMEOffload && !params->usePMECUDA) {
00535       greatestPriority = leastPriority;
00536     }
00537     if (params->usePMECUDA) greatestPriority = leastPriority;
00538     cudaStreamCreateWithPriority(&stream,cudaStreamDefault,greatestPriority);
00539     cudaStreamCreateWithPriority(&stream2,cudaStreamDefault,leastPriority);
00540   } else
00541 #endif
00542   {
00543     cudaStreamCreate(&stream);
00544     cuda_errcheck("cudaStreamCreate");
00545     int dev = deviceCUDA->getDeviceID();
00546     cudaDeviceProp deviceProp;
00547     cudaGetDeviceProperties(&deviceProp, dev);
00548     cuda_errcheck("cudaGetDeviceProperties");
00549     if ( deviceProp.concurrentKernels && deviceProp.major > 2 ) {
00550       if ( CkMyNode() == 0 ) CkPrintf("CUDA device %d supports concurrent kernels.\n", dev);
00551       cudaStreamCreate(&stream2);
00552     } else {
00553       stream2 = stream;
00554     }
00555   }
00556   cuda_errcheck("cudaStreamCreate");
00557 
00558   // Get GPU device ID
00559   deviceID = deviceCUDA->getDeviceID();
00560 
00561   cuda_init();
00562   if ( max_grid_size < 65535 ) NAMD_bug("bad CUDA max_grid_size");
00563   build_exclusions();
00564   // cudaEventCreate(&start_upload);
00565   cudaEventCreateWithFlags(&start_calc,cudaEventDisableTiming);
00566   cudaEventCreateWithFlags(&end_remote_download,cudaEventDisableTiming);
00567   cudaEventCreateWithFlags(&end_local_download,cudaEventDisableTiming);
00568 
00569   patchRecords.resize(patchMap->numPatches());
00570   patch_pairs_ptr = new ResizeArray<patch_pair>;
00571   patch_pair_num_ptr = new ResizeArray<int>;
00572 
00573   if ( params->PMEOn && params->PMEOffload && !params->usePMECUDA) deviceCUDA->setGpuIsMine(0);
00574 }

ComputeNonbondedCUDA::~ComputeNonbondedCUDA (  ) 

Definition at line 577 of file ComputeNonbondedCUDA.C.

00577 { ; }


Member Function Documentation

void ComputeNonbondedCUDA::assignPatches (  ) 

Definition at line 661 of file ComputeNonbondedCUDA.C.

References activePatches, ResizeArray< Elem >::add(), computeMgr, deviceCUDA, DeviceCUDA::getNumPesSharingDevice(), DeviceCUDA::getPesSharingDevice(), ComputeNonbondedCUDA::patch_record::hostPe, if(), j, PatchMap::node(), npatches, numSlaves, ReductionMgr::Object(), PatchMap::ObjectOnPe(), patchMap, patchRecords, reduction, REDUCTIONS_BASIC, registerPatches(), ComputeMgr::sendCreateNonbondedCUDASlave(), ResizeArray< Elem >::size(), slavePes, slaves, and ReductionMgr::willSubmit().

Referenced by ComputeMgr::createComputes().

00661                                          {
00662 
00663   int *pesOnNodeSharingDevice = new int[CkMyNodeSize()];
00664   int numPesOnNodeSharingDevice = 0;
00665   int masterIndex = -1;
00666   for ( int i=0; i<deviceCUDA->getNumPesSharingDevice(); ++i ) {
00667     int pe = deviceCUDA->getPesSharingDevice(i);
00668     if ( pe == CkMyPe() ) masterIndex = numPesOnNodeSharingDevice;
00669     if ( CkNodeOf(pe) == CkMyNode() ) {
00670       pesOnNodeSharingDevice[numPesOnNodeSharingDevice++] = pe;
00671     }
00672   }
00673 
00674   int npatches = activePatches.size();
00675 
00676   if ( npatches ) {
00677     reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
00678   }
00679 
00680   // calculate priority rank of local home patch within pe
00681   {
00682     ResizeArray< ResizeArray<int> > homePatchByRank(CkMyNodeSize());
00683     for ( int i=0; i<npatches; ++i ) {
00684       int pid = activePatches[i];
00685       int homePe = patchMap->node(pid);
00686       if ( CkNodeOf(homePe) == CkMyNode() ) {
00687         homePatchByRank[CkRankOf(homePe)].add(pid);
00688       }
00689     }
00690     for ( int i=0; i<CkMyNodeSize(); ++i ) {
00691       pid_sortop_reverse_priority so;
00692       std::sort(homePatchByRank[i].begin(),homePatchByRank[i].end(),so);
00693       int masterBoost = ( CkMyRank() == i ? 2 : 0 );
00694       for ( int j=0; j<homePatchByRank[i].size(); ++j ) {
00695         int pid = homePatchByRank[i][j];
00696         patchRecords[pid].reversePriorityRankInPe = j + masterBoost;
00697       }
00698     }
00699   }
00700 
00701   int *count = new int[npatches];
00702   memset(count, 0, sizeof(int)*npatches);
00703   int *pcount = new int[numPesOnNodeSharingDevice];
00704   memset(pcount, 0, sizeof(int)*numPesOnNodeSharingDevice);
00705   int *rankpcount = new int[CkMyNodeSize()];
00706   memset(rankpcount, 0, sizeof(int)*CkMyNodeSize());
00707   char *table = new char[npatches*numPesOnNodeSharingDevice];
00708   memset(table, 0, npatches*numPesOnNodeSharingDevice);
00709 
00710   int unassignedpatches = npatches;
00711 
00712   if ( 0 ) { // assign all to device pe
00713     for ( int i=0; i<npatches; ++i ) {
00714       int pid = activePatches[i];
00715       patch_record &pr = patchRecords[pid];
00716       pr.hostPe = CkMyPe();
00717     }
00718     unassignedpatches = 0;
00719     pcount[masterIndex] = npatches;
00720   } else 
00721 
00722   // assign if home pe and build table of natural proxies
00723   for ( int i=0; i<npatches; ++i ) {
00724     int pid = activePatches[i];
00725     patch_record &pr = patchRecords[pid];
00726     int homePe = patchMap->node(pid);
00727     for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
00728       int pe = pesOnNodeSharingDevice[j];
00729       if ( pe == homePe ) {
00730         pr.hostPe = pe;  --unassignedpatches;
00731         pcount[j] += 1;
00732       }
00733       if ( PatchMap::ObjectOnPe(pe)->patch(pid) ) {
00734         table[i*numPesOnNodeSharingDevice+j] = 1;
00735       }
00736     }
00737     if ( pr.hostPe == -1 && CkNodeOf(homePe) == CkMyNode() ) {
00738       pr.hostPe = homePe;  --unassignedpatches;
00739       rankpcount[CkRankOf(homePe)] += 1;
00740     }
00741   }
00742   // assign if only one pe has a required proxy
00743   int assignj = 0;
00744   for ( int i=0; i<npatches; ++i ) {
00745     int pid = activePatches[i];
00746     patch_record &pr = patchRecords[pid];
00747     if ( pr.hostPe != -1 ) continue;
00748     int c = 0;
00749     int lastj;
00750     for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
00751       if ( table[i*numPesOnNodeSharingDevice+j] ) { ++c; lastj=j; }
00752     }
00753     count[i] = c;
00754     if ( c == 1 ) {
00755       pr.hostPe = pesOnNodeSharingDevice[lastj];
00756       --unassignedpatches;
00757       pcount[lastj] += 1;
00758     }
00759   }
00760   while ( unassignedpatches ) {
00761     int i;
00762     for ( i=0; i<npatches; ++i ) {
00763       if ( ! table[i*numPesOnNodeSharingDevice+assignj] ) continue;
00764       int pid = activePatches[i];
00765       patch_record &pr = patchRecords[pid];
00766       if ( pr.hostPe != -1 ) continue;
00767       pr.hostPe = pesOnNodeSharingDevice[assignj];
00768       --unassignedpatches;
00769       pcount[assignj] += 1;
00770       if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
00771       break;
00772     }
00773     if ( i<npatches ) continue;  // start search again
00774     for ( i=0; i<npatches; ++i ) {
00775       int pid = activePatches[i];
00776       patch_record &pr = patchRecords[pid];
00777       if ( pr.hostPe != -1 ) continue;
00778       if ( count[i] ) continue;
00779       pr.hostPe = pesOnNodeSharingDevice[assignj];
00780       --unassignedpatches;
00781       pcount[assignj] += 1;
00782       if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
00783       break;
00784     }
00785     if ( i<npatches ) continue;  // start search again
00786     if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
00787   }
00788 
00789   for ( int i=0; i<npatches; ++i ) {
00790     int pid = activePatches[i];
00791     patch_record &pr = patchRecords[pid];
00792     // CkPrintf("Pe %d patch %d hostPe %d\n", CkMyPe(), pid, pr.hostPe);
00793   }
00794 
00795   slavePes = new int[CkMyNodeSize()];
00796   slaves = new ComputeNonbondedCUDA*[CkMyNodeSize()];
00797   numSlaves = 0;
00798   for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
00799     int pe = pesOnNodeSharingDevice[j];
00800     int rank = pe - CkNodeFirst(CkMyNode());
00801     // CkPrintf("host %d sharing %d pe %d rank %d pcount %d rankpcount %d\n",
00802     //          CkMyPe(),j,pe,rank,pcount[j],rankpcount[rank]);
00803     if ( pe == CkMyPe() ) continue;
00804     if ( ! pcount[j] && ! rankpcount[rank] ) continue;
00805     rankpcount[rank] = 0;  // skip in rank loop below
00806     slavePes[numSlaves] = pe;
00807     computeMgr->sendCreateNonbondedCUDASlave(pe,numSlaves);
00808     ++numSlaves;
00809   }
00810   for ( int j=0; j<CkMyNodeSize(); ++j ) {
00811     int pe = CkNodeFirst(CkMyNode()) + j;
00812     // CkPrintf("host %d rank %d pe %d rankpcount %d\n",
00813     //          CkMyPe(),j,pe,rankpcount[j]);
00814     if ( ! rankpcount[j] ) continue;
00815     if ( pe == CkMyPe() ) continue;
00816     slavePes[numSlaves] = pe;
00817     computeMgr->sendCreateNonbondedCUDASlave(pe,numSlaves);
00818     ++numSlaves;
00819   }
00820   registerPatches();
00821 
00822   delete [] pesOnNodeSharingDevice;
00823   delete [] count;
00824   delete [] pcount;
00825   delete [] rankpcount;
00826   delete [] table;
00827 }

void ComputeNonbondedCUDA::atomUpdate (  )  [virtual]

Reimplemented from Compute.

Definition at line 1039 of file ComputeNonbondedCUDA.C.

References atomsChanged.

01039                                       {
01040   //fprintf(stderr, "%d ComputeNonbondedCUDA::atomUpdate\n",CkMyPe());
01041   atomsChanged = 1;
01042 }

void ComputeNonbondedCUDA::build_exclusions (  )  [static]

Definition at line 259 of file ComputeNonbondedCUDA.C.

References ResizeArray< Elem >::add(), ResizeArray< Elem >::begin(), cuda_bind_exclusions(), ResizeArray< Elem >::end(), exclusionsByAtom, ExclusionSignature::fullExclCnt, ExclusionSignature::fullOffset, Molecule::get_full_exclusions_for_atom(), ObjectArena< Type >::getNewArray(), j, MAX_EXCLUSIONS, ComputeNonbondedUtil::mol, Node::molecule, NAMD_bug(), NAMD_die(), Molecule::numAtoms, Node::Object(), ResizeArray< Elem >::resize(), SET_EXCL, ResizeArray< Elem >::size(), SortableResizeArray< Elem >::sort(), and x.

Referenced by build_cuda_exclusions(), and ComputeNonbondedCUDA().

00259                                             {
00260   Molecule *mol = Node::Object()->molecule;
00261 
00262 #ifdef MEM_OPT_VERSION
00263   int natoms = mol->exclSigPoolSize;
00264 #else
00265   int natoms = mol->numAtoms; 
00266 #endif
00267 
00268   delete [] exclusionsByAtom;
00269   exclusionsByAtom = new int2[natoms];
00270 
00271   // create unique sorted lists
00272 
00273   ObjectArena<int32> listArena;
00274   ResizeArray<int32*> unique_lists;
00275   int32 **listsByAtom = new int32*[natoms];
00276   SortableResizeArray<int32> curList;
00277   for ( int i=0; i<natoms; ++i ) {
00278     curList.resize(0);
00279     curList.add(0);  // always excluded from self
00280 #ifdef MEM_OPT_VERSION
00281     const ExclusionSignature *sig = mol->exclSigPool + i;
00282     int n = sig->fullExclCnt;
00283     for ( int j=0; j<n; ++j ) { curList.add(sig->fullOffset[j]); }
00284     n += 1;
00285 #else
00286     const int32 *mol_list = mol->get_full_exclusions_for_atom(i);
00287     int n = mol_list[0] + 1;
00288     for ( int j=1; j<n; ++j ) {
00289       curList.add(mol_list[j] - i);
00290     }
00291 #endif
00292     curList.sort();
00293 
00294     int j;
00295     for ( j=0; j<unique_lists.size(); ++j ) {
00296       if ( n != unique_lists[j][0] ) continue;  // no match
00297       int k;
00298       for ( k=0; k<n; ++k ) {
00299         if ( unique_lists[j][k+3] != curList[k] ) break;
00300       }
00301       if ( k == n ) break;  // found match
00302     }
00303     if ( j == unique_lists.size() ) {  // no match
00304       int32 *list = listArena.getNewArray(n+3);
00305       list[0] = n;
00306       int maxdiff = 0;
00307       maxdiff = -1 * curList[0];
00308       if ( curList[n-1] > maxdiff ) maxdiff = curList[n-1];
00309       list[1] = maxdiff;
00310       for ( int k=0; k<n; ++k ) {
00311         list[k+3] = curList[k];
00312       }
00313       unique_lists.add(list);
00314     }
00315     listsByAtom[i] = unique_lists[j];
00316   }
00317   // sort lists by maxdiff
00318   std::stable_sort(unique_lists.begin(), unique_lists.end(), exlist_sortop());
00319   long int totalbits = 0;
00320   int nlists = unique_lists.size();
00321   for ( int j=0; j<nlists; ++j ) {
00322     int32 *list = unique_lists[j];
00323     int maxdiff = list[1];
00324     list[2] = totalbits + maxdiff;
00325     totalbits += 2*maxdiff + 1;
00326   }
00327   for ( int i=0; i<natoms; ++i ) {
00328     exclusionsByAtom[i].x = listsByAtom[i][1];  // maxdiff
00329     exclusionsByAtom[i].y = listsByAtom[i][2];  // start
00330   }
00331   delete [] listsByAtom;
00332 
00333   if ( totalbits & 31 ) totalbits += ( 32 - ( totalbits & 31 ) );
00334 
00335   {
00336     long int bytesneeded = totalbits / 8;
00337     if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
00338     CkPrintf("Info: Found %d unique exclusion lists needing %ld bytes\n",
00339                 unique_lists.size(), bytesneeded);
00340     }
00341 
00342     long int bytesavail = MAX_EXCLUSIONS * sizeof(unsigned int);
00343     if ( bytesneeded > bytesavail ) {
00344       char errmsg[512];
00345       sprintf(errmsg,"Found %d unique exclusion lists needing %ld bytes "
00346                      "but only %ld bytes can be addressed with 32-bit int.",
00347                      unique_lists.size(), bytesneeded, bytesavail);
00348       NAMD_die(errmsg);
00349     }
00350   }
00351 
00352 #define SET_EXCL(EXCL,BASE,DIFF) \
00353          (EXCL)[((BASE)+(DIFF))>>5] |= (1<<(((BASE)+(DIFF))&31))
00354 
00355   unsigned int *exclusion_bits = new unsigned int[totalbits/32];
00356   memset(exclusion_bits, 0, totalbits/8);
00357 
00358   long int base = 0;
00359   for ( int i=0; i<unique_lists.size(); ++i ) {
00360     base += unique_lists[i][1];
00361     if ( unique_lists[i][2] != (int32)base ) {
00362       NAMD_bug("ComputeNonbondedCUDA::build_exclusions base != stored");
00363     }
00364     int n = unique_lists[i][0];
00365     for ( int j=0; j<n; ++j ) {
00366       SET_EXCL(exclusion_bits,base,unique_lists[i][j+3]);
00367     }
00368     base += unique_lists[i][1] + 1;
00369   }
00370 
00371   cuda_bind_exclusions(exclusion_bits, totalbits/32);
00372 
00373   delete [] exclusion_bits;
00374 }

void ComputeNonbondedCUDA::build_force_table (  )  [static]

Definition at line 114 of file ComputeNonbondedCUDA.C.

References cuda_bind_force_table(), ComputeNonbondedUtil::cutoff, diffa, f, ComputeNonbondedUtil::fast_table, FORCE_TABLE_SIZE, ComputeNonbondedUtil::r2_delta, ComputeNonbondedUtil::r2_delta_exp, ComputeNonbondedUtil::r2_table, ComputeNonbondedUtil::scor_table, table_i, ComputeNonbondedUtil::vdwa_table, ComputeNonbondedUtil::vdwb_table, and x.

Referenced by build_cuda_force_table().

00114                                              {  // static
00115 
00116   float4 t[FORCE_TABLE_SIZE];
00117   float4 et[FORCE_TABLE_SIZE];  // energy table
00118 
00119   const BigReal r2_delta = ComputeNonbondedUtil:: r2_delta;
00120   const int r2_delta_exp = ComputeNonbondedUtil:: r2_delta_exp;
00121   // const int r2_delta_expc = 64 * (r2_delta_exp - 127);
00122   const int r2_delta_expc = 64 * (r2_delta_exp - 1023);
00123 
00124   double r2list[FORCE_TABLE_SIZE];  // double to match cpu code
00125   for ( int i=1; i<FORCE_TABLE_SIZE; ++i ) {
00126     double r = ((double) FORCE_TABLE_SIZE) / ( (double) i + 0.5 );
00127     r2list[i] = r*r + r2_delta;
00128   }
00129 
00130   union { double f; int32 i[2]; } byte_order_test;
00131   byte_order_test.f = 1.0;  // should occupy high-order bits only
00132   int32 *r2iilist = (int32*)r2list + ( byte_order_test.i[0] ? 0 : 1 );
00133 
00134   for ( int i=1; i<FORCE_TABLE_SIZE; ++i ) {
00135     double r = ((double) FORCE_TABLE_SIZE) / ( (double) i + 0.5 );
00136     int table_i = (r2iilist[2*i] >> 14) + r2_delta_expc;  // table_i >= 0
00137 
00138     if ( r > cutoff ) {
00139       t[i].x = 0.;
00140       t[i].y = 0.;
00141       t[i].z = 0.;
00142       t[i].w = 0.;
00143       et[i].x = 0.;
00144       et[i].y = 0.;
00145       et[i].z = 0.;
00146       et[i].w = 0.;
00147       continue;
00148     }
00149 
00150     BigReal diffa = r2list[i] - r2_table[table_i];
00151 
00152     // coulomb 1/r or fast force
00153     // t[i].x = 1. / (r2 * r);  // -1/r * d/dr r^-1
00154     {
00155       BigReal table_a = fast_table[4*table_i];
00156       BigReal table_b = fast_table[4*table_i+1];
00157       BigReal table_c = fast_table[4*table_i+2];
00158       BigReal table_d = fast_table[4*table_i+3];
00159       BigReal grad =
00160                 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00161       t[i].x = 2. * grad;
00162       BigReal ener = table_a + diffa *
00163                 ( ( table_d * diffa + table_c ) * diffa + table_b);
00164       et[i].x = ener;
00165     }
00166 
00167 
00168     // pme correction for slow force
00169     // t[i].w = 0.;
00170     {
00171       BigReal table_a = scor_table[4*table_i];
00172       BigReal table_b = scor_table[4*table_i+1];
00173       BigReal table_c = scor_table[4*table_i+2];
00174       BigReal table_d = scor_table[4*table_i+3];
00175       BigReal grad =
00176                 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00177       t[i].w = 2. * grad;
00178       BigReal ener = table_a + diffa *
00179                 ( ( table_d * diffa + table_c ) * diffa + table_b);
00180       et[i].w = ener;
00181     }
00182 
00183 
00184     // vdw 1/r^6
00185     // t[i].y = 6. / (r8);  // -1/r * d/dr r^-6
00186     {
00187       BigReal table_a = vdwb_table[4*table_i];
00188       BigReal table_b = vdwb_table[4*table_i+1];
00189       BigReal table_c = vdwb_table[4*table_i+2];
00190       BigReal table_d = vdwb_table[4*table_i+3];
00191       BigReal grad =
00192                 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00193       t[i].y = 2. * -1. * grad;
00194       BigReal ener = table_a + diffa *
00195                 ( ( table_d * diffa + table_c ) * diffa + table_b);
00196       et[i].y = -1. * ener;
00197     }
00198 
00199 
00200     // vdw 1/r^12
00201     // t[i].z = 12e / (r8 * r4 * r2);  // -1/r * d/dr r^-12
00202     {
00203       BigReal table_a = vdwa_table[4*table_i];
00204       BigReal table_b = vdwa_table[4*table_i+1];
00205       BigReal table_c = vdwa_table[4*table_i+2];
00206       BigReal table_d = vdwa_table[4*table_i+3];
00207       BigReal grad =
00208                 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00209       t[i].z = 2. * grad;
00210       BigReal ener = table_a + diffa *
00211                 ( ( table_d * diffa + table_c ) * diffa + table_b);
00212       et[i].z = ener;
00213     }
00214 
00215     // CkPrintf("%d %g %g %g %g %g %g\n", i, r, diffa,
00216     //   t[i].x, t[i].y, t[i].z, t[i].w);
00217 
00218 /*
00219     double r2 = r * r;
00220     double r4 = r2 * r2;
00221     double r8 = r4 * r4;
00222 
00223     t[i].x = 1. / (r2 * r);  // -1/r * d/dr r^-1
00224     t[i].y = 6. / (r8);  // -1/r * d/dr r^-6
00225     t[i].z = 12. / (r8 * r4 * r2);  // -1/r * d/dr r^-12
00226     t[i].w = 0.;
00227 */
00228   }
00229 
00230   t[0].x = 0.f;
00231   t[0].y = 0.f;
00232   t[0].z = 0.f;
00233   t[0].w = 0.f;
00234   et[0].x = et[1].x;
00235   et[0].y = et[1].y;
00236   et[0].z = et[1].z;
00237   et[0].w = et[1].w;
00238 
00239   cuda_bind_force_table(t,et);
00240 
00241   if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
00242     CkPrintf("Info: Updated CUDA force table with %d elements.\n", FORCE_TABLE_SIZE);
00243   }
00244 }

void ComputeNonbondedCUDA::build_lj_table (  )  [static]

Definition at line 87 of file ComputeNonbondedCUDA.C.

References cuda_bind_lj_table(), LJTable::get_table_dim(), j, ComputeNonbondedUtil::ljTable, NAMD_bug(), ComputeNonbondedUtil::scaling, and LJTable::table_val().

Referenced by build_cuda_force_table().

00087                                           {  // static
00088 
00089   const LJTable* const ljTable = ComputeNonbondedUtil:: ljTable;
00090   const int dim = ljTable->get_table_dim();
00091 
00092   // round dim up to odd multiple of 16
00093   int tsize = (((dim+16+31)/32)*32)-16;
00094   if ( tsize < dim ) NAMD_bug("ComputeNonbondedCUDA::build_lj_table bad tsize");
00095 
00096   float2 *t = new float2[tsize*tsize];
00097   float2 *row = t;
00098   for ( int i=0; i<dim; ++i, row += tsize ) {
00099     for ( int j=0; j<dim; ++j ) {
00100       const LJTable::TableEntry *e = ljTable->table_val(i,j);
00101       row[j].x = e->A * scaling;
00102       row[j].y = e->B * scaling;
00103     }
00104   }
00105 
00106   cuda_bind_lj_table(t,tsize);
00107   delete [] t;
00108 
00109   if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
00110     CkPrintf("Info: Updated CUDA LJ table with %d x %d elements.\n", dim, dim);
00111   }
00112 }

void ComputeNonbondedCUDA::doWork (  )  [virtual]

Reimplemented from Compute.

Definition at line 1173 of file ComputeNonbondedCUDA.C.

References activePatches, atom_params, atom_params_size, atoms, atoms_size, atomsChanged, Compute::basePriority, ResizeArray< Elem >::begin(), block_order, block_order_size, bornRadH, bornRadH_size, PatchMap::center(), CompAtom::charge, COMPUTE_PROXY_PRIORITY, computeRecords, computesChanged, COULOMB, cuda_bind_patch_pairs(), cuda_check_local_progress(), cuda_errcheck(), cudaCheck, cudaCompute, dEdaSumH, dEdaSumH_size, deviceCUDA, deviceID, dHdrPrefixH, dHdrPrefixH_size, ComputeNonbondedUtil::dielectric_1, dummy_dev, dummy_size, energy_gbis, energy_gbis_size, exclusionsByAtom, f, finishWork(), ComputeNonbondedUtil::fixedAtomsOn, force_ready_queue, force_ready_queue_len, force_ready_queue_size, forces, forces_size, GBISP, Compute::gbisPhase, DeviceCUDA::getNoStreaming(), Patch::getNumAtoms(), hostedPatches, CompAtomExt::id, if(), intRad0H, intRad0H_size, intRadSH, intRadSH_size, SubmitReduction::item(), j, lattice, localActivePatches, localComputeRecords, ComputeNonbondedCUDA::patch_record::localStart, master, ComputeNonbondedUtil::mol, Node::molecule, NAMD_bug(), npatches, num_atoms, num_local_atoms, num_remote_atoms, num_virials, ComputeNonbondedCUDA::patch_record::numAtoms, ComputeNonbondedCUDA::patch_record::numFreeAtoms, Node::Object(), ComputeNonbondedCUDA::compute_record::offset, ComputeNonbondedCUDA::patch_record::p, pairlistsValid, pairlistTolerance, Node::parameters, patch_pair_num_ptr, patch_pairs, patch_pairs_ptr, patchMap, patchPairsReordered, patchRecords, ComputeNonbondedCUDA::compute_record::pid, CompAtom::position, PROXY_DATA_PRIORITY, PROXY_RESULTS_PRIORITY, psiSumH, psiSumH_size, reduction, REDUCTION_PAIRLIST_WARNINGS, remoteActivePatches, remoteComputeRecords, ResizeArray< Elem >::resize(), Flags::savePairlists, savePairlists, ComputeNonbondedUtil::scaling, Compute::sequence(), Node::simParameters, simParams, ResizeArray< Elem >::size(), slow_forces, slow_forces_size, slow_virials, usePairlists, vdw_types, vdw_types_size, CompAtom::vdwType, virials, virials_size, WARPSIZE, workStarted, Vector::x, ComputeNonbondedCUDA::patch_record::xExt, Vector::y, and Vector::z.

01173                                   {
01174 GBISP("C.N.CUDA[%d]::doWork: seq %d, phase %d, workStarted %d, atomsChanged %d\n", \
01175 CkMyPe(), sequence(), gbisPhase, workStarted, atomsChanged);
01176 
01177   // Set GPU device ID
01178   cudaCheck(cudaSetDevice(deviceID));
01179 
01180   ResizeArray<patch_pair> &patch_pairs(*patch_pairs_ptr);
01181   ResizeArray<int> &patch_pair_num(*patch_pair_num_ptr);
01182 
01183   if ( workStarted ) { //if work already started, check if finished
01184     if ( finishWork() ) {  // finished
01185       workStarted = 0;
01186       basePriority = PROXY_DATA_PRIORITY;  // higher to aid overlap
01187     } else {  // need to call again
01188       workStarted = 2;
01189       basePriority = PROXY_RESULTS_PRIORITY;  // lower for local
01190       if ( master == this && kernel_launch_state > 2 ) {
01191         cuda_check_local_progress(this,0.);  // launches polling
01192       }
01193     }
01194     return;
01195   }
01196 
01197   workStarted = 1;
01198   basePriority = COMPUTE_PROXY_PRIORITY;
01199 
01200   Molecule *mol = Node::Object()->molecule;
01201   Parameters *params = Node::Object()->parameters;
01202   SimParameters *simParams = Node::Object()->simParameters;
01203 
01204   //execute only during GBIS phase 1, or if not using GBIS
01205   if (!simParams->GBISOn || gbisPhase == 1) {
01206 
01207     //bind new patches to GPU
01208     if ( atomsChanged || computesChanged ) {
01209       int npatches = activePatches.size();
01210 
01211       pairlistsValid = 0;
01212       pairlistTolerance = 0.;
01213 
01214       if ( computesChanged ) {
01215         computesChanged = 0;
01216 
01217         if ( ! dummy_size ) {
01218           dummy_size = 1024*1024;
01219           cudaMalloc((void**)&dummy_dev,dummy_size);
01220           cuda_errcheck("in cudaMalloc dummy_dev");
01221         }
01222 
01223         bool did_realloc = reallocate_host<int>(&force_ready_queue, &force_ready_queue_size, npatches,
01224           1.2f, cudaHostAllocMapped);
01225         if (did_realloc) {
01226           for (int k=0; k < force_ready_queue_size; ++k)
01227             force_ready_queue[k] = -1;
01228         }
01229         force_ready_queue_len = npatches;
01230         reallocate_host<int>(&block_order, &block_order_size,
01231           2*(localComputeRecords.size()+remoteComputeRecords.size()),
01232           1.2f, cudaHostAllocMapped);
01233     
01234         num_virials = npatches;
01235         reallocate_host<float>(&virials, &virials_size, 2*16*num_virials,
01236           1.2f, cudaHostAllocMapped);
01237         slow_virials = virials + 16*num_virials;
01238 
01239         reallocate_host<float>(&energy_gbis, &energy_gbis_size, npatches,
01240           1.2f, cudaHostAllocMapped);
01241         for (int i  = 0; i < energy_gbis_size; i++) energy_gbis[i] = 0.f;
01242 
01243         int *ap = activePatches.begin();
01244         for ( int i=0; i<localActivePatches.size(); ++i ) {
01245           *(ap++) = localActivePatches[i];
01246         }
01247         for ( int i=0; i<remoteActivePatches.size(); ++i ) {
01248           *(ap++) = remoteActivePatches[i];
01249         }
01250 
01251         // sort computes by distance between patches
01252         cr_sortop_distance so(lattice);
01253         std::stable_sort(localComputeRecords.begin(),localComputeRecords.end(),so);
01254         std::stable_sort(remoteComputeRecords.begin(),remoteComputeRecords.end(),so);
01255 
01256         const bool streaming = ! (deviceCUDA->getNoStreaming() || simParams->GBISOn);
01257 
01258         if ( streaming ) {
01259           // iout << "Reverse-priority sorting...\n" << endi;
01260           cr_sortop_reverse_priority sorp(so,patchRecords.begin());
01261           std::stable_sort(localComputeRecords.begin(),localComputeRecords.end(),sorp);
01262           std::stable_sort(remoteComputeRecords.begin(),remoteComputeRecords.end(),sorp);
01263           patchPairsReordered = 0;
01264           //patchPairsReordered = 1;
01265       // int len = remoteComputeRecords.size();
01266       // for ( int i=0; i<len; ++i ) {
01267       //   iout << "reverse_order " << i << " " << remoteComputeRecords[i].pid[0] << "\n";
01268       // }
01269       // int len2 = localComputeRecords.size();
01270       // for ( int i=0; i<len2; ++i ) {
01271       //   iout << "reverse_order " << (i+len) << " " << localComputeRecords[i].pid[0] << "\n";
01272       // }
01273       // iout << endi;
01274         } else {
01275           patchPairsReordered = 1;
01276         }
01277  
01278         int nlc = localComputeRecords.size();
01279         int nrc = remoteComputeRecords.size();
01280         computeRecords.resize(nlc+nrc);
01281         compute_record *cr = computeRecords.begin();
01282         for ( int i=0; i<nrc; ++i ) {
01283           *(cr++) = remoteComputeRecords[i];
01284         }
01285         for ( int i=0; i<nlc; ++i ) {
01286           *(cr++) = localComputeRecords[i];
01287         }
01288 
01289         // patch_pair_num[i] = number of patch pairs that involve patch i
01290         patch_pair_num.resize(npatches);
01291         for ( int i=0; i<npatches; ++i ) {
01292           patchRecords[activePatches[i]].localIndex = i;
01293           patch_pair_num[i] = 0;
01294         }
01295 
01296         int ncomputes = computeRecords.size();
01297         patch_pairs.resize(ncomputes);
01298         for ( int i=0; i<ncomputes; ++i ) {
01299           ComputeNonbondedCUDA::compute_record &cr = computeRecords[i];
01300           int lp1 = patchRecords[cr.pid[0]].localIndex;
01301           int lp2 = patchRecords[cr.pid[1]].localIndex;
01302           patch_pair_num[lp1]++;
01303           if (lp1 != lp2) patch_pair_num[lp2]++;
01304           patch_pair &pp = patch_pairs[i];
01305           pp.offset.x = cr.offset.x;
01306           pp.offset.y = cr.offset.y;
01307           pp.offset.z = cr.offset.z;
01308         }
01309 
01310         for ( int i=0; i<ncomputes; ++i ) {
01311           ComputeNonbondedCUDA::compute_record &cr = computeRecords[i];
01312           int lp1 = patchRecords[cr.pid[0]].localIndex;
01313           int lp2 = patchRecords[cr.pid[1]].localIndex;
01314           patch_pair &pp = patch_pairs[i];
01315           pp.patch1_ind = lp1;
01316           pp.patch2_ind = lp2;
01317           pp.patch1_num_pairs = patch_pair_num[lp1];
01318           pp.patch2_num_pairs = patch_pair_num[lp2];
01319         }
01320 
01321       if ( CmiPhysicalNodeID(CkMyPe()) < 2 ) {
01322         CkPrintf("Pe %d has %d local and %d remote patches and %d local and %d remote computes.\n",
01323                  CkMyPe(), localActivePatches.size(), remoteActivePatches.size(),
01324                  localComputeRecords.size(), remoteComputeRecords.size());
01325       }
01326     }  // computesChanged
01327     else if ( ! patchPairsReordered ) {
01328       patchPairsReordered = 1;
01329       int len = patch_pairs.size();
01330       int nlc = localComputeRecords.size();
01331       int nrc = remoteComputeRecords.size();
01332       if ( len != nlc + nrc ) NAMD_bug("array size mismatch in ComputeNonbondedCUDA reordering");
01333       ResizeArray<ComputeNonbondedCUDA::compute_record> new_computeRecords(len);
01334       ResizeArray<patch_pair> new_patch_pairs(len);
01335       int irc=nrc;
01336       int ilc=len;
01337       for ( int i=0; i<len; ++i ) {
01338         int boi = block_order[i];
01339         int dest;
01340         if ( boi < nrc ) { dest = --irc; } else { dest = --ilc; }
01341         new_computeRecords[dest] = computeRecords[boi];
01342         new_patch_pairs[dest] = patch_pairs[boi];
01343       }
01344       if ( irc != 0 || ilc != nrc ) NAMD_bug("block index mismatch in ComputeNonbondedCUDA reordering");
01345       computeRecords.swap(new_computeRecords);
01346       patch_pairs.swap(new_patch_pairs);
01347     }
01348 
01349     int istart = 0;
01350     int i;
01351     for ( i=0; i<npatches; ++i ) {
01352       if ( i == localActivePatches.size() ) {
01353         num_local_atoms = istart;
01354       }
01355       patch_record &pr = patchRecords[activePatches[i]];
01356       pr.localStart = istart;
01357       int natoms = pr.p->getNumAtoms();
01358       int nfreeatoms = natoms;
01359       if ( fixedAtomsOn ) {
01360         const CompAtomExt *aExt = pr.xExt;
01361         for ( int j=0; j<natoms; ++j ) {
01362           if ( aExt[j].atomFixed ) --nfreeatoms;
01363         }
01364       }
01365       pr.numAtoms = natoms;
01366       pr.numFreeAtoms = nfreeatoms;
01367       istart += natoms;
01368       istart += 16 - (natoms & 15);
01369     }
01370     if ( i == localActivePatches.size() ) {
01371       num_local_atoms = istart;
01372     }
01373     num_atoms = istart;
01374     num_remote_atoms = num_atoms - num_local_atoms;
01375     reallocate_host<atom_param>(&atom_params, &atom_params_size, num_atoms, 1.2f);
01376     reallocate_host<int>(&vdw_types, &vdw_types_size, num_atoms, 1.2f);
01377     reallocate_host<CudaAtom>(&atoms, &atoms_size, num_atoms, 1.2f);
01378     reallocate_host<float4>(&forces, &forces_size, num_atoms, 1.2f, cudaHostAllocMapped);
01379     reallocate_host<float4>(&slow_forces, &slow_forces_size, num_atoms, 1.2f, cudaHostAllocMapped);
01380     if (simParams->GBISOn) {
01381       reallocate_host<float>(&intRad0H, &intRad0H_size, num_atoms, 1.2f);
01382       reallocate_host<float>(&intRadSH, &intRadSH_size, num_atoms, 1.2f);
01383       reallocate_host<GBReal>(&psiSumH, &psiSumH_size, num_atoms, 1.2f, cudaHostAllocMapped);
01384       reallocate_host<float>(&bornRadH, &bornRadH_size, num_atoms, 1.2f);
01385       reallocate_host<GBReal>(&dEdaSumH, &dEdaSumH_size, num_atoms, 1.2f, cudaHostAllocMapped);
01386       reallocate_host<float>(&dHdrPrefixH, &dHdrPrefixH_size, num_atoms, 1.2f);
01387     }
01388 
01389     int bfstart = 0;
01390     int exclmask_start = 0;
01391     int ncomputes = computeRecords.size();
01392     for ( int i=0; i<ncomputes; ++i ) {
01393       ComputeNonbondedCUDA::compute_record &cr = computeRecords[i];
01394       int p1 = cr.pid[0];
01395       int p2 = cr.pid[1];
01396       patch_pair &pp = patch_pairs[i];
01397       pp.patch1_start = patchRecords[p1].localStart;
01398       pp.patch1_size  = patchRecords[p1].numAtoms;
01399       pp.patch1_free_size = patchRecords[p1].numFreeAtoms;
01400       pp.patch2_start = patchRecords[p2].localStart;
01401       pp.patch2_size  = patchRecords[p2].numAtoms;
01402       pp.patch2_free_size = patchRecords[p2].numFreeAtoms;
01403       pp.plist_start = bfstart;
01404       // size1*size2 = number of patch pairs
01405       int size1 = (pp.patch1_size-1)/WARPSIZE+1;
01406       int size2 = (pp.patch2_size-1)/WARPSIZE+1;
01407       pp.plist_size = (size1*size2-1)/32+1;
01408       bfstart += pp.plist_size;
01409       pp.exclmask_start = exclmask_start;
01410       exclmask_start += size1*size2;
01411     } //for ncomputes
01412 
01413     cuda_bind_patch_pairs(patch_pairs.begin(), patch_pairs.size(),
01414       activePatches.size(), num_atoms, bfstart, 
01415       exclmask_start);
01416 
01417   }  // atomsChanged || computesChanged
01418 
01419   double charge_scaling = sqrt(COULOMB * scaling * dielectric_1);
01420 
01421   Flags &flags = patchRecords[hostedPatches[0]].p->flags;
01422   float maxAtomMovement = 0.;
01423   float maxPatchTolerance = 0.;
01424 
01425   for ( int i=0; i<activePatches.size(); ++i ) {
01426     patch_record &pr = patchRecords[activePatches[i]];
01427 
01428     float maxMove = pr.p->flags.maxAtomMovement;
01429     if ( maxMove > maxAtomMovement ) maxAtomMovement = maxMove;
01430 
01431     float maxTol = pr.p->flags.pairlistTolerance;
01432     if ( maxTol > maxPatchTolerance ) maxPatchTolerance = maxTol;
01433 
01434     int start = pr.localStart;
01435     int n = pr.numAtoms;
01436     const CompAtom *a = pr.x;
01437     const CompAtomExt *aExt = pr.xExt;
01438     if ( atomsChanged ) {
01439 
01440       atom_param *ap = atom_params + start;
01441       for ( int k=0; k<n; ++k ) {
01442         int j = aExt[k].sortOrder;
01443         ap[k].vdw_type = a[j].vdwType;
01444         vdw_types[start + k] = a[j].vdwType;
01445         ap[k].index = aExt[j].id;
01446 #ifdef MEM_OPT_VERSION
01447         ap[k].excl_index = exclusionsByAtom[aExt[j].exclId].y;
01448         ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].exclId].x;
01449 #else // ! MEM_OPT_VERSION
01450         ap[k].excl_index = exclusionsByAtom[aExt[j].id].y;
01451         ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].id].x;
01452 #endif // MEM_OPT_VERSION
01453       }
01454     }
01455     {
01456 #if 1
01457       const CudaAtom *ac = pr.p->getCudaAtomList();
01458       CudaAtom *ap = atoms + start;
01459       memcpy(ap, ac, sizeof(atom)*n);
01460 #else
01461       Vector center =
01462         pr.p->flags.lattice.unscale(cudaCompute->patchMap->center(pr.patchID));
01463       atom *ap = atoms + start;
01464       for ( int k=0; k<n; ++k ) {
01465         int j = aExt[k].sortOrder;
01466         ap[k].position.x = a[j].position.x - center.x;
01467         ap[k].position.y = a[j].position.y - center.y;
01468         ap[k].position.z = a[j].position.z - center.z;
01469         ap[k].charge = charge_scaling * a[j].charge;
01470       }
01471 #endif
01472     }
01473   }
01474 
01475   savePairlists = 0;
01476   usePairlists = 0;
01477   if ( flags.savePairlists ) {
01478     savePairlists = 1;
01479     usePairlists = 1;
01480   } else if ( flags.usePairlists ) {
01481     if ( ! pairlistsValid ||
01482          ( 2. * maxAtomMovement > pairlistTolerance ) ) {
01483       reduction->item(REDUCTION_PAIRLIST_WARNINGS) += 1;
01484     } else {
01485       usePairlists = 1;
01486     }
01487   }
01488   if ( ! usePairlists ) {
01489     pairlistsValid = 0;
01490   }
01491   float plcutoff = cutoff;
01492   if ( savePairlists ) {
01493     pairlistsValid = 1;
01494     pairlistTolerance = 2. * maxPatchTolerance;
01495     plcutoff += pairlistTolerance;
01496   }
01497   plcutoff2 = plcutoff * plcutoff;
01498 
01499   //CkPrintf("plcutoff = %f  listTolerance = %f  save = %d  use = %d\n",
01500   //  plcutoff, pairlistTolerance, savePairlists, usePairlists);
01501 
01502 #if 0
01503   // calculate warp divergence
01504   if ( 1 ) {
01505     Flags &flags = patchRecords[hostedPatches[0]].p->flags;
01506     Lattice &lattice = flags.lattice;
01507     float3 lata, latb, latc;
01508     lata.x = lattice.a().x;
01509     lata.y = lattice.a().y;
01510     lata.z = lattice.a().z;
01511     latb.x = lattice.b().x;
01512     latb.y = lattice.b().y;
01513     latb.z = lattice.b().z;
01514     latc.x = lattice.c().x;
01515     latc.y = lattice.c().y;
01516     latc.z = lattice.c().z;
01517 
01518     int ncomputes = computeRecords.size();
01519     for ( int ic=0; ic<ncomputes; ++ic ) {
01520       patch_pair &pp = patch_pairs[ic];
01521       atom *a1 = atoms + pp.patch1_atom_start;
01522       int n1 = pp.patch1_size;
01523       atom *a2 = atoms + pp.patch2_atom_start;
01524       int n2 = pp.patch2_size;
01525       float offx = pp.offset.x * lata.x
01526                + pp.offset.y * latb.x
01527                + pp.offset.z * latc.x;
01528       float offy = pp.offset.x * lata.y
01529                + pp.offset.y * latb.y
01530                + pp.offset.z * latc.y;
01531       float offz = pp.offset.x * lata.z
01532                + pp.offset.y * latb.z
01533                + pp.offset.z * latc.z;
01534       // CkPrintf("%f %f %f\n", offx, offy, offz);
01535       int atoms_tried = 0;
01536       int blocks_tried = 0;
01537       int atoms_used = 0;
01538       int blocks_used = 0;
01539       for ( int ii=0; ii<n1; ii+=32 ) {  // warps
01540         for ( int jj=0; jj<n2; jj+=16 ) {  // shared atom loads
01541           int block_used = 0;
01542           for ( int j=jj; j<jj+16 && j<n2; ++j ) {  // shared atoms
01543             int atom_used = 0;
01544             for ( int i=ii; i<ii+32 && i<n1; ++i ) {  // threads
01545               float dx = offx + a1[i].position.x - a2[j].position.x;
01546               float dy = offy + a1[i].position.y - a2[j].position.y;
01547               float dz = offz + a1[i].position.z - a2[j].position.z;
01548               float r2 = dx*dx + dy*dy + dz*dz;
01549               if ( r2 < cutoff2 ) atom_used = 1;
01550             }
01551             ++atoms_tried;
01552             if ( atom_used ) { block_used = 1; ++atoms_used; }
01553           }
01554           ++blocks_tried;
01555           if ( block_used ) { ++blocks_used; }
01556         }
01557       }
01558       CkPrintf("blocks = %d/%d (%f)  atoms = %d/%d (%f)\n",
01559                 blocks_used, blocks_tried, blocks_used/(float)blocks_tried,
01560                 atoms_used, atoms_tried, atoms_used/(float)atoms_tried);
01561     }
01562   }
01563 #endif
01564 
01565   } // !GBISOn || gbisPhase == 1
01566 
01567   //Do GBIS
01568   if (simParams->GBISOn) {
01569     //open GBIS boxes depending on phase
01570     for ( int i=0; i<activePatches.size(); ++i ) {
01571       patch_record &pr = master->patchRecords[activePatches[i]];
01572       GBISP("doWork[%d] accessing arrays for P%d\n",CkMyPe(),gbisPhase);
01573       if (gbisPhase == 1) {
01574         //Copy GBIS intRadius to Host
01575         if (atomsChanged) {
01576           float *intRad0 = intRad0H + pr.localStart;
01577           float *intRadS = intRadSH + pr.localStart;
01578           for ( int k=0; k<pr.numAtoms; ++k ) {
01579             int j = pr.xExt[k].sortOrder;
01580             intRad0[k] = pr.intRad[2*j+0];
01581             intRadS[k] = pr.intRad[2*j+1];
01582           }
01583         }
01584       } else if (gbisPhase == 2) {
01585         float *bornRad = bornRadH + pr.localStart;
01586         for ( int k=0; k<pr.numAtoms; ++k ) {
01587           int j = pr.xExt[k].sortOrder;
01588           bornRad[k] = pr.bornRad[j];
01589         }
01590       } else if (gbisPhase == 3) {
01591         float *dHdrPrefix = dHdrPrefixH + pr.localStart;
01592         for ( int k=0; k<pr.numAtoms; ++k ) {
01593           int j = pr.xExt[k].sortOrder;
01594           dHdrPrefix[k] = pr.dHdrPrefix[j];
01595         }
01596       } // end phases
01597     } // end for patches
01598   } // if GBISOn
01599 
01600   kernel_time = CkWallTimer();
01601   kernel_launch_state = 1;
01602   //if ( gpu_is_mine || ! doSlow ) recvYieldDevice(-1);
01603   if ( deviceCUDA->getGpuIsMine() || ! doSlow ) recvYieldDevice(-1);
01604 }

void ComputeNonbondedCUDA::finishPatch ( int   )  [virtual]

Reimplemented from Compute.

Definition at line 1865 of file ComputeNonbondedCUDA.C.

References activePatches, Box< Owner, Data >::close(), ComputeNonbondedCUDA::patch_record::forceBox, master, Box< Owner, Data >::open(), patchRecords, ComputeNonbondedCUDA::patch_record::positionBox, ComputeNonbondedCUDA::patch_record::r, and ComputeNonbondedCUDA::patch_record::x.

Referenced by finishWork().

01865                                                   {
01866   //fprintf(stderr, "%d ComputeNonbondedCUDA::finishPatch \n",CkMyPe());
01867   patch_record &pr = master->patchRecords[master->activePatches[flindex]];
01868   pr.r = pr.forceBox->open();
01869   finishPatch(pr);
01870   pr.positionBox->close(&(pr.x));
01871   pr.forceBox->close(&(pr.r));
01872 }

void ComputeNonbondedCUDA::finishReductions (  ) 

Definition at line 2029 of file ComputeNonbondedCUDA.C.

References ADD_TENSOR_OBJECT, block_order, endi(), energy_gbis, if(), iout, num_virials, Node::Object(), PROXY_DATA_PRIORITY, REDUCTION_ELECT_ENERGY, REDUCTION_ELECT_ENERGY_SLOW, REDUCTION_EXCLUSION_CHECKSUM_CUDA, REDUCTION_LJ_ENERGY, REDUCTION_VIRIAL_NBOND, Node::simParameters, simParams, ResizeArray< Elem >::size(), virials, Tensor::xx, Tensor::xy, Tensor::xz, Tensor::yx, Tensor::yy, Tensor::yz, Tensor::zx, Tensor::zy, and Tensor::zz.

02029                                             {
02030   //fprintf(stderr, "%d ComputeNonbondedCUDA::finishReductions \n",CkMyPe());
02031 
02032   basePriority = PROXY_DATA_PRIORITY;  // higher to aid overlap
02033 
02034   SimParameters *simParams = Node::Object()->simParameters;
02035 
02036   if ( 0 && patchPairsReordered && patchPairsReordered < 3 ) {
02037     if ( patchPairsReordered++ == 2) {
02038       int patch_len = patchRecords.size();
02039       ResizeArray<int> plast(patch_len);
02040       for ( int i=0; i<patch_len; ++i ) {
02041         plast[i] = -1;
02042       }
02043       int order_len = localComputeRecords.size()+remoteComputeRecords.size();
02044       for ( int i=0; i<order_len; ++i ) {
02045         plast[computeRecords[block_order[i]].pid[0]] = i;
02046         iout << "block_order " << i << " " << block_order[i] << " " << computeRecords[block_order[i]].pid[0] << "\n";
02047       }
02048       iout << endi;
02049       for ( int i=0; i<patch_len; ++i ) {
02050         iout << "patch_last " << i << " " << plast[i] << "\n";
02051       }
02052       iout << endi;
02053     }
02054   }
02055 
02056   {
02057     Tensor virial_tensor;
02058     BigReal energyv = 0.;
02059     BigReal energye = 0.;
02060     BigReal energys = 0.;
02061     int nexcluded = 0;
02062     for ( int i = 0; i < num_virials; ++i ) {
02063       virial_tensor.xx += virials[16*i];
02064       virial_tensor.xy += virials[16*i+1];
02065       virial_tensor.xz += virials[16*i+2];
02066       virial_tensor.yx += virials[16*i+3];
02067       virial_tensor.yy += virials[16*i+4];
02068       virial_tensor.yz += virials[16*i+5];
02069       virial_tensor.zx += virials[16*i+6];
02070       virial_tensor.zy += virials[16*i+7];
02071       virial_tensor.zz += virials[16*i+8];
02072       energyv += virials[16*i+9];
02073       energye += virials[16*i+10];
02074       energys += virials[16*i+11];
02075       nexcluded += ((int *)virials)[16*i+12];
02076       if (simParams->GBISOn) {
02077         energye += energy_gbis[i];
02078       }
02079     }
02080     reduction->item(REDUCTION_EXCLUSION_CHECKSUM_CUDA) += nexcluded;
02081     ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NBOND,virial_tensor);
02082     if ( doEnergy ) {
02083       reduction->item(REDUCTION_LJ_ENERGY) += energyv;
02084       reduction->item(REDUCTION_ELECT_ENERGY) += energye;
02085       reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += energys;
02086     }
02087   }
02088   if ( doSlow ) {
02089     Tensor virial_slow_tensor;
02090     for ( int i = 0; i < num_virials; ++i ) {
02091       virial_slow_tensor.xx += slow_virials[16*i];
02092       virial_slow_tensor.xy += slow_virials[16*i+1];
02093       virial_slow_tensor.xz += slow_virials[16*i+2];
02094       virial_slow_tensor.yx += slow_virials[16*i+3];
02095       virial_slow_tensor.yy += slow_virials[16*i+4];
02096       virial_slow_tensor.yz += slow_virials[16*i+5];
02097       virial_slow_tensor.zx += slow_virials[16*i+6];
02098       virial_slow_tensor.zy += slow_virials[16*i+7];
02099       virial_slow_tensor.zz += slow_virials[16*i+8];
02100     }
02101     ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_SLOW,virial_slow_tensor);
02102   }
02103 
02104   reduction->submit();
02105 
02106   cuda_timer_count++;
02107   if ( simParams->outputCudaTiming &&
02108         step % simParams->outputCudaTiming == 0 ) {
02109 
02110     // int natoms = mol->numAtoms; 
02111     // double wpa = wcount;  wpa /= natoms;
02112 
02113     // CkPrintf("Pe %d CUDA kernel %f ms, total %f ms, wpa %f\n", CkMyPe(),
02114     //  kernel_time * 1.e3, time * 1.e3, wpa);
02115 
02116 #if 0
02117     float upload_ms, remote_calc_ms;
02118     float local_calc_ms, total_ms;
02119     cuda_errcheck("before event timers");
02120     cudaEventElapsedTime(&upload_ms, start_upload, start_calc);
02121     cuda_errcheck("in event timer 1");
02122     cudaEventElapsedTime(&remote_calc_ms, start_calc, end_remote_download);
02123     cuda_errcheck("in event timer 2");
02124     cudaEventElapsedTime(&local_calc_ms, end_remote_download, end_local_download);
02125     cuda_errcheck("in event timer 3");
02126     cudaEventElapsedTime(&total_ms, start_upload, end_local_download);
02127     cuda_errcheck("in event timer 4");
02128     cuda_errcheck("in event timers");
02129 
02130     CkPrintf("CUDA EVENT TIMING: %d %f %f %f %f\n",
02131              CkMyPe(), upload_ms, remote_calc_ms,
02132              local_calc_ms, total_ms);
02133 #endif
02134 
02135     if ( cuda_timer_count >= simParams->outputCudaTiming ) {
02136       cuda_timer_total /= cuda_timer_count;
02137       CkPrintf("CUDA TIMING: %d  %f ms/step on node %d\n",
02138                step, cuda_timer_total * 1.e3, CkMyPe());
02139     }
02140     cuda_timer_count = 0;
02141     cuda_timer_total = 0;
02142   }
02143 
02144 }

int ComputeNonbondedCUDA::finishWork (  ) 

Definition at line 1903 of file ComputeNonbondedCUDA.C.

References computeMgr, dEdaSumH, finishPatch(), ComputeNonbondedCUDA::patch_record::forceBox, GBISP, Compute::gbisPhase, if(), j, localHostedPatches, master, ComputeNonbondedUtil::mol, Node::molecule, numSlaves, Node::Object(), Box< Owner, Data >::open(), ComputeNonbondedCUDA::patch_record::patchID, patchRecords, Compute::priority(), psiSumH, ComputeNonbondedCUDA::patch_record::r, remoteHostedPatches, ComputeMgr::sendNonbondedCUDASlaveEnqueue(), Compute::sequence(), Node::simParameters, simParams, ResizeArray< Elem >::size(), slavePes, slaves, and workStarted.

Referenced by doWork().

01903                                      {
01904   //fprintf(stderr, "%d ComputeNonbondedCUDA::finishWork() \n",CkMyPe());
01905 
01906   if ( master == this ) {
01907     for ( int i = 0; i < numSlaves; ++i ) {
01908       computeMgr->sendNonbondedCUDASlaveEnqueue(slaves[i],slavePes[i],sequence(),priority(),workStarted);
01909     }
01910   }
01911 
01912 GBISP("C.N.CUDA[%d]::fnWork: workStarted %d, phase %d\n", \
01913 CkMyPe(), workStarted, gbisPhase)
01914 
01915   Molecule *mol = Node::Object()->molecule;
01916   SimParameters *simParams = Node::Object()->simParameters;
01917 
01918   ResizeArray<int> &patches( workStarted == 1 ?
01919                                 remoteHostedPatches : localHostedPatches );
01920 
01921   // long long int wcount = 0;
01922   // double virial = 0.;
01923   // double virial_slow = 0.;
01924   for ( int i=0; i<patches.size(); ++i ) {
01925     // CkPrintf("Pe %d patch %d of %d pid %d\n",CkMyPe(),i,patches.size(),patches[i]);
01926     patch_record &pr = master->patchRecords[patches[i]];
01927 
01928     if ( !simParams->GBISOn || gbisPhase == 1 ) {
01929       patch_record &pr = master->patchRecords[patches[i]];
01930 GBISP("GBIS[%d] fnWork() P0[%d] force.open()\n",CkMyPe(), pr.patchID);
01931       pr.r = pr.forceBox->open();
01932     } // !GBISOn || gbisPhase==1
01933 
01934     int start = pr.localStart;
01935     const CompAtomExt *aExt = pr.xExt;
01936     if ( !simParams->GBISOn || gbisPhase == 3 ) {
01937       finishPatch(pr);
01938     } // !GBISOn || gbisPhase == 3
01939 
01940 #if 0
01941     if ( i % 31 == 0 ) for ( int j=0; j<3; ++j ) {
01942       CkPrintf("Pe %d patch %d atom %d (%f %f %f) force %f\n", CkMyPe(), i,
01943         j, pr.x[j].position.x, pr.x[j].position.y, pr.x[j].position.z,
01944         af[j].w);
01945     }
01946 #endif
01947 
01948     //Close Boxes depending on Phase
01949     if (simParams->GBISOn) {
01950       if (gbisPhase == 1) {
01951         //Copy dEdaSum from Host to Patch Box
01952         GBReal *psiSumMaster = master->psiSumH + start;
01953         for ( int k=0; k<pr.numAtoms; ++k ) {
01954           int j = aExt[k].sortOrder;
01955           pr.psiSum[j] += psiSumMaster[k];
01956         }
01957 GBISP("C.N.CUDA[%d]::fnWork: P1 psiSum.close()\n", CkMyPe());
01958         pr.psiSumBox->close(&(pr.psiSum));
01959 
01960       } else if (gbisPhase == 2) {
01961         //Copy dEdaSum from Host to Patch Box
01962         GBReal *dEdaSumMaster = master->dEdaSumH + start;
01963         for ( int k=0; k<pr.numAtoms; ++k ) {
01964           int j = aExt[k].sortOrder;
01965           pr.dEdaSum[j] += dEdaSumMaster[k];
01966         }
01967 GBISP("C.N.CUDA[%d]::fnWork: P2 dEdaSum.close()\n", CkMyPe());
01968         pr.dEdaSumBox->close(&(pr.dEdaSum));
01969 
01970       } else if (gbisPhase == 3) {
01971 GBISP("C.N.CUDA[%d]::fnWork: P3 all.close()\n", CkMyPe());
01972         pr.intRadBox->close(&(pr.intRad)); //box 6
01973         pr.bornRadBox->close(&(pr.bornRad)); //box 7
01974         pr.dHdrPrefixBox->close(&(pr.dHdrPrefix)); //box 9
01975         pr.positionBox->close(&(pr.x)); //box 0
01976         pr.forceBox->close(&(pr.r));
01977       } //end phases
01978     } else { //not GBIS
01979 GBISP("C.N.CUDA[%d]::fnWork: pos/force.close()\n", CkMyPe());
01980       pr.positionBox->close(&(pr.x));
01981       pr.forceBox->close(&(pr.r));
01982     }
01983   }// end for
01984 
01985 #if 0
01986   virial *= (-1./6.);
01987   reduction->item(REDUCTION_VIRIAL_NBOND_XX) += virial;
01988   reduction->item(REDUCTION_VIRIAL_NBOND_YY) += virial;
01989   reduction->item(REDUCTION_VIRIAL_NBOND_ZZ) += virial;
01990   if ( doSlow ) {
01991     virial_slow *= (-1./6.);
01992     reduction->item(REDUCTION_VIRIAL_SLOW_XX) += virial_slow;
01993     reduction->item(REDUCTION_VIRIAL_SLOW_YY) += virial_slow;
01994     reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += virial_slow;
01995   }
01996 #endif
01997 
01998   if ( workStarted == 1 && ! deviceCUDA->getMergeGrids() &&
01999        ( localHostedPatches.size() || master == this ) ) {
02000     GBISP("not finished, call again\n");
02001     return 0;  // not finished, call again
02002   }
02003 
02004   if ( master != this ) {  // finished
02005     GBISP("finished\n");
02006     if (simParams->GBISOn) gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
02007     atomsChanged = 0;
02008     return 1;
02009   }
02010 
02011   cuda_timer_total += kernel_time;
02012 
02013   if ( !simParams->GBISOn || gbisPhase == 3 ) {
02014 
02015     atomsChanged = 0;
02016     finishReductions();
02017 
02018   } // !GBISOn || gbisPhase==3  
02019 
02020   // Next GBIS Phase
02021 GBISP("C.N.CUDA[%d]::fnWork: incrementing phase\n", CkMyPe())
02022     if (simParams->GBISOn) gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
02023 
02024   GBISP("C.N.CUDA[%d] finished ready for next step\n",CkMyPe());
02025   return 1;  // finished and ready for next step
02026 }

void ComputeNonbondedCUDA::messageFinishPatch ( int   ) 

Definition at line 1858 of file ComputeNonbondedCUDA.C.

References activePatches, computeMgr, ComputeNonbondedCUDA::patch_record::hostPe, ComputeNonbondedCUDA::patch_record::msg, patchRecords, PROXY_DATA_PRIORITY, ComputeMgr::sendNonbondedCUDASlaveEnqueuePatch(), Compute::sequence(), and ComputeNonbondedCUDA::patch_record::slave.

01858                                                          {
01859   int pid = activePatches[flindex];
01860   patch_record &pr = patchRecords[pid];
01861   //fprintf(stderr, "%d ComputeNonbondedCUDA::messageFinishPatch %d\n",CkMyPe(),pr.hostPe);
01862   computeMgr->sendNonbondedCUDASlaveEnqueuePatch(pr.slave,pr.hostPe,sequence(),PROXY_DATA_PRIORITY,flindex,pr.msg);
01863 }

int ComputeNonbondedCUDA::noWork (  )  [virtual]

Reimplemented from Compute.

Definition at line 1109 of file ComputeNonbondedCUDA.C.

References atomsChanged, computeMgr, Flags::doEnergy, doEnergy, Flags::doFullElectrostatics, Flags::doNonbonded, doSlow, GBISP, Compute::gbisPhase, hostedPatches, if(), Flags::lattice, lattice, master, masterPe, numSlaves, Node::Object(), patchRecords, reduction, ComputeMgr::sendNonbondedCUDASlaveReady(), ComputeMgr::sendNonbondedCUDASlaveSkip(), Compute::sequence(), Node::simParameters, simParams, ResizeArray< Elem >::size(), skip(), slavePes, slaves, Flags::step, step, SubmitReduction::submit(), and workStarted.

01109                                  {
01110 
01111   SimParameters *simParams = Node::Object()->simParameters;
01112   Flags &flags = master->patchRecords[hostedPatches[0]].p->flags;
01113   lattice = flags.lattice;
01114   doSlow = flags.doFullElectrostatics;
01115   doEnergy = flags.doEnergy;
01116   step = flags.step;
01117 
01118   if ( ! flags.doNonbonded ) {
01119     GBISP("GBIS[%d] noWork() don't do nonbonded\n",CkMyPe());
01120     if ( master != this ) {
01121       computeMgr->sendNonbondedCUDASlaveReady(masterPe,
01122                         hostedPatches.size(),atomsChanged,sequence());
01123     } else {
01124       for ( int i = 0; i < numSlaves; ++i ) {
01125         computeMgr->sendNonbondedCUDASlaveSkip(slaves[i],slavePes[i]);
01126       }
01127       skip();
01128     }
01129     if ( reduction ) {
01130        reduction->submit();
01131     }
01132 
01133     return 1;
01134   }
01135 
01136   for ( int i=0; i<hostedPatches.size(); ++i ) {
01137     patch_record &pr = master->patchRecords[hostedPatches[i]];
01138     if (!simParams->GBISOn || gbisPhase == 1) {
01139       GBISP("GBIS[%d] noWork() P0[%d] open()\n",CkMyPe(), pr.patchID);
01140       pr.x = pr.positionBox->open();
01141       pr.xExt = pr.p->getCompAtomExtInfo();
01142     }
01143 
01144     if (simParams->GBISOn) {
01145       if (gbisPhase == 1) {
01146         GBISP("GBIS[%d] noWork() P1[%d] open()\n",CkMyPe(),pr.patchID);
01147         pr.intRad     = pr.intRadBox->open();
01148         pr.psiSum     = pr.psiSumBox->open();
01149       } else if (gbisPhase == 2) {
01150         GBISP("GBIS[%d] noWork() P2[%d] open()\n",CkMyPe(),pr.patchID);
01151         pr.bornRad    = pr.bornRadBox->open();
01152         pr.dEdaSum    = pr.dEdaSumBox->open();
01153       } else if (gbisPhase == 3) {
01154         GBISP("GBIS[%d] noWork() P3[%d] open()\n",CkMyPe(),pr.patchID);
01155         pr.dHdrPrefix = pr.dHdrPrefixBox->open();
01156       }
01157       GBISP("opened GBIS boxes");
01158     }
01159   }
01160 
01161   if ( master == this ) return 0; //work to do, enqueue as usual
01162 
01163   // message masterPe
01164   computeMgr->sendNonbondedCUDASlaveReady(masterPe,
01165                         hostedPatches.size(),atomsChanged,sequence());
01166   atomsChanged = 0;
01167 
01168   workStarted = 1;
01169 
01170   return 1;
01171 }

void ComputeNonbondedCUDA::recvYieldDevice ( int  pe  ) 

Definition at line 1634 of file ComputeNonbondedCUDA.C.

References Lattice::a(), atom_params, atoms, atomsChanged, Lattice::b(), block_order, bornRadH, Lattice::c(), CcdCallBacksReset(), cuda_bind_atom_params(), cuda_bind_atoms(), cuda_bind_forces(), cuda_bind_GBIS_bornRad(), cuda_bind_GBIS_dEdaSum(), cuda_bind_GBIS_dHdrPrefix(), cuda_bind_GBIS_energy(), cuda_bind_GBIS_intRad(), cuda_bind_GBIS_psiSum(), cuda_bind_vdw_types(), cuda_bind_virials(), cuda_check_local_calc(), cuda_check_local_progress(), cuda_check_progress(), cuda_check_remote_calc(), cuda_check_remote_progress(), cuda_GBIS_P1(), cuda_GBIS_P2(), cuda_GBIS_P3(), cuda_nonbonded_forces(), CUDA_POLL, ComputeNonbondedUtil::cutoff2, dEdaSumH, deviceCUDA, deviceID, dHdrPrefixH, doEnergy, doSlow, dummy_dev, dummy_size, end_local_download, end_remote_download, energy_gbis, force_ready_queue, force_ready_queue_next, forces, GBISP, Compute::gbisPhase, DeviceCUDA::getMergeGrids(), DeviceCUDA::getNoStreaming(), DeviceCUDA::getSharedGpu(), intRad0H, intRadSH, lattice, localActivePatches, localComputeRecords, Node::Object(), patchPairsReordered, plcutoff2, psiSumH, remote_submit_time, remoteActivePatches, remoteComputeRecords, savePairlists, Compute::sequence(), DeviceCUDA::setGpuIsMine(), Node::simParameters, simParams, ResizeArray< Elem >::size(), slow_forces, start_calc, stream, stream2, usePairlists, vdw_types, virials, workStarted, Vector::x, Vector::y, and Vector::z.

Referenced by ComputeMgr::recvYieldDevice().

01634                                                  {
01635 GBISP("C.N.CUDA[%d]::recvYieldDevice: seq %d, workStarted %d, \
01636 gbisPhase %d, kls %d, from pe %d\n", CkMyPe(), sequence(), \
01637 workStarted, gbisPhase, kernel_launch_state, pe)
01638 
01639   float3 lata, latb, latc;
01640   lata.x = lattice.a().x;
01641   lata.y = lattice.a().y;
01642   lata.z = lattice.a().z;
01643   latb.x = lattice.b().x;
01644   latb.y = lattice.b().y;
01645   latb.z = lattice.b().z;
01646   latc.x = lattice.c().x;
01647   latc.y = lattice.c().y;
01648   latc.z = lattice.c().z;
01649   SimParameters *simParams = Node::Object()->simParameters;
01650 
01651   // Set GPU device ID
01652   cudaSetDevice(deviceID);
01653 
01654   const bool streaming = ! (deviceCUDA->getNoStreaming() || simParams->GBISOn);
01655 
01656   double walltime;
01657   if ( kernel_launch_state == 1 || kernel_launch_state == 2 ) {
01658     walltime = CkWallTimer();
01659     CcdCallBacksReset(0,walltime);  // fix Charm++
01660   }
01661 
01662   switch ( kernel_launch_state ) {
01664 // Remote
01665   case 1:
01666 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: case 1\n", CkMyPe())
01667     ++kernel_launch_state;
01668     //gpu_is_mine = 0;
01669     deviceCUDA->setGpuIsMine(0);
01670     remote_submit_time = walltime;
01671 
01672     if (!simParams->GBISOn || gbisPhase == 1) {
01673       // cudaEventRecord(start_upload, stream);
01674       if ( atomsChanged ) {
01675         cuda_bind_atom_params(atom_params);
01676         cuda_bind_vdw_types(vdw_types);
01677       }
01678       if ( simParams->GBISOn) {
01679         cuda_bind_GBIS_psiSum(psiSumH);
01680                if ( atomsChanged ) {
01681                  cuda_bind_GBIS_intRad(intRad0H, intRadSH);
01682                }
01683       }
01684       atomsChanged = 0;
01685       cuda_bind_atoms((const atom *)atoms);
01686       cuda_bind_forces(forces, slow_forces);
01687       cuda_bind_virials(virials, force_ready_queue, block_order);
01688       if ( simParams->GBISOn) {
01689         cuda_bind_GBIS_energy(energy_gbis);
01690       }
01691       if ( stream2 != stream ) cudaEventRecord(start_calc, stream);
01692       //call CUDA Kernels
01693 
01694       cuda_nonbonded_forces(lata, latb, latc, cutoff2, plcutoff2,
01695                             0,remoteComputeRecords.size(),
01696                             remoteComputeRecords.size()+localComputeRecords.size(),
01697                             doSlow, doEnergy, usePairlists, savePairlists, streaming, ! patchPairsReordered, stream);
01698       if (simParams->GBISOn) {
01699         cuda_GBIS_P1(
01700           0,remoteComputeRecords.size(),
01701                       localActivePatches.size(),remoteActivePatches.size(),
01702                       simParams->alpha_cutoff-simParams->fsMax,
01703                       simParams->coulomb_radius_offset,
01704                       lata, latb, latc, stream);
01705       }
01706       //cuda_load_forces(forces, (doSlow ? slow_forces : 0 ),
01707       //    num_local_atom_records,num_remote_atom_records);
01708       if ( ( ! streaming ) || ( deviceCUDA->getSharedGpu() && ! deviceCUDA->getMergeGrids() ) ) {
01709         cudaEventRecord(end_remote_download, stream);
01710       }
01711       if ( streaming ) {
01712         force_ready_queue_next = 0;
01713         CUDA_POLL(cuda_check_progress,this);
01714       } else {
01715         CUDA_POLL(cuda_check_remote_progress,this);
01716       }
01717       if ( deviceCUDA->getSharedGpu() && ! deviceCUDA->getMergeGrids() ) {
01718         CUDA_POLL(cuda_check_remote_calc,this);
01719         break;
01720       }
01721     } // !GBIS or gbisPhase==1
01722     if (simParams->GBISOn) {
01723       if (gbisPhase == 1) {
01724         //GBIS P1 Kernel launched in previous code block
01725       } else if (gbisPhase == 2) {
01726 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: <<<P2>>>\n", CkMyPe())
01727         // cudaEventRecord(start_upload, stream);
01728         cuda_bind_GBIS_bornRad(bornRadH);
01729         cuda_bind_GBIS_dEdaSum(dEdaSumH);
01730         if ( stream2 != stream ) cudaEventRecord(start_calc, stream);
01731         cuda_GBIS_P2(
01732           0,remoteComputeRecords.size(),
01733           localActivePatches.size(),remoteActivePatches.size(),
01734           (simParams->alpha_cutoff-simParams->fsMax), simParams->cutoff,
01735           simParams->nonbondedScaling, simParams->kappa,
01736           (simParams->switchingActive ? simParams->switchingDist : -1.0),
01737           simParams->dielectric, simParams->solvent_dielectric,
01738           lata, latb, latc,
01739           doEnergy, doSlow, stream
01740           );
01741         cudaEventRecord(end_remote_download, stream);
01742         CUDA_POLL(cuda_check_remote_progress,this);
01743       } else if (gbisPhase == 3) {
01744 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: <<<P3>>>\n", CkMyPe())
01745         // cudaEventRecord(start_upload, stream);
01746         cuda_bind_GBIS_dHdrPrefix(dHdrPrefixH);
01747         if ( stream2 != stream ) cudaEventRecord(start_calc, stream);
01748         if (doSlow)
01749         cuda_GBIS_P3(
01750           0,remoteComputeRecords.size(),
01751           localActivePatches.size(),remoteActivePatches.size(),
01752           (simParams->alpha_cutoff-simParams->fsMax),
01753           simParams->coulomb_radius_offset,
01754           simParams->nonbondedScaling,
01755           lata, latb, latc, stream
01756           );
01757         cudaEventRecord(end_remote_download, stream);
01758         CUDA_POLL(cuda_check_remote_progress,this);
01759       }
01760     }
01761 
01763 // Local
01764   case 2:
01765 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: case 2\n", CkMyPe())
01766     ++kernel_launch_state;
01767     //gpu_is_mine = 0;
01768     deviceCUDA->setGpuIsMine(0);
01769 
01770     if ( stream2 != stream ) {
01771       // needed to ensure that upload is finished
01772       cudaStreamWaitEvent(stream2, start_calc, 0);
01773       // priorities do not prevent local from launching ahead
01774       // of remote, so delay local stream with a small memset
01775       cudaMemsetAsync(dummy_dev, 0, dummy_size, stream2);
01776     }
01777 
01778     if (!simParams->GBISOn || gbisPhase == 1) {
01779 
01780       cuda_nonbonded_forces(lata, latb, latc, cutoff2, plcutoff2,
01781                             remoteComputeRecords.size(),localComputeRecords.size(),
01782                             remoteComputeRecords.size()+localComputeRecords.size(),
01783                             doSlow, doEnergy, usePairlists, savePairlists, streaming, ! patchPairsReordered, stream2);
01784       if (simParams->GBISOn) {
01785         cuda_GBIS_P1(
01786                      remoteComputeRecords.size(),localComputeRecords.size(),
01787                      0,localActivePatches.size(),
01788                      simParams->alpha_cutoff-simParams->fsMax,
01789                      simParams->coulomb_radius_offset,
01790                      lata, latb, latc, stream2 );
01791       }
01792       //cuda_load_forces(forces, (doSlow ? slow_forces : 0 ),
01793       //    0,num_local_atom_records);
01794       //cuda_load_virials(virials, doSlow);  // slow_virials follows virials
01795       if ( ( ! streaming ) || ( deviceCUDA->getSharedGpu() && ! deviceCUDA->getMergeGrids() ) ) {
01796         cudaEventRecord(end_local_download, stream2);
01797       }
01798       if ( ! streaming && workStarted == 2 ) {
01799         GBISP("C.N.CUDA[%d]::recvYieldDeviceL: adding POLL \
01800 cuda_check_local_progress\n", CkMyPe())
01801         CUDA_POLL(cuda_check_local_progress,this);
01802       }
01803       if ( deviceCUDA->getSharedGpu() && ! deviceCUDA->getMergeGrids() ) {
01804         GBISP("C.N.CUDA[%d]::recvYieldDeviceL: adding POLL \
01805 cuda_check_local_calc\n", CkMyPe())
01806           CUDA_POLL(cuda_check_local_calc,this);
01807         break;
01808       }
01809 
01810     } // !GBIS or gbisPhase==1
01811     if (simParams->GBISOn) {
01812       if (gbisPhase == 1) {
01813         //GBIS P1 Kernel launched in previous code block
01814       } else if (gbisPhase == 2) {
01815 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: calling <<<P2>>>\n", CkMyPe())
01816         cuda_GBIS_P2(
01817           remoteComputeRecords.size(),localComputeRecords.size(),
01818           0,localActivePatches.size(),
01819           (simParams->alpha_cutoff-simParams->fsMax), simParams->cutoff,
01820           simParams->nonbondedScaling, simParams->kappa,
01821           (simParams->switchingActive ? simParams->switchingDist : -1.0),
01822           simParams->dielectric, simParams->solvent_dielectric,
01823           lata, latb, latc,
01824           doEnergy, doSlow, stream2
01825           );
01826         cudaEventRecord(end_local_download, stream2);
01827         if ( workStarted == 2 ) {
01828           CUDA_POLL(cuda_check_local_progress,this);
01829         }
01830       } else if (gbisPhase == 3) {
01831 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: calling <<<P3>>>\n", CkMyPe())
01832         if (doSlow)
01833         cuda_GBIS_P3(
01834           remoteComputeRecords.size(),localComputeRecords.size(),
01835           0,localActivePatches.size(),
01836           (simParams->alpha_cutoff-simParams->fsMax),
01837           simParams->coulomb_radius_offset,
01838           simParams->nonbondedScaling,
01839           lata, latb, latc, stream2
01840           );
01841         cudaEventRecord(end_local_download, stream2);
01842         if ( workStarted == 2 ) {
01843           CUDA_POLL(cuda_check_local_progress,this);
01844         }
01845       } // phases
01846     } // GBISOn
01847     if ( simParams->PMEOn && simParams->PMEOffload  && !simParams->usePMECUDA) break;
01848 
01849   default:
01850 GBISP("C.N.CUDA[%d]::recvYieldDevice: case default\n", CkMyPe())
01851     //gpu_is_mine = 1;
01852     deviceCUDA->setGpuIsMine(1);
01853     break;
01854   } // switch
01855 GBISP("C.N.CUDA[%d]::recvYieldDevice: DONE\n", CkMyPe())
01856 }

void ComputeNonbondedCUDA::registerPatches (  ) 

Definition at line 615 of file ComputeNonbondedCUDA.C.

References activePatches, ResizeArray< Elem >::add(), ResizeArray< Elem >::begin(), ComputeNonbondedCUDA::patch_record::bornRadBox, ProxyMgr::createProxy(), ComputeNonbondedCUDA::patch_record::dEdaSumBox, ComputeNonbondedCUDA::patch_record::dHdrPrefixBox, ComputeNonbondedCUDA::patch_record::forceBox, hostedPatches, ComputeNonbondedCUDA::patch_record::hostPe, ComputeNonbondedCUDA::patch_record::intRadBox, ComputeNonbondedCUDA::patch_record::isLocal, localHostedPatches, master, masterPe, ComputeNonbondedCUDA::patch_record::msg, npatches, ProxyMgr::Object(), Node::Object(), ComputeNonbondedCUDA::patch_record::p, PatchMap::patch(), patchMap, patchRecords, ComputeNonbondedCUDA::patch_record::positionBox, PRIORITY_SIZE, ComputeNonbondedCUDA::patch_record::psiSumBox, Patch::registerBornRadPickup(), Patch::registerDEdaSumDeposit(), Patch::registerDHdrPrefixPickup(), Patch::registerForceDeposit(), Patch::registerIntRadPickup(), Patch::registerPositionPickup(), Patch::registerPsiSumDeposit(), remoteHostedPatches, Compute::setNumPatches(), Node::simParameters, simParams, ResizeArray< Elem >::size(), and ComputeNonbondedCUDA::patch_record::slave.

Referenced by assignPatches(), and ComputeNonbondedCUDA().

00615                                            {
00616 
00617   SimParameters *simParams = Node::Object()->simParameters;
00618   int npatches = master->activePatches.size();
00619   int *pids = master->activePatches.begin();
00620   patch_record *recs = master->patchRecords.begin();
00621   for ( int i=0; i<npatches; ++i ) {
00622     int pid = pids[i];
00623     patch_record &pr = recs[pid];
00624     if ( pr.hostPe == CkMyPe() ) {
00625       pr.slave = this;
00626       pr.msg = new (PRIORITY_SIZE) FinishWorkMsg;
00627       hostedPatches.add(pid);
00628       if ( pr.isLocal ) {
00629         localHostedPatches.add(pid);
00630       } else {
00631         remoteHostedPatches.add(pid);
00632       }
00633       ProxyMgr::Object()->createProxy(pid);
00634       pr.p = patchMap->patch(pid);
00635       pr.positionBox = pr.p->registerPositionPickup(this);
00636       pr.forceBox = pr.p->registerForceDeposit(this);
00637       if (simParams->GBISOn) {
00638         pr.intRadBox      = pr.p->registerIntRadPickup(this);
00639         pr.psiSumBox      = pr.p->registerPsiSumDeposit(this);
00640         pr.bornRadBox     = pr.p->registerBornRadPickup(this);
00641         pr.dEdaSumBox     = pr.p->registerDEdaSumDeposit(this);
00642         pr.dHdrPrefixBox  = pr.p->registerDHdrPrefixPickup(this);
00643       }
00644     }
00645   }
00646   if ( master == this ) setNumPatches(activePatches.size());
00647   else setNumPatches(hostedPatches.size());
00648   if ( CmiPhysicalNodeID(CkMyPe()) < 2 )
00649   CkPrintf("Pe %d hosts %d local and %d remote patches for pe %d\n", CkMyPe(), localHostedPatches.size(), remoteHostedPatches.size(), masterPe);
00650 }

void ComputeNonbondedCUDA::requirePatch ( int  pid  ) 

Definition at line 579 of file ComputeNonbondedCUDA.C.

References activePatches, ResizeArray< Elem >::add(), ComputeNonbondedCUDA::patch_record::bornRad, computesChanged, ComputeNonbondedCUDA::patch_record::dEdaSum, deviceCUDA, ComputeNonbondedCUDA::patch_record::dHdrPrefix, ComputeNonbondedCUDA::patch_record::f, DeviceCUDA::getMergeGrids(), ComputeNonbondedCUDA::patch_record::hostPe, PatchMap::index_a(), PatchMap::index_b(), PatchMap::index_c(), ComputeNonbondedCUDA::patch_record::intRad, ComputeNonbondedCUDA::patch_record::isLocal, ComputeNonbondedCUDA::patch_record::isSameNode, ComputeNonbondedCUDA::patch_record::isSamePhysicalNode, localActivePatches, PatchMap::node(), ComputeNonbondedCUDA::patch_record::patchID, patchMap, patchRecords, ComputeNonbondedCUDA::patch_record::psiSum, ComputeNonbondedCUDA::patch_record::r, ComputeNonbondedCUDA::patch_record::refCount, remoteActivePatches, ComputeNonbondedCUDA::patch_record::x, and ComputeNonbondedCUDA::patch_record::xExt.

Referenced by register_cuda_compute_pair(), and register_cuda_compute_self().

00579                                                {
00580 
00581   computesChanged = 1;
00582   patch_record &pr = patchRecords[pid];
00583   if ( pr.refCount == 0 ) {
00584     pr.isSamePhysicalNode = ( CmiPhysicalNodeID(patchMap->node(pid)) == CmiPhysicalNodeID(CkMyPe()) );
00585     pr.isSameNode = ( CkNodeOf(patchMap->node(pid)) == CkMyNode() );
00586     if ( deviceCUDA->getMergeGrids() ) {
00587       pr.isLocal = 0;
00588     } else if ( CkNumNodes() < 2 ) {
00589       pr.isLocal = 1 & ( 1 ^ patchMap->index_a(pid) ^
00590          patchMap->index_b(pid) ^ patchMap->index_c(pid) );
00591     } else {
00592       pr.isLocal = pr.isSameNode;
00593     }
00594     if ( pr.isLocal ) {
00595       localActivePatches.add(pid);
00596     } else {
00597       remoteActivePatches.add(pid);
00598     }
00599     activePatches.add(pid);
00600     pr.patchID = pid;
00601     pr.hostPe = -1;
00602     pr.x = NULL;
00603     pr.xExt = NULL;
00604     pr.r = NULL;
00605     pr.f = NULL;
00606     pr.intRad      = NULL;
00607     pr.psiSum      = NULL;
00608     pr.bornRad     = NULL;
00609     pr.dEdaSum     = NULL;
00610     pr.dHdrPrefix  = NULL;
00611   }
00612   pr.refCount += 1;
00613 }

void ComputeNonbondedCUDA::skip (  ) 

Definition at line 1092 of file ComputeNonbondedCUDA.C.

References hostedPatches, master, Node::Object(), patchRecords, Node::simParameters, simParams, and ResizeArray< Elem >::size().

Referenced by noWork(), and ComputeMgr::recvNonbondedCUDASlaveSkip().

01092                                 {
01093   //fprintf(stderr, "ComputeNonbondedCUDA::skip()\n");
01094   SimParameters *simParams = Node::Object()->simParameters;
01095   for ( int i=0; i<hostedPatches.size(); ++i ) {
01096     patch_record &pr = master->patchRecords[hostedPatches[i]];
01097     pr.positionBox->skip();
01098     pr.forceBox->skip();
01099     if (simParams->GBISOn) {
01100       pr.intRadBox->skip();
01101       pr.psiSumBox->skip();
01102       pr.bornRadBox->skip();
01103       pr.dEdaSumBox->skip();
01104       pr.dHdrPrefixBox->skip();
01105     }
01106   }
01107 }


Member Data Documentation

ResizeArray<int> ComputeNonbondedCUDA::activePatches

Definition at line 95 of file ComputeNonbondedCUDA.h.

Referenced by assignPatches(), doWork(), finishPatch(), messageFinishPatch(), registerPatches(), and requirePatch().

AtomMap* ComputeNonbondedCUDA::atomMap

Definition at line 119 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA().

CudaAtom* ComputeNonbondedCUDA::atoms

Definition at line 142 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), and recvYieldDevice().

int ComputeNonbondedCUDA::atoms_size

Definition at line 141 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), and doWork().

int ComputeNonbondedCUDA::atomsChanged

Definition at line 131 of file ComputeNonbondedCUDA.h.

Referenced by atomUpdate(), ComputeNonbondedCUDA(), doWork(), noWork(), and recvYieldDevice().

ResizeArray<compute_record> ComputeNonbondedCUDA::computeRecords

Definition at line 98 of file ComputeNonbondedCUDA.h.

Referenced by doWork().

int ComputeNonbondedCUDA::computesChanged

Definition at line 132 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), and requirePatch().

GBReal* ComputeNonbondedCUDA::dEdaSumH

Definition at line 111 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), finishWork(), and recvYieldDevice().

int ComputeNonbondedCUDA::dEdaSumH_size

Definition at line 110 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), and doWork().

int ComputeNonbondedCUDA::deviceID

Definition at line 116 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), and recvYieldDevice().

int ComputeNonbondedCUDA::doEnergy

Definition at line 80 of file ComputeNonbondedCUDA.h.

Referenced by noWork(), and recvYieldDevice().

int ComputeNonbondedCUDA::doSlow

Definition at line 80 of file ComputeNonbondedCUDA.h.

Referenced by noWork(), and recvYieldDevice().

float4* ComputeNonbondedCUDA::forces

Definition at line 102 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), and recvYieldDevice().

int ComputeNonbondedCUDA::forces_size

Definition at line 101 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), and doWork().

ResizeArray<int> ComputeNonbondedCUDA::hostedPatches

Definition at line 96 of file ComputeNonbondedCUDA.h.

Referenced by doWork(), noWork(), registerPatches(), and skip().

Lattice ComputeNonbondedCUDA::lattice

Definition at line 79 of file ComputeNonbondedCUDA.h.

Referenced by doWork(), noWork(), and recvYieldDevice().

ResizeArray<int> ComputeNonbondedCUDA::localActivePatches

Definition at line 95 of file ComputeNonbondedCUDA.h.

Referenced by doWork(), recvYieldDevice(), and requirePatch().

ResizeArray<compute_record> ComputeNonbondedCUDA::localComputeRecords

Definition at line 99 of file ComputeNonbondedCUDA.h.

Referenced by doWork(), recvYieldDevice(), register_cuda_compute_pair(), and register_cuda_compute_self().

ResizeArray<int> ComputeNonbondedCUDA::localHostedPatches

Definition at line 96 of file ComputeNonbondedCUDA.h.

Referenced by finishWork(), registerPatches(), and ComputeMgr::sendNonbondedCUDASlaveEnqueue().

LocalWorkMsg* ComputeNonbondedCUDA::localWorkMsg2

Definition at line 76 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), and ComputeMgr::sendNonbondedCUDASlaveEnqueue().

ComputeNonbondedCUDA* ComputeNonbondedCUDA::master

Definition at line 124 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), finishPatch(), finishWork(), noWork(), registerPatches(), and skip().

int ComputeNonbondedCUDA::masterPe

Definition at line 125 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), noWork(), and registerPatches().

int ComputeNonbondedCUDA::numSlaves

Definition at line 129 of file ComputeNonbondedCUDA.h.

Referenced by assignPatches(), finishWork(), and noWork().

int ComputeNonbondedCUDA::pairlistsValid

Definition at line 135 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), and doWork().

float ComputeNonbondedCUDA::pairlistTolerance

Definition at line 136 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), and doWork().

PatchMap* ComputeNonbondedCUDA::patchMap

Definition at line 118 of file ComputeNonbondedCUDA.h.

Referenced by assignPatches(), ComputeNonbondedCUDA(), doWork(), register_cuda_compute_pair(), registerPatches(), and requirePatch().

int ComputeNonbondedCUDA::patchPairsReordered

Definition at line 133 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), and recvYieldDevice().

ResizeArray<patch_record> ComputeNonbondedCUDA::patchRecords

Definition at line 97 of file ComputeNonbondedCUDA.h.

Referenced by assignPatches(), ComputeNonbondedCUDA(), doWork(), finishPatch(), finishWork(), messageFinishPatch(), noWork(), register_cuda_compute_pair(), register_cuda_compute_self(), registerPatches(), requirePatch(), and skip().

float ComputeNonbondedCUDA::plcutoff2

Definition at line 139 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), and recvYieldDevice().

GBReal* ComputeNonbondedCUDA::psiSumH

Definition at line 108 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), finishWork(), and recvYieldDevice().

int ComputeNonbondedCUDA::psiSumH_size

Definition at line 107 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), and doWork().

SubmitReduction* ComputeNonbondedCUDA::reduction

Definition at line 120 of file ComputeNonbondedCUDA.h.

Referenced by assignPatches(), ComputeNonbondedCUDA(), doWork(), and noWork().

ResizeArray<int> ComputeNonbondedCUDA::remoteActivePatches

Definition at line 95 of file ComputeNonbondedCUDA.h.

Referenced by doWork(), recvYieldDevice(), and requirePatch().

ResizeArray<compute_record> ComputeNonbondedCUDA::remoteComputeRecords

Definition at line 99 of file ComputeNonbondedCUDA.h.

Referenced by doWork(), recvYieldDevice(), register_cuda_compute_pair(), and register_cuda_compute_self().

ResizeArray<int> ComputeNonbondedCUDA::remoteHostedPatches

Definition at line 96 of file ComputeNonbondedCUDA.h.

Referenced by finishWork(), and registerPatches().

int ComputeNonbondedCUDA::savePairlists

Definition at line 138 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), and recvYieldDevice().

int ComputeNonbondedCUDA::slaveIndex

Definition at line 126 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA().

int* ComputeNonbondedCUDA::slavePes

Definition at line 128 of file ComputeNonbondedCUDA.h.

Referenced by assignPatches(), ComputeNonbondedCUDA(), finishWork(), and noWork().

ComputeNonbondedCUDA** ComputeNonbondedCUDA::slaves

Definition at line 127 of file ComputeNonbondedCUDA.h.

Referenced by assignPatches(), ComputeNonbondedCUDA(), finishWork(), and noWork().

float4* ComputeNonbondedCUDA::slow_forces

Definition at line 105 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), and recvYieldDevice().

int ComputeNonbondedCUDA::slow_forces_size

Definition at line 104 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), and doWork().

int ComputeNonbondedCUDA::step

Definition at line 81 of file ComputeNonbondedCUDA.h.

Referenced by noWork().

int ComputeNonbondedCUDA::usePairlists

Definition at line 137 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), and recvYieldDevice().

int ComputeNonbondedCUDA::workStarted

Definition at line 78 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), finishWork(), noWork(), and recvYieldDevice().


The documentation for this class was generated from the following files:
Generated on Sun Feb 25 01:17:19 2018 for NAMD by  doxygen 1.4.7