ComputeNonbondedCUDA Class Reference

#include <ComputeNonbondedCUDA.h>

Inheritance diagram for ComputeNonbondedCUDA:

Compute ComputeNonbondedUtil List of all members.

Public Member Functions

 ComputeNonbondedCUDA (ComputeID c, ComputeMgr *mgr, ComputeNonbondedCUDA *m=0, int idx=-1)
 ~ComputeNonbondedCUDA ()
void atomUpdate ()
void doWork ()
int noWork ()
void skip ()
void recvYieldDevice (int pe)
int finishWork ()
void finishReductions ()
void finishPatch (int)
void messageFinishPatch (int)
void requirePatch (int pid)
void assignPatches ()
void registerPatches ()

Static Public Member Functions

static void build_lj_table ()
static void build_force_table ()
static void build_exclusions ()

Public Attributes

LocalWorkMsglocalWorkMsg2
int workStarted
Lattice lattice
int doSlow
int doEnergy
int step
ResizeArray< int > activePatches
ResizeArray< int > localActivePatches
ResizeArray< int > remoteActivePatches
ResizeArray< int > hostedPatches
ResizeArray< int > localHostedPatches
ResizeArray< int > remoteHostedPatches
ResizeArray< patch_recordpatchRecords
ResizeArray< compute_recordcomputeRecords
ResizeArray< compute_recordlocalComputeRecords
ResizeArray< compute_recordremoteComputeRecords
int forces_size
float4 * forces
int slow_forces_size
float4 * slow_forces
int psiSumH_size
GBRealpsiSumH
int dEdaSumH_size
GBRealdEdaSumH
int deviceID
PatchMappatchMap
AtomMapatomMap
SubmitReductionreduction
ComputeNonbondedCUDAmaster
int masterPe
int slaveIndex
ComputeNonbondedCUDA ** slaves
int * slavePes
int numSlaves
int atomsChanged
int computesChanged
int patchPairsReordered
int pairlistsValid
float pairlistTolerance
int usePairlists
int savePairlists
float plcutoff2
int atoms_size
CudaAtomatoms

Classes

struct  compute_record
struct  patch_record

Detailed Description

Definition at line 20 of file ComputeNonbondedCUDA.h.


Constructor & Destructor Documentation

ComputeNonbondedCUDA::ComputeNonbondedCUDA ( ComputeID  c,
ComputeMgr mgr,
ComputeNonbondedCUDA m = 0,
int  idx = -1 
)

Definition at line 761 of file ComputeNonbondedCUDA.C.

References atomMap, atoms, atoms_size, atomsChanged, Compute::basePriority, build_exclusions(), computeMgr, computesChanged, cuda_errcheck(), cuda_init(), cudaCompute, dEdaSumH, dEdaSumH_size, deviceCUDA, deviceID, end_local_download, end_remote_download, forces, forces_size, SimParameters::GBISOn, DeviceCUDA::getDeviceID(), DeviceCUDA::getMergeGrids(), DeviceCUDA::getNoMergeGrids(), DeviceCUDA::getNoStreaming(), DeviceCUDA::getSharedGpu(), init_arrays(), localWorkMsg2, master, masterPe, max_grid_size, NAMD_bug(), NAMD_die(), PatchMap::numPatches(), Node::Object(), AtomMap::Object(), PatchMap::Object(), pairlistsValid, pairlistTolerance, patch_pair_num_ptr, patch_pairs_ptr, patchMap, patchPairsReordered, patchRecords, plcutoff2, SimParameters::PMEOffload, SimParameters::PMEOn, SimParameters::pressureProfileOn, PRIORITY_SIZE, PROXY_DATA_PRIORITY, psiSumH, psiSumH_size, reduction, registerPatches(), savePairlists, DeviceCUDA::setGpuIsMine(), DeviceCUDA::setMergeGrids(), Node::simParameters, slaveIndex, slavePes, slaves, slow_forces, slow_forces_size, start_calc, stream, stream2, usePairlists, SimParameters::usePMECUDA, and workStarted.

00762                                                                              : Compute(c), slaveIndex(idx) {
00763 #ifdef PRINT_GBIS
00764    CkPrintf("C.N.CUDA[%d]::constructor cid=%d\n", CkMyPe(), c);
00765 #endif
00766 
00767   if ( sizeof(patch_pair) & 15 ) NAMD_bug("sizeof(patch_pair) % 16 != 0");
00768   if ( sizeof(atom) & 15 ) NAMD_bug("sizeof(atom) % 16 != 0");
00769   if ( sizeof(atom_param) & 15 ) NAMD_bug("sizeof(atom_param) % 16 != 0");
00770 
00771   // CkPrintf("create ComputeNonbondedCUDA\n");
00772   master = m ? m : this;
00773   cudaCompute = this;
00774   computeMgr = mgr;
00775   patchMap = PatchMap::Object();
00776   atomMap = AtomMap::Object();
00777   reduction = 0;
00778 
00779   SimParameters *params = Node::Object()->simParameters;
00780   if (params->pressureProfileOn) {
00781     NAMD_die("pressure profile not supported in CUDA");
00782   }
00783 
00784   atomsChanged = 1;
00785   computesChanged = 1;
00786   patchPairsReordered = 1;
00787 
00788   pairlistsValid = 0;
00789   pairlistTolerance = 0.;
00790   usePairlists = 0;
00791   savePairlists = 0;
00792   plcutoff2 = 0.;
00793 
00794   workStarted = 0;
00795   basePriority = PROXY_DATA_PRIORITY;
00796   localWorkMsg2 = new (PRIORITY_SIZE) LocalWorkMsg;
00797 
00798   // Zero array sizes and pointers
00799   init_arrays();
00800 
00801   atoms_size = 0;
00802   atoms = NULL;
00803 
00804   forces_size = 0;
00805   forces = NULL;
00806   
00807   slow_forces_size = 0;
00808   slow_forces = NULL;
00809 
00810   psiSumH_size = 0;
00811   psiSumH = NULL;
00812 
00813   dEdaSumH_size = 0;
00814   dEdaSumH = NULL;
00815 
00816   if ( master != this ) { // I am slave
00817     masterPe = master->masterPe;
00818     master->slaves[slaveIndex] = this;
00819     if ( master->slavePes[slaveIndex] != CkMyPe() ) {
00820       NAMD_bug("ComputeNonbondedCUDA slavePes[slaveIndex] != CkMyPe");
00821     }
00822     deviceID = master->deviceID;
00823     registerPatches();
00824     return;
00825   }
00826   masterPe = CkMyPe();
00827 
00828   const bool streaming = ! (deviceCUDA->getNoStreaming() || params->GBISOn);
00829   if ( streaming && ! deviceCUDA->getSharedGpu() && ! deviceCUDA->getNoMergeGrids() )
00830     deviceCUDA->setMergeGrids(1);
00831 
00832   // Sanity checks for New CUDA kernel
00833   if (params->GBISOn) {
00834     // GBIS
00835     if (deviceCUDA->getNoMergeGrids()) {
00836       NAMD_die("CUDA kernel cannot use +nomergegrids with GBIS simulations");
00837     }
00838     // Set mergegrids ON as long as user hasn't defined +nomergegrids
00839     if (!deviceCUDA->getNoMergeGrids()) deviceCUDA->setMergeGrids(1);
00840     // Final sanity check
00841     if (!deviceCUDA->getMergeGrids()) NAMD_die("CUDA GBIS kernel final sanity check failed");
00842   } else {
00843     // non-GBIS
00844     if (deviceCUDA->getNoStreaming() && !deviceCUDA->getMergeGrids()) {
00845       NAMD_die("CUDA kernel requires +mergegrids if +nostreaming is used");
00846     }
00847   }
00848 
00849 #if CUDA_VERSION >= 5050
00850   int leastPriority, greatestPriority;
00851   cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
00852   cuda_errcheck("in cudaDeviceGetStreamPriorityRange");
00853   if ( leastPriority != greatestPriority ) {
00854     if ( CkMyNode() == 0 ) {
00855       int dev = deviceCUDA->getDeviceID();
00856       CkPrintf("CUDA device %d stream priority range %d %d\n", dev, leastPriority, greatestPriority);
00857     }
00858     if ( deviceCUDA->getMergeGrids() && params->PMEOn && params->PMEOffload && !params->usePMECUDA) {
00859       greatestPriority = leastPriority;
00860     }
00861     if (params->usePMECUDA) greatestPriority = leastPriority;
00862     cudaStreamCreateWithPriority(&stream,cudaStreamDefault,greatestPriority);
00863     cudaStreamCreateWithPriority(&stream2,cudaStreamDefault,leastPriority);
00864   } else
00865 #endif
00866   {
00867     cudaStreamCreate(&stream);
00868     cuda_errcheck("cudaStreamCreate");
00869     int dev = deviceCUDA->getDeviceID();
00870     cudaDeviceProp deviceProp;
00871     cudaGetDeviceProperties(&deviceProp, dev);
00872     cuda_errcheck("cudaGetDeviceProperties");
00873     if ( deviceProp.concurrentKernels && deviceProp.major > 2 ) {
00874       if ( CkMyNode() == 0 ) CkPrintf("CUDA device %d supports concurrent kernels.\n", dev);
00875       cudaStreamCreate(&stream2);
00876     } else {
00877       stream2 = stream;
00878     }
00879   }
00880   cuda_errcheck("cudaStreamCreate");
00881 
00882   // Get GPU device ID
00883   deviceID = deviceCUDA->getDeviceID();
00884 
00885   cuda_init();
00886   if ( max_grid_size < 65535 ) NAMD_bug("bad CUDA max_grid_size");
00887   build_exclusions();
00888   // cudaEventCreate(&start_upload);
00889   cudaEventCreateWithFlags(&start_calc,cudaEventDisableTiming);
00890   cudaEventCreateWithFlags(&end_remote_download,cudaEventDisableTiming);
00891   cudaEventCreateWithFlags(&end_local_download,cudaEventDisableTiming);
00892 
00893   patchRecords.resize(patchMap->numPatches());
00894   patch_pairs_ptr = new ResizeArray<patch_pair>;
00895   patch_pair_num_ptr = new ResizeArray<int>;
00896 
00897   if ( params->PMEOn && params->PMEOffload && !params->usePMECUDA) deviceCUDA->setGpuIsMine(0);
00898 }

ComputeNonbondedCUDA::~ComputeNonbondedCUDA (  ) 

Definition at line 901 of file ComputeNonbondedCUDA.C.

00901 { ; }


Member Function Documentation

void ComputeNonbondedCUDA::assignPatches (  ) 

Definition at line 985 of file ComputeNonbondedCUDA.C.

References activePatches, ResizeArray< Elem >::add(), computeMgr, deviceCUDA, DeviceCUDA::getNumPesSharingDevice(), DeviceCUDA::getPesSharingDevice(), ComputeNonbondedCUDA::patch_record::hostPe, if(), j, PatchMap::node(), npatches, numSlaves, ReductionMgr::Object(), PatchMap::ObjectOnPe(), patchMap, patchRecords, reduction, REDUCTIONS_BASIC, registerPatches(), ComputeMgr::sendCreateNonbondedCUDASlave(), ResizeArray< Elem >::size(), slavePes, slaves, and ReductionMgr::willSubmit().

Referenced by ComputeMgr::createComputes().

00985                                          {
00986 
00987   int *pesOnNodeSharingDevice = new int[CkMyNodeSize()];
00988   int numPesOnNodeSharingDevice = 0;
00989   int masterIndex = -1;
00990   for ( int i=0; i<deviceCUDA->getNumPesSharingDevice(); ++i ) {
00991     int pe = deviceCUDA->getPesSharingDevice(i);
00992     if ( pe == CkMyPe() ) masterIndex = numPesOnNodeSharingDevice;
00993     if ( CkNodeOf(pe) == CkMyNode() ) {
00994       pesOnNodeSharingDevice[numPesOnNodeSharingDevice++] = pe;
00995     }
00996   }
00997 
00998   int npatches = activePatches.size();
00999 
01000   if ( npatches ) {
01001     reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
01002   }
01003 
01004   // calculate priority rank of local home patch within pe
01005   {
01006     ResizeArray< ResizeArray<int> > homePatchByRank(CkMyNodeSize());
01007     for ( int i=0; i<npatches; ++i ) {
01008       int pid = activePatches[i];
01009       int homePe = patchMap->node(pid);
01010       if ( CkNodeOf(homePe) == CkMyNode() ) {
01011         homePatchByRank[CkRankOf(homePe)].add(pid);
01012       }
01013     }
01014     for ( int i=0; i<CkMyNodeSize(); ++i ) {
01015       pid_sortop_reverse_priority so;
01016       std::sort(homePatchByRank[i].begin(),homePatchByRank[i].end(),so);
01017       int masterBoost = ( CkMyRank() == i ? 2 : 0 );
01018       for ( int j=0; j<homePatchByRank[i].size(); ++j ) {
01019         int pid = homePatchByRank[i][j];
01020         patchRecords[pid].reversePriorityRankInPe = j + masterBoost;
01021       }
01022     }
01023   }
01024 
01025   int *count = new int[npatches];
01026   memset(count, 0, sizeof(int)*npatches);
01027   int *pcount = new int[numPesOnNodeSharingDevice];
01028   memset(pcount, 0, sizeof(int)*numPesOnNodeSharingDevice);
01029   int *rankpcount = new int[CkMyNodeSize()];
01030   memset(rankpcount, 0, sizeof(int)*CkMyNodeSize());
01031   char *table = new char[npatches*numPesOnNodeSharingDevice];
01032   memset(table, 0, npatches*numPesOnNodeSharingDevice);
01033 
01034   int unassignedpatches = npatches;
01035 
01036   if ( 0 ) { // assign all to device pe
01037     for ( int i=0; i<npatches; ++i ) {
01038       int pid = activePatches[i];
01039       patch_record &pr = patchRecords[pid];
01040       pr.hostPe = CkMyPe();
01041     }
01042     unassignedpatches = 0;
01043     pcount[masterIndex] = npatches;
01044   } else 
01045 
01046   // assign if home pe and build table of natural proxies
01047   for ( int i=0; i<npatches; ++i ) {
01048     int pid = activePatches[i];
01049     patch_record &pr = patchRecords[pid];
01050     int homePe = patchMap->node(pid);
01051     for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
01052       int pe = pesOnNodeSharingDevice[j];
01053       if ( pe == homePe ) {
01054         pr.hostPe = pe;  --unassignedpatches;
01055         pcount[j] += 1;
01056       }
01057       if ( PatchMap::ObjectOnPe(pe)->patch(pid) ) {
01058         table[i*numPesOnNodeSharingDevice+j] = 1;
01059       }
01060     }
01061     if ( pr.hostPe == -1 && CkNodeOf(homePe) == CkMyNode() ) {
01062       pr.hostPe = homePe;  --unassignedpatches;
01063       rankpcount[CkRankOf(homePe)] += 1;
01064     }
01065   }
01066   // assign if only one pe has a required proxy
01067   int assignj = 0;
01068   for ( int i=0; i<npatches; ++i ) {
01069     int pid = activePatches[i];
01070     patch_record &pr = patchRecords[pid];
01071     if ( pr.hostPe != -1 ) continue;
01072     int c = 0;
01073     int lastj;
01074     for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
01075       if ( table[i*numPesOnNodeSharingDevice+j] ) { ++c; lastj=j; }
01076     }
01077     count[i] = c;
01078     if ( c == 1 ) {
01079       pr.hostPe = pesOnNodeSharingDevice[lastj];
01080       --unassignedpatches;
01081       pcount[lastj] += 1;
01082     }
01083   }
01084   while ( unassignedpatches ) {
01085     int i;
01086     for ( i=0; i<npatches; ++i ) {
01087       if ( ! table[i*numPesOnNodeSharingDevice+assignj] ) continue;
01088       int pid = activePatches[i];
01089       patch_record &pr = patchRecords[pid];
01090       if ( pr.hostPe != -1 ) continue;
01091       pr.hostPe = pesOnNodeSharingDevice[assignj];
01092       --unassignedpatches;
01093       pcount[assignj] += 1;
01094       if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
01095       break;
01096     }
01097     if ( i<npatches ) continue;  // start search again
01098     for ( i=0; i<npatches; ++i ) {
01099       int pid = activePatches[i];
01100       patch_record &pr = patchRecords[pid];
01101       if ( pr.hostPe != -1 ) continue;
01102       if ( count[i] ) continue;
01103       pr.hostPe = pesOnNodeSharingDevice[assignj];
01104       --unassignedpatches;
01105       pcount[assignj] += 1;
01106       if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
01107       break;
01108     }
01109     if ( i<npatches ) continue;  // start search again
01110     if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
01111   }
01112 
01113   for ( int i=0; i<npatches; ++i ) {
01114     int pid = activePatches[i];
01115     patch_record &pr = patchRecords[pid];
01116     // CkPrintf("Pe %d patch %d hostPe %d\n", CkMyPe(), pid, pr.hostPe);
01117   }
01118 
01119   slavePes = new int[CkMyNodeSize()];
01120   slaves = new ComputeNonbondedCUDA*[CkMyNodeSize()];
01121   numSlaves = 0;
01122   for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
01123     int pe = pesOnNodeSharingDevice[j];
01124     int rank = pe - CkNodeFirst(CkMyNode());
01125     // CkPrintf("host %d sharing %d pe %d rank %d pcount %d rankpcount %d\n",
01126     //          CkMyPe(),j,pe,rank,pcount[j],rankpcount[rank]);
01127     if ( pe == CkMyPe() ) continue;
01128     if ( ! pcount[j] && ! rankpcount[rank] ) continue;
01129     rankpcount[rank] = 0;  // skip in rank loop below
01130     slavePes[numSlaves] = pe;
01131     computeMgr->sendCreateNonbondedCUDASlave(pe,numSlaves);
01132     ++numSlaves;
01133   }
01134   for ( int j=0; j<CkMyNodeSize(); ++j ) {
01135     int pe = CkNodeFirst(CkMyNode()) + j;
01136     // CkPrintf("host %d rank %d pe %d rankpcount %d\n",
01137     //          CkMyPe(),j,pe,rankpcount[j]);
01138     if ( ! rankpcount[j] ) continue;
01139     if ( pe == CkMyPe() ) continue;
01140     slavePes[numSlaves] = pe;
01141     computeMgr->sendCreateNonbondedCUDASlave(pe,numSlaves);
01142     ++numSlaves;
01143   }
01144   registerPatches();
01145 
01146   delete [] pesOnNodeSharingDevice;
01147   delete [] count;
01148   delete [] pcount;
01149   delete [] rankpcount;
01150   delete [] table;
01151 }

void ComputeNonbondedCUDA::atomUpdate (  )  [virtual]

Reimplemented from Compute.

Definition at line 1363 of file ComputeNonbondedCUDA.C.

References atomsChanged.

01363                                       {
01364   //fprintf(stderr, "%d ComputeNonbondedCUDA::atomUpdate\n",CkMyPe());
01365   atomsChanged = 1;
01366 }

void ComputeNonbondedCUDA::build_exclusions (  )  [static]

Definition at line 583 of file ComputeNonbondedCUDA.C.

References ResizeArray< Elem >::add(), ResizeArray< Elem >::begin(), cuda_bind_exclusions(), ResizeArray< Elem >::end(), exclusionsByAtom, ExclusionSignature::fullExclCnt, ExclusionSignature::fullOffset, Molecule::get_full_exclusions_for_atom(), ObjectArena< Type >::getNewArray(), j, MAX_EXCLUSIONS, ComputeNonbondedUtil::mol, Node::molecule, NAMD_bug(), NAMD_die(), Molecule::numAtoms, Node::Object(), ResizeArray< Elem >::resize(), SET_EXCL, ResizeArray< Elem >::size(), SortableResizeArray< Elem >::sort(), and x.

Referenced by build_cuda_exclusions(), and ComputeNonbondedCUDA().

00583                                             {
00584   Molecule *mol = Node::Object()->molecule;
00585 
00586 #ifdef MEM_OPT_VERSION
00587   int natoms = mol->exclSigPoolSize;
00588 #else
00589   int natoms = mol->numAtoms; 
00590 #endif
00591 
00592   delete [] exclusionsByAtom;
00593   exclusionsByAtom = new int2[natoms];
00594 
00595   // create unique sorted lists
00596 
00597   ObjectArena<int32> listArena;
00598   ResizeArray<int32*> unique_lists;
00599   int32 **listsByAtom = new int32*[natoms];
00600   SortableResizeArray<int32> curList;
00601   for ( int i=0; i<natoms; ++i ) {
00602     curList.resize(0);
00603     curList.add(0);  // always excluded from self
00604 #ifdef MEM_OPT_VERSION
00605     const ExclusionSignature *sig = mol->exclSigPool + i;
00606     int n = sig->fullExclCnt;
00607     for ( int j=0; j<n; ++j ) { curList.add(sig->fullOffset[j]); }
00608     n += 1;
00609 #else
00610     const int32 *mol_list = mol->get_full_exclusions_for_atom(i);
00611     int n = mol_list[0] + 1;
00612     for ( int j=1; j<n; ++j ) {
00613       curList.add(mol_list[j] - i);
00614     }
00615 #endif
00616     curList.sort();
00617 
00618     int j;
00619     for ( j=0; j<unique_lists.size(); ++j ) {
00620       if ( n != unique_lists[j][0] ) continue;  // no match
00621       int k;
00622       for ( k=0; k<n; ++k ) {
00623         if ( unique_lists[j][k+3] != curList[k] ) break;
00624       }
00625       if ( k == n ) break;  // found match
00626     }
00627     if ( j == unique_lists.size() ) {  // no match
00628       int32 *list = listArena.getNewArray(n+3);
00629       list[0] = n;
00630       int maxdiff = 0;
00631       maxdiff = -1 * curList[0];
00632       if ( curList[n-1] > maxdiff ) maxdiff = curList[n-1];
00633       list[1] = maxdiff;
00634       for ( int k=0; k<n; ++k ) {
00635         list[k+3] = curList[k];
00636       }
00637       unique_lists.add(list);
00638     }
00639     listsByAtom[i] = unique_lists[j];
00640   }
00641   // sort lists by maxdiff
00642   std::stable_sort(unique_lists.begin(), unique_lists.end(), exlist_sortop());
00643   long int totalbits = 0;
00644   int nlists = unique_lists.size();
00645   for ( int j=0; j<nlists; ++j ) {
00646     int32 *list = unique_lists[j];
00647     int maxdiff = list[1];
00648     list[2] = totalbits + maxdiff;
00649     totalbits += 2*maxdiff + 1;
00650   }
00651   for ( int i=0; i<natoms; ++i ) {
00652     exclusionsByAtom[i].x = listsByAtom[i][1];  // maxdiff
00653     exclusionsByAtom[i].y = listsByAtom[i][2];  // start
00654   }
00655   delete [] listsByAtom;
00656 
00657   if ( totalbits & 31 ) totalbits += ( 32 - ( totalbits & 31 ) );
00658 
00659   {
00660     long int bytesneeded = totalbits / 8;
00661     if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
00662     CkPrintf("Info: Found %d unique exclusion lists needing %ld bytes\n",
00663                 unique_lists.size(), bytesneeded);
00664     }
00665 
00666     long int bytesavail = MAX_EXCLUSIONS * sizeof(unsigned int);
00667     if ( bytesneeded > bytesavail ) {
00668       char errmsg[512];
00669       sprintf(errmsg,"Found %d unique exclusion lists needing %ld bytes "
00670                      "but only %ld bytes can be addressed with 32-bit int.",
00671                      unique_lists.size(), bytesneeded, bytesavail);
00672       NAMD_die(errmsg);
00673     }
00674   }
00675 
00676 #define SET_EXCL(EXCL,BASE,DIFF) \
00677          (EXCL)[((BASE)+(DIFF))>>5] |= (1<<(((BASE)+(DIFF))&31))
00678 
00679   unsigned int *exclusion_bits = new unsigned int[totalbits/32];
00680   memset(exclusion_bits, 0, totalbits/8);
00681 
00682   long int base = 0;
00683   for ( int i=0; i<unique_lists.size(); ++i ) {
00684     base += unique_lists[i][1];
00685     if ( unique_lists[i][2] != (int32)base ) {
00686       NAMD_bug("ComputeNonbondedCUDA::build_exclusions base != stored");
00687     }
00688     int n = unique_lists[i][0];
00689     for ( int j=0; j<n; ++j ) {
00690       SET_EXCL(exclusion_bits,base,unique_lists[i][j+3]);
00691     }
00692     base += unique_lists[i][1] + 1;
00693   }
00694 
00695   cuda_bind_exclusions(exclusion_bits, totalbits/32);
00696 
00697   delete [] exclusion_bits;
00698 }

void ComputeNonbondedCUDA::build_force_table (  )  [static]

Definition at line 438 of file ComputeNonbondedCUDA.C.

References cuda_bind_force_table(), ComputeNonbondedUtil::cutoff, diffa, f, ComputeNonbondedUtil::fast_table, FORCE_TABLE_SIZE, ComputeNonbondedUtil::r2_delta, ComputeNonbondedUtil::r2_delta_exp, ComputeNonbondedUtil::r2_table, ComputeNonbondedUtil::scor_table, table_i, ComputeNonbondedUtil::vdwa_table, ComputeNonbondedUtil::vdwb_table, and x.

Referenced by build_cuda_force_table().

00438                                              {  // static
00439 
00440   float4 t[FORCE_TABLE_SIZE];
00441   float4 et[FORCE_TABLE_SIZE];  // energy table
00442 
00443   const BigReal r2_delta = ComputeNonbondedUtil:: r2_delta;
00444   const int r2_delta_exp = ComputeNonbondedUtil:: r2_delta_exp;
00445   // const int r2_delta_expc = 64 * (r2_delta_exp - 127);
00446   const int r2_delta_expc = 64 * (r2_delta_exp - 1023);
00447 
00448   double r2list[FORCE_TABLE_SIZE];  // double to match cpu code
00449   for ( int i=1; i<FORCE_TABLE_SIZE; ++i ) {
00450     double r = ((double) FORCE_TABLE_SIZE) / ( (double) i + 0.5 );
00451     r2list[i] = r*r + r2_delta;
00452   }
00453 
00454   union { double f; int32 i[2]; } byte_order_test;
00455   byte_order_test.f = 1.0;  // should occupy high-order bits only
00456   int32 *r2iilist = (int32*)r2list + ( byte_order_test.i[0] ? 0 : 1 );
00457 
00458   for ( int i=1; i<FORCE_TABLE_SIZE; ++i ) {
00459     double r = ((double) FORCE_TABLE_SIZE) / ( (double) i + 0.5 );
00460     int table_i = (r2iilist[2*i] >> 14) + r2_delta_expc;  // table_i >= 0
00461 
00462     if ( r > cutoff ) {
00463       t[i].x = 0.;
00464       t[i].y = 0.;
00465       t[i].z = 0.;
00466       t[i].w = 0.;
00467       et[i].x = 0.;
00468       et[i].y = 0.;
00469       et[i].z = 0.;
00470       et[i].w = 0.;
00471       continue;
00472     }
00473 
00474     BigReal diffa = r2list[i] - r2_table[table_i];
00475 
00476     // coulomb 1/r or fast force
00477     // t[i].x = 1. / (r2 * r);  // -1/r * d/dr r^-1
00478     {
00479       BigReal table_a = fast_table[4*table_i];
00480       BigReal table_b = fast_table[4*table_i+1];
00481       BigReal table_c = fast_table[4*table_i+2];
00482       BigReal table_d = fast_table[4*table_i+3];
00483       BigReal grad =
00484                 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00485       t[i].x = 2. * grad;
00486       BigReal ener = table_a + diffa *
00487                 ( ( table_d * diffa + table_c ) * diffa + table_b);
00488       et[i].x = ener;
00489     }
00490 
00491 
00492     // pme correction for slow force
00493     // t[i].w = 0.;
00494     {
00495       BigReal table_a = scor_table[4*table_i];
00496       BigReal table_b = scor_table[4*table_i+1];
00497       BigReal table_c = scor_table[4*table_i+2];
00498       BigReal table_d = scor_table[4*table_i+3];
00499       BigReal grad =
00500                 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00501       t[i].w = 2. * grad;
00502       BigReal ener = table_a + diffa *
00503                 ( ( table_d * diffa + table_c ) * diffa + table_b);
00504       et[i].w = ener;
00505     }
00506 
00507 
00508     // vdw 1/r^6
00509     // t[i].y = 6. / (r8);  // -1/r * d/dr r^-6
00510     {
00511       BigReal table_a = vdwb_table[4*table_i];
00512       BigReal table_b = vdwb_table[4*table_i+1];
00513       BigReal table_c = vdwb_table[4*table_i+2];
00514       BigReal table_d = vdwb_table[4*table_i+3];
00515       BigReal grad =
00516                 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00517       t[i].y = 2. * -1. * grad;
00518       BigReal ener = table_a + diffa *
00519                 ( ( table_d * diffa + table_c ) * diffa + table_b);
00520       et[i].y = -1. * ener;
00521     }
00522 
00523 
00524     // vdw 1/r^12
00525     // t[i].z = 12e / (r8 * r4 * r2);  // -1/r * d/dr r^-12
00526     {
00527       BigReal table_a = vdwa_table[4*table_i];
00528       BigReal table_b = vdwa_table[4*table_i+1];
00529       BigReal table_c = vdwa_table[4*table_i+2];
00530       BigReal table_d = vdwa_table[4*table_i+3];
00531       BigReal grad =
00532                 ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
00533       t[i].z = 2. * grad;
00534       BigReal ener = table_a + diffa *
00535                 ( ( table_d * diffa + table_c ) * diffa + table_b);
00536       et[i].z = ener;
00537     }
00538 
00539     // CkPrintf("%d %g %g %g %g %g %g\n", i, r, diffa,
00540     //   t[i].x, t[i].y, t[i].z, t[i].w);
00541 
00542 /*
00543     double r2 = r * r;
00544     double r4 = r2 * r2;
00545     double r8 = r4 * r4;
00546 
00547     t[i].x = 1. / (r2 * r);  // -1/r * d/dr r^-1
00548     t[i].y = 6. / (r8);  // -1/r * d/dr r^-6
00549     t[i].z = 12. / (r8 * r4 * r2);  // -1/r * d/dr r^-12
00550     t[i].w = 0.;
00551 */
00552   }
00553 
00554   t[0].x = 0.f;
00555   t[0].y = 0.f;
00556   t[0].z = 0.f;
00557   t[0].w = 0.f;
00558   et[0].x = et[1].x;
00559   et[0].y = et[1].y;
00560   et[0].z = et[1].z;
00561   et[0].w = et[1].w;
00562 
00563   cuda_bind_force_table(t,et);
00564 
00565   if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
00566     CkPrintf("Info: Updated CUDA force table with %d elements.\n", FORCE_TABLE_SIZE);
00567   }
00568 }

void ComputeNonbondedCUDA::build_lj_table (  )  [static]

Definition at line 411 of file ComputeNonbondedCUDA.C.

References cuda_bind_lj_table(), LJTable::get_table_dim(), j, ComputeNonbondedUtil::ljTable, NAMD_bug(), ComputeNonbondedUtil::scaling, and LJTable::table_val().

Referenced by build_cuda_force_table().

00411                                           {  // static
00412 
00413   const LJTable* const ljTable = ComputeNonbondedUtil:: ljTable;
00414   const int dim = ljTable->get_table_dim();
00415 
00416   // round dim up to odd multiple of 16
00417   int tsize = (((dim+16+31)/32)*32)-16;
00418   if ( tsize < dim ) NAMD_bug("ComputeNonbondedCUDA::build_lj_table bad tsize");
00419 
00420   float2 *t = new float2[tsize*tsize];
00421   float2 *row = t;
00422   for ( int i=0; i<dim; ++i, row += tsize ) {
00423     for ( int j=0; j<dim; ++j ) {
00424       const LJTable::TableEntry *e = ljTable->table_val(i,j);
00425       row[j].x = e->A * scaling;
00426       row[j].y = e->B * scaling;
00427     }
00428   }
00429 
00430   cuda_bind_lj_table(t,tsize);
00431   delete [] t;
00432 
00433   if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
00434     CkPrintf("Info: Updated CUDA LJ table with %d x %d elements.\n", dim, dim);
00435   }
00436 }

void ComputeNonbondedCUDA::doWork (  )  [virtual]

Reimplemented from Compute.

Definition at line 1497 of file ComputeNonbondedCUDA.C.

References activePatches, atom_params, atom_params_size, atoms, atoms_size, atomsChanged, Compute::basePriority, ResizeArray< Elem >::begin(), block_order, block_order_size, bornRadH, bornRadH_size, PatchMap::center(), CompAtom::charge, COMPUTE_PROXY_PRIORITY, computeRecords, computesChanged, COULOMB, cuda_bind_patch_pairs(), cuda_check_local_progress(), cuda_errcheck(), cudaCheck, cudaCompute, dEdaSumH, dEdaSumH_size, deviceCUDA, deviceID, dHdrPrefixH, dHdrPrefixH_size, ComputeNonbondedUtil::dielectric_1, dummy_dev, dummy_size, energy_gbis, energy_gbis_size, exclusionsByAtom, f, finishWork(), ComputeNonbondedUtil::fixedAtomsOn, force_ready_queue, force_ready_queue_len, force_ready_queue_size, forces, forces_size, GBISP, Compute::gbisPhase, DeviceCUDA::getNoStreaming(), Patch::getNumAtoms(), hostedPatches, CompAtomExt::id, if(), intRad0H, intRad0H_size, intRadSH, intRadSH_size, SubmitReduction::item(), j, lattice, localActivePatches, localComputeRecords, ComputeNonbondedCUDA::patch_record::localStart, master, ComputeNonbondedUtil::mol, Node::molecule, NAMD_bug(), npatches, num_atoms, num_local_atoms, num_remote_atoms, num_virials, ComputeNonbondedCUDA::patch_record::numAtoms, ComputeNonbondedCUDA::patch_record::numFreeAtoms, Node::Object(), ComputeNonbondedCUDA::compute_record::offset, ComputeNonbondedCUDA::patch_record::p, pairlistsValid, pairlistTolerance, Node::parameters, patch_pair_num_ptr, patch_pairs, patch_pairs_ptr, patchMap, patchPairsReordered, patchRecords, ComputeNonbondedCUDA::compute_record::pid, CompAtom::position, PROXY_DATA_PRIORITY, PROXY_RESULTS_PRIORITY, psiSumH, psiSumH_size, reduction, REDUCTION_PAIRLIST_WARNINGS, remoteActivePatches, remoteComputeRecords, ResizeArray< Elem >::resize(), Flags::savePairlists, savePairlists, ComputeNonbondedUtil::scaling, Compute::sequence(), Node::simParameters, simParams, ResizeArray< Elem >::size(), slow_forces, slow_forces_size, slow_virials, usePairlists, vdw_types, vdw_types_size, CompAtom::vdwType, virials, virials_size, WARPSIZE, workStarted, Vector::x, ComputeNonbondedCUDA::patch_record::xExt, Vector::y, and Vector::z.

01497                                   {
01498 GBISP("C.N.CUDA[%d]::doWork: seq %d, phase %d, workStarted %d, atomsChanged %d\n", \
01499 CkMyPe(), sequence(), gbisPhase, workStarted, atomsChanged);
01500 
01501   // Set GPU device ID
01502   cudaCheck(cudaSetDevice(deviceID));
01503 
01504   ResizeArray<patch_pair> &patch_pairs(*patch_pairs_ptr);
01505   ResizeArray<int> &patch_pair_num(*patch_pair_num_ptr);
01506 
01507   if ( workStarted ) { //if work already started, check if finished
01508     if ( finishWork() ) {  // finished
01509       workStarted = 0;
01510       basePriority = PROXY_DATA_PRIORITY;  // higher to aid overlap
01511     } else {  // need to call again
01512       workStarted = 2;
01513       basePriority = PROXY_RESULTS_PRIORITY;  // lower for local
01514       if ( master == this && kernel_launch_state > 2 ) {
01515         cuda_check_local_progress(this,0.);  // launches polling
01516       }
01517     }
01518     return;
01519   }
01520 
01521   workStarted = 1;
01522   basePriority = COMPUTE_PROXY_PRIORITY;
01523 
01524   Molecule *mol = Node::Object()->molecule;
01525   Parameters *params = Node::Object()->parameters;
01526   SimParameters *simParams = Node::Object()->simParameters;
01527 
01528   //execute only during GBIS phase 1, or if not using GBIS
01529   if (!simParams->GBISOn || gbisPhase == 1) {
01530 
01531     //bind new patches to GPU
01532     if ( atomsChanged || computesChanged ) {
01533       int npatches = activePatches.size();
01534 
01535       pairlistsValid = 0;
01536       pairlistTolerance = 0.;
01537 
01538       if ( computesChanged ) {
01539         computesChanged = 0;
01540 
01541         if ( ! dummy_size ) {
01542           dummy_size = 1024*1024;
01543           cudaMalloc((void**)&dummy_dev,dummy_size);
01544           cuda_errcheck("in cudaMalloc dummy_dev");
01545         }
01546 
01547         bool did_realloc = reallocate_host<int>(&force_ready_queue, &force_ready_queue_size, npatches,
01548           1.2f, cudaHostAllocMapped);
01549         if (did_realloc) {
01550           for (int k=0; k < force_ready_queue_size; ++k)
01551             force_ready_queue[k] = -1;
01552         }
01553         force_ready_queue_len = npatches;
01554         reallocate_host<int>(&block_order, &block_order_size,
01555           2*(localComputeRecords.size()+remoteComputeRecords.size()),
01556           1.2f, cudaHostAllocMapped);
01557     
01558         num_virials = npatches;
01559         reallocate_host<float>(&virials, &virials_size, 2*16*num_virials,
01560           1.2f, cudaHostAllocMapped);
01561         slow_virials = virials + 16*num_virials;
01562 
01563         reallocate_host<float>(&energy_gbis, &energy_gbis_size, npatches,
01564           1.2f, cudaHostAllocMapped);
01565         for (int i  = 0; i < energy_gbis_size; i++) energy_gbis[i] = 0.f;
01566 
01567         int *ap = activePatches.begin();
01568         for ( int i=0; i<localActivePatches.size(); ++i ) {
01569           *(ap++) = localActivePatches[i];
01570         }
01571         for ( int i=0; i<remoteActivePatches.size(); ++i ) {
01572           *(ap++) = remoteActivePatches[i];
01573         }
01574 
01575         // sort computes by distance between patches
01576         cr_sortop_distance so(lattice);
01577         std::stable_sort(localComputeRecords.begin(),localComputeRecords.end(),so);
01578         std::stable_sort(remoteComputeRecords.begin(),remoteComputeRecords.end(),so);
01579 
01580         const bool streaming = ! (deviceCUDA->getNoStreaming() || simParams->GBISOn);
01581 
01582         if ( streaming ) {
01583           // iout << "Reverse-priority sorting...\n" << endi;
01584           cr_sortop_reverse_priority sorp(so,patchRecords.begin());
01585           std::stable_sort(localComputeRecords.begin(),localComputeRecords.end(),sorp);
01586           std::stable_sort(remoteComputeRecords.begin(),remoteComputeRecords.end(),sorp);
01587           patchPairsReordered = 0;
01588           //patchPairsReordered = 1;
01589       // int len = remoteComputeRecords.size();
01590       // for ( int i=0; i<len; ++i ) {
01591       //   iout << "reverse_order " << i << " " << remoteComputeRecords[i].pid[0] << "\n";
01592       // }
01593       // int len2 = localComputeRecords.size();
01594       // for ( int i=0; i<len2; ++i ) {
01595       //   iout << "reverse_order " << (i+len) << " " << localComputeRecords[i].pid[0] << "\n";
01596       // }
01597       // iout << endi;
01598         } else {
01599           patchPairsReordered = 1;
01600         }
01601  
01602         int nlc = localComputeRecords.size();
01603         int nrc = remoteComputeRecords.size();
01604         computeRecords.resize(nlc+nrc);
01605         compute_record *cr = computeRecords.begin();
01606         for ( int i=0; i<nrc; ++i ) {
01607           *(cr++) = remoteComputeRecords[i];
01608         }
01609         for ( int i=0; i<nlc; ++i ) {
01610           *(cr++) = localComputeRecords[i];
01611         }
01612 
01613         // patch_pair_num[i] = number of patch pairs that involve patch i
01614         patch_pair_num.resize(npatches);
01615         for ( int i=0; i<npatches; ++i ) {
01616           patchRecords[activePatches[i]].localIndex = i;
01617           patch_pair_num[i] = 0;
01618         }
01619 
01620         int ncomputes = computeRecords.size();
01621         patch_pairs.resize(ncomputes);
01622         for ( int i=0; i<ncomputes; ++i ) {
01623           ComputeNonbondedCUDA::compute_record &cr = computeRecords[i];
01624           int lp1 = patchRecords[cr.pid[0]].localIndex;
01625           int lp2 = patchRecords[cr.pid[1]].localIndex;
01626           patch_pair_num[lp1]++;
01627           if (lp1 != lp2) patch_pair_num[lp2]++;
01628           patch_pair &pp = patch_pairs[i];
01629           pp.offset.x = cr.offset.x;
01630           pp.offset.y = cr.offset.y;
01631           pp.offset.z = cr.offset.z;
01632         }
01633 
01634         for ( int i=0; i<ncomputes; ++i ) {
01635           ComputeNonbondedCUDA::compute_record &cr = computeRecords[i];
01636           int lp1 = patchRecords[cr.pid[0]].localIndex;
01637           int lp2 = patchRecords[cr.pid[1]].localIndex;
01638           patch_pair &pp = patch_pairs[i];
01639           pp.patch1_ind = lp1;
01640           pp.patch2_ind = lp2;
01641           pp.patch1_num_pairs = patch_pair_num[lp1];
01642           pp.patch2_num_pairs = patch_pair_num[lp2];
01643         }
01644 
01645       if ( CmiPhysicalNodeID(CkMyPe()) < 2 ) {
01646         CkPrintf("Pe %d has %d local and %d remote patches and %d local and %d remote computes.\n",
01647                  CkMyPe(), localActivePatches.size(), remoteActivePatches.size(),
01648                  localComputeRecords.size(), remoteComputeRecords.size());
01649       }
01650     }  // computesChanged
01651     else if ( ! patchPairsReordered ) {
01652       patchPairsReordered = 1;
01653       int len = patch_pairs.size();
01654       int nlc = localComputeRecords.size();
01655       int nrc = remoteComputeRecords.size();
01656       if ( len != nlc + nrc ) NAMD_bug("array size mismatch in ComputeNonbondedCUDA reordering");
01657       ResizeArray<ComputeNonbondedCUDA::compute_record> new_computeRecords(len);
01658       ResizeArray<patch_pair> new_patch_pairs(len);
01659       int irc=nrc;
01660       int ilc=len;
01661       for ( int i=0; i<len; ++i ) {
01662         int boi = block_order[i];
01663         int dest;
01664         if ( boi < nrc ) { dest = --irc; } else { dest = --ilc; }
01665         new_computeRecords[dest] = computeRecords[boi];
01666         new_patch_pairs[dest] = patch_pairs[boi];
01667       }
01668       if ( irc != 0 || ilc != nrc ) NAMD_bug("block index mismatch in ComputeNonbondedCUDA reordering");
01669       computeRecords.swap(new_computeRecords);
01670       patch_pairs.swap(new_patch_pairs);
01671     }
01672 
01673     int istart = 0;
01674     int i;
01675     for ( i=0; i<npatches; ++i ) {
01676       if ( i == localActivePatches.size() ) {
01677         num_local_atoms = istart;
01678       }
01679       patch_record &pr = patchRecords[activePatches[i]];
01680       pr.localStart = istart;
01681       int natoms = pr.p->getNumAtoms();
01682       int nfreeatoms = natoms;
01683       if ( fixedAtomsOn ) {
01684         const CompAtomExt *aExt = pr.xExt;
01685         for ( int j=0; j<natoms; ++j ) {
01686           if ( aExt[j].atomFixed ) --nfreeatoms;
01687         }
01688       }
01689       pr.numAtoms = natoms;
01690       pr.numFreeAtoms = nfreeatoms;
01691       istart += natoms;
01692       istart += 16 - (natoms & 15);
01693     }
01694     if ( i == localActivePatches.size() ) {
01695       num_local_atoms = istart;
01696     }
01697     num_atoms = istart;
01698     num_remote_atoms = num_atoms - num_local_atoms;
01699     reallocate_host<atom_param>(&atom_params, &atom_params_size, num_atoms, 1.2f);
01700     reallocate_host<int>(&vdw_types, &vdw_types_size, num_atoms, 1.2f);
01701     reallocate_host<CudaAtom>(&atoms, &atoms_size, num_atoms, 1.2f);
01702     reallocate_host<float4>(&forces, &forces_size, num_atoms, 1.2f, cudaHostAllocMapped);
01703     reallocate_host<float4>(&slow_forces, &slow_forces_size, num_atoms, 1.2f, cudaHostAllocMapped);
01704     if (simParams->GBISOn) {
01705       reallocate_host<float>(&intRad0H, &intRad0H_size, num_atoms, 1.2f);
01706       reallocate_host<float>(&intRadSH, &intRadSH_size, num_atoms, 1.2f);
01707       reallocate_host<GBReal>(&psiSumH, &psiSumH_size, num_atoms, 1.2f, cudaHostAllocMapped);
01708       reallocate_host<float>(&bornRadH, &bornRadH_size, num_atoms, 1.2f);
01709       reallocate_host<GBReal>(&dEdaSumH, &dEdaSumH_size, num_atoms, 1.2f, cudaHostAllocMapped);
01710       reallocate_host<float>(&dHdrPrefixH, &dHdrPrefixH_size, num_atoms, 1.2f);
01711     }
01712 
01713     int bfstart = 0;
01714     int exclmask_start = 0;
01715     int ncomputes = computeRecords.size();
01716     for ( int i=0; i<ncomputes; ++i ) {
01717       ComputeNonbondedCUDA::compute_record &cr = computeRecords[i];
01718       int p1 = cr.pid[0];
01719       int p2 = cr.pid[1];
01720       patch_pair &pp = patch_pairs[i];
01721       pp.patch1_start = patchRecords[p1].localStart;
01722       pp.patch1_size  = patchRecords[p1].numAtoms;
01723       pp.patch1_free_size = patchRecords[p1].numFreeAtoms;
01724       pp.patch2_start = patchRecords[p2].localStart;
01725       pp.patch2_size  = patchRecords[p2].numAtoms;
01726       pp.patch2_free_size = patchRecords[p2].numFreeAtoms;
01727       pp.plist_start = bfstart;
01728       // size1*size2 = number of patch pairs
01729       int size1 = (pp.patch1_size-1)/WARPSIZE+1;
01730       int size2 = (pp.patch2_size-1)/WARPSIZE+1;
01731       pp.plist_size = (size1*size2-1)/32+1;
01732       bfstart += pp.plist_size;
01733       pp.exclmask_start = exclmask_start;
01734       exclmask_start += size1*size2;
01735     } //for ncomputes
01736 
01737     cuda_bind_patch_pairs(patch_pairs.begin(), patch_pairs.size(),
01738       activePatches.size(), num_atoms, bfstart, 
01739       exclmask_start);
01740 
01741   }  // atomsChanged || computesChanged
01742 
01743   double charge_scaling = sqrt(COULOMB * scaling * dielectric_1);
01744 
01745   Flags &flags = patchRecords[hostedPatches[0]].p->flags;
01746   float maxAtomMovement = 0.;
01747   float maxPatchTolerance = 0.;
01748 
01749   for ( int i=0; i<activePatches.size(); ++i ) {
01750     patch_record &pr = patchRecords[activePatches[i]];
01751 
01752     float maxMove = pr.p->flags.maxAtomMovement;
01753     if ( maxMove > maxAtomMovement ) maxAtomMovement = maxMove;
01754 
01755     float maxTol = pr.p->flags.pairlistTolerance;
01756     if ( maxTol > maxPatchTolerance ) maxPatchTolerance = maxTol;
01757 
01758     int start = pr.localStart;
01759     int n = pr.numAtoms;
01760     const CompAtom *a = pr.x;
01761     const CompAtomExt *aExt = pr.xExt;
01762     if ( atomsChanged ) {
01763 
01764       atom_param *ap = atom_params + start;
01765       for ( int k=0; k<n; ++k ) {
01766         int j = aExt[k].sortOrder;
01767         ap[k].vdw_type = a[j].vdwType;
01768         vdw_types[start + k] = a[j].vdwType;
01769         ap[k].index = aExt[j].id;
01770 #ifdef MEM_OPT_VERSION
01771         ap[k].excl_index = exclusionsByAtom[aExt[j].exclId].y;
01772         ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].exclId].x;
01773 #else // ! MEM_OPT_VERSION
01774         ap[k].excl_index = exclusionsByAtom[aExt[j].id].y;
01775         ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].id].x;
01776 #endif // MEM_OPT_VERSION
01777       }
01778     }
01779     {
01780 #if 1
01781       const CudaAtom *ac = pr.p->getCudaAtomList();
01782       CudaAtom *ap = atoms + start;
01783       memcpy(ap, ac, sizeof(atom)*n);
01784 #else
01785       Vector center =
01786         pr.p->flags.lattice.unscale(cudaCompute->patchMap->center(pr.patchID));
01787       atom *ap = atoms + start;
01788       for ( int k=0; k<n; ++k ) {
01789         int j = aExt[k].sortOrder;
01790         ap[k].position.x = a[j].position.x - center.x;
01791         ap[k].position.y = a[j].position.y - center.y;
01792         ap[k].position.z = a[j].position.z - center.z;
01793         ap[k].charge = charge_scaling * a[j].charge;
01794       }
01795 #endif
01796     }
01797   }
01798 
01799   savePairlists = 0;
01800   usePairlists = 0;
01801   if ( flags.savePairlists ) {
01802     savePairlists = 1;
01803     usePairlists = 1;
01804   } else if ( flags.usePairlists ) {
01805     if ( ! pairlistsValid ||
01806          ( 2. * maxAtomMovement > pairlistTolerance ) ) {
01807       reduction->item(REDUCTION_PAIRLIST_WARNINGS) += 1;
01808     } else {
01809       usePairlists = 1;
01810     }
01811   }
01812   if ( ! usePairlists ) {
01813     pairlistsValid = 0;
01814   }
01815   float plcutoff = cutoff;
01816   if ( savePairlists ) {
01817     pairlistsValid = 1;
01818     pairlistTolerance = 2. * maxPatchTolerance;
01819     plcutoff += pairlistTolerance;
01820   }
01821   plcutoff2 = plcutoff * plcutoff;
01822 
01823   //CkPrintf("plcutoff = %f  listTolerance = %f  save = %d  use = %d\n",
01824   //  plcutoff, pairlistTolerance, savePairlists, usePairlists);
01825 
01826 #if 0
01827   // calculate warp divergence
01828   if ( 1 ) {
01829     Flags &flags = patchRecords[hostedPatches[0]].p->flags;
01830     Lattice &lattice = flags.lattice;
01831     float3 lata, latb, latc;
01832     lata.x = lattice.a().x;
01833     lata.y = lattice.a().y;
01834     lata.z = lattice.a().z;
01835     latb.x = lattice.b().x;
01836     latb.y = lattice.b().y;
01837     latb.z = lattice.b().z;
01838     latc.x = lattice.c().x;
01839     latc.y = lattice.c().y;
01840     latc.z = lattice.c().z;
01841 
01842     int ncomputes = computeRecords.size();
01843     for ( int ic=0; ic<ncomputes; ++ic ) {
01844       patch_pair &pp = patch_pairs[ic];
01845       atom *a1 = atoms + pp.patch1_atom_start;
01846       int n1 = pp.patch1_size;
01847       atom *a2 = atoms + pp.patch2_atom_start;
01848       int n2 = pp.patch2_size;
01849       float offx = pp.offset.x * lata.x
01850                + pp.offset.y * latb.x
01851                + pp.offset.z * latc.x;
01852       float offy = pp.offset.x * lata.y
01853                + pp.offset.y * latb.y
01854                + pp.offset.z * latc.y;
01855       float offz = pp.offset.x * lata.z
01856                + pp.offset.y * latb.z
01857                + pp.offset.z * latc.z;
01858       // CkPrintf("%f %f %f\n", offx, offy, offz);
01859       int atoms_tried = 0;
01860       int blocks_tried = 0;
01861       int atoms_used = 0;
01862       int blocks_used = 0;
01863       for ( int ii=0; ii<n1; ii+=32 ) {  // warps
01864         for ( int jj=0; jj<n2; jj+=16 ) {  // shared atom loads
01865           int block_used = 0;
01866           for ( int j=jj; j<jj+16 && j<n2; ++j ) {  // shared atoms
01867             int atom_used = 0;
01868             for ( int i=ii; i<ii+32 && i<n1; ++i ) {  // threads
01869               float dx = offx + a1[i].position.x - a2[j].position.x;
01870               float dy = offy + a1[i].position.y - a2[j].position.y;
01871               float dz = offz + a1[i].position.z - a2[j].position.z;
01872               float r2 = dx*dx + dy*dy + dz*dz;
01873               if ( r2 < cutoff2 ) atom_used = 1;
01874             }
01875             ++atoms_tried;
01876             if ( atom_used ) { block_used = 1; ++atoms_used; }
01877           }
01878           ++blocks_tried;
01879           if ( block_used ) { ++blocks_used; }
01880         }
01881       }
01882       CkPrintf("blocks = %d/%d (%f)  atoms = %d/%d (%f)\n",
01883                 blocks_used, blocks_tried, blocks_used/(float)blocks_tried,
01884                 atoms_used, atoms_tried, atoms_used/(float)atoms_tried);
01885     }
01886   }
01887 #endif
01888 
01889   } // !GBISOn || gbisPhase == 1
01890 
01891   //Do GBIS
01892   if (simParams->GBISOn) {
01893     //open GBIS boxes depending on phase
01894     for ( int i=0; i<activePatches.size(); ++i ) {
01895       patch_record &pr = master->patchRecords[activePatches[i]];
01896       GBISP("doWork[%d] accessing arrays for P%d\n",CkMyPe(),gbisPhase);
01897       if (gbisPhase == 1) {
01898         //Copy GBIS intRadius to Host
01899         if (atomsChanged) {
01900           float *intRad0 = intRad0H + pr.localStart;
01901           float *intRadS = intRadSH + pr.localStart;
01902           for ( int k=0; k<pr.numAtoms; ++k ) {
01903             int j = pr.xExt[k].sortOrder;
01904             intRad0[k] = pr.intRad[2*j+0];
01905             intRadS[k] = pr.intRad[2*j+1];
01906           }
01907         }
01908       } else if (gbisPhase == 2) {
01909         float *bornRad = bornRadH + pr.localStart;
01910         for ( int k=0; k<pr.numAtoms; ++k ) {
01911           int j = pr.xExt[k].sortOrder;
01912           bornRad[k] = pr.bornRad[j];
01913         }
01914       } else if (gbisPhase == 3) {
01915         float *dHdrPrefix = dHdrPrefixH + pr.localStart;
01916         for ( int k=0; k<pr.numAtoms; ++k ) {
01917           int j = pr.xExt[k].sortOrder;
01918           dHdrPrefix[k] = pr.dHdrPrefix[j];
01919         }
01920       } // end phases
01921     } // end for patches
01922   } // if GBISOn
01923 
01924   kernel_time = CkWallTimer();
01925   kernel_launch_state = 1;
01926   //if ( gpu_is_mine || ! doSlow ) recvYieldDevice(-1);
01927   if ( deviceCUDA->getGpuIsMine() || ! doSlow ) recvYieldDevice(-1);
01928 }

void ComputeNonbondedCUDA::finishPatch ( int   )  [virtual]

Reimplemented from Compute.

Definition at line 2189 of file ComputeNonbondedCUDA.C.

References activePatches, Box< Owner, Data >::close(), ComputeNonbondedCUDA::patch_record::forceBox, master, Box< Owner, Data >::open(), patchRecords, ComputeNonbondedCUDA::patch_record::positionBox, ComputeNonbondedCUDA::patch_record::r, and ComputeNonbondedCUDA::patch_record::x.

Referenced by finishWork().

02189                                                   {
02190   //fprintf(stderr, "%d ComputeNonbondedCUDA::finishPatch \n",CkMyPe());
02191   patch_record &pr = master->patchRecords[master->activePatches[flindex]];
02192   pr.r = pr.forceBox->open();
02193   finishPatch(pr);
02194   pr.positionBox->close(&(pr.x));
02195   pr.forceBox->close(&(pr.r));
02196 }

void ComputeNonbondedCUDA::finishReductions (  ) 

Definition at line 2353 of file ComputeNonbondedCUDA.C.

References ADD_TENSOR_OBJECT, block_order, endi(), energy_gbis, if(), iout, num_virials, Node::Object(), PROXY_DATA_PRIORITY, REDUCTION_ELECT_ENERGY, REDUCTION_ELECT_ENERGY_SLOW, REDUCTION_EXCLUSION_CHECKSUM_CUDA, REDUCTION_LJ_ENERGY, REDUCTION_VIRIAL_NBOND, Node::simParameters, simParams, ResizeArray< Elem >::size(), virials, Tensor::xx, Tensor::xy, Tensor::xz, Tensor::yx, Tensor::yy, Tensor::yz, Tensor::zx, Tensor::zy, and Tensor::zz.

02353                                             {
02354   //fprintf(stderr, "%d ComputeNonbondedCUDA::finishReductions \n",CkMyPe());
02355 
02356   basePriority = PROXY_DATA_PRIORITY;  // higher to aid overlap
02357 
02358   SimParameters *simParams = Node::Object()->simParameters;
02359 
02360   if ( 0 && patchPairsReordered && patchPairsReordered < 3 ) {
02361     if ( patchPairsReordered++ == 2) {
02362       int patch_len = patchRecords.size();
02363       ResizeArray<int> plast(patch_len);
02364       for ( int i=0; i<patch_len; ++i ) {
02365         plast[i] = -1;
02366       }
02367       int order_len = localComputeRecords.size()+remoteComputeRecords.size();
02368       for ( int i=0; i<order_len; ++i ) {
02369         plast[computeRecords[block_order[i]].pid[0]] = i;
02370         iout << "block_order " << i << " " << block_order[i] << " " << computeRecords[block_order[i]].pid[0] << "\n";
02371       }
02372       iout << endi;
02373       for ( int i=0; i<patch_len; ++i ) {
02374         iout << "patch_last " << i << " " << plast[i] << "\n";
02375       }
02376       iout << endi;
02377     }
02378   }
02379 
02380   {
02381     Tensor virial_tensor;
02382     BigReal energyv = 0.;
02383     BigReal energye = 0.;
02384     BigReal energys = 0.;
02385     int nexcluded = 0;
02386     for ( int i = 0; i < num_virials; ++i ) {
02387       virial_tensor.xx += virials[16*i];
02388       virial_tensor.xy += virials[16*i+1];
02389       virial_tensor.xz += virials[16*i+2];
02390       virial_tensor.yx += virials[16*i+3];
02391       virial_tensor.yy += virials[16*i+4];
02392       virial_tensor.yz += virials[16*i+5];
02393       virial_tensor.zx += virials[16*i+6];
02394       virial_tensor.zy += virials[16*i+7];
02395       virial_tensor.zz += virials[16*i+8];
02396       energyv += virials[16*i+9];
02397       energye += virials[16*i+10];
02398       energys += virials[16*i+11];
02399       nexcluded += ((int *)virials)[16*i+12];
02400       if (simParams->GBISOn) {
02401         energye += energy_gbis[i];
02402       }
02403     }
02404     reduction->item(REDUCTION_EXCLUSION_CHECKSUM_CUDA) += nexcluded;
02405     ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NBOND,virial_tensor);
02406     if ( doEnergy ) {
02407       reduction->item(REDUCTION_LJ_ENERGY) += energyv;
02408       reduction->item(REDUCTION_ELECT_ENERGY) += energye;
02409       reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += energys;
02410     }
02411   }
02412   if ( doSlow ) {
02413     Tensor virial_slow_tensor;
02414     for ( int i = 0; i < num_virials; ++i ) {
02415       virial_slow_tensor.xx += slow_virials[16*i];
02416       virial_slow_tensor.xy += slow_virials[16*i+1];
02417       virial_slow_tensor.xz += slow_virials[16*i+2];
02418       virial_slow_tensor.yx += slow_virials[16*i+3];
02419       virial_slow_tensor.yy += slow_virials[16*i+4];
02420       virial_slow_tensor.yz += slow_virials[16*i+5];
02421       virial_slow_tensor.zx += slow_virials[16*i+6];
02422       virial_slow_tensor.zy += slow_virials[16*i+7];
02423       virial_slow_tensor.zz += slow_virials[16*i+8];
02424     }
02425     ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_SLOW,virial_slow_tensor);
02426   }
02427 
02428   reduction->submit();
02429 
02430   cuda_timer_count++;
02431   if ( simParams->outputCudaTiming &&
02432         step % simParams->outputCudaTiming == 0 ) {
02433 
02434     // int natoms = mol->numAtoms; 
02435     // double wpa = wcount;  wpa /= natoms;
02436 
02437     // CkPrintf("Pe %d CUDA kernel %f ms, total %f ms, wpa %f\n", CkMyPe(),
02438     //  kernel_time * 1.e3, time * 1.e3, wpa);
02439 
02440 #if 0
02441     float upload_ms, remote_calc_ms;
02442     float local_calc_ms, total_ms;
02443     cuda_errcheck("before event timers");
02444     cudaEventElapsedTime(&upload_ms, start_upload, start_calc);
02445     cuda_errcheck("in event timer 1");
02446     cudaEventElapsedTime(&remote_calc_ms, start_calc, end_remote_download);
02447     cuda_errcheck("in event timer 2");
02448     cudaEventElapsedTime(&local_calc_ms, end_remote_download, end_local_download);
02449     cuda_errcheck("in event timer 3");
02450     cudaEventElapsedTime(&total_ms, start_upload, end_local_download);
02451     cuda_errcheck("in event timer 4");
02452     cuda_errcheck("in event timers");
02453 
02454     CkPrintf("CUDA EVENT TIMING: %d %f %f %f %f\n",
02455              CkMyPe(), upload_ms, remote_calc_ms,
02456              local_calc_ms, total_ms);
02457 #endif
02458 
02459     if ( cuda_timer_count >= simParams->outputCudaTiming ) {
02460       cuda_timer_total /= cuda_timer_count;
02461       CkPrintf("CUDA TIMING: %d  %f ms/step on node %d\n",
02462                step, cuda_timer_total * 1.e3, CkMyPe());
02463     }
02464     cuda_timer_count = 0;
02465     cuda_timer_total = 0;
02466   }
02467 
02468 }

int ComputeNonbondedCUDA::finishWork (  ) 

Definition at line 2227 of file ComputeNonbondedCUDA.C.

References computeMgr, dEdaSumH, finishPatch(), ComputeNonbondedCUDA::patch_record::forceBox, GBISP, Compute::gbisPhase, if(), j, localHostedPatches, master, ComputeNonbondedUtil::mol, Node::molecule, numSlaves, Node::Object(), Box< Owner, Data >::open(), ComputeNonbondedCUDA::patch_record::patchID, patchRecords, Compute::priority(), psiSumH, ComputeNonbondedCUDA::patch_record::r, remoteHostedPatches, ComputeMgr::sendNonbondedCUDASlaveEnqueue(), Compute::sequence(), Node::simParameters, simParams, ResizeArray< Elem >::size(), slavePes, slaves, and workStarted.

Referenced by doWork().

02227                                      {
02228   //fprintf(stderr, "%d ComputeNonbondedCUDA::finishWork() \n",CkMyPe());
02229 
02230   if ( master == this ) {
02231     for ( int i = 0; i < numSlaves; ++i ) {
02232       computeMgr->sendNonbondedCUDASlaveEnqueue(slaves[i],slavePes[i],sequence(),priority(),workStarted);
02233     }
02234   }
02235 
02236 GBISP("C.N.CUDA[%d]::fnWork: workStarted %d, phase %d\n", \
02237 CkMyPe(), workStarted, gbisPhase)
02238 
02239   Molecule *mol = Node::Object()->molecule;
02240   SimParameters *simParams = Node::Object()->simParameters;
02241 
02242   ResizeArray<int> &patches( workStarted == 1 ?
02243                                 remoteHostedPatches : localHostedPatches );
02244 
02245   // long long int wcount = 0;
02246   // double virial = 0.;
02247   // double virial_slow = 0.;
02248   for ( int i=0; i<patches.size(); ++i ) {
02249     // CkPrintf("Pe %d patch %d of %d pid %d\n",CkMyPe(),i,patches.size(),patches[i]);
02250     patch_record &pr = master->patchRecords[patches[i]];
02251 
02252     if ( !simParams->GBISOn || gbisPhase == 1 ) {
02253       patch_record &pr = master->patchRecords[patches[i]];
02254 GBISP("GBIS[%d] fnWork() P0[%d] force.open()\n",CkMyPe(), pr.patchID);
02255       pr.r = pr.forceBox->open();
02256     } // !GBISOn || gbisPhase==1
02257 
02258     int start = pr.localStart;
02259     const CompAtomExt *aExt = pr.xExt;
02260     if ( !simParams->GBISOn || gbisPhase == 3 ) {
02261       finishPatch(pr);
02262     } // !GBISOn || gbisPhase == 3
02263 
02264 #if 0
02265     if ( i % 31 == 0 ) for ( int j=0; j<3; ++j ) {
02266       CkPrintf("Pe %d patch %d atom %d (%f %f %f) force %f\n", CkMyPe(), i,
02267         j, pr.x[j].position.x, pr.x[j].position.y, pr.x[j].position.z,
02268         af[j].w);
02269     }
02270 #endif
02271 
02272     //Close Boxes depending on Phase
02273     if (simParams->GBISOn) {
02274       if (gbisPhase == 1) {
02275         //Copy dEdaSum from Host to Patch Box
02276         GBReal *psiSumMaster = master->psiSumH + start;
02277         for ( int k=0; k<pr.numAtoms; ++k ) {
02278           int j = aExt[k].sortOrder;
02279           pr.psiSum[j] += psiSumMaster[k];
02280         }
02281 GBISP("C.N.CUDA[%d]::fnWork: P1 psiSum.close()\n", CkMyPe());
02282         pr.psiSumBox->close(&(pr.psiSum));
02283 
02284       } else if (gbisPhase == 2) {
02285         //Copy dEdaSum from Host to Patch Box
02286         GBReal *dEdaSumMaster = master->dEdaSumH + start;
02287         for ( int k=0; k<pr.numAtoms; ++k ) {
02288           int j = aExt[k].sortOrder;
02289           pr.dEdaSum[j] += dEdaSumMaster[k];
02290         }
02291 GBISP("C.N.CUDA[%d]::fnWork: P2 dEdaSum.close()\n", CkMyPe());
02292         pr.dEdaSumBox->close(&(pr.dEdaSum));
02293 
02294       } else if (gbisPhase == 3) {
02295 GBISP("C.N.CUDA[%d]::fnWork: P3 all.close()\n", CkMyPe());
02296         pr.intRadBox->close(&(pr.intRad)); //box 6
02297         pr.bornRadBox->close(&(pr.bornRad)); //box 7
02298         pr.dHdrPrefixBox->close(&(pr.dHdrPrefix)); //box 9
02299         pr.positionBox->close(&(pr.x)); //box 0
02300         pr.forceBox->close(&(pr.r));
02301       } //end phases
02302     } else { //not GBIS
02303 GBISP("C.N.CUDA[%d]::fnWork: pos/force.close()\n", CkMyPe());
02304       pr.positionBox->close(&(pr.x));
02305       pr.forceBox->close(&(pr.r));
02306     }
02307   }// end for
02308 
02309 #if 0
02310   virial *= (-1./6.);
02311   reduction->item(REDUCTION_VIRIAL_NBOND_XX) += virial;
02312   reduction->item(REDUCTION_VIRIAL_NBOND_YY) += virial;
02313   reduction->item(REDUCTION_VIRIAL_NBOND_ZZ) += virial;
02314   if ( doSlow ) {
02315     virial_slow *= (-1./6.);
02316     reduction->item(REDUCTION_VIRIAL_SLOW_XX) += virial_slow;
02317     reduction->item(REDUCTION_VIRIAL_SLOW_YY) += virial_slow;
02318     reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += virial_slow;
02319   }
02320 #endif
02321 
02322   if ( workStarted == 1 && ! deviceCUDA->getMergeGrids() &&
02323        ( localHostedPatches.size() || master == this ) ) {
02324     GBISP("not finished, call again\n");
02325     return 0;  // not finished, call again
02326   }
02327 
02328   if ( master != this ) {  // finished
02329     GBISP("finished\n");
02330     if (simParams->GBISOn) gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
02331     atomsChanged = 0;
02332     return 1;
02333   }
02334 
02335   cuda_timer_total += kernel_time;
02336 
02337   if ( !simParams->GBISOn || gbisPhase == 3 ) {
02338 
02339     atomsChanged = 0;
02340     finishReductions();
02341 
02342   } // !GBISOn || gbisPhase==3  
02343 
02344   // Next GBIS Phase
02345 GBISP("C.N.CUDA[%d]::fnWork: incrementing phase\n", CkMyPe())
02346     if (simParams->GBISOn) gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
02347 
02348   GBISP("C.N.CUDA[%d] finished ready for next step\n",CkMyPe());
02349   return 1;  // finished and ready for next step
02350 }

void ComputeNonbondedCUDA::messageFinishPatch ( int   ) 

Definition at line 2182 of file ComputeNonbondedCUDA.C.

References activePatches, computeMgr, ComputeNonbondedCUDA::patch_record::hostPe, ComputeNonbondedCUDA::patch_record::msg, patchRecords, PROXY_DATA_PRIORITY, ComputeMgr::sendNonbondedCUDASlaveEnqueuePatch(), Compute::sequence(), and ComputeNonbondedCUDA::patch_record::slave.

02182                                                          {
02183   int pid = activePatches[flindex];
02184   patch_record &pr = patchRecords[pid];
02185   //fprintf(stderr, "%d ComputeNonbondedCUDA::messageFinishPatch %d\n",CkMyPe(),pr.hostPe);
02186   computeMgr->sendNonbondedCUDASlaveEnqueuePatch(pr.slave,pr.hostPe,sequence(),PROXY_DATA_PRIORITY,flindex,pr.msg);
02187 }

int ComputeNonbondedCUDA::noWork (  )  [virtual]

Reimplemented from Compute.

Definition at line 1433 of file ComputeNonbondedCUDA.C.

References atomsChanged, computeMgr, Flags::doEnergy, doEnergy, Flags::doFullElectrostatics, Flags::doNonbonded, doSlow, GBISP, Compute::gbisPhase, hostedPatches, if(), Flags::lattice, lattice, master, masterPe, numSlaves, Node::Object(), patchRecords, reduction, ComputeMgr::sendNonbondedCUDASlaveReady(), ComputeMgr::sendNonbondedCUDASlaveSkip(), Compute::sequence(), Node::simParameters, simParams, ResizeArray< Elem >::size(), skip(), slavePes, slaves, Flags::step, step, SubmitReduction::submit(), and workStarted.

01433                                  {
01434 
01435   SimParameters *simParams = Node::Object()->simParameters;
01436   Flags &flags = master->patchRecords[hostedPatches[0]].p->flags;
01437   lattice = flags.lattice;
01438   doSlow = flags.doFullElectrostatics;
01439   doEnergy = flags.doEnergy;
01440   step = flags.step;
01441 
01442   if ( ! flags.doNonbonded ) {
01443     GBISP("GBIS[%d] noWork() don't do nonbonded\n",CkMyPe());
01444     if ( master != this ) {
01445       computeMgr->sendNonbondedCUDASlaveReady(masterPe,
01446                         hostedPatches.size(),atomsChanged,sequence());
01447     } else {
01448       for ( int i = 0; i < numSlaves; ++i ) {
01449         computeMgr->sendNonbondedCUDASlaveSkip(slaves[i],slavePes[i]);
01450       }
01451       skip();
01452     }
01453     if ( reduction ) {
01454        reduction->submit();
01455     }
01456 
01457     return 1;
01458   }
01459 
01460   for ( int i=0; i<hostedPatches.size(); ++i ) {
01461     patch_record &pr = master->patchRecords[hostedPatches[i]];
01462     if (!simParams->GBISOn || gbisPhase == 1) {
01463       GBISP("GBIS[%d] noWork() P0[%d] open()\n",CkMyPe(), pr.patchID);
01464       pr.x = pr.positionBox->open();
01465       pr.xExt = pr.p->getCompAtomExtInfo();
01466     }
01467 
01468     if (simParams->GBISOn) {
01469       if (gbisPhase == 1) {
01470         GBISP("GBIS[%d] noWork() P1[%d] open()\n",CkMyPe(),pr.patchID);
01471         pr.intRad     = pr.intRadBox->open();
01472         pr.psiSum     = pr.psiSumBox->open();
01473       } else if (gbisPhase == 2) {
01474         GBISP("GBIS[%d] noWork() P2[%d] open()\n",CkMyPe(),pr.patchID);
01475         pr.bornRad    = pr.bornRadBox->open();
01476         pr.dEdaSum    = pr.dEdaSumBox->open();
01477       } else if (gbisPhase == 3) {
01478         GBISP("GBIS[%d] noWork() P3[%d] open()\n",CkMyPe(),pr.patchID);
01479         pr.dHdrPrefix = pr.dHdrPrefixBox->open();
01480       }
01481       GBISP("opened GBIS boxes");
01482     }
01483   }
01484 
01485   if ( master == this ) return 0; //work to do, enqueue as usual
01486 
01487   // message masterPe
01488   computeMgr->sendNonbondedCUDASlaveReady(masterPe,
01489                         hostedPatches.size(),atomsChanged,sequence());
01490   atomsChanged = 0;
01491 
01492   workStarted = 1;
01493 
01494   return 1;
01495 }

void ComputeNonbondedCUDA::recvYieldDevice ( int  pe  ) 

Definition at line 1958 of file ComputeNonbondedCUDA.C.

References Lattice::a(), atom_params, atoms, atomsChanged, Lattice::b(), block_order, bornRadH, Lattice::c(), CcdCallBacksReset(), cuda_bind_atom_params(), cuda_bind_atoms(), cuda_bind_forces(), cuda_bind_GBIS_bornRad(), cuda_bind_GBIS_dEdaSum(), cuda_bind_GBIS_dHdrPrefix(), cuda_bind_GBIS_energy(), cuda_bind_GBIS_intRad(), cuda_bind_GBIS_psiSum(), cuda_bind_vdw_types(), cuda_bind_virials(), cuda_check_local_calc(), cuda_check_local_progress(), cuda_check_progress(), cuda_check_remote_calc(), cuda_check_remote_progress(), cuda_GBIS_P1(), cuda_GBIS_P2(), cuda_GBIS_P3(), cuda_nonbonded_forces(), CUDA_POLL, ComputeNonbondedUtil::cutoff2, dEdaSumH, deviceCUDA, deviceID, dHdrPrefixH, doEnergy, doSlow, dummy_dev, dummy_size, end_local_download, end_remote_download, energy_gbis, force_ready_queue, force_ready_queue_next, forces, GBISP, Compute::gbisPhase, DeviceCUDA::getMergeGrids(), DeviceCUDA::getNoStreaming(), DeviceCUDA::getSharedGpu(), intRad0H, intRadSH, lattice, localActivePatches, localComputeRecords, Node::Object(), patchPairsReordered, plcutoff2, psiSumH, remote_submit_time, remoteActivePatches, remoteComputeRecords, savePairlists, Compute::sequence(), DeviceCUDA::setGpuIsMine(), Node::simParameters, simParams, ResizeArray< Elem >::size(), slow_forces, start_calc, stream, stream2, usePairlists, vdw_types, virials, workStarted, Vector::x, Vector::y, and Vector::z.

Referenced by ComputeMgr::recvYieldDevice().

01958                                                  {
01959 GBISP("C.N.CUDA[%d]::recvYieldDevice: seq %d, workStarted %d, \
01960 gbisPhase %d, kls %d, from pe %d\n", CkMyPe(), sequence(), \
01961 workStarted, gbisPhase, kernel_launch_state, pe)
01962 
01963   float3 lata, latb, latc;
01964   lata.x = lattice.a().x;
01965   lata.y = lattice.a().y;
01966   lata.z = lattice.a().z;
01967   latb.x = lattice.b().x;
01968   latb.y = lattice.b().y;
01969   latb.z = lattice.b().z;
01970   latc.x = lattice.c().x;
01971   latc.y = lattice.c().y;
01972   latc.z = lattice.c().z;
01973   SimParameters *simParams = Node::Object()->simParameters;
01974 
01975   // Set GPU device ID
01976   cudaSetDevice(deviceID);
01977 
01978   const bool streaming = ! (deviceCUDA->getNoStreaming() || simParams->GBISOn);
01979 
01980   double walltime;
01981   if ( kernel_launch_state == 1 || kernel_launch_state == 2 ) {
01982     walltime = CkWallTimer();
01983     CcdCallBacksReset(0,walltime);  // fix Charm++
01984   }
01985 
01986   switch ( kernel_launch_state ) {
01988 // Remote
01989   case 1:
01990 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: case 1\n", CkMyPe())
01991     ++kernel_launch_state;
01992     //gpu_is_mine = 0;
01993     deviceCUDA->setGpuIsMine(0);
01994     remote_submit_time = walltime;
01995 
01996     if (!simParams->GBISOn || gbisPhase == 1) {
01997       // cudaEventRecord(start_upload, stream);
01998       if ( atomsChanged ) {
01999         cuda_bind_atom_params(atom_params);
02000         cuda_bind_vdw_types(vdw_types);
02001       }
02002       if ( simParams->GBISOn) {
02003         cuda_bind_GBIS_psiSum(psiSumH);
02004                if ( atomsChanged ) {
02005                  cuda_bind_GBIS_intRad(intRad0H, intRadSH);
02006                }
02007       }
02008       atomsChanged = 0;
02009       cuda_bind_atoms((const atom *)atoms);
02010       cuda_bind_forces(forces, slow_forces);
02011       cuda_bind_virials(virials, force_ready_queue, block_order);
02012       if ( simParams->GBISOn) {
02013         cuda_bind_GBIS_energy(energy_gbis);
02014       }
02015       if ( stream2 != stream ) cudaEventRecord(start_calc, stream);
02016       //call CUDA Kernels
02017 
02018       cuda_nonbonded_forces(lata, latb, latc, cutoff2, plcutoff2,
02019                             0,remoteComputeRecords.size(),
02020                             remoteComputeRecords.size()+localComputeRecords.size(),
02021                             doSlow, doEnergy, usePairlists, savePairlists, streaming, ! patchPairsReordered, stream);
02022       if (simParams->GBISOn) {
02023         cuda_GBIS_P1(
02024           0,remoteComputeRecords.size(),
02025                       localActivePatches.size(),remoteActivePatches.size(),
02026                       simParams->alpha_cutoff-simParams->fsMax,
02027                       simParams->coulomb_radius_offset,
02028                       lata, latb, latc, stream);
02029       }
02030       //cuda_load_forces(forces, (doSlow ? slow_forces : 0 ),
02031       //    num_local_atom_records,num_remote_atom_records);
02032       if ( ( ! streaming ) || ( deviceCUDA->getSharedGpu() && ! deviceCUDA->getMergeGrids() ) ) {
02033         cudaEventRecord(end_remote_download, stream);
02034       }
02035       if ( streaming ) {
02036         force_ready_queue_next = 0;
02037         CUDA_POLL(cuda_check_progress,this);
02038       } else {
02039         CUDA_POLL(cuda_check_remote_progress,this);
02040       }
02041       if ( deviceCUDA->getSharedGpu() && ! deviceCUDA->getMergeGrids() ) {
02042         CUDA_POLL(cuda_check_remote_calc,this);
02043         break;
02044       }
02045     } // !GBIS or gbisPhase==1
02046     if (simParams->GBISOn) {
02047       if (gbisPhase == 1) {
02048         //GBIS P1 Kernel launched in previous code block
02049       } else if (gbisPhase == 2) {
02050 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: <<<P2>>>\n", CkMyPe())
02051         // cudaEventRecord(start_upload, stream);
02052         cuda_bind_GBIS_bornRad(bornRadH);
02053         cuda_bind_GBIS_dEdaSum(dEdaSumH);
02054         if ( stream2 != stream ) cudaEventRecord(start_calc, stream);
02055         cuda_GBIS_P2(
02056           0,remoteComputeRecords.size(),
02057           localActivePatches.size(),remoteActivePatches.size(),
02058           (simParams->alpha_cutoff-simParams->fsMax), simParams->cutoff,
02059           simParams->nonbondedScaling, simParams->kappa,
02060           (simParams->switchingActive ? simParams->switchingDist : -1.0),
02061           simParams->dielectric, simParams->solvent_dielectric,
02062           lata, latb, latc,
02063           doEnergy, doSlow, stream
02064           );
02065         cudaEventRecord(end_remote_download, stream);
02066         CUDA_POLL(cuda_check_remote_progress,this);
02067       } else if (gbisPhase == 3) {
02068 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: <<<P3>>>\n", CkMyPe())
02069         // cudaEventRecord(start_upload, stream);
02070         cuda_bind_GBIS_dHdrPrefix(dHdrPrefixH);
02071         if ( stream2 != stream ) cudaEventRecord(start_calc, stream);
02072         if (doSlow)
02073         cuda_GBIS_P3(
02074           0,remoteComputeRecords.size(),
02075           localActivePatches.size(),remoteActivePatches.size(),
02076           (simParams->alpha_cutoff-simParams->fsMax),
02077           simParams->coulomb_radius_offset,
02078           simParams->nonbondedScaling,
02079           lata, latb, latc, stream
02080           );
02081         cudaEventRecord(end_remote_download, stream);
02082         CUDA_POLL(cuda_check_remote_progress,this);
02083       }
02084     }
02085 
02087 // Local
02088   case 2:
02089 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: case 2\n", CkMyPe())
02090     ++kernel_launch_state;
02091     //gpu_is_mine = 0;
02092     deviceCUDA->setGpuIsMine(0);
02093 
02094     if ( stream2 != stream ) {
02095       // needed to ensure that upload is finished
02096       cudaStreamWaitEvent(stream2, start_calc, 0);
02097       // priorities do not prevent local from launching ahead
02098       // of remote, so delay local stream with a small memset
02099       cudaMemsetAsync(dummy_dev, 0, dummy_size, stream2);
02100     }
02101 
02102     if (!simParams->GBISOn || gbisPhase == 1) {
02103 
02104       cuda_nonbonded_forces(lata, latb, latc, cutoff2, plcutoff2,
02105                             remoteComputeRecords.size(),localComputeRecords.size(),
02106                             remoteComputeRecords.size()+localComputeRecords.size(),
02107                             doSlow, doEnergy, usePairlists, savePairlists, streaming, ! patchPairsReordered, stream2);
02108       if (simParams->GBISOn) {
02109         cuda_GBIS_P1(
02110                      remoteComputeRecords.size(),localComputeRecords.size(),
02111                      0,localActivePatches.size(),
02112                      simParams->alpha_cutoff-simParams->fsMax,
02113                      simParams->coulomb_radius_offset,
02114                      lata, latb, latc, stream2 );
02115       }
02116       //cuda_load_forces(forces, (doSlow ? slow_forces : 0 ),
02117       //    0,num_local_atom_records);
02118       //cuda_load_virials(virials, doSlow);  // slow_virials follows virials
02119       if ( ( ! streaming ) || ( deviceCUDA->getSharedGpu() && ! deviceCUDA->getMergeGrids() ) ) {
02120         cudaEventRecord(end_local_download, stream2);
02121       }
02122       if ( ! streaming && workStarted == 2 ) {
02123         GBISP("C.N.CUDA[%d]::recvYieldDeviceL: adding POLL \
02124 cuda_check_local_progress\n", CkMyPe())
02125         CUDA_POLL(cuda_check_local_progress,this);
02126       }
02127       if ( deviceCUDA->getSharedGpu() && ! deviceCUDA->getMergeGrids() ) {
02128         GBISP("C.N.CUDA[%d]::recvYieldDeviceL: adding POLL \
02129 cuda_check_local_calc\n", CkMyPe())
02130           CUDA_POLL(cuda_check_local_calc,this);
02131         break;
02132       }
02133 
02134     } // !GBIS or gbisPhase==1
02135     if (simParams->GBISOn) {
02136       if (gbisPhase == 1) {
02137         //GBIS P1 Kernel launched in previous code block
02138       } else if (gbisPhase == 2) {
02139 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: calling <<<P2>>>\n", CkMyPe())
02140         cuda_GBIS_P2(
02141           remoteComputeRecords.size(),localComputeRecords.size(),
02142           0,localActivePatches.size(),
02143           (simParams->alpha_cutoff-simParams->fsMax), simParams->cutoff,
02144           simParams->nonbondedScaling, simParams->kappa,
02145           (simParams->switchingActive ? simParams->switchingDist : -1.0),
02146           simParams->dielectric, simParams->solvent_dielectric,
02147           lata, latb, latc,
02148           doEnergy, doSlow, stream2
02149           );
02150         cudaEventRecord(end_local_download, stream2);
02151         if ( workStarted == 2 ) {
02152           CUDA_POLL(cuda_check_local_progress,this);
02153         }
02154       } else if (gbisPhase == 3) {
02155 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: calling <<<P3>>>\n", CkMyPe())
02156         if (doSlow)
02157         cuda_GBIS_P3(
02158           remoteComputeRecords.size(),localComputeRecords.size(),
02159           0,localActivePatches.size(),
02160           (simParams->alpha_cutoff-simParams->fsMax),
02161           simParams->coulomb_radius_offset,
02162           simParams->nonbondedScaling,
02163           lata, latb, latc, stream2
02164           );
02165         cudaEventRecord(end_local_download, stream2);
02166         if ( workStarted == 2 ) {
02167           CUDA_POLL(cuda_check_local_progress,this);
02168         }
02169       } // phases
02170     } // GBISOn
02171     if ( simParams->PMEOn && simParams->PMEOffload  && !simParams->usePMECUDA) break;
02172 
02173   default:
02174 GBISP("C.N.CUDA[%d]::recvYieldDevice: case default\n", CkMyPe())
02175     //gpu_is_mine = 1;
02176     deviceCUDA->setGpuIsMine(1);
02177     break;
02178   } // switch
02179 GBISP("C.N.CUDA[%d]::recvYieldDevice: DONE\n", CkMyPe())
02180 }

void ComputeNonbondedCUDA::registerPatches (  ) 

Definition at line 939 of file ComputeNonbondedCUDA.C.

References activePatches, ResizeArray< Elem >::add(), ResizeArray< Elem >::begin(), ComputeNonbondedCUDA::patch_record::bornRadBox, ProxyMgr::createProxy(), ComputeNonbondedCUDA::patch_record::dEdaSumBox, ComputeNonbondedCUDA::patch_record::dHdrPrefixBox, ComputeNonbondedCUDA::patch_record::forceBox, hostedPatches, ComputeNonbondedCUDA::patch_record::hostPe, ComputeNonbondedCUDA::patch_record::intRadBox, ComputeNonbondedCUDA::patch_record::isLocal, localHostedPatches, master, masterPe, ComputeNonbondedCUDA::patch_record::msg, npatches, ProxyMgr::Object(), Node::Object(), ComputeNonbondedCUDA::patch_record::p, PatchMap::patch(), patchMap, patchRecords, ComputeNonbondedCUDA::patch_record::positionBox, PRIORITY_SIZE, ComputeNonbondedCUDA::patch_record::psiSumBox, Patch::registerBornRadPickup(), Patch::registerDEdaSumDeposit(), Patch::registerDHdrPrefixPickup(), Patch::registerForceDeposit(), Patch::registerIntRadPickup(), Patch::registerPositionPickup(), Patch::registerPsiSumDeposit(), remoteHostedPatches, Compute::setNumPatches(), Node::simParameters, simParams, ResizeArray< Elem >::size(), and ComputeNonbondedCUDA::patch_record::slave.

Referenced by assignPatches(), and ComputeNonbondedCUDA().

00939                                            {
00940 
00941   SimParameters *simParams = Node::Object()->simParameters;
00942   int npatches = master->activePatches.size();
00943   int *pids = master->activePatches.begin();
00944   patch_record *recs = master->patchRecords.begin();
00945   for ( int i=0; i<npatches; ++i ) {
00946     int pid = pids[i];
00947     patch_record &pr = recs[pid];
00948     if ( pr.hostPe == CkMyPe() ) {
00949       pr.slave = this;
00950       pr.msg = new (PRIORITY_SIZE) FinishWorkMsg;
00951       hostedPatches.add(pid);
00952       if ( pr.isLocal ) {
00953         localHostedPatches.add(pid);
00954       } else {
00955         remoteHostedPatches.add(pid);
00956       }
00957       ProxyMgr::Object()->createProxy(pid);
00958       pr.p = patchMap->patch(pid);
00959       pr.positionBox = pr.p->registerPositionPickup(this);
00960       pr.forceBox = pr.p->registerForceDeposit(this);
00961       if (simParams->GBISOn) {
00962         pr.intRadBox      = pr.p->registerIntRadPickup(this);
00963         pr.psiSumBox      = pr.p->registerPsiSumDeposit(this);
00964         pr.bornRadBox     = pr.p->registerBornRadPickup(this);
00965         pr.dEdaSumBox     = pr.p->registerDEdaSumDeposit(this);
00966         pr.dHdrPrefixBox  = pr.p->registerDHdrPrefixPickup(this);
00967       }
00968     }
00969   }
00970   if ( master == this ) setNumPatches(activePatches.size());
00971   else setNumPatches(hostedPatches.size());
00972   if ( CmiPhysicalNodeID(CkMyPe()) < 2 )
00973   CkPrintf("Pe %d hosts %d local and %d remote patches for pe %d\n", CkMyPe(), localHostedPatches.size(), remoteHostedPatches.size(), masterPe);
00974 }

void ComputeNonbondedCUDA::requirePatch ( int  pid  ) 

Definition at line 903 of file ComputeNonbondedCUDA.C.

References activePatches, ResizeArray< Elem >::add(), ComputeNonbondedCUDA::patch_record::bornRad, computesChanged, ComputeNonbondedCUDA::patch_record::dEdaSum, deviceCUDA, ComputeNonbondedCUDA::patch_record::dHdrPrefix, ComputeNonbondedCUDA::patch_record::f, DeviceCUDA::getMergeGrids(), ComputeNonbondedCUDA::patch_record::hostPe, PatchMap::index_a(), PatchMap::index_b(), PatchMap::index_c(), ComputeNonbondedCUDA::patch_record::intRad, ComputeNonbondedCUDA::patch_record::isLocal, ComputeNonbondedCUDA::patch_record::isSameNode, ComputeNonbondedCUDA::patch_record::isSamePhysicalNode, localActivePatches, PatchMap::node(), ComputeNonbondedCUDA::patch_record::patchID, patchMap, patchRecords, ComputeNonbondedCUDA::patch_record::psiSum, ComputeNonbondedCUDA::patch_record::r, ComputeNonbondedCUDA::patch_record::refCount, remoteActivePatches, ComputeNonbondedCUDA::patch_record::x, and ComputeNonbondedCUDA::patch_record::xExt.

Referenced by register_cuda_compute_pair(), and register_cuda_compute_self().

00903                                                {
00904 
00905   computesChanged = 1;
00906   patch_record &pr = patchRecords[pid];
00907   if ( pr.refCount == 0 ) {
00908     pr.isSamePhysicalNode = ( CmiPhysicalNodeID(patchMap->node(pid)) == CmiPhysicalNodeID(CkMyPe()) );
00909     pr.isSameNode = ( CkNodeOf(patchMap->node(pid)) == CkMyNode() );
00910     if ( deviceCUDA->getMergeGrids() ) {
00911       pr.isLocal = 0;
00912     } else if ( CkNumNodes() < 2 ) {
00913       pr.isLocal = 1 & ( 1 ^ patchMap->index_a(pid) ^
00914          patchMap->index_b(pid) ^ patchMap->index_c(pid) );
00915     } else {
00916       pr.isLocal = pr.isSameNode;
00917     }
00918     if ( pr.isLocal ) {
00919       localActivePatches.add(pid);
00920     } else {
00921       remoteActivePatches.add(pid);
00922     }
00923     activePatches.add(pid);
00924     pr.patchID = pid;
00925     pr.hostPe = -1;
00926     pr.x = NULL;
00927     pr.xExt = NULL;
00928     pr.r = NULL;
00929     pr.f = NULL;
00930     pr.intRad      = NULL;
00931     pr.psiSum      = NULL;
00932     pr.bornRad     = NULL;
00933     pr.dEdaSum     = NULL;
00934     pr.dHdrPrefix  = NULL;
00935   }
00936   pr.refCount += 1;
00937 }

void ComputeNonbondedCUDA::skip (  ) 

Definition at line 1416 of file ComputeNonbondedCUDA.C.

References hostedPatches, master, Node::Object(), patchRecords, Node::simParameters, simParams, and ResizeArray< Elem >::size().

Referenced by noWork(), and ComputeMgr::recvNonbondedCUDASlaveSkip().

01416                                 {
01417   //fprintf(stderr, "ComputeNonbondedCUDA::skip()\n");
01418   SimParameters *simParams = Node::Object()->simParameters;
01419   for ( int i=0; i<hostedPatches.size(); ++i ) {
01420     patch_record &pr = master->patchRecords[hostedPatches[i]];
01421     pr.positionBox->skip();
01422     pr.forceBox->skip();
01423     if (simParams->GBISOn) {
01424       pr.intRadBox->skip();
01425       pr.psiSumBox->skip();
01426       pr.bornRadBox->skip();
01427       pr.dEdaSumBox->skip();
01428       pr.dHdrPrefixBox->skip();
01429     }
01430   }
01431 }


Member Data Documentation

ResizeArray<int> ComputeNonbondedCUDA::activePatches

Definition at line 95 of file ComputeNonbondedCUDA.h.

Referenced by assignPatches(), doWork(), finishPatch(), messageFinishPatch(), registerPatches(), and requirePatch().

AtomMap* ComputeNonbondedCUDA::atomMap

Definition at line 119 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA().

CudaAtom* ComputeNonbondedCUDA::atoms

Definition at line 142 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), and recvYieldDevice().

int ComputeNonbondedCUDA::atoms_size

Definition at line 141 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), and doWork().

int ComputeNonbondedCUDA::atomsChanged

Definition at line 131 of file ComputeNonbondedCUDA.h.

Referenced by atomUpdate(), ComputeNonbondedCUDA(), doWork(), noWork(), and recvYieldDevice().

ResizeArray<compute_record> ComputeNonbondedCUDA::computeRecords

Definition at line 98 of file ComputeNonbondedCUDA.h.

Referenced by doWork().

int ComputeNonbondedCUDA::computesChanged

Definition at line 132 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), and requirePatch().

GBReal* ComputeNonbondedCUDA::dEdaSumH

Definition at line 111 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), finishWork(), and recvYieldDevice().

int ComputeNonbondedCUDA::dEdaSumH_size

Definition at line 110 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), and doWork().

int ComputeNonbondedCUDA::deviceID

Definition at line 116 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), and recvYieldDevice().

int ComputeNonbondedCUDA::doEnergy

Definition at line 80 of file ComputeNonbondedCUDA.h.

Referenced by noWork(), and recvYieldDevice().

int ComputeNonbondedCUDA::doSlow

Definition at line 80 of file ComputeNonbondedCUDA.h.

Referenced by noWork(), and recvYieldDevice().

float4* ComputeNonbondedCUDA::forces

Definition at line 102 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), and recvYieldDevice().

int ComputeNonbondedCUDA::forces_size

Definition at line 101 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), and doWork().

ResizeArray<int> ComputeNonbondedCUDA::hostedPatches

Definition at line 96 of file ComputeNonbondedCUDA.h.

Referenced by doWork(), noWork(), registerPatches(), and skip().

Lattice ComputeNonbondedCUDA::lattice

Definition at line 79 of file ComputeNonbondedCUDA.h.

Referenced by doWork(), noWork(), and recvYieldDevice().

ResizeArray<int> ComputeNonbondedCUDA::localActivePatches

Definition at line 95 of file ComputeNonbondedCUDA.h.

Referenced by doWork(), recvYieldDevice(), and requirePatch().

ResizeArray<compute_record> ComputeNonbondedCUDA::localComputeRecords

Definition at line 99 of file ComputeNonbondedCUDA.h.

Referenced by doWork(), recvYieldDevice(), register_cuda_compute_pair(), and register_cuda_compute_self().

ResizeArray<int> ComputeNonbondedCUDA::localHostedPatches

Definition at line 96 of file ComputeNonbondedCUDA.h.

Referenced by finishWork(), registerPatches(), and ComputeMgr::sendNonbondedCUDASlaveEnqueue().

LocalWorkMsg* ComputeNonbondedCUDA::localWorkMsg2

Definition at line 76 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), and ComputeMgr::sendNonbondedCUDASlaveEnqueue().

ComputeNonbondedCUDA* ComputeNonbondedCUDA::master

Definition at line 124 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), finishPatch(), finishWork(), noWork(), registerPatches(), and skip().

int ComputeNonbondedCUDA::masterPe

Definition at line 125 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), noWork(), and registerPatches().

int ComputeNonbondedCUDA::numSlaves

Definition at line 129 of file ComputeNonbondedCUDA.h.

Referenced by assignPatches(), finishWork(), and noWork().

int ComputeNonbondedCUDA::pairlistsValid

Definition at line 135 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), and doWork().

float ComputeNonbondedCUDA::pairlistTolerance

Definition at line 136 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), and doWork().

PatchMap* ComputeNonbondedCUDA::patchMap

Definition at line 118 of file ComputeNonbondedCUDA.h.

Referenced by assignPatches(), ComputeNonbondedCUDA(), doWork(), register_cuda_compute_pair(), registerPatches(), and requirePatch().

int ComputeNonbondedCUDA::patchPairsReordered

Definition at line 133 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), and recvYieldDevice().

ResizeArray<patch_record> ComputeNonbondedCUDA::patchRecords

Definition at line 97 of file ComputeNonbondedCUDA.h.

Referenced by assignPatches(), ComputeNonbondedCUDA(), doWork(), finishPatch(), finishWork(), messageFinishPatch(), noWork(), register_cuda_compute_pair(), register_cuda_compute_self(), registerPatches(), requirePatch(), and skip().

float ComputeNonbondedCUDA::plcutoff2

Definition at line 139 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), and recvYieldDevice().

GBReal* ComputeNonbondedCUDA::psiSumH

Definition at line 108 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), finishWork(), and recvYieldDevice().

int ComputeNonbondedCUDA::psiSumH_size

Definition at line 107 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), and doWork().

SubmitReduction* ComputeNonbondedCUDA::reduction

Definition at line 120 of file ComputeNonbondedCUDA.h.

Referenced by assignPatches(), ComputeNonbondedCUDA(), doWork(), and noWork().

ResizeArray<int> ComputeNonbondedCUDA::remoteActivePatches

Definition at line 95 of file ComputeNonbondedCUDA.h.

Referenced by doWork(), recvYieldDevice(), and requirePatch().

ResizeArray<compute_record> ComputeNonbondedCUDA::remoteComputeRecords

Definition at line 99 of file ComputeNonbondedCUDA.h.

Referenced by doWork(), recvYieldDevice(), register_cuda_compute_pair(), and register_cuda_compute_self().

ResizeArray<int> ComputeNonbondedCUDA::remoteHostedPatches

Definition at line 96 of file ComputeNonbondedCUDA.h.

Referenced by finishWork(), and registerPatches().

int ComputeNonbondedCUDA::savePairlists

Definition at line 138 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), and recvYieldDevice().

int ComputeNonbondedCUDA::slaveIndex

Definition at line 126 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA().

int* ComputeNonbondedCUDA::slavePes

Definition at line 128 of file ComputeNonbondedCUDA.h.

Referenced by assignPatches(), ComputeNonbondedCUDA(), finishWork(), and noWork().

ComputeNonbondedCUDA** ComputeNonbondedCUDA::slaves

Definition at line 127 of file ComputeNonbondedCUDA.h.

Referenced by assignPatches(), ComputeNonbondedCUDA(), finishWork(), and noWork().

float4* ComputeNonbondedCUDA::slow_forces

Definition at line 105 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), and recvYieldDevice().

int ComputeNonbondedCUDA::slow_forces_size

Definition at line 104 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), and doWork().

int ComputeNonbondedCUDA::step

Definition at line 81 of file ComputeNonbondedCUDA.h.

Referenced by noWork().

int ComputeNonbondedCUDA::usePairlists

Definition at line 137 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), and recvYieldDevice().

int ComputeNonbondedCUDA::workStarted

Definition at line 78 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), finishWork(), noWork(), and recvYieldDevice().


The documentation for this class was generated from the following files:
Generated on Wed Nov 22 01:17:19 2017 for NAMD by  doxygen 1.4.7