NAMD
Classes | Public Member Functions | Static Public Member Functions | Public Attributes | List of all members
ComputeNonbondedCUDA Class Reference

#include <ComputeNonbondedCUDA.h>

Inheritance diagram for ComputeNonbondedCUDA:
Compute ComputeNonbondedUtil

Classes

struct  compute_record
 
struct  patch_record
 

Public Member Functions

 ComputeNonbondedCUDA (ComputeID c, ComputeMgr *mgr, ComputeNonbondedCUDA *m=0, int idx=-1)
 
 ~ComputeNonbondedCUDA ()
 
void atomUpdate ()
 
void doWork ()
 
int noWork ()
 
void skip ()
 
void recvYieldDevice (int pe)
 
int finishWork ()
 
void finishReductions ()
 
void finishPatch (int)
 
void messageFinishPatch (int)
 
void requirePatch (int pid)
 
void assignPatches ()
 
void registerPatches ()
 
- Public Member Functions inherited from Compute
 Compute (ComputeID)
 
int type ()
 
virtual ~Compute ()
 
void setNumPatches (int n)
 
int getNumPatches ()
 
virtual void initialize ()
 
virtual void patchReady (PatchID, int doneMigration, int seq)
 
int sequence (void)
 
int priority (void)
 
int getGBISPhase (void)
 
virtual void gbisP2PatchReady (PatchID, int seq)
 
virtual void gbisP3PatchReady (PatchID, int seq)
 

Static Public Member Functions

static void build_lj_table ()
 
static void build_force_table ()
 
static void build_exclusions ()
 

Public Attributes

LocalWorkMsglocalWorkMsg2
 
int workStarted
 
Lattice lattice
 
int doSlow
 
int doEnergy
 
int step
 
ResizeArray< int > activePatches
 
ResizeArray< int > localActivePatches
 
ResizeArray< int > remoteActivePatches
 
ResizeArray< int > hostedPatches
 
ResizeArray< int > localHostedPatches
 
ResizeArray< int > remoteHostedPatches
 
ResizeArray< patch_recordpatchRecords
 
ResizeArray< compute_recordcomputeRecords
 
ResizeArray< compute_recordlocalComputeRecords
 
ResizeArray< compute_recordremoteComputeRecords
 
int forces_size
 
float4 * forces
 
int slow_forces_size
 
float4 * slow_forces
 
int psiSumH_size
 
GBRealpsiSumH
 
int dEdaSumH_size
 
GBRealdEdaSumH
 
int deviceID
 
PatchMappatchMap
 
AtomMapatomMap
 
SubmitReductionreduction
 
ComputeNonbondedCUDAmaster
 
int masterPe
 
int slaveIndex
 
ComputeNonbondedCUDA ** slaves
 
int * slavePes
 
int numSlaves
 
int atomsChanged
 
int computesChanged
 
int patchPairsReordered
 
int pairlistsValid
 
float pairlistTolerance
 
int usePairlists
 
int savePairlists
 
float plcutoff2
 
int atoms_size
 
CudaAtomatoms
 
- Public Attributes inherited from Compute
const ComputeID cid
 
LDObjHandle ldObjHandle
 
LocalWorkMsg *const localWorkMsg
 

Additional Inherited Members

- Protected Member Functions inherited from Compute
void enqueueWork ()
 
- Protected Attributes inherited from Compute
int computeType
 
int basePriority
 
int gbisPhase
 
int gbisPhasePriority [3]
 

Detailed Description

Definition at line 25 of file ComputeNonbondedCUDA.h.

Constructor & Destructor Documentation

ComputeNonbondedCUDA::ComputeNonbondedCUDA ( ComputeID  c,
ComputeMgr mgr,
ComputeNonbondedCUDA m = 0,
int  idx = -1 
)

Definition at line 437 of file ComputeNonbondedCUDA.C.

References atomMap, atoms, atoms_size, atomsChanged, Compute::basePriority, build_exclusions(), computeMgr, computesChanged, cuda_errcheck(), cuda_init(), cudaCompute, dEdaSumH, dEdaSumH_size, deviceCUDA, deviceID, end_local_download, end_remote_download, forces, forces_size, SimParameters::GBISOn, DeviceCUDA::getDeviceID(), DeviceCUDA::getMergeGrids(), DeviceCUDA::getNoMergeGrids(), DeviceCUDA::getNoStreaming(), DeviceCUDA::getSharedGpu(), init_arrays(), localWorkMsg2, master, masterPe, max_grid_size, NAMD_bug(), NAMD_die(), PatchMap::numPatches(), PatchMap::Object(), AtomMap::Object(), Node::Object(), pairlistsValid, pairlistTolerance, patch_pair_num_ptr, patch_pairs_ptr, patchMap, patchPairsReordered, patchRecords, plcutoff2, SimParameters::PMEOffload, SimParameters::PMEOn, SimParameters::pressureProfileOn, PRIORITY_SIZE, PROXY_DATA_PRIORITY, psiSumH, psiSumH_size, reduction, registerPatches(), savePairlists, DeviceCUDA::setGpuIsMine(), DeviceCUDA::setMergeGrids(), Node::simParameters, slaveIndex, slavePes, slaves, slow_forces, slow_forces_size, start_calc, stream, stream2, usePairlists, SimParameters::usePMECUDA, and workStarted.

438  : Compute(c), slaveIndex(idx) {
439 #ifdef PRINT_GBIS
440  CkPrintf("C.N.CUDA[%d]::constructor cid=%d\n", CkMyPe(), c);
441 #endif
442 
443  if ( sizeof(patch_pair) & 15 ) NAMD_bug("sizeof(patch_pair) % 16 != 0");
444  if ( sizeof(atom) & 15 ) NAMD_bug("sizeof(atom) % 16 != 0");
445  if ( sizeof(atom_param) & 15 ) NAMD_bug("sizeof(atom_param) % 16 != 0");
446 
447  // CkPrintf("create ComputeNonbondedCUDA\n");
448  master = m ? m : this;
449  cudaCompute = this;
450  computeMgr = mgr;
453  reduction = 0;
454 
456  if (params->pressureProfileOn) {
457  NAMD_die("pressure profile not supported in CUDA");
458  }
459 
460  atomsChanged = 1;
461  computesChanged = 1;
463 
464  pairlistsValid = 0;
465  pairlistTolerance = 0.;
466  usePairlists = 0;
467  savePairlists = 0;
468  plcutoff2 = 0.;
469 
470  workStarted = 0;
473 
474  // Zero array sizes and pointers
475  init_arrays();
476 
477  atoms_size = 0;
478  atoms = NULL;
479 
480  forces_size = 0;
481  forces = NULL;
482 
483  slow_forces_size = 0;
484  slow_forces = NULL;
485 
486  psiSumH_size = 0;
487  psiSumH = NULL;
488 
489  dEdaSumH_size = 0;
490  dEdaSumH = NULL;
491 
492  if ( master != this ) { // I am slave
494  master->slaves[slaveIndex] = this;
495  if ( master->slavePes[slaveIndex] != CkMyPe() ) {
496  NAMD_bug("ComputeNonbondedCUDA slavePes[slaveIndex] != CkMyPe");
497  }
499  registerPatches();
500  return;
501  }
502  masterPe = CkMyPe();
503 
504  const bool streaming = ! (deviceCUDA->getNoStreaming() || params->GBISOn);
505  if ( streaming && ! deviceCUDA->getSharedGpu() && ! deviceCUDA->getNoMergeGrids() )
507 
508  // Sanity checks for New CUDA kernel
509  if (params->GBISOn) {
510  // GBIS
511  if (deviceCUDA->getNoMergeGrids()) {
512  NAMD_die("CUDA kernel cannot use +nomergegrids with GBIS simulations");
513  }
514  // Set mergegrids ON as long as user hasn't defined +nomergegrids
516  // Final sanity check
517  if (!deviceCUDA->getMergeGrids()) NAMD_die("CUDA GBIS kernel final sanity check failed");
518  } else {
519  // non-GBIS
521  NAMD_die("CUDA kernel requires +mergegrids if +nostreaming is used");
522  }
523  }
524 
525 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP)
526  int leastPriority, greatestPriority;
527  cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority);
528  cuda_errcheck("in cudaDeviceGetStreamPriorityRange");
529  if ( leastPriority != greatestPriority ) {
530  if ( CkMyNode() == 0 ) {
531  int dev = deviceCUDA->getDeviceID();
532  CkPrintf("CUDA device %d stream priority range %d %d\n", dev, leastPriority, greatestPriority);
533  }
534  if ( deviceCUDA->getMergeGrids() && params->PMEOn && params->PMEOffload && !params->usePMECUDA) {
535  greatestPriority = leastPriority;
536  }
537  if (params->usePMECUDA) greatestPriority = leastPriority;
538  cudaStreamCreateWithPriority(&stream,cudaStreamDefault,greatestPriority);
539  cudaStreamCreateWithPriority(&stream2,cudaStreamDefault,leastPriority);
540  } else
541 #endif
542  {
543  cudaStreamCreate(&stream);
544  cuda_errcheck("cudaStreamCreate");
545  int dev = deviceCUDA->getDeviceID();
546  cudaDeviceProp deviceProp;
547  cudaGetDeviceProperties(&deviceProp, dev);
548  cuda_errcheck("cudaGetDeviceProperties");
549  if ( deviceProp.concurrentKernels && deviceProp.major > 2 ) {
550  if ( CkMyNode() == 0 ) CkPrintf("CUDA device %d supports concurrent kernels.\n", dev);
551  cudaStreamCreate(&stream2);
552  } else {
553  stream2 = stream;
554  }
555  }
556  cuda_errcheck("cudaStreamCreate");
557 
558  // Get GPU device ID
560 
561  cuda_init();
562  if ( max_grid_size < 65535 ) NAMD_bug("bad CUDA max_grid_size");
564  // cudaEventCreate(&start_upload);
565  cudaEventCreateWithFlags(&start_calc,cudaEventDisableTiming);
566  cudaEventCreateWithFlags(&end_remote_download,cudaEventDisableTiming);
567  cudaEventCreateWithFlags(&end_local_download,cudaEventDisableTiming);
568 
569  patchRecords.resize(patchMap->numPatches());
572 
573  if ( params->PMEOn && params->PMEOffload && !params->usePMECUDA) deviceCUDA->setGpuIsMine(0);
574 }
static Node * Object()
Definition: Node.h:86
bool getSharedGpu()
Definition: DeviceCUDA.h:103
static __thread cudaEvent_t end_remote_download
static PatchMap * Object()
Definition: PatchMap.h:27
void setMergeGrids(const int val)
Definition: DeviceCUDA.h:101
static __thread ComputeMgr * computeMgr
SimParameters * simParameters
Definition: Node.h:178
#define PROXY_DATA_PRIORITY
Definition: Priorities.h:40
int getMergeGrids()
Definition: DeviceCUDA.h:100
SubmitReduction * reduction
static __thread cudaEvent_t end_local_download
__thread cudaStream_t stream
static __thread ResizeArray< int > * patch_pair_num_ptr
#define PRIORITY_SIZE
Definition: Priorities.h:13
void setGpuIsMine(const int val)
Definition: DeviceCUDA.h:110
ComputeNonbondedCUDA ** slaves
void NAMD_bug(const char *err_msg)
Definition: common.C:129
ComputeNonbondedCUDA * master
void NAMD_die(const char *err_msg)
Definition: common.C:85
void init_arrays()
LocalWorkMsg * localWorkMsg2
static AtomMap * Object()
Definition: AtomMap.h:36
static __thread ResizeArray< patch_pair > * patch_pairs_ptr
int getDeviceID()
Definition: DeviceCUDA.h:112
void cuda_init()
int numPatches(void) const
Definition: PatchMap.h:59
int getNoMergeGrids()
Definition: DeviceCUDA.h:99
void cuda_errcheck(const char *msg)
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:22
Bool pressureProfileOn
int getNoStreaming()
Definition: DeviceCUDA.h:98
static __thread cudaEvent_t start_calc
__thread int max_grid_size
ResizeArray< patch_record > patchRecords
int basePriority
Definition: Compute.h:37
__thread cudaStream_t stream2
Compute(ComputeID)
Definition: Compute.C:33
static __thread ComputeNonbondedCUDA * cudaCompute
ComputeNonbondedCUDA::~ComputeNonbondedCUDA ( )

Definition at line 577 of file ComputeNonbondedCUDA.C.

577 { ; }

Member Function Documentation

void ComputeNonbondedCUDA::assignPatches ( )

Definition at line 661 of file ComputeNonbondedCUDA.C.

References activePatches, ResizeArray< T >::add(), computeMgr, deviceCUDA, DeviceCUDA::getNumPesSharingDevice(), DeviceCUDA::getPesSharingDevice(), ComputeNonbondedCUDA::patch_record::hostPe, PatchMap::node(), numSlaves, ReductionMgr::Object(), PatchMap::ObjectOnPe(), patchMap, patchRecords, reduction, REDUCTIONS_BASIC, registerPatches(), ComputeMgr::sendCreateNonbondedCUDASlave(), ResizeArray< T >::size(), slavePes, slaves, sort, and ReductionMgr::willSubmit().

Referenced by ComputeMgr::createComputes().

661  {
662 
663  int *pesOnNodeSharingDevice = new int[CkMyNodeSize()];
664  int numPesOnNodeSharingDevice = 0;
665  int masterIndex = -1;
666  for ( int i=0; i<deviceCUDA->getNumPesSharingDevice(); ++i ) {
667  int pe = deviceCUDA->getPesSharingDevice(i);
668  if ( pe == CkMyPe() ) masterIndex = numPesOnNodeSharingDevice;
669  if ( CkNodeOf(pe) == CkMyNode() ) {
670  pesOnNodeSharingDevice[numPesOnNodeSharingDevice++] = pe;
671  }
672  }
673 
674  int npatches = activePatches.size();
675 
676  if ( npatches ) {
678  }
679 
680  // calculate priority rank of local home patch within pe
681  {
682  ResizeArray< ResizeArray<int> > homePatchByRank(CkMyNodeSize());
683  for ( int i=0; i<npatches; ++i ) {
684  int pid = activePatches[i];
685  int homePe = patchMap->node(pid);
686  if ( CkNodeOf(homePe) == CkMyNode() ) {
687  homePatchByRank[CkRankOf(homePe)].add(pid);
688  }
689  }
690  for ( int i=0; i<CkMyNodeSize(); ++i ) {
692  std::sort(homePatchByRank[i].begin(),homePatchByRank[i].end(),so);
693  int masterBoost = ( CkMyRank() == i ? 2 : 0 );
694  for ( int j=0; j<homePatchByRank[i].size(); ++j ) {
695  int pid = homePatchByRank[i][j];
696  patchRecords[pid].reversePriorityRankInPe = j + masterBoost;
697  }
698  }
699  }
700 
701  int *count = new int[npatches];
702  memset(count, 0, sizeof(int)*npatches);
703  int *pcount = new int[numPesOnNodeSharingDevice];
704  memset(pcount, 0, sizeof(int)*numPesOnNodeSharingDevice);
705  int *rankpcount = new int[CkMyNodeSize()];
706  memset(rankpcount, 0, sizeof(int)*CkMyNodeSize());
707  char *table = new char[npatches*numPesOnNodeSharingDevice];
708  memset(table, 0, npatches*numPesOnNodeSharingDevice);
709 
710  int unassignedpatches = npatches;
711 
712  if ( 0 ) { // assign all to device pe
713  for ( int i=0; i<npatches; ++i ) {
714  int pid = activePatches[i];
715  patch_record &pr = patchRecords[pid];
716  pr.hostPe = CkMyPe();
717  }
718  unassignedpatches = 0;
719  pcount[masterIndex] = npatches;
720  } else
721 
722  // assign if home pe and build table of natural proxies
723  for ( int i=0; i<npatches; ++i ) {
724  int pid = activePatches[i];
725  patch_record &pr = patchRecords[pid];
726  int homePe = patchMap->node(pid);
727  for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
728  int pe = pesOnNodeSharingDevice[j];
729  if ( pe == homePe ) {
730  pr.hostPe = pe; --unassignedpatches;
731  pcount[j] += 1;
732  }
733  if ( PatchMap::ObjectOnPe(pe)->patch(pid) ) {
734  table[i*numPesOnNodeSharingDevice+j] = 1;
735  }
736  }
737  if ( pr.hostPe == -1 && CkNodeOf(homePe) == CkMyNode() ) {
738  pr.hostPe = homePe; --unassignedpatches;
739  rankpcount[CkRankOf(homePe)] += 1;
740  }
741  }
742  // assign if only one pe has a required proxy
743  int assignj = 0;
744  for ( int i=0; i<npatches; ++i ) {
745  int pid = activePatches[i];
746  patch_record &pr = patchRecords[pid];
747  if ( pr.hostPe != -1 ) continue;
748  int c = 0;
749  int lastj;
750  for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
751  if ( table[i*numPesOnNodeSharingDevice+j] ) { ++c; lastj=j; }
752  }
753  count[i] = c;
754  if ( c == 1 ) {
755  pr.hostPe = pesOnNodeSharingDevice[lastj];
756  --unassignedpatches;
757  pcount[lastj] += 1;
758  }
759  }
760  while ( unassignedpatches ) {
761  int i;
762  for ( i=0; i<npatches; ++i ) {
763  if ( ! table[i*numPesOnNodeSharingDevice+assignj] ) continue;
764  int pid = activePatches[i];
765  patch_record &pr = patchRecords[pid];
766  if ( pr.hostPe != -1 ) continue;
767  pr.hostPe = pesOnNodeSharingDevice[assignj];
768  --unassignedpatches;
769  pcount[assignj] += 1;
770  if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
771  break;
772  }
773  if ( i<npatches ) continue; // start search again
774  for ( i=0; i<npatches; ++i ) {
775  int pid = activePatches[i];
776  patch_record &pr = patchRecords[pid];
777  if ( pr.hostPe != -1 ) continue;
778  if ( count[i] ) continue;
779  pr.hostPe = pesOnNodeSharingDevice[assignj];
780  --unassignedpatches;
781  pcount[assignj] += 1;
782  if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
783  break;
784  }
785  if ( i<npatches ) continue; // start search again
786  if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
787  }
788 
789  for ( int i=0; i<npatches; ++i ) {
790  int pid = activePatches[i];
791  patch_record &pr = patchRecords[pid];
792  // CkPrintf("Pe %d patch %d hostPe %d\n", CkMyPe(), pid, pr.hostPe);
793  }
794 
795  slavePes = new int[CkMyNodeSize()];
796  slaves = new ComputeNonbondedCUDA*[CkMyNodeSize()];
797  numSlaves = 0;
798  for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
799  int pe = pesOnNodeSharingDevice[j];
800  int rank = pe - CkNodeFirst(CkMyNode());
801  // CkPrintf("host %d sharing %d pe %d rank %d pcount %d rankpcount %d\n",
802  // CkMyPe(),j,pe,rank,pcount[j],rankpcount[rank]);
803  if ( pe == CkMyPe() ) continue;
804  if ( ! pcount[j] && ! rankpcount[rank] ) continue;
805  rankpcount[rank] = 0; // skip in rank loop below
806  slavePes[numSlaves] = pe;
808  ++numSlaves;
809  }
810  for ( int j=0; j<CkMyNodeSize(); ++j ) {
811  int pe = CkNodeFirst(CkMyNode()) + j;
812  // CkPrintf("host %d rank %d pe %d rankpcount %d\n",
813  // CkMyPe(),j,pe,rankpcount[j]);
814  if ( ! rankpcount[j] ) continue;
815  if ( pe == CkMyPe() ) continue;
816  slavePes[numSlaves] = pe;
818  ++numSlaves;
819  }
820  registerPatches();
821 
822  delete [] pesOnNodeSharingDevice;
823  delete [] count;
824  delete [] pcount;
825  delete [] rankpcount;
826  delete [] table;
827 }
static __thread ComputeMgr * computeMgr
SubmitReduction * willSubmit(int setID, int size=-1)
Definition: ReductionMgr.C:365
static ReductionMgr * Object(void)
Definition: ReductionMgr.h:278
static PatchMap * ObjectOnPe(int pe)
Definition: PatchMap.h:28
SubmitReduction * reduction
ComputeNonbondedCUDA ** slaves
void sendCreateNonbondedCUDASlave(int, int)
Definition: ComputeMgr.C:1511
int getPesSharingDevice(const int i)
Definition: DeviceCUDA.h:107
ResizeArray< int > activePatches
BlockRadixSort::TempStorage sort
int node(int pid) const
Definition: PatchMap.h:114
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:22
ResizeArray< patch_record > patchRecords
int size(void) const
Definition: ResizeArray.h:127
int getNumPesSharingDevice()
Definition: DeviceCUDA.h:106
void ComputeNonbondedCUDA::atomUpdate ( void  )
virtual

Reimplemented from Compute.

Definition at line 1042 of file ComputeNonbondedCUDA.C.

References atomsChanged.

1042  {
1043  //fprintf(stderr, "%d ComputeNonbondedCUDA::atomUpdate\n",CkMyPe());
1044  atomsChanged = 1;
1045 }
void ComputeNonbondedCUDA::build_exclusions ( )
static

Definition at line 259 of file ComputeNonbondedCUDA.C.

References ResizeArray< Elem >::add(), ResizeArray< T >::add(), ResizeArray< T >::begin(), cuda_bind_exclusions(), ResizeArray< T >::end(), exclusionsByAtom, ExclusionSignature::fullExclCnt, ExclusionSignature::fullOffset, Molecule::get_full_exclusions_for_atom(), ObjectArena< Type >::getNewArray(), MAX_EXCLUSIONS, ComputeNonbondedUtil::mol, Node::molecule, NAMD_bug(), NAMD_die(), Molecule::numAtoms, Node::Object(), ResizeArray< Elem >::resize(), SET_EXCL, ResizeArray< T >::size(), and SortableResizeArray< Type >::sort().

Referenced by build_cuda_exclusions(), and ComputeNonbondedCUDA().

259  {
261 
262 #ifdef MEM_OPT_VERSION
263  int natoms = mol->exclSigPoolSize;
264 #else
265  int natoms = mol->numAtoms;
266 #endif
267 
268  delete [] exclusionsByAtom;
269  exclusionsByAtom = new int2[natoms];
270 
271  // create unique sorted lists
272 
273  ObjectArena<int32> listArena;
274  ResizeArray<int32*> unique_lists;
275  int32 **listsByAtom = new int32*[natoms];
277  for ( int i=0; i<natoms; ++i ) {
278  curList.resize(0);
279  curList.add(0); // always excluded from self
280 #ifdef MEM_OPT_VERSION
281  const ExclusionSignature *sig = mol->exclSigPool + i;
282  int n = sig->fullExclCnt;
283  for ( int j=0; j<n; ++j ) { curList.add(sig->fullOffset[j]); }
284  n += 1;
285 #else
286  const int32 *mol_list = mol->get_full_exclusions_for_atom(i);
287  int n = mol_list[0] + 1;
288  for ( int j=1; j<n; ++j ) {
289  curList.add(mol_list[j] - i);
290  }
291 #endif
292  curList.sort();
293 
294  int j;
295  for ( j=0; j<unique_lists.size(); ++j ) {
296  if ( n != unique_lists[j][0] ) continue; // no match
297  int k;
298  for ( k=0; k<n; ++k ) {
299  if ( unique_lists[j][k+3] != curList[k] ) break;
300  }
301  if ( k == n ) break; // found match
302  }
303  if ( j == unique_lists.size() ) { // no match
304  int32 *list = listArena.getNewArray(n+3);
305  list[0] = n;
306  int maxdiff = 0;
307  maxdiff = -1 * curList[0];
308  if ( curList[n-1] > maxdiff ) maxdiff = curList[n-1];
309  list[1] = maxdiff;
310  for ( int k=0; k<n; ++k ) {
311  list[k+3] = curList[k];
312  }
313  unique_lists.add(list);
314  }
315  listsByAtom[i] = unique_lists[j];
316  }
317  // sort lists by maxdiff
318  std::stable_sort(unique_lists.begin(), unique_lists.end(), exlist_sortop());
319  long int totalbits = 0;
320  int nlists = unique_lists.size();
321  for ( int j=0; j<nlists; ++j ) {
322  int32 *list = unique_lists[j];
323  int maxdiff = list[1];
324  list[2] = totalbits + maxdiff;
325  totalbits += 2*maxdiff + 1;
326  }
327  for ( int i=0; i<natoms; ++i ) {
328  exclusionsByAtom[i].x = listsByAtom[i][1]; // maxdiff
329  exclusionsByAtom[i].y = listsByAtom[i][2]; // start
330  }
331  delete [] listsByAtom;
332 
333  if ( totalbits & 31 ) totalbits += ( 32 - ( totalbits & 31 ) );
334 
335  {
336  long int bytesneeded = totalbits / 8;
337  if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
338  CkPrintf("Info: Found %d unique exclusion lists needing %ld bytes\n",
339  unique_lists.size(), bytesneeded);
340  }
341 
342  long int bytesavail = MAX_EXCLUSIONS * sizeof(unsigned int);
343  if ( bytesneeded > bytesavail ) {
344  char errmsg[512];
345  sprintf(errmsg,"Found %d unique exclusion lists needing %ld bytes "
346  "but only %ld bytes can be addressed with 32-bit int.",
347  unique_lists.size(), bytesneeded, bytesavail);
348  NAMD_die(errmsg);
349  }
350  }
351 
352 #define SET_EXCL(EXCL,BASE,DIFF) \
353  (EXCL)[((BASE)+(DIFF))>>5] |= (1<<(((BASE)+(DIFF))&31))
354 
355  unsigned int *exclusion_bits = new unsigned int[totalbits/32];
356  memset(exclusion_bits, 0, totalbits/8);
357 
358  long int base = 0;
359  for ( int i=0; i<unique_lists.size(); ++i ) {
360  base += unique_lists[i][1];
361  if ( unique_lists[i][2] != (int32)base ) {
362  NAMD_bug("ComputeNonbondedCUDA::build_exclusions base != stored");
363  }
364  int n = unique_lists[i][0];
365  for ( int j=0; j<n; ++j ) {
366  SET_EXCL(exclusion_bits,base,unique_lists[i][j+3]);
367  }
368  base += unique_lists[i][1] + 1;
369  }
370 
371  cuda_bind_exclusions(exclusion_bits, totalbits/32);
372 
373  delete [] exclusion_bits;
374 }
static Node * Object()
Definition: Node.h:86
Type * getNewArray(int n)
Definition: ObjectArena.h:49
void cuda_bind_exclusions(const unsigned int *t, int n)
short int32
Definition: dumpdcd.c:24
static const Molecule * mol
static __thread int2 * exclusionsByAtom
void NAMD_bug(const char *err_msg)
Definition: common.C:129
iterator end(void)
Definition: ResizeArray.h:37
int numAtoms
Definition: Molecule.h:557
void NAMD_die(const char *err_msg)
Definition: common.C:85
int add(const Elem &elem)
Definition: ResizeArray.h:97
void resize(int i)
Definition: ResizeArray.h:84
int size(void) const
Definition: ResizeArray.h:127
#define SET_EXCL(EXCL, BASE, DIFF)
#define MAX_EXCLUSIONS
Molecule * molecule
Definition: Node.h:176
const int32 * get_full_exclusions_for_atom(int anum) const
Definition: Molecule.h:1161
iterator begin(void)
Definition: ResizeArray.h:36
void ComputeNonbondedCUDA::build_force_table ( )
static

Definition at line 114 of file ComputeNonbondedCUDA.C.

References cuda_bind_force_table(), ComputeNonbondedUtil::cutoff, ComputeNonbondedUtil::fast_table, FORCE_TABLE_SIZE, ComputeNonbondedUtil::r2_delta, ComputeNonbondedUtil::r2_delta_exp, ComputeNonbondedUtil::r2_table, ComputeNonbondedUtil::scor_table, ComputeNonbondedUtil::vdwa_table, and ComputeNonbondedUtil::vdwb_table.

Referenced by build_cuda_force_table().

114  { // static
115 
116  float4 t[FORCE_TABLE_SIZE];
117  float4 et[FORCE_TABLE_SIZE]; // energy table
118 
121  // const int r2_delta_expc = 64 * (r2_delta_exp - 127);
122  const int r2_delta_expc = 64 * (r2_delta_exp - 1023);
123 
124  double r2list[FORCE_TABLE_SIZE]; // double to match cpu code
125  for ( int i=1; i<FORCE_TABLE_SIZE; ++i ) {
126  double r = ((double) FORCE_TABLE_SIZE) / ( (double) i + 0.5 );
127  r2list[i] = r*r + r2_delta;
128  }
129 
130  union { double f; int32 i[2]; } byte_order_test;
131  byte_order_test.f = 1.0; // should occupy high-order bits only
132  int32 *r2iilist = (int32*)r2list + ( byte_order_test.i[0] ? 0 : 1 );
133 
134  for ( int i=1; i<FORCE_TABLE_SIZE; ++i ) {
135  double r = ((double) FORCE_TABLE_SIZE) / ( (double) i + 0.5 );
136  int table_i = (r2iilist[2*i] >> 14) + r2_delta_expc; // table_i >= 0
137 
138  if ( r > cutoff ) {
139  t[i].x = 0.;
140  t[i].y = 0.;
141  t[i].z = 0.;
142  t[i].w = 0.;
143  et[i].x = 0.;
144  et[i].y = 0.;
145  et[i].z = 0.;
146  et[i].w = 0.;
147  continue;
148  }
149 
150  BigReal diffa = r2list[i] - r2_table[table_i];
151 
152  // coulomb 1/r or fast force
153  // t[i].x = 1. / (r2 * r); // -1/r * d/dr r^-1
154  {
155  BigReal table_a = fast_table[4*table_i];
156  BigReal table_b = fast_table[4*table_i+1];
157  BigReal table_c = fast_table[4*table_i+2];
158  BigReal table_d = fast_table[4*table_i+3];
159  BigReal grad =
160  ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
161  t[i].x = 2. * grad;
162  BigReal ener = table_a + diffa *
163  ( ( table_d * diffa + table_c ) * diffa + table_b);
164  et[i].x = ener;
165  }
166 
167 
168  // pme correction for slow force
169  // t[i].w = 0.;
170  {
171  BigReal table_a = scor_table[4*table_i];
172  BigReal table_b = scor_table[4*table_i+1];
173  BigReal table_c = scor_table[4*table_i+2];
174  BigReal table_d = scor_table[4*table_i+3];
175  BigReal grad =
176  ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
177  t[i].w = 2. * grad;
178  BigReal ener = table_a + diffa *
179  ( ( table_d * diffa + table_c ) * diffa + table_b);
180  et[i].w = ener;
181  }
182 
183 
184  // vdw 1/r^6
185  // t[i].y = 6. / (r8); // -1/r * d/dr r^-6
186  {
187  BigReal table_a = vdwb_table[4*table_i];
188  BigReal table_b = vdwb_table[4*table_i+1];
189  BigReal table_c = vdwb_table[4*table_i+2];
190  BigReal table_d = vdwb_table[4*table_i+3];
191  BigReal grad =
192  ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
193  t[i].y = 2. * -1. * grad;
194  BigReal ener = table_a + diffa *
195  ( ( table_d * diffa + table_c ) * diffa + table_b);
196  et[i].y = -1. * ener;
197  }
198 
199 
200  // vdw 1/r^12
201  // t[i].z = 12e / (r8 * r4 * r2); // -1/r * d/dr r^-12
202  {
203  BigReal table_a = vdwa_table[4*table_i];
204  BigReal table_b = vdwa_table[4*table_i+1];
205  BigReal table_c = vdwa_table[4*table_i+2];
206  BigReal table_d = vdwa_table[4*table_i+3];
207  BigReal grad =
208  ( 3. * table_d * diffa + 2. * table_c ) * diffa + table_b;
209  t[i].z = 2. * grad;
210  BigReal ener = table_a + diffa *
211  ( ( table_d * diffa + table_c ) * diffa + table_b);
212  et[i].z = ener;
213  }
214 
215  // CkPrintf("%d %g %g %g %g %g %g\n", i, r, diffa,
216  // t[i].x, t[i].y, t[i].z, t[i].w);
217 
218 /*
219  double r2 = r * r;
220  double r4 = r2 * r2;
221  double r8 = r4 * r4;
222 
223  t[i].x = 1. / (r2 * r); // -1/r * d/dr r^-1
224  t[i].y = 6. / (r8); // -1/r * d/dr r^-6
225  t[i].z = 12. / (r8 * r4 * r2); // -1/r * d/dr r^-12
226  t[i].w = 0.;
227 */
228  }
229 
230  t[0].x = 0.f;
231  t[0].y = 0.f;
232  t[0].z = 0.f;
233  t[0].w = 0.f;
234  et[0].x = et[1].x;
235  et[0].y = et[1].y;
236  et[0].z = et[1].z;
237  et[0].w = et[1].w;
238 
239  cuda_bind_force_table(t,et);
240 
241  if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
242  CkPrintf("Info: Updated CUDA force table with %d elements.\n", FORCE_TABLE_SIZE);
243  }
244 }
static BigReal * fast_table
void cuda_bind_force_table(const float4 *t, const float4 *et)
static BigReal * scor_table
short int32
Definition: dumpdcd.c:24
#define FORCE_TABLE_SIZE
static BigReal * vdwa_table
static BigReal * vdwb_table
double BigReal
Definition: common.h:114
void ComputeNonbondedCUDA::build_lj_table ( )
static

Definition at line 87 of file ComputeNonbondedCUDA.C.

References LJTable::TableEntry::A, LJTable::TableEntry::B, cuda_bind_lj_table(), LJTable::get_table_dim(), ComputeNonbondedUtil::ljTable, NAMD_bug(), ComputeNonbondedUtil::scaling, and LJTable::table_val().

Referenced by build_cuda_force_table().

87  { // static
88 
90  const int dim = ljTable->get_table_dim();
91 
92  // round dim up to odd multiple of 16
93  int tsize = (((dim+16+31)/32)*32)-16;
94  if ( tsize < dim ) NAMD_bug("ComputeNonbondedCUDA::build_lj_table bad tsize");
95 
96  float2 *t = new float2[tsize*tsize];
97  float2 *row = t;
98  for ( int i=0; i<dim; ++i, row += tsize ) {
99  for ( int j=0; j<dim; ++j ) {
100  const LJTable::TableEntry *e = ljTable->table_val(i,j);
101  row[j].x = e->A * scaling;
102  row[j].y = e->B * scaling;
103  }
104  }
105 
106  cuda_bind_lj_table(t,tsize);
107  delete [] t;
108 
109  if ( ! CmiPhysicalNodeID(CkMyPe()) ) {
110  CkPrintf("Info: Updated CUDA LJ table with %d x %d elements.\n", dim, dim);
111  }
112 }
int get_table_dim() const
Definition: LJTable.h:44
const TableEntry * table_val(unsigned int i, unsigned int j) const
Definition: LJTable.h:35
void cuda_bind_lj_table(const float2 *t, int _lj_table_size)
void NAMD_bug(const char *err_msg)
Definition: common.C:129
static const LJTable * ljTable
void ComputeNonbondedCUDA::doWork ( void  )
virtual

Reimplemented from Compute.

Definition at line 1176 of file ComputeNonbondedCUDA.C.

References Lattice::a(), activePatches, atom_params, atom_params_size, atoms, atoms_size, atomsChanged, Lattice::b(), Compute::basePriority, ResizeArray< T >::begin(), block_order, block_order_size, ComputeNonbondedCUDA::patch_record::bornRad, bornRadH, bornRadH_size, Lattice::c(), PatchMap::center(), CompAtom::charge, COMPUTE_PROXY_PRIORITY, computeRecords, computesChanged, COULOMB, cuda_bind_patch_pairs(), cuda_check_local_progress(), cuda_errcheck(), cudaCheck, cudaCompute, ComputeNonbondedUtil::cutoff, ComputeNonbondedUtil::cutoff2, dEdaSumH, dEdaSumH_size, deviceCUDA, deviceID, ComputeNonbondedCUDA::patch_record::dHdrPrefix, dHdrPrefixH, dHdrPrefixH_size, ComputeNonbondedUtil::dielectric_1, doSlow, dummy_dev, dummy_size, energy_gbis, energy_gbis_size, exclusionsByAtom, finishWork(), ComputeNonbondedUtil::fixedAtomsOn, Patch::flags, force_ready_queue, force_ready_queue_len, force_ready_queue_size, forces, forces_size, SimParameters::GBISOn, GBISP, Compute::gbisPhase, Patch::getCudaAtomList(), DeviceCUDA::getGpuIsMine(), DeviceCUDA::getNoStreaming(), Patch::getNumAtoms(), hostedPatches, CompAtomExt::id, ComputeNonbondedCUDA::patch_record::intRad, intRad0H, intRad0H_size, intRadSH, intRadSH_size, SubmitReduction::item(), kernel_time, lata, latb, latc, Flags::lattice, lattice, localActivePatches, localComputeRecords, ComputeNonbondedCUDA::patch_record::localStart, master, Flags::maxAtomMovement, ComputeNonbondedUtil::mol, Node::molecule, NAMD_bug(), num_atoms, num_local_atoms, num_remote_atoms, num_virials, ComputeNonbondedCUDA::patch_record::numAtoms, ComputeNonbondedCUDA::patch_record::numFreeAtoms, Node::Object(), ComputeNonbondedCUDA::compute_record::offset, ComputeNonbondedCUDA::patch_record::p, pairlistsValid, Flags::pairlistTolerance, pairlistTolerance, Node::parameters, patch_pair_num_ptr, patch_pairs, patch_pairs_ptr, ComputeNonbondedCUDA::patch_record::patchID, patchMap, patchPairsReordered, patchRecords, ComputeNonbondedCUDA::compute_record::pid, plcutoff2, CompAtom::position, PROXY_DATA_PRIORITY, PROXY_RESULTS_PRIORITY, psiSumH, psiSumH_size, recvYieldDevice(), reduction, REDUCTION_PAIRLIST_WARNINGS, remoteActivePatches, remoteComputeRecords, ResizeArray< T >::resize(), Flags::savePairlists, savePairlists, ComputeNonbondedUtil::scaling, Compute::sequence(), Node::simParameters, simParams, ResizeArray< T >::size(), slow_forces, slow_forces_size, slow_virials, CompAtomExt::sortOrder, ResizeArray< T >::swap(), Lattice::unscale(), Flags::usePairlists, usePairlists, vdw_types, vdw_types_size, CompAtom::vdwType, virials, virials_size, WARPSIZE, workStarted, ComputeNonbondedCUDA::patch_record::x, Vector::x, CudaAtom::x, ComputeNonbondedCUDA::patch_record::xExt, Vector::y, and Vector::z.

1176  {
1177 GBISP("C.N.CUDA[%d]::doWork: seq %d, phase %d, workStarted %d, atomsChanged %d\n", \
1178 CkMyPe(), sequence(), gbisPhase, workStarted, atomsChanged);
1179 
1180  // Set GPU device ID
1181  cudaCheck(cudaSetDevice(deviceID));
1182 
1184  ResizeArray<int> &patch_pair_num(*patch_pair_num_ptr);
1185 
1186  if ( workStarted ) { //if work already started, check if finished
1187  if ( finishWork() ) { // finished
1188  workStarted = 0;
1189  basePriority = PROXY_DATA_PRIORITY; // higher to aid overlap
1190  } else { // need to call again
1191  workStarted = 2;
1192  basePriority = PROXY_RESULTS_PRIORITY; // lower for local
1193  if ( master == this && kernel_launch_state > 2 ) {
1194  cuda_check_local_progress(this,0.); // launches polling
1195  }
1196  }
1197  return;
1198  }
1199 
1200  workStarted = 1;
1202 
1204  Parameters *params = Node::Object()->parameters;
1206 
1207  //execute only during GBIS phase 1, or if not using GBIS
1208  if (!simParams->GBISOn || gbisPhase == 1) {
1209 
1210  //bind new patches to GPU
1211  if ( atomsChanged || computesChanged ) {
1212  int npatches = activePatches.size();
1213 
1214  pairlistsValid = 0;
1215  pairlistTolerance = 0.;
1216 
1217  if ( computesChanged ) {
1218  computesChanged = 0;
1219 
1220  if ( ! dummy_size ) {
1221  dummy_size = 1024*1024;
1222  cudaMalloc((void**)&dummy_dev,dummy_size);
1223  cuda_errcheck("in cudaMalloc dummy_dev");
1224  }
1225 
1226  bool did_realloc = reallocate_host<int>(&force_ready_queue, &force_ready_queue_size, npatches,
1227  1.2f, cudaHostAllocMapped);
1228  if (did_realloc) {
1229  for (int k=0; k < force_ready_queue_size; ++k)
1230  force_ready_queue[k] = -1;
1231  }
1232  force_ready_queue_len = npatches;
1233  reallocate_host<int>(&block_order, &block_order_size,
1234  2*(localComputeRecords.size()+remoteComputeRecords.size()),
1235  1.2f, cudaHostAllocMapped);
1236 
1237  num_virials = npatches;
1238  reallocate_host<float>(&virials, &virials_size, 2*16*num_virials,
1239  1.2f, cudaHostAllocMapped);
1240  slow_virials = virials + 16*num_virials;
1241 
1242  reallocate_host<float>(&energy_gbis, &energy_gbis_size, npatches,
1243  1.2f, cudaHostAllocMapped);
1244  for (int i = 0; i < energy_gbis_size; i++) energy_gbis[i] = 0.f;
1245 
1246  int *ap = activePatches.begin();
1247  for ( int i=0; i<localActivePatches.size(); ++i ) {
1248  *(ap++) = localActivePatches[i];
1249  }
1250  for ( int i=0; i<remoteActivePatches.size(); ++i ) {
1251  *(ap++) = remoteActivePatches[i];
1252  }
1253 
1254  // sort computes by distance between patches
1256  std::stable_sort(localComputeRecords.begin(),localComputeRecords.end(),so);
1257  std::stable_sort(remoteComputeRecords.begin(),remoteComputeRecords.end(),so);
1258 
1259  const bool streaming = ! (deviceCUDA->getNoStreaming() || simParams->GBISOn);
1260 
1261  if ( streaming ) {
1262  // iout << "Reverse-priority sorting...\n" << endi;
1263  cr_sortop_reverse_priority sorp(so,patchRecords.begin());
1264  std::stable_sort(localComputeRecords.begin(),localComputeRecords.end(),sorp);
1265  std::stable_sort(remoteComputeRecords.begin(),remoteComputeRecords.end(),sorp);
1266  patchPairsReordered = 0;
1267  //patchPairsReordered = 1;
1268  // int len = remoteComputeRecords.size();
1269  // for ( int i=0; i<len; ++i ) {
1270  // iout << "reverse_order " << i << " " << remoteComputeRecords[i].pid[0] << "\n";
1271  // }
1272  // int len2 = localComputeRecords.size();
1273  // for ( int i=0; i<len2; ++i ) {
1274  // iout << "reverse_order " << (i+len) << " " << localComputeRecords[i].pid[0] << "\n";
1275  // }
1276  // iout << endi;
1277  } else {
1278  patchPairsReordered = 1;
1279  }
1280 
1281  int nlc = localComputeRecords.size();
1282  int nrc = remoteComputeRecords.size();
1283  computeRecords.resize(nlc+nrc);
1284  compute_record *cr = computeRecords.begin();
1285  for ( int i=0; i<nrc; ++i ) {
1286  *(cr++) = remoteComputeRecords[i];
1287  }
1288  for ( int i=0; i<nlc; ++i ) {
1289  *(cr++) = localComputeRecords[i];
1290  }
1291 
1292  // patch_pair_num[i] = number of patch pairs that involve patch i
1293  patch_pair_num.resize(npatches);
1294  for ( int i=0; i<npatches; ++i ) {
1295  patchRecords[activePatches[i]].localIndex = i;
1296  patch_pair_num[i] = 0;
1297  }
1298 
1299  int ncomputes = computeRecords.size();
1300  patch_pairs.resize(ncomputes);
1301  for ( int i=0; i<ncomputes; ++i ) {
1303  int lp1 = patchRecords[cr.pid[0]].localIndex;
1304  int lp2 = patchRecords[cr.pid[1]].localIndex;
1305  patch_pair_num[lp1]++;
1306  if (lp1 != lp2) patch_pair_num[lp2]++;
1307  patch_pair &pp = patch_pairs[i];
1308  pp.offset.x = cr.offset.x;
1309  pp.offset.y = cr.offset.y;
1310  pp.offset.z = cr.offset.z;
1311  }
1312 
1313  for ( int i=0; i<ncomputes; ++i ) {
1315  int lp1 = patchRecords[cr.pid[0]].localIndex;
1316  int lp2 = patchRecords[cr.pid[1]].localIndex;
1317  patch_pair &pp = patch_pairs[i];
1318  pp.patch1_ind = lp1;
1319  pp.patch2_ind = lp2;
1320  pp.patch1_num_pairs = patch_pair_num[lp1];
1321  pp.patch2_num_pairs = patch_pair_num[lp2];
1322  }
1323 
1324  if ( CmiPhysicalNodeID(CkMyPe()) < 2 ) {
1325  CkPrintf("Pe %d has %d local and %d remote patches and %d local and %d remote computes.\n",
1327  localComputeRecords.size(), remoteComputeRecords.size());
1328  }
1329  } // computesChanged
1330  else if ( ! patchPairsReordered ) {
1331  patchPairsReordered = 1;
1332  int len = patch_pairs.size();
1333  int nlc = localComputeRecords.size();
1334  int nrc = remoteComputeRecords.size();
1335  if ( len != nlc + nrc ) NAMD_bug("array size mismatch in ComputeNonbondedCUDA reordering");
1336  ResizeArray<ComputeNonbondedCUDA::compute_record> new_computeRecords(len);
1337  ResizeArray<patch_pair> new_patch_pairs(len);
1338  int irc=nrc;
1339  int ilc=len;
1340  for ( int i=0; i<len; ++i ) {
1341  int boi = block_order[i];
1342  int dest;
1343  if ( boi < nrc ) { dest = --irc; } else { dest = --ilc; }
1344  new_computeRecords[dest] = computeRecords[boi];
1345  new_patch_pairs[dest] = patch_pairs[boi];
1346  }
1347  if ( irc != 0 || ilc != nrc ) NAMD_bug("block index mismatch in ComputeNonbondedCUDA reordering");
1348  computeRecords.swap(new_computeRecords);
1349  patch_pairs.swap(new_patch_pairs);
1350  }
1351 
1352  int istart = 0;
1353  int i;
1354  for ( i=0; i<npatches; ++i ) {
1355  if ( i == localActivePatches.size() ) {
1356  num_local_atoms = istart;
1357  }
1358  patch_record &pr = patchRecords[activePatches[i]];
1359  pr.localStart = istart;
1360  int natoms = pr.p->getNumAtoms();
1361  int nfreeatoms = natoms;
1362  if ( fixedAtomsOn ) {
1363  const CompAtomExt *aExt = pr.xExt;
1364  for ( int j=0; j<natoms; ++j ) {
1365  if ( aExt[j].atomFixed ) --nfreeatoms;
1366  }
1367  }
1368  pr.numAtoms = natoms;
1369  pr.numFreeAtoms = nfreeatoms;
1370  istart += natoms;
1371  istart += 16 - (natoms & 15);
1372  }
1373  if ( i == localActivePatches.size() ) {
1374  num_local_atoms = istart;
1375  }
1376  num_atoms = istart;
1378  reallocate_host<atom_param>(&atom_params, &atom_params_size, num_atoms, 1.2f);
1379  reallocate_host<int>(&vdw_types, &vdw_types_size, num_atoms, 1.2f);
1380  reallocate_host<CudaAtom>(&atoms, &atoms_size, num_atoms, 1.2f);
1381  reallocate_host<float4>(&forces, &forces_size, num_atoms, 1.2f, cudaHostAllocMapped);
1382  reallocate_host<float4>(&slow_forces, &slow_forces_size, num_atoms, 1.2f, cudaHostAllocMapped);
1383  if (simParams->GBISOn) {
1384  reallocate_host<float>(&intRad0H, &intRad0H_size, num_atoms, 1.2f);
1385  reallocate_host<float>(&intRadSH, &intRadSH_size, num_atoms, 1.2f);
1386  reallocate_host<GBReal>(&psiSumH, &psiSumH_size, num_atoms, 1.2f, cudaHostAllocMapped);
1387  reallocate_host<float>(&bornRadH, &bornRadH_size, num_atoms, 1.2f);
1388  reallocate_host<GBReal>(&dEdaSumH, &dEdaSumH_size, num_atoms, 1.2f, cudaHostAllocMapped);
1389  reallocate_host<float>(&dHdrPrefixH, &dHdrPrefixH_size, num_atoms, 1.2f);
1390  }
1391 
1392  int bfstart = 0;
1393  int exclmask_start = 0;
1394  int ncomputes = computeRecords.size();
1395  for ( int i=0; i<ncomputes; ++i ) {
1397  int p1 = cr.pid[0];
1398  int p2 = cr.pid[1];
1399  patch_pair &pp = patch_pairs[i];
1400  pp.patch1_start = patchRecords[p1].localStart;
1401  pp.patch1_size = patchRecords[p1].numAtoms;
1402  pp.patch1_free_size = patchRecords[p1].numFreeAtoms;
1403  pp.patch2_start = patchRecords[p2].localStart;
1404  pp.patch2_size = patchRecords[p2].numAtoms;
1405  pp.patch2_free_size = patchRecords[p2].numFreeAtoms;
1406  pp.plist_start = bfstart;
1407  // size1*size2 = number of patch pairs
1408  int size1 = (pp.patch1_size-1)/WARPSIZE+1;
1409  int size2 = (pp.patch2_size-1)/WARPSIZE+1;
1410  pp.plist_size = (size1*size2-1)/32+1;
1411  bfstart += pp.plist_size;
1412  pp.exclmask_start = exclmask_start;
1413  exclmask_start += size1*size2;
1414  } //for ncomputes
1415 
1417  activePatches.size(), num_atoms, bfstart,
1418  exclmask_start);
1419 
1420  } // atomsChanged || computesChanged
1421 
1422  double charge_scaling = sqrt(COULOMB * scaling * dielectric_1);
1423 
1424  Flags &flags = patchRecords[hostedPatches[0]].p->flags;
1425  float maxAtomMovement = 0.;
1426  float maxPatchTolerance = 0.;
1427 
1428  for ( int i=0; i<activePatches.size(); ++i ) {
1429  patch_record &pr = patchRecords[activePatches[i]];
1430 
1431  float maxMove = pr.p->flags.maxAtomMovement;
1432  if ( maxMove > maxAtomMovement ) maxAtomMovement = maxMove;
1433 
1434  float maxTol = pr.p->flags.pairlistTolerance;
1435  if ( maxTol > maxPatchTolerance ) maxPatchTolerance = maxTol;
1436 
1437  int start = pr.localStart;
1438  int n = pr.numAtoms;
1439  const CompAtom *a = pr.x;
1440  const CompAtomExt *aExt = pr.xExt;
1441  if ( atomsChanged ) {
1442 
1443  atom_param *ap = atom_params + start;
1444  for ( int k=0; k<n; ++k ) {
1445  int j = aExt[k].sortOrder;
1446  ap[k].vdw_type = a[j].vdwType;
1447  vdw_types[start + k] = a[j].vdwType;
1448  ap[k].index = aExt[j].id;
1449 #ifdef MEM_OPT_VERSION
1450  ap[k].excl_index = exclusionsByAtom[aExt[j].exclId].y;
1451  ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].exclId].x;
1452 #else // ! MEM_OPT_VERSION
1453  ap[k].excl_index = exclusionsByAtom[aExt[j].id].y;
1454  ap[k].excl_maxdiff = exclusionsByAtom[aExt[j].id].x;
1455 #endif // MEM_OPT_VERSION
1456  }
1457  }
1458  {
1459 #if 1
1460  const CudaAtom *ac = pr.p->getCudaAtomList();
1461  CudaAtom *ap = atoms + start;
1462  memcpy(ap, ac, sizeof(atom)*n);
1463 #else
1464  Vector center =
1465  pr.p->flags.lattice.unscale(cudaCompute->patchMap->center(pr.patchID));
1466  atom *ap = atoms + start;
1467  for ( int k=0; k<n; ++k ) {
1468  int j = aExt[k].sortOrder;
1469  ap[k].position.x = a[j].position.x - center.x;
1470  ap[k].position.y = a[j].position.y - center.y;
1471  ap[k].position.z = a[j].position.z - center.z;
1472  ap[k].charge = charge_scaling * a[j].charge;
1473  }
1474 #endif
1475  }
1476  }
1477 
1478  savePairlists = 0;
1479  usePairlists = 0;
1480  if ( flags.savePairlists ) {
1481  savePairlists = 1;
1482  usePairlists = 1;
1483  } else if ( flags.usePairlists ) {
1484  if ( ! pairlistsValid ||
1485  ( 2. * maxAtomMovement > pairlistTolerance ) ) {
1487  } else {
1488  usePairlists = 1;
1489  }
1490  }
1491  if ( ! usePairlists ) {
1492  pairlistsValid = 0;
1493  }
1494  float plcutoff = cutoff;
1495  if ( savePairlists ) {
1496  pairlistsValid = 1;
1497  pairlistTolerance = 2. * maxPatchTolerance;
1498  plcutoff += pairlistTolerance;
1499  }
1500  plcutoff2 = plcutoff * plcutoff;
1501 
1502  //CkPrintf("plcutoff = %f listTolerance = %f save = %d use = %d\n",
1503  // plcutoff, pairlistTolerance, savePairlists, usePairlists);
1504 
1505 #if 0
1506  // calculate warp divergence
1507  if ( 1 ) {
1508  Flags &flags = patchRecords[hostedPatches[0]].p->flags;
1509  Lattice &lattice = flags.lattice;
1510  float3 lata, latb, latc;
1511  lata.x = lattice.a().x;
1512  lata.y = lattice.a().y;
1513  lata.z = lattice.a().z;
1514  latb.x = lattice.b().x;
1515  latb.y = lattice.b().y;
1516  latb.z = lattice.b().z;
1517  latc.x = lattice.c().x;
1518  latc.y = lattice.c().y;
1519  latc.z = lattice.c().z;
1520 
1521  int ncomputes = computeRecords.size();
1522  for ( int ic=0; ic<ncomputes; ++ic ) {
1523  patch_pair &pp = patch_pairs[ic];
1524  atom *a1 = atoms + pp.patch1_atom_start;
1525  int n1 = pp.patch1_size;
1526  atom *a2 = atoms + pp.patch2_atom_start;
1527  int n2 = pp.patch2_size;
1528  float offx = pp.offset.x * lata.x
1529  + pp.offset.y * latb.x
1530  + pp.offset.z * latc.x;
1531  float offy = pp.offset.x * lata.y
1532  + pp.offset.y * latb.y
1533  + pp.offset.z * latc.y;
1534  float offz = pp.offset.x * lata.z
1535  + pp.offset.y * latb.z
1536  + pp.offset.z * latc.z;
1537  // CkPrintf("%f %f %f\n", offx, offy, offz);
1538  int atoms_tried = 0;
1539  int blocks_tried = 0;
1540  int atoms_used = 0;
1541  int blocks_used = 0;
1542  for ( int ii=0; ii<n1; ii+=32 ) { // warps
1543  for ( int jj=0; jj<n2; jj+=16 ) { // shared atom loads
1544  int block_used = 0;
1545  for ( int j=jj; j<jj+16 && j<n2; ++j ) { // shared atoms
1546  int atom_used = 0;
1547  for ( int i=ii; i<ii+32 && i<n1; ++i ) { // threads
1548  float dx = offx + a1[i].position.x - a2[j].position.x;
1549  float dy = offy + a1[i].position.y - a2[j].position.y;
1550  float dz = offz + a1[i].position.z - a2[j].position.z;
1551  float r2 = dx*dx + dy*dy + dz*dz;
1552  if ( r2 < cutoff2 ) atom_used = 1;
1553  }
1554  ++atoms_tried;
1555  if ( atom_used ) { block_used = 1; ++atoms_used; }
1556  }
1557  ++blocks_tried;
1558  if ( block_used ) { ++blocks_used; }
1559  }
1560  }
1561  CkPrintf("blocks = %d/%d (%f) atoms = %d/%d (%f)\n",
1562  blocks_used, blocks_tried, blocks_used/(float)blocks_tried,
1563  atoms_used, atoms_tried, atoms_used/(float)atoms_tried);
1564  }
1565  }
1566 #endif
1567 
1568  } // !GBISOn || gbisPhase == 1
1569 
1570  //Do GBIS
1571  if (simParams->GBISOn) {
1572  //open GBIS boxes depending on phase
1573  for ( int i=0; i<activePatches.size(); ++i ) {
1574  patch_record &pr = master->patchRecords[activePatches[i]];
1575  GBISP("doWork[%d] accessing arrays for P%d\n",CkMyPe(),gbisPhase);
1576  if (gbisPhase == 1) {
1577  //Copy GBIS intRadius to Host
1578  if (atomsChanged) {
1579  float *intRad0 = intRad0H + pr.localStart;
1580  float *intRadS = intRadSH + pr.localStart;
1581  for ( int k=0; k<pr.numAtoms; ++k ) {
1582  int j = pr.xExt[k].sortOrder;
1583  intRad0[k] = pr.intRad[2*j+0];
1584  intRadS[k] = pr.intRad[2*j+1];
1585  }
1586  }
1587  } else if (gbisPhase == 2) {
1588  float *bornRad = bornRadH + pr.localStart;
1589  for ( int k=0; k<pr.numAtoms; ++k ) {
1590  int j = pr.xExt[k].sortOrder;
1591  bornRad[k] = pr.bornRad[j];
1592  }
1593  } else if (gbisPhase == 3) {
1594  float *dHdrPrefix = dHdrPrefixH + pr.localStart;
1595  for ( int k=0; k<pr.numAtoms; ++k ) {
1596  int j = pr.xExt[k].sortOrder;
1597  dHdrPrefix[k] = pr.dHdrPrefix[j];
1598  }
1599  } // end phases
1600  } // end for patches
1601  } // if GBISOn
1602 
1603  kernel_time = CkWallTimer();
1604  kernel_launch_state = 1;
1605  //if ( gpu_is_mine || ! doSlow ) recvYieldDevice(-1);
1606  if ( deviceCUDA->getGpuIsMine() || ! doSlow ) recvYieldDevice(-1);
1607 }
static Node * Object()
Definition: Node.h:86
static __thread int * block_order
#define COMPUTE_PROXY_PRIORITY
Definition: Priorities.h:71
#define PROXY_RESULTS_PRIORITY
Definition: Priorities.h:73
ResizeArray< int > localActivePatches
int sequence(void)
Definition: Compute.h:64
int sortOrder
Definition: NamdTypes.h:87
static __thread int intRadSH_size
ResizeArray< compute_record > computeRecords
void cuda_check_local_progress(void *arg, double walltime)
#define GBISP(...)
Definition: Vector.h:64
SimParameters * simParameters
Definition: Node.h:178
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
static __thread int dummy_size
int savePairlists
Definition: PatchTypes.h:39
static const Molecule * mol
#define COULOMB
Definition: common.h:46
BigReal & item(int i)
Definition: ReductionMgr.h:312
static __thread float * bornRadH
BigReal z
Definition: Vector.h:66
static __thread int2 * exclusionsByAtom
int usePairlists
Definition: PatchTypes.h:38
Position position
Definition: NamdTypes.h:53
static __thread float * dHdrPrefixH
#define PROXY_DATA_PRIORITY
Definition: Priorities.h:40
#define WARPSIZE
Definition: CudaUtils.h:10
SubmitReduction * reduction
static __thread int dHdrPrefixH_size
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
static __thread float * slow_virials
static __thread int intRad0H_size
Charge charge
Definition: NamdTypes.h:54
static __thread ResizeArray< int > * patch_pair_num_ptr
static __thread patch_pair * patch_pairs
static __thread float * intRadSH
void cuda_bind_patch_pairs(patch_pair *h_patch_pairs, int npatch_pairs, int npatches, int natoms, int plist_len, int nexclmask)
static __thread double kernel_time
void NAMD_bug(const char *err_msg)
Definition: common.C:129
static __thread int force_ready_queue_size
static __thread int num_remote_atoms
static __thread int virials_size
ResizeArray< int > remoteActivePatches
BigReal x
Definition: Vector.h:66
ComputeNonbondedCUDA * master
ResizeArray< compute_record > localComputeRecords
static __thread float * virials
ResizeArray< compute_record > remoteComputeRecords
static __thread ResizeArray< patch_pair > * patch_pairs_ptr
static __thread int bornRadH_size
static __thread int num_virials
ResizeArray< int > hostedPatches
ResizeArray< int > activePatches
static __thread int vdw_types_size
int gbisPhase
Definition: Compute.h:39
Parameters * parameters
Definition: Node.h:177
ScaledPosition center(int pid) const
Definition: PatchMap.h:99
static __thread int force_ready_queue_len
static __thread int energy_gbis_size
static __thread float * dummy_dev
#define simParams
Definition: Output.C:127
short vdwType
Definition: NamdTypes.h:55
void cuda_errcheck(const char *msg)
static __thread float * energy_gbis
BigReal y
Definition: Vector.h:66
Vector b() const
Definition: Lattice.h:253
static __thread int kernel_launch_state
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:22
int getNoStreaming()
Definition: DeviceCUDA.h:98
Lattice lattice
Definition: PatchTypes.h:44
static __thread atom_param * atom_params
ResizeArray< patch_record > patchRecords
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
static __thread int block_order_size
static __thread int * vdw_types
int basePriority
Definition: Compute.h:37
int size(void) const
Definition: ResizeArray.h:127
int getGpuIsMine()
Definition: DeviceCUDA.h:109
static __thread int atom_params_size
static __thread int * force_ready_queue
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
static __thread int num_atoms
Molecule * molecule
Definition: Node.h:176
Vector a() const
Definition: Lattice.h:252
static __thread int num_local_atoms
Vector c() const
Definition: Lattice.h:254
static __thread float * intRad0H
static __thread ComputeNonbondedCUDA * cudaCompute
iterator begin(void)
Definition: ResizeArray.h:36
void ComputeNonbondedCUDA::finishPatch ( int  flindex)
virtual

Reimplemented from Compute.

Definition at line 1868 of file ComputeNonbondedCUDA.C.

References activePatches, Box< Owner, Data >::close(), ComputeNonbondedCUDA::patch_record::forceBox, master, Box< Owner, Data >::open(), patchRecords, ComputeNonbondedCUDA::patch_record::positionBox, ComputeNonbondedCUDA::patch_record::r, and ComputeNonbondedCUDA::patch_record::x.

Referenced by finishWork().

1868  {
1869  //fprintf(stderr, "%d ComputeNonbondedCUDA::finishPatch \n",CkMyPe());
1870  patch_record &pr = master->patchRecords[master->activePatches[flindex]];
1871  pr.r = pr.forceBox->open();
1872  finishPatch(pr);
1873  pr.positionBox->close(&(pr.x));
1874  pr.forceBox->close(&(pr.r));
1875 }
ComputeNonbondedCUDA * master
ResizeArray< int > activePatches
ResizeArray< patch_record > patchRecords
void ComputeNonbondedCUDA::finishReductions ( )

Definition at line 2032 of file ComputeNonbondedCUDA.C.

References ADD_TENSOR_OBJECT, Compute::basePriority, block_order, computeRecords, cuda_errcheck(), cuda_timer_count, cuda_timer_total, doEnergy, doSlow, end_local_download, end_remote_download, endi(), energy_gbis, SimParameters::GBISOn, iout, SubmitReduction::item(), localComputeRecords, num_virials, Node::Object(), SimParameters::outputCudaTiming, patchPairsReordered, patchRecords, PROXY_DATA_PRIORITY, reduction, REDUCTION_ELECT_ENERGY, REDUCTION_ELECT_ENERGY_SLOW, REDUCTION_EXCLUSION_CHECKSUM_CUDA, REDUCTION_LJ_ENERGY, remoteComputeRecords, Node::simParameters, simParams, slow_virials, start_calc, step, SubmitReduction::submit(), virials, Tensor::xx, Tensor::xy, Tensor::xz, Tensor::yx, Tensor::yy, Tensor::yz, Tensor::zx, Tensor::zy, and Tensor::zz.

Referenced by finishWork().

2032  {
2033  //fprintf(stderr, "%d ComputeNonbondedCUDA::finishReductions \n",CkMyPe());
2034 
2035  basePriority = PROXY_DATA_PRIORITY; // higher to aid overlap
2036 
2038 
2039  if ( 0 && patchPairsReordered && patchPairsReordered < 3 ) {
2040  if ( patchPairsReordered++ == 2) {
2041  int patch_len = patchRecords.size();
2042  ResizeArray<int> plast(patch_len);
2043  for ( int i=0; i<patch_len; ++i ) {
2044  plast[i] = -1;
2045  }
2046  int order_len = localComputeRecords.size()+remoteComputeRecords.size();
2047  for ( int i=0; i<order_len; ++i ) {
2048  plast[computeRecords[block_order[i]].pid[0]] = i;
2049  iout << "block_order " << i << " " << block_order[i] << " " << computeRecords[block_order[i]].pid[0] << "\n";
2050  }
2051  iout << endi;
2052  for ( int i=0; i<patch_len; ++i ) {
2053  iout << "patch_last " << i << " " << plast[i] << "\n";
2054  }
2055  iout << endi;
2056  }
2057  }
2058 
2059  {
2060  Tensor virial_tensor;
2061  BigReal energyv = 0.;
2062  BigReal energye = 0.;
2063  BigReal energys = 0.;
2064  int nexcluded = 0;
2065  for ( int i = 0; i < num_virials; ++i ) {
2066  virial_tensor.xx += virials[16*i];
2067  virial_tensor.xy += virials[16*i+1];
2068  virial_tensor.xz += virials[16*i+2];
2069  virial_tensor.yx += virials[16*i+3];
2070  virial_tensor.yy += virials[16*i+4];
2071  virial_tensor.yz += virials[16*i+5];
2072  virial_tensor.zx += virials[16*i+6];
2073  virial_tensor.zy += virials[16*i+7];
2074  virial_tensor.zz += virials[16*i+8];
2075  energyv += virials[16*i+9];
2076  energye += virials[16*i+10];
2077  energys += virials[16*i+11];
2078  nexcluded += ((int *)virials)[16*i+12];
2079  if (simParams->GBISOn) {
2080  energye += energy_gbis[i];
2081  }
2082  }
2084  ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_NBOND,virial_tensor);
2085  if ( doEnergy ) {
2086  reduction->item(REDUCTION_LJ_ENERGY) += energyv;
2087  reduction->item(REDUCTION_ELECT_ENERGY) += energye;
2089  }
2090  }
2091  if ( doSlow ) {
2092  Tensor virial_slow_tensor;
2093  for ( int i = 0; i < num_virials; ++i ) {
2094  virial_slow_tensor.xx += slow_virials[16*i];
2095  virial_slow_tensor.xy += slow_virials[16*i+1];
2096  virial_slow_tensor.xz += slow_virials[16*i+2];
2097  virial_slow_tensor.yx += slow_virials[16*i+3];
2098  virial_slow_tensor.yy += slow_virials[16*i+4];
2099  virial_slow_tensor.yz += slow_virials[16*i+5];
2100  virial_slow_tensor.zx += slow_virials[16*i+6];
2101  virial_slow_tensor.zy += slow_virials[16*i+7];
2102  virial_slow_tensor.zz += slow_virials[16*i+8];
2103  }
2104  ADD_TENSOR_OBJECT(reduction,REDUCTION_VIRIAL_SLOW,virial_slow_tensor);
2105  }
2106 
2107  reduction->submit();
2108 
2109  cuda_timer_count++;
2110  if ( simParams->outputCudaTiming &&
2111  step % simParams->outputCudaTiming == 0 ) {
2112 
2113  // int natoms = mol->numAtoms;
2114  // double wpa = wcount; wpa /= natoms;
2115 
2116  // CkPrintf("Pe %d CUDA kernel %f ms, total %f ms, wpa %f\n", CkMyPe(),
2117  // kernel_time * 1.e3, time * 1.e3, wpa);
2118 
2119 #if 0
2120  float upload_ms, remote_calc_ms;
2121  float local_calc_ms, total_ms;
2122  cuda_errcheck("before event timers");
2123  cudaEventElapsedTime(&upload_ms, start_upload, start_calc);
2124  cuda_errcheck("in event timer 1");
2125  cudaEventElapsedTime(&remote_calc_ms, start_calc, end_remote_download);
2126  cuda_errcheck("in event timer 2");
2127  cudaEventElapsedTime(&local_calc_ms, end_remote_download, end_local_download);
2128  cuda_errcheck("in event timer 3");
2129  cudaEventElapsedTime(&total_ms, start_upload, end_local_download);
2130  cuda_errcheck("in event timer 4");
2131  cuda_errcheck("in event timers");
2132 
2133  CkPrintf("CUDA EVENT TIMING: %d %f %f %f %f\n",
2134  CkMyPe(), upload_ms, remote_calc_ms,
2135  local_calc_ms, total_ms);
2136 #endif
2137 
2138  if ( cuda_timer_count >= simParams->outputCudaTiming ) {
2140  CkPrintf("CUDA TIMING: %d %f ms/step on node %d\n",
2141  step, cuda_timer_total * 1.e3, CkMyPe());
2142  }
2143  cuda_timer_count = 0;
2144  cuda_timer_total = 0;
2145  }
2146 
2147 }
static Node * Object()
Definition: Node.h:86
static __thread int * block_order
BigReal zy
Definition: Tensor.h:19
BigReal xz
Definition: Tensor.h:17
static __thread double cuda_timer_total
ResizeArray< compute_record > computeRecords
static __thread cudaEvent_t end_remote_download
#define ADD_TENSOR_OBJECT(R, RL, D)
Definition: ReductionMgr.h:43
SimParameters * simParameters
Definition: Node.h:178
BigReal & item(int i)
Definition: ReductionMgr.h:312
std::ostream & endi(std::ostream &s)
Definition: InfoStream.C:54
BigReal yz
Definition: Tensor.h:18
#define PROXY_DATA_PRIORITY
Definition: Priorities.h:40
#define iout
Definition: InfoStream.h:51
SubmitReduction * reduction
static __thread cudaEvent_t end_local_download
static __thread float * slow_virials
BigReal yx
Definition: Tensor.h:18
ResizeArray< compute_record > localComputeRecords
static __thread float * virials
ResizeArray< compute_record > remoteComputeRecords
static __thread int cuda_timer_count
static __thread int num_virials
BigReal xx
Definition: Tensor.h:17
BigReal zz
Definition: Tensor.h:19
#define simParams
Definition: Output.C:127
void cuda_errcheck(const char *msg)
Definition: Tensor.h:15
static __thread float * energy_gbis
BigReal xy
Definition: Tensor.h:17
BigReal yy
Definition: Tensor.h:18
static __thread cudaEvent_t start_calc
ResizeArray< patch_record > patchRecords
void submit(void)
Definition: ReductionMgr.h:323
int basePriority
Definition: Compute.h:37
BigReal zx
Definition: Tensor.h:19
double BigReal
Definition: common.h:114
int ComputeNonbondedCUDA::finishWork ( )

Definition at line 1906 of file ComputeNonbondedCUDA.C.

References atomsChanged, ComputeNonbondedCUDA::patch_record::bornRad, ComputeNonbondedCUDA::patch_record::bornRadBox, Box< Owner, Data >::close(), computeMgr, cuda_timer_total, ComputeNonbondedCUDA::patch_record::dEdaSum, ComputeNonbondedCUDA::patch_record::dEdaSumBox, dEdaSumH, deviceCUDA, ComputeNonbondedCUDA::patch_record::dHdrPrefix, ComputeNonbondedCUDA::patch_record::dHdrPrefixBox, doSlow, finishPatch(), finishReductions(), ComputeNonbondedCUDA::patch_record::forceBox, SimParameters::GBISOn, GBISP, Compute::gbisPhase, DeviceCUDA::getMergeGrids(), ComputeNonbondedCUDA::patch_record::intRad, ComputeNonbondedCUDA::patch_record::intRadBox, SubmitReduction::item(), kernel_time, localHostedPatches, ComputeNonbondedCUDA::patch_record::localStart, master, ComputeNonbondedUtil::mol, Node::molecule, ComputeNonbondedCUDA::patch_record::numAtoms, numSlaves, Node::Object(), Box< Owner, Data >::open(), ComputeNonbondedCUDA::patch_record::patchID, patchRecords, CompAtom::position, ComputeNonbondedCUDA::patch_record::positionBox, Compute::priority(), ComputeNonbondedCUDA::patch_record::psiSum, ComputeNonbondedCUDA::patch_record::psiSumBox, psiSumH, ComputeNonbondedCUDA::patch_record::r, reduction, remoteHostedPatches, ComputeMgr::sendNonbondedCUDASlaveEnqueue(), Compute::sequence(), Node::simParameters, simParams, ResizeArray< T >::size(), slavePes, slaves, CompAtomExt::sortOrder, workStarted, ComputeNonbondedCUDA::patch_record::x, Vector::x, ComputeNonbondedCUDA::patch_record::xExt, Vector::y, and Vector::z.

Referenced by doWork().

1906  {
1907  //fprintf(stderr, "%d ComputeNonbondedCUDA::finishWork() \n",CkMyPe());
1908 
1909  if ( master == this ) {
1910  for ( int i = 0; i < numSlaves; ++i ) {
1912  }
1913  }
1914 
1915 GBISP("C.N.CUDA[%d]::fnWork: workStarted %d, phase %d\n", \
1916 CkMyPe(), workStarted, gbisPhase)
1917 
1918  Molecule *mol = Node::Object()->molecule;
1919  SimParameters *simParams = Node::Object()->simParameters;
1920 
1921  ResizeArray<int> &patches( workStarted == 1 ?
1923 
1924  // long long int wcount = 0;
1925  // double virial = 0.;
1926  // double virial_slow = 0.;
1927  for ( int i=0; i<patches.size(); ++i ) {
1928  // CkPrintf("Pe %d patch %d of %d pid %d\n",CkMyPe(),i,patches.size(),patches[i]);
1929  patch_record &pr = master->patchRecords[patches[i]];
1930 
1931  if ( !simParams->GBISOn || gbisPhase == 1 ) {
1932  patch_record &pr = master->patchRecords[patches[i]];
1933 GBISP("GBIS[%d] fnWork() P0[%d] force.open()\n",CkMyPe(), pr.patchID);
1934  pr.r = pr.forceBox->open();
1935  } // !GBISOn || gbisPhase==1
1936 
1937  int start = pr.localStart;
1938  const CompAtomExt *aExt = pr.xExt;
1939  if ( !simParams->GBISOn || gbisPhase == 3 ) {
1940  finishPatch(pr);
1941  } // !GBISOn || gbisPhase == 3
1942 
1943 #if 0
1944  if ( i % 31 == 0 ) for ( int j=0; j<3; ++j ) {
1945  CkPrintf("Pe %d patch %d atom %d (%f %f %f) force %f\n", CkMyPe(), i,
1946  j, pr.x[j].position.x, pr.x[j].position.y, pr.x[j].position.z,
1947  af[j].w);
1948  }
1949 #endif
1950 
1951  //Close Boxes depending on Phase
1952  if (simParams->GBISOn) {
1953  if (gbisPhase == 1) {
1954  //Copy dEdaSum from Host to Patch Box
1955  GBReal *psiSumMaster = master->psiSumH + start;
1956  for ( int k=0; k<pr.numAtoms; ++k ) {
1957  int j = aExt[k].sortOrder;
1958  pr.psiSum[j] += psiSumMaster[k];
1959  }
1960 GBISP("C.N.CUDA[%d]::fnWork: P1 psiSum.close()\n", CkMyPe());
1961  pr.psiSumBox->close(&(pr.psiSum));
1962 
1963  } else if (gbisPhase == 2) {
1964  //Copy dEdaSum from Host to Patch Box
1965  GBReal *dEdaSumMaster = master->dEdaSumH + start;
1966  for ( int k=0; k<pr.numAtoms; ++k ) {
1967  int j = aExt[k].sortOrder;
1968  pr.dEdaSum[j] += dEdaSumMaster[k];
1969  }
1970 GBISP("C.N.CUDA[%d]::fnWork: P2 dEdaSum.close()\n", CkMyPe());
1971  pr.dEdaSumBox->close(&(pr.dEdaSum));
1972 
1973  } else if (gbisPhase == 3) {
1974 GBISP("C.N.CUDA[%d]::fnWork: P3 all.close()\n", CkMyPe());
1975  pr.intRadBox->close(&(pr.intRad)); //box 6
1976  pr.bornRadBox->close(&(pr.bornRad)); //box 7
1977  pr.dHdrPrefixBox->close(&(pr.dHdrPrefix)); //box 9
1978  pr.positionBox->close(&(pr.x)); //box 0
1979  pr.forceBox->close(&(pr.r));
1980  } //end phases
1981  } else { //not GBIS
1982 GBISP("C.N.CUDA[%d]::fnWork: pos/force.close()\n", CkMyPe());
1983  pr.positionBox->close(&(pr.x));
1984  pr.forceBox->close(&(pr.r));
1985  }
1986  }// end for
1987 
1988 #if 0
1989  virial *= (-1./6.);
1990  reduction->item(REDUCTION_VIRIAL_NBOND_XX) += virial;
1991  reduction->item(REDUCTION_VIRIAL_NBOND_YY) += virial;
1992  reduction->item(REDUCTION_VIRIAL_NBOND_ZZ) += virial;
1993  if ( doSlow ) {
1994  virial_slow *= (-1./6.);
1995  reduction->item(REDUCTION_VIRIAL_SLOW_XX) += virial_slow;
1996  reduction->item(REDUCTION_VIRIAL_SLOW_YY) += virial_slow;
1997  reduction->item(REDUCTION_VIRIAL_SLOW_ZZ) += virial_slow;
1998  }
1999 #endif
2000 
2001  if ( workStarted == 1 && ! deviceCUDA->getMergeGrids() &&
2002  ( localHostedPatches.size() || master == this ) ) {
2003  GBISP("not finished, call again\n");
2004  return 0; // not finished, call again
2005  }
2006 
2007  if ( master != this ) { // finished
2008  GBISP("finished\n");
2009  if (simParams->GBISOn) gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
2010  atomsChanged = 0;
2011  return 1;
2012  }
2013 
2015 
2016  if ( !simParams->GBISOn || gbisPhase == 3 ) {
2017 
2018  atomsChanged = 0;
2019  finishReductions();
2020 
2021  } // !GBISOn || gbisPhase==3
2022 
2023  // Next GBIS Phase
2024 GBISP("C.N.CUDA[%d]::fnWork: incrementing phase\n", CkMyPe())
2025  if (simParams->GBISOn) gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
2026 
2027  GBISP("C.N.CUDA[%d] finished ready for next step\n",CkMyPe());
2028  return 1; // finished and ready for next step
2029 }
ResizeArray< int > remoteHostedPatches
int sequence(void)
Definition: Compute.h:64
static __thread double cuda_timer_total
int sortOrder
Definition: NamdTypes.h:87
Definition: Node.h:78
#define GBISP(...)
static __thread ComputeMgr * computeMgr
static const Molecule * mol
BigReal & item(int i)
Definition: ReductionMgr.h:312
int getMergeGrids()
Definition: DeviceCUDA.h:100
if(ComputeNonbondedUtil::goMethod==2)
SubmitReduction * reduction
static Units next(Units u)
Definition: ParseOptions.C:48
static __thread double kernel_time
ComputeNonbondedCUDA ** slaves
int priority(void)
Definition: Compute.h:65
ComputeNonbondedCUDA * master
int gbisPhase
Definition: Compute.h:39
#define simParams
Definition: Output.C:127
ResizeArray< int > localHostedPatches
void sendNonbondedCUDASlaveEnqueue(ComputeNonbondedCUDA *c, int, int, int, int)
Definition: ComputeMgr.C:1554
#define CUDA(X)
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:22
ResizeArray< patch_record > patchRecords
float GBReal
Definition: ComputeGBIS.inl:17
for(int i=0;i< n1;++i)
void ComputeNonbondedCUDA::messageFinishPatch ( int  flindex)

Definition at line 1861 of file ComputeNonbondedCUDA.C.

References activePatches, computeMgr, ComputeNonbondedCUDA::patch_record::hostPe, ComputeNonbondedCUDA::patch_record::msg, patchRecords, PROXY_DATA_PRIORITY, ComputeMgr::sendNonbondedCUDASlaveEnqueuePatch(), Compute::sequence(), and ComputeNonbondedCUDA::patch_record::slave.

1861  {
1862  int pid = activePatches[flindex];
1863  patch_record &pr = patchRecords[pid];
1864  //fprintf(stderr, "%d ComputeNonbondedCUDA::messageFinishPatch %d\n",CkMyPe(),pr.hostPe);
1865  computeMgr->sendNonbondedCUDASlaveEnqueuePatch(pr.slave,pr.hostPe,sequence(),PROXY_DATA_PRIORITY,flindex,pr.msg);
1866 }
void sendNonbondedCUDASlaveEnqueuePatch(ComputeNonbondedCUDA *c, int, int, int, int, FinishWorkMsg *)
Definition: ComputeMgr.C:1565
int sequence(void)
Definition: Compute.h:64
static __thread ComputeMgr * computeMgr
#define PROXY_DATA_PRIORITY
Definition: Priorities.h:40
ResizeArray< int > activePatches
ResizeArray< patch_record > patchRecords
int ComputeNonbondedCUDA::noWork ( )
virtual

Reimplemented from Compute.

Definition at line 1112 of file ComputeNonbondedCUDA.C.

References atomsChanged, ComputeNonbondedCUDA::patch_record::bornRad, ComputeNonbondedCUDA::patch_record::bornRadBox, computeMgr, ComputeNonbondedCUDA::patch_record::dEdaSum, ComputeNonbondedCUDA::patch_record::dEdaSumBox, ComputeNonbondedCUDA::patch_record::dHdrPrefix, ComputeNonbondedCUDA::patch_record::dHdrPrefixBox, Flags::doEnergy, doEnergy, Flags::doFullElectrostatics, Flags::doNonbonded, doSlow, SimParameters::GBISOn, GBISP, Compute::gbisPhase, Patch::getCompAtomExtInfo(), hostedPatches, ComputeNonbondedCUDA::patch_record::intRad, ComputeNonbondedCUDA::patch_record::intRadBox, Flags::lattice, lattice, master, masterPe, numSlaves, Node::Object(), Box< Owner, Data >::open(), ComputeNonbondedCUDA::patch_record::p, ComputeNonbondedCUDA::patch_record::patchID, patchRecords, ComputeNonbondedCUDA::patch_record::positionBox, ComputeNonbondedCUDA::patch_record::psiSum, ComputeNonbondedCUDA::patch_record::psiSumBox, reduction, ComputeMgr::sendNonbondedCUDASlaveReady(), ComputeMgr::sendNonbondedCUDASlaveSkip(), Compute::sequence(), Node::simParameters, simParams, ResizeArray< T >::size(), skip(), slavePes, slaves, Flags::step, step, SubmitReduction::submit(), workStarted, ComputeNonbondedCUDA::patch_record::x, and ComputeNonbondedCUDA::patch_record::xExt.

1112  {
1113 
1115  Flags &flags = master->patchRecords[hostedPatches[0]].p->flags;
1116  lattice = flags.lattice;
1117  doSlow = flags.doFullElectrostatics;
1118  doEnergy = flags.doEnergy;
1119  step = flags.step;
1120 
1121  if ( ! flags.doNonbonded ) {
1122  GBISP("GBIS[%d] noWork() don't do nonbonded\n",CkMyPe());
1123  if ( master != this ) {
1126  } else {
1127  for ( int i = 0; i < numSlaves; ++i ) {
1129  }
1130  skip();
1131  }
1132  if ( reduction ) {
1133  reduction->submit();
1134  }
1135 
1136  return 1;
1137  }
1138 
1139  for ( int i=0; i<hostedPatches.size(); ++i ) {
1140  patch_record &pr = master->patchRecords[hostedPatches[i]];
1141  if (!simParams->GBISOn || gbisPhase == 1) {
1142  GBISP("GBIS[%d] noWork() P0[%d] open()\n",CkMyPe(), pr.patchID);
1143  pr.x = pr.positionBox->open();
1144  pr.xExt = pr.p->getCompAtomExtInfo();
1145  }
1146 
1147  if (simParams->GBISOn) {
1148  if (gbisPhase == 1) {
1149  GBISP("GBIS[%d] noWork() P1[%d] open()\n",CkMyPe(),pr.patchID);
1150  pr.intRad = pr.intRadBox->open();
1151  pr.psiSum = pr.psiSumBox->open();
1152  } else if (gbisPhase == 2) {
1153  GBISP("GBIS[%d] noWork() P2[%d] open()\n",CkMyPe(),pr.patchID);
1154  pr.bornRad = pr.bornRadBox->open();
1155  pr.dEdaSum = pr.dEdaSumBox->open();
1156  } else if (gbisPhase == 3) {
1157  GBISP("GBIS[%d] noWork() P3[%d] open()\n",CkMyPe(),pr.patchID);
1158  pr.dHdrPrefix = pr.dHdrPrefixBox->open();
1159  }
1160  GBISP("opened GBIS boxes");
1161  }
1162  }
1163 
1164  if ( master == this ) return 0; //work to do, enqueue as usual
1165 
1166  // message masterPe
1169  atomsChanged = 0;
1170 
1171  workStarted = 1;
1172 
1173  return 1;
1174 }
static Node * Object()
Definition: Node.h:86
void sendNonbondedCUDASlaveSkip(ComputeNonbondedCUDA *c, int)
Definition: ComputeMgr.C:1541
int sequence(void)
Definition: Compute.h:64
#define GBISP(...)
static __thread ComputeMgr * computeMgr
SimParameters * simParameters
Definition: Node.h:178
void sendNonbondedCUDASlaveReady(int, int, int, int)
Definition: ComputeMgr.C:1525
SubmitReduction * reduction
ComputeNonbondedCUDA ** slaves
int doEnergy
Definition: PatchTypes.h:20
int doFullElectrostatics
Definition: PatchTypes.h:23
ComputeNonbondedCUDA * master
int doNonbonded
Definition: PatchTypes.h:22
ResizeArray< int > hostedPatches
int gbisPhase
Definition: Compute.h:39
#define simParams
Definition: Output.C:127
Lattice lattice
Definition: PatchTypes.h:44
ResizeArray< patch_record > patchRecords
void submit(void)
Definition: ReductionMgr.h:323
int size(void) const
Definition: ResizeArray.h:127
int step
Definition: PatchTypes.h:16
void ComputeNonbondedCUDA::recvYieldDevice ( int  pe)

Definition at line 1637 of file ComputeNonbondedCUDA.C.

References Lattice::a(), SimParameters::alpha_cutoff, atom_params, atoms, atomsChanged, Lattice::b(), block_order, bornRadH, Lattice::c(), CcdCallBacksReset(), SimParameters::coulomb_radius_offset, cuda_bind_atom_params(), cuda_bind_atoms(), cuda_bind_forces(), cuda_bind_GBIS_bornRad(), cuda_bind_GBIS_dEdaSum(), cuda_bind_GBIS_dHdrPrefix(), cuda_bind_GBIS_energy(), cuda_bind_GBIS_intRad(), cuda_bind_GBIS_psiSum(), cuda_bind_vdw_types(), cuda_bind_virials(), cuda_check_local_calc(), cuda_check_local_progress(), cuda_check_progress(), cuda_check_remote_calc(), cuda_check_remote_progress(), cuda_GBIS_P1(), cuda_GBIS_P2(), cuda_GBIS_P3(), cuda_nonbonded_forces(), CUDA_POLL, SimParameters::cutoff, ComputeNonbondedUtil::cutoff2, dEdaSumH, deviceCUDA, deviceID, dHdrPrefixH, SimParameters::dielectric, doEnergy, doSlow, dummy_dev, dummy_size, end_local_download, end_remote_download, energy_gbis, force_ready_queue, force_ready_queue_next, forces, SimParameters::fsMax, SimParameters::GBISOn, GBISP, Compute::gbisPhase, DeviceCUDA::getMergeGrids(), DeviceCUDA::getNoStreaming(), DeviceCUDA::getSharedGpu(), intRad0H, intRadSH, SimParameters::kappa, kernel_launch_state, lata, latb, latc, lattice, localActivePatches, localComputeRecords, SimParameters::nonbondedScaling, Node::Object(), patchPairsReordered, plcutoff2, SimParameters::PMEOffload, SimParameters::PMEOn, psiSumH, remote_submit_time, remoteActivePatches, remoteComputeRecords, savePairlists, Compute::sequence(), DeviceCUDA::setGpuIsMine(), Node::simParameters, simParams, ResizeArray< T >::size(), slow_forces, SimParameters::solvent_dielectric, start_calc, stream, stream2, SimParameters::switchingActive, SimParameters::switchingDist, usePairlists, SimParameters::usePMECUDA, vdw_types, virials, workStarted, Vector::x, Vector::y, and Vector::z.

Referenced by doWork(), and ComputeMgr::recvYieldDevice().

1637  {
1638 GBISP("C.N.CUDA[%d]::recvYieldDevice: seq %d, workStarted %d, \
1639 gbisPhase %d, kls %d, from pe %d\n", CkMyPe(), sequence(), \
1641 
1642  float3 lata, latb, latc;
1643  lata.x = lattice.a().x;
1644  lata.y = lattice.a().y;
1645  lata.z = lattice.a().z;
1646  latb.x = lattice.b().x;
1647  latb.y = lattice.b().y;
1648  latb.z = lattice.b().z;
1649  latc.x = lattice.c().x;
1650  latc.y = lattice.c().y;
1651  latc.z = lattice.c().z;
1652  SimParameters *simParams = Node::Object()->simParameters;
1653 
1654  // Set GPU device ID
1655  cudaSetDevice(deviceID);
1656 
1657  const bool streaming = ! (deviceCUDA->getNoStreaming() || simParams->GBISOn);
1658 
1659  double walltime;
1660  if ( kernel_launch_state == 1 || kernel_launch_state == 2 ) {
1661  walltime = CkWallTimer();
1662  CcdCallBacksReset(0,walltime); // fix Charm++
1663  }
1664 
1665  switch ( kernel_launch_state ) {
1667 // Remote
1668  case 1:
1669 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: case 1\n", CkMyPe())
1671  //gpu_is_mine = 0;
1673  remote_submit_time = walltime;
1674 
1675  if (!simParams->GBISOn || gbisPhase == 1) {
1676  // cudaEventRecord(start_upload, stream);
1677  if ( atomsChanged ) {
1680  }
1681  if ( simParams->GBISOn) {
1683  if ( atomsChanged ) {
1685  }
1686  }
1687  atomsChanged = 0;
1688  cuda_bind_atoms((const atom *)atoms);
1691  if ( simParams->GBISOn) {
1693  }
1694  if ( stream2 != stream ) cudaEventRecord(start_calc, stream);
1695  //call CUDA Kernels
1696 
1698  0,remoteComputeRecords.size(),
1701  if (simParams->GBISOn) {
1702  cuda_GBIS_P1(
1703  0,remoteComputeRecords.size(),
1705  simParams->alpha_cutoff-simParams->fsMax,
1706  simParams->coulomb_radius_offset,
1707  lata, latb, latc, stream);
1708  }
1709  //cuda_load_forces(forces, (doSlow ? slow_forces : 0 ),
1710  // num_local_atom_records,num_remote_atom_records);
1711  if ( ( ! streaming ) || ( deviceCUDA->getSharedGpu() && ! deviceCUDA->getMergeGrids() ) ) {
1712  cudaEventRecord(end_remote_download, stream);
1713  }
1714  if ( streaming ) {
1717  } else {
1719  }
1720  if ( deviceCUDA->getSharedGpu() && ! deviceCUDA->getMergeGrids() ) {
1722  break;
1723  }
1724  } // !GBIS or gbisPhase==1
1725  if (simParams->GBISOn) {
1726  if (gbisPhase == 1) {
1727  //GBIS P1 Kernel launched in previous code block
1728  } else if (gbisPhase == 2) {
1729 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: <<<P2>>>\n", CkMyPe())
1730  // cudaEventRecord(start_upload, stream);
1733  if ( stream2 != stream ) cudaEventRecord(start_calc, stream);
1734  cuda_GBIS_P2(
1735  0,remoteComputeRecords.size(),
1737  (simParams->alpha_cutoff-simParams->fsMax), simParams->cutoff,
1738  simParams->nonbondedScaling, simParams->kappa,
1739  (simParams->switchingActive ? simParams->switchingDist : -1.0),
1740  simParams->dielectric, simParams->solvent_dielectric,
1741  lata, latb, latc,
1742  doEnergy, doSlow, stream
1743  );
1744  cudaEventRecord(end_remote_download, stream);
1746  } else if (gbisPhase == 3) {
1747 GBISP("C.N.CUDA[%d]::recvYieldDeviceR: <<<P3>>>\n", CkMyPe())
1748  // cudaEventRecord(start_upload, stream);
1750  if ( stream2 != stream ) cudaEventRecord(start_calc, stream);
1751  if (doSlow)
1752  cuda_GBIS_P3(
1753  0,remoteComputeRecords.size(),
1755  (simParams->alpha_cutoff-simParams->fsMax),
1756  simParams->coulomb_radius_offset,
1757  simParams->nonbondedScaling,
1758  lata, latb, latc, stream
1759  );
1760  cudaEventRecord(end_remote_download, stream);
1761  CUDA_POLL(cuda_check_remote_progress,this);
1762  }
1763  }
1764 
1766 // Local
1767  case 2:
1768 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: case 2\n", CkMyPe())
1770  //gpu_is_mine = 0;
1771  deviceCUDA->setGpuIsMine(0);
1772 
1773  if ( stream2 != stream ) {
1774  // needed to ensure that upload is finished
1775  cudaStreamWaitEvent(stream2, start_calc, 0);
1776  // priorities do not prevent local from launching ahead
1777  // of remote, so delay local stream with a small memset
1778  cudaMemsetAsync(dummy_dev, 0, dummy_size, stream2);
1779  }
1780 
1781  if (!simParams->GBISOn || gbisPhase == 1) {
1782 
1783  cuda_nonbonded_forces(lata, latb, latc, cutoff2, plcutoff2,
1787  if (simParams->GBISOn) {
1788  cuda_GBIS_P1(
1791  simParams->alpha_cutoff-simParams->fsMax,
1792  simParams->coulomb_radius_offset,
1793  lata, latb, latc, stream2 );
1794  }
1795  //cuda_load_forces(forces, (doSlow ? slow_forces : 0 ),
1796  // 0,num_local_atom_records);
1797  //cuda_load_virials(virials, doSlow); // slow_virials follows virials
1798  if ( ( ! streaming ) || ( deviceCUDA->getSharedGpu() && ! deviceCUDA->getMergeGrids() ) ) {
1799  cudaEventRecord(end_local_download, stream2);
1800  }
1801  if ( ! streaming && workStarted == 2 ) {
1802  GBISP("C.N.CUDA[%d]::recvYieldDeviceL: adding POLL \
1803 cuda_check_local_progress\n", CkMyPe())
1805  }
1806  if ( deviceCUDA->getSharedGpu() && ! deviceCUDA->getMergeGrids() ) {
1807  GBISP("C.N.CUDA[%d]::recvYieldDeviceL: adding POLL \
1808 cuda_check_local_calc\n", CkMyPe())
1810  break;
1811  }
1812 
1813  } // !GBIS or gbisPhase==1
1814  if (simParams->GBISOn) {
1815  if (gbisPhase == 1) {
1816  //GBIS P1 Kernel launched in previous code block
1817  } else if (gbisPhase == 2) {
1818 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: calling <<<P2>>>\n", CkMyPe())
1819  cuda_GBIS_P2(
1821  0,localActivePatches.size(),
1822  (simParams->alpha_cutoff-simParams->fsMax), simParams->cutoff,
1823  simParams->nonbondedScaling, simParams->kappa,
1824  (simParams->switchingActive ? simParams->switchingDist : -1.0),
1825  simParams->dielectric, simParams->solvent_dielectric,
1826  lata, latb, latc,
1827  doEnergy, doSlow, stream2
1828  );
1829  cudaEventRecord(end_local_download, stream2);
1830  if ( workStarted == 2 ) {
1831  CUDA_POLL(cuda_check_local_progress,this);
1832  }
1833  } else if (gbisPhase == 3) {
1834 GBISP("C.N.CUDA[%d]::recvYieldDeviceL: calling <<<P3>>>\n", CkMyPe())
1835  if (doSlow)
1836  cuda_GBIS_P3(
1838  0,localActivePatches.size(),
1839  (simParams->alpha_cutoff-simParams->fsMax),
1840  simParams->coulomb_radius_offset,
1841  simParams->nonbondedScaling,
1842  lata, latb, latc, stream2
1843  );
1844  cudaEventRecord(end_local_download, stream2);
1845  if ( workStarted == 2 ) {
1846  CUDA_POLL(cuda_check_local_progress,this);
1847  }
1848  } // phases
1849  } // GBISOn
1850  if ( simParams->PMEOn && simParams->PMEOffload && !simParams->usePMECUDA) break;
1851 
1852  default:
1853 GBISP("C.N.CUDA[%d]::recvYieldDevice: case default\n", CkMyPe())
1854  //gpu_is_mine = 1;
1855  deviceCUDA->setGpuIsMine(1);
1856  break;
1857  } // switch
1858 GBISP("C.N.CUDA[%d]::recvYieldDevice: DONE\n", CkMyPe())
1859 }
static __thread int * block_order
ResizeArray< int > localActivePatches
int sequence(void)
Definition: Compute.h:64
void cuda_bind_forces(float4 *f, float4 *f_slow)
void cuda_check_progress(void *arg, double walltime)
Definition: Node.h:78
void cuda_check_local_progress(void *arg, double walltime)
void cuda_bind_atoms(const atom *a)
bool getSharedGpu()
Definition: DeviceCUDA.h:103
static __thread cudaEvent_t end_remote_download
#define GBISP(...)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 lata
static __thread int dummy_size
static __thread float * bornRadH
static __thread float * dHdrPrefixH
int getMergeGrids()
Definition: DeviceCUDA.h:100
if(ComputeNonbondedUtil::goMethod==2)
void cuda_bind_GBIS_energy(float *e)
void CcdCallBacksReset(void *ignored, double curWallTime)
void cuda_bind_GBIS_dEdaSum(GBReal *dEdaSumH)
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 latb
static __thread cudaEvent_t end_local_download
void cuda_nonbonded_forces(float3 lata, float3 latb, float3 latc, float cutoff2, float plcutoff2, int cbegin, int ccount, int ctotal, int doSlow, int doEnergy, int usePairlists, int savePairlists, int doStreaming, int saveOrder, cudaStream_t &strm)
__thread cudaStream_t stream
static __thread int force_ready_queue_next
void cuda_check_local_calc(void *arg, double walltime)
#define CUDA_POLL(FN, ARG)
static __thread float * intRadSH
void setGpuIsMine(const int val)
Definition: DeviceCUDA.h:110
void cuda_bind_vdw_types(const int *t)
void cuda_bind_GBIS_dHdrPrefix(float *dHdrPrefixH)
gridSize z
void cuda_bind_GBIS_bornRad(float *bornRadH)
ResizeArray< int > remoteActivePatches
ResizeArray< compute_record > localComputeRecords
static __thread float * virials
void cuda_bind_virials(float *v, int *queue, int *blockorder)
ResizeArray< compute_record > remoteComputeRecords
void cuda_bind_GBIS_psiSum(GBReal *psiSumH)
void cuda_GBIS_P3(int cbegin, int ccount, int pbegin, int pcount, float a_cut, float rho_0, float scaling, float3 lata, float3 latb, float3 latc, cudaStream_t &strm)
static __thread double remote_submit_time
void cuda_GBIS_P1(int cbegin, int ccount, int pbegin, int pcount, float a_cut, float rho_0, float3 lata, float3 latb, float3 latc, cudaStream_t &strm)
int gbisPhase
Definition: Compute.h:39
void cuda_check_remote_progress(void *arg, double walltime)
static __thread float * dummy_dev
#define simParams
Definition: Output.C:127
void cuda_bind_GBIS_intRad(float *intRad0H, float *intRadSH)
static __thread float * energy_gbis
#define CUDA(X)
static __thread int kernel_launch_state
void cuda_check_remote_calc(void *arg, double walltime)
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:22
static __thread cudaEvent_t start_calc
static __thread atom_param * atom_params
gridSize y
static __thread int * vdw_types
void cuda_GBIS_P2(int cbegin, int ccount, int pbegin, int pcount, float a_cut, float r_cut, float scaling, float kappa, float smoothDist, float epsilon_p, float epsilon_s, float3 lata, float3 latb, float3 latc, int doEnergy, int doFullElec, cudaStream_t &strm)
int size(void) const
Definition: ResizeArray.h:127
__thread cudaStream_t stream2
static __thread int * force_ready_queue
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 latc
void cuda_bind_atom_params(const atom_param *t)
gridSize x
static __thread float * intRad0H
void ComputeNonbondedCUDA::registerPatches ( )

Definition at line 615 of file ComputeNonbondedCUDA.C.

References activePatches, ResizeArray< T >::add(), ResizeArray< T >::begin(), ComputeNonbondedCUDA::patch_record::bornRadBox, ProxyMgr::createProxy(), ComputeNonbondedCUDA::patch_record::dEdaSumBox, ComputeNonbondedCUDA::patch_record::dHdrPrefixBox, ComputeNonbondedCUDA::patch_record::forceBox, SimParameters::GBISOn, hostedPatches, ComputeNonbondedCUDA::patch_record::hostPe, ComputeNonbondedCUDA::patch_record::intRadBox, ComputeNonbondedCUDA::patch_record::isLocal, localHostedPatches, master, masterPe, ComputeNonbondedCUDA::patch_record::msg, Node::Object(), ProxyMgr::Object(), ComputeNonbondedCUDA::patch_record::p, PatchMap::patch(), patchMap, patchRecords, ComputeNonbondedCUDA::patch_record::positionBox, PRIORITY_SIZE, ComputeNonbondedCUDA::patch_record::psiSumBox, Patch::registerBornRadPickup(), Patch::registerDEdaSumDeposit(), Patch::registerDHdrPrefixPickup(), Patch::registerForceDeposit(), Patch::registerIntRadPickup(), Patch::registerPositionPickup(), Patch::registerPsiSumDeposit(), remoteHostedPatches, Compute::setNumPatches(), Node::simParameters, simParams, ResizeArray< T >::size(), and ComputeNonbondedCUDA::patch_record::slave.

Referenced by assignPatches(), and ComputeNonbondedCUDA().

615  {
616 
618  int npatches = master->activePatches.size();
619  int *pids = master->activePatches.begin();
620  patch_record *recs = master->patchRecords.begin();
621  for ( int i=0; i<npatches; ++i ) {
622  int pid = pids[i];
623  patch_record &pr = recs[pid];
624  if ( pr.hostPe == CkMyPe() ) {
625  pr.slave = this;
626  pr.msg = new (PRIORITY_SIZE) FinishWorkMsg;
627  hostedPatches.add(pid);
628  if ( pr.isLocal ) {
629  localHostedPatches.add(pid);
630  } else {
632  }
634  pr.p = patchMap->patch(pid);
635  pr.positionBox = pr.p->registerPositionPickup(this);
636  pr.forceBox = pr.p->registerForceDeposit(this);
637  if (simParams->GBISOn) {
638  pr.intRadBox = pr.p->registerIntRadPickup(this);
639  pr.psiSumBox = pr.p->registerPsiSumDeposit(this);
640  pr.bornRadBox = pr.p->registerBornRadPickup(this);
641  pr.dEdaSumBox = pr.p->registerDEdaSumDeposit(this);
642  pr.dHdrPrefixBox = pr.p->registerDHdrPrefixPickup(this);
643  }
644  }
645  }
646  if ( master == this ) setNumPatches(activePatches.size());
648  if ( CmiPhysicalNodeID(CkMyPe()) < 2 )
649  CkPrintf("Pe %d hosts %d local and %d remote patches for pe %d\n", CkMyPe(), localHostedPatches.size(), remoteHostedPatches.size(), masterPe);
650 }
static Node * Object()
Definition: Node.h:86
void setNumPatches(int n)
Definition: Compute.h:52
ResizeArray< int > remoteHostedPatches
static ProxyMgr * Object()
Definition: ProxyMgr.h:394
SimParameters * simParameters
Definition: Node.h:178
Patch * patch(PatchID pid)
Definition: PatchMap.h:235
#define PRIORITY_SIZE
Definition: Priorities.h:13
OwnerBox< Patch, CompAtom > positionBox
Definition: Patch.h:213
void createProxy(PatchID pid)
Definition: ProxyMgr.C:493
ComputeNonbondedCUDA * master
ResizeArray< int > hostedPatches
ResizeArray< int > activePatches
int add(const Elem &elem)
Definition: ResizeArray.h:97
#define simParams
Definition: Output.C:127
ResizeArray< int > localHostedPatches
ResizeArray< patch_record > patchRecords
int size(void) const
Definition: ResizeArray.h:127
iterator begin(void)
Definition: ResizeArray.h:36
void ComputeNonbondedCUDA::requirePatch ( int  pid)

Definition at line 579 of file ComputeNonbondedCUDA.C.

References activePatches, ResizeArray< T >::add(), ComputeNonbondedCUDA::patch_record::bornRad, computesChanged, ComputeNonbondedCUDA::patch_record::dEdaSum, deviceCUDA, ComputeNonbondedCUDA::patch_record::dHdrPrefix, ComputeNonbondedCUDA::patch_record::f, DeviceCUDA::getMergeGrids(), ComputeNonbondedCUDA::patch_record::hostPe, PatchMap::index_a(), PatchMap::index_b(), PatchMap::index_c(), ComputeNonbondedCUDA::patch_record::intRad, ComputeNonbondedCUDA::patch_record::isLocal, ComputeNonbondedCUDA::patch_record::isSameNode, ComputeNonbondedCUDA::patch_record::isSamePhysicalNode, localActivePatches, PatchMap::node(), ComputeNonbondedCUDA::patch_record::patchID, patchMap, patchRecords, ComputeNonbondedCUDA::patch_record::psiSum, ComputeNonbondedCUDA::patch_record::r, ComputeNonbondedCUDA::patch_record::refCount, remoteActivePatches, ComputeNonbondedCUDA::patch_record::x, and ComputeNonbondedCUDA::patch_record::xExt.

Referenced by register_cuda_compute_pair(), and register_cuda_compute_self().

579  {
580 
581  computesChanged = 1;
582  patch_record &pr = patchRecords[pid];
583  if ( pr.refCount == 0 ) {
584  pr.isSamePhysicalNode = ( CmiPhysicalNodeID(patchMap->node(pid)) == CmiPhysicalNodeID(CkMyPe()) );
585  pr.isSameNode = ( CkNodeOf(patchMap->node(pid)) == CkMyNode() );
586  if ( deviceCUDA->getMergeGrids() ) {
587  pr.isLocal = 0;
588  } else if ( CkNumNodes() < 2 ) {
589  pr.isLocal = 1 & ( 1 ^ patchMap->index_a(pid) ^
590  patchMap->index_b(pid) ^ patchMap->index_c(pid) );
591  } else {
592  pr.isLocal = pr.isSameNode;
593  }
594  if ( pr.isLocal ) {
595  localActivePatches.add(pid);
596  } else {
598  }
599  activePatches.add(pid);
600  pr.patchID = pid;
601  pr.hostPe = -1;
602  pr.x = NULL;
603  pr.xExt = NULL;
604  pr.r = NULL;
605  pr.f = NULL;
606  pr.intRad = NULL;
607  pr.psiSum = NULL;
608  pr.bornRad = NULL;
609  pr.dEdaSum = NULL;
610  pr.dHdrPrefix = NULL;
611  }
612  pr.refCount += 1;
613 }
ResizeArray< int > localActivePatches
int index_a(int pid) const
Definition: PatchMap.h:86
int getMergeGrids()
Definition: DeviceCUDA.h:100
int index_b(int pid) const
Definition: PatchMap.h:87
ResizeArray< int > remoteActivePatches
ResizeArray< int > activePatches
int index_c(int pid) const
Definition: PatchMap.h:88
int add(const Elem &elem)
Definition: ResizeArray.h:97
int node(int pid) const
Definition: PatchMap.h:114
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:22
ResizeArray< patch_record > patchRecords
void ComputeNonbondedCUDA::skip ( void  )

Definition at line 1095 of file ComputeNonbondedCUDA.C.

References ComputeNonbondedCUDA::patch_record::bornRadBox, ComputeNonbondedCUDA::patch_record::dEdaSumBox, ComputeNonbondedCUDA::patch_record::dHdrPrefixBox, ComputeNonbondedCUDA::patch_record::forceBox, SimParameters::GBISOn, hostedPatches, ComputeNonbondedCUDA::patch_record::intRadBox, master, Node::Object(), patchRecords, ComputeNonbondedCUDA::patch_record::positionBox, ComputeNonbondedCUDA::patch_record::psiSumBox, Node::simParameters, simParams, ResizeArray< T >::size(), and Box< Owner, Data >::skip().

Referenced by noWork(), and ComputeMgr::recvNonbondedCUDASlaveSkip().

1095  {
1096  //fprintf(stderr, "ComputeNonbondedCUDA::skip()\n");
1098  for ( int i=0; i<hostedPatches.size(); ++i ) {
1099  patch_record &pr = master->patchRecords[hostedPatches[i]];
1100  pr.positionBox->skip();
1101  pr.forceBox->skip();
1102  if (simParams->GBISOn) {
1103  pr.intRadBox->skip();
1104  pr.psiSumBox->skip();
1105  pr.bornRadBox->skip();
1106  pr.dEdaSumBox->skip();
1107  pr.dHdrPrefixBox->skip();
1108  }
1109  }
1110 }
static Node * Object()
Definition: Node.h:86
SimParameters * simParameters
Definition: Node.h:178
ComputeNonbondedCUDA * master
ResizeArray< int > hostedPatches
#define simParams
Definition: Output.C:127
ResizeArray< patch_record > patchRecords
int size(void) const
Definition: ResizeArray.h:127

Member Data Documentation

ResizeArray<int> ComputeNonbondedCUDA::activePatches
AtomMap* ComputeNonbondedCUDA::atomMap

Definition at line 124 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA().

CudaAtom* ComputeNonbondedCUDA::atoms

Definition at line 147 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), and recvYieldDevice().

int ComputeNonbondedCUDA::atoms_size

Definition at line 146 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), and doWork().

int ComputeNonbondedCUDA::atomsChanged
ResizeArray<compute_record> ComputeNonbondedCUDA::computeRecords

Definition at line 103 of file ComputeNonbondedCUDA.h.

Referenced by doWork(), and finishReductions().

int ComputeNonbondedCUDA::computesChanged

Definition at line 137 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), and requirePatch().

GBReal* ComputeNonbondedCUDA::dEdaSumH

Definition at line 116 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), finishWork(), and recvYieldDevice().

int ComputeNonbondedCUDA::dEdaSumH_size

Definition at line 115 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), and doWork().

int ComputeNonbondedCUDA::deviceID

Definition at line 121 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), and recvYieldDevice().

int ComputeNonbondedCUDA::doEnergy

Definition at line 85 of file ComputeNonbondedCUDA.h.

Referenced by finishReductions(), noWork(), and recvYieldDevice().

int ComputeNonbondedCUDA::doSlow

Definition at line 85 of file ComputeNonbondedCUDA.h.

Referenced by doWork(), finishReductions(), finishWork(), noWork(), and recvYieldDevice().

float4* ComputeNonbondedCUDA::forces

Definition at line 107 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), and recvYieldDevice().

int ComputeNonbondedCUDA::forces_size

Definition at line 106 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), and doWork().

ResizeArray<int> ComputeNonbondedCUDA::hostedPatches

Definition at line 101 of file ComputeNonbondedCUDA.h.

Referenced by doWork(), noWork(), registerPatches(), and skip().

Lattice ComputeNonbondedCUDA::lattice

Definition at line 84 of file ComputeNonbondedCUDA.h.

Referenced by doWork(), noWork(), and recvYieldDevice().

ResizeArray<int> ComputeNonbondedCUDA::localActivePatches

Definition at line 100 of file ComputeNonbondedCUDA.h.

Referenced by doWork(), recvYieldDevice(), and requirePatch().

ResizeArray<compute_record> ComputeNonbondedCUDA::localComputeRecords
ResizeArray<int> ComputeNonbondedCUDA::localHostedPatches
LocalWorkMsg* ComputeNonbondedCUDA::localWorkMsg2
ComputeNonbondedCUDA* ComputeNonbondedCUDA::master
int ComputeNonbondedCUDA::masterPe

Definition at line 130 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), noWork(), and registerPatches().

int ComputeNonbondedCUDA::numSlaves

Definition at line 134 of file ComputeNonbondedCUDA.h.

Referenced by assignPatches(), finishWork(), and noWork().

int ComputeNonbondedCUDA::pairlistsValid

Definition at line 140 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), and doWork().

float ComputeNonbondedCUDA::pairlistTolerance

Definition at line 141 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), and doWork().

PatchMap* ComputeNonbondedCUDA::patchMap
int ComputeNonbondedCUDA::patchPairsReordered
ResizeArray<patch_record> ComputeNonbondedCUDA::patchRecords
float ComputeNonbondedCUDA::plcutoff2

Definition at line 144 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), and recvYieldDevice().

GBReal* ComputeNonbondedCUDA::psiSumH

Definition at line 113 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), finishWork(), and recvYieldDevice().

int ComputeNonbondedCUDA::psiSumH_size

Definition at line 112 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), and doWork().

SubmitReduction* ComputeNonbondedCUDA::reduction
ResizeArray<int> ComputeNonbondedCUDA::remoteActivePatches

Definition at line 100 of file ComputeNonbondedCUDA.h.

Referenced by doWork(), recvYieldDevice(), and requirePatch().

ResizeArray<compute_record> ComputeNonbondedCUDA::remoteComputeRecords
ResizeArray<int> ComputeNonbondedCUDA::remoteHostedPatches

Definition at line 101 of file ComputeNonbondedCUDA.h.

Referenced by finishWork(), and registerPatches().

int ComputeNonbondedCUDA::savePairlists

Definition at line 143 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), and recvYieldDevice().

int ComputeNonbondedCUDA::slaveIndex

Definition at line 131 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA().

int* ComputeNonbondedCUDA::slavePes

Definition at line 133 of file ComputeNonbondedCUDA.h.

Referenced by assignPatches(), ComputeNonbondedCUDA(), finishWork(), and noWork().

ComputeNonbondedCUDA** ComputeNonbondedCUDA::slaves

Definition at line 132 of file ComputeNonbondedCUDA.h.

Referenced by assignPatches(), ComputeNonbondedCUDA(), finishWork(), and noWork().

float4* ComputeNonbondedCUDA::slow_forces

Definition at line 110 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), and recvYieldDevice().

int ComputeNonbondedCUDA::slow_forces_size

Definition at line 109 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), and doWork().

int ComputeNonbondedCUDA::step

Definition at line 86 of file ComputeNonbondedCUDA.h.

Referenced by finishReductions(), and noWork().

int ComputeNonbondedCUDA::usePairlists

Definition at line 142 of file ComputeNonbondedCUDA.h.

Referenced by ComputeNonbondedCUDA(), doWork(), and recvYieldDevice().

int ComputeNonbondedCUDA::workStarted

The documentation for this class was generated from the following files: