NAMD
Classes | Public Member Functions | Static Public Member Functions | List of all members
CudaComputeNonbonded Class Reference

#include <CudaComputeNonbonded.h>

Inheritance diagram for CudaComputeNonbonded:
Compute ComputeNonbondedUtil

Classes

struct  ComputeRecord
 
struct  PatchRecord
 

Public Member Functions

 CudaComputeNonbonded (ComputeID c, int deviceID, CudaNonbondedTables &cudaNonbondedTables, bool doStreaming)
 
 ~CudaComputeNonbonded ()
 
void registerComputeSelf (ComputeID cid, PatchID pid)
 
void registerComputePair (ComputeID cid, PatchID *pid, int *trans)
 
void assignPatches (ComputeMgr *computeMgrIn)
 
virtual void initialize ()
 
virtual void atomUpdate ()
 
virtual int noWork ()
 
virtual void doWork ()
 
void launchWork ()
 
void finishReductions ()
 
void unregisterBoxesOnPe ()
 
void assignPatchesOnPe ()
 
void openBoxesOnPe ()
 
void skipPatchesOnPe ()
 
void finishPatchesOnPe ()
 
void finishPatchOnPe (int i)
 
void finishPatches ()
 
void messageEnqueueWork ()
 
virtual void patchReady (PatchID, int doneMigration, int seq)
 
virtual void gbisP2PatchReady (PatchID, int seq)
 
virtual void gbisP3PatchReady (PatchID, int seq)
 
void reSortTileLists ()
 
void updatePatchOrder (const std::vector< CudaLocalRecord > &data)
 
std::vector< PatchRecord > & getPatches ()
 
- Public Member Functions inherited from Compute
 Compute (ComputeID)
 
int type ()
 
virtual ~Compute ()
 
void setNumPatches (int n)
 
int getNumPatches ()
 
int sequence (void)
 
int priority (void)
 
int getGBISPhase (void)
 
- Public Member Functions inherited from ComputeNonbondedUtil
 ComputeNonbondedUtil ()
 
 ~ComputeNonbondedUtil ()
 
void calcGBIS (nonbonded *params, GBISParamStruct *gbisParams)
 

Static Public Member Functions

static CudaNBConstants getNonbondedCoef (SimParameters *params)
 
static bool getDoTable (SimParameters *params, const bool doSlow, const bool doVirial)
 
- Static Public Member Functions inherited from ComputeNonbondedUtil
static void select (void)
 
static void submitReductionData (BigReal *, SubmitReduction *)
 
static void submitPressureProfileData (BigReal *, SubmitReduction *)
 
static BigReal square (const BigReal &x, const BigReal &y, const BigReal &z)
 
static void calc_error (nonbonded *)
 
static void calc_pair (nonbonded *)
 
static void calc_pair_energy (nonbonded *)
 
static void calc_pair_fullelect (nonbonded *)
 
static void calc_pair_energy_fullelect (nonbonded *)
 
static void calc_pair_merge_fullelect (nonbonded *)
 
static void calc_pair_energy_merge_fullelect (nonbonded *)
 
static void calc_pair_slow_fullelect (nonbonded *)
 
static void calc_pair_energy_slow_fullelect (nonbonded *)
 
static void calc_self (nonbonded *)
 
static void calc_self_energy (nonbonded *)
 
static void calc_self_fullelect (nonbonded *)
 
static void calc_self_energy_fullelect (nonbonded *)
 
static void calc_self_merge_fullelect (nonbonded *)
 
static void calc_self_energy_merge_fullelect (nonbonded *)
 
static void calc_self_slow_fullelect (nonbonded *)
 
static void calc_self_energy_slow_fullelect (nonbonded *)
 
static void calc_pair_energy_fep (nonbonded *)
 
static void calc_pair_energy_fullelect_fep (nonbonded *)
 
static void calc_pair_energy_merge_fullelect_fep (nonbonded *)
 
static void calc_pair_energy_slow_fullelect_fep (nonbonded *)
 
static void calc_self_energy_fep (nonbonded *)
 
static void calc_self_energy_fullelect_fep (nonbonded *)
 
static void calc_self_energy_merge_fullelect_fep (nonbonded *)
 
static void calc_self_energy_slow_fullelect_fep (nonbonded *)
 
static void calc_pair_energy_ti (nonbonded *)
 
static void calc_pair_ti (nonbonded *)
 
static void calc_pair_energy_fullelect_ti (nonbonded *)
 
static void calc_pair_fullelect_ti (nonbonded *)
 
static void calc_pair_energy_merge_fullelect_ti (nonbonded *)
 
static void calc_pair_merge_fullelect_ti (nonbonded *)
 
static void calc_pair_energy_slow_fullelect_ti (nonbonded *)
 
static void calc_pair_slow_fullelect_ti (nonbonded *)
 
static void calc_self_energy_ti (nonbonded *)
 
static void calc_self_ti (nonbonded *)
 
static void calc_self_energy_fullelect_ti (nonbonded *)
 
static void calc_self_fullelect_ti (nonbonded *)
 
static void calc_self_energy_merge_fullelect_ti (nonbonded *)
 
static void calc_self_merge_fullelect_ti (nonbonded *)
 
static void calc_self_energy_slow_fullelect_ti (nonbonded *)
 
static void calc_self_slow_fullelect_ti (nonbonded *)
 
static void calc_pair_les (nonbonded *)
 
static void calc_pair_energy_les (nonbonded *)
 
static void calc_pair_fullelect_les (nonbonded *)
 
static void calc_pair_energy_fullelect_les (nonbonded *)
 
static void calc_pair_merge_fullelect_les (nonbonded *)
 
static void calc_pair_energy_merge_fullelect_les (nonbonded *)
 
static void calc_pair_slow_fullelect_les (nonbonded *)
 
static void calc_pair_energy_slow_fullelect_les (nonbonded *)
 
static void calc_self_les (nonbonded *)
 
static void calc_self_energy_les (nonbonded *)
 
static void calc_self_fullelect_les (nonbonded *)
 
static void calc_self_energy_fullelect_les (nonbonded *)
 
static void calc_self_merge_fullelect_les (nonbonded *)
 
static void calc_self_energy_merge_fullelect_les (nonbonded *)
 
static void calc_self_slow_fullelect_les (nonbonded *)
 
static void calc_self_energy_slow_fullelect_les (nonbonded *)
 
static void calc_pair_energy_int (nonbonded *)
 
static void calc_pair_energy_fullelect_int (nonbonded *)
 
static void calc_pair_energy_merge_fullelect_int (nonbonded *)
 
static void calc_self_energy_int (nonbonded *)
 
static void calc_self_energy_fullelect_int (nonbonded *)
 
static void calc_self_energy_merge_fullelect_int (nonbonded *)
 
static void calc_pair_pprof (nonbonded *)
 
static void calc_pair_energy_pprof (nonbonded *)
 
static void calc_pair_fullelect_pprof (nonbonded *)
 
static void calc_pair_energy_fullelect_pprof (nonbonded *)
 
static void calc_pair_merge_fullelect_pprof (nonbonded *)
 
static void calc_pair_energy_merge_fullelect_pprof (nonbonded *)
 
static void calc_pair_slow_fullelect_pprof (nonbonded *)
 
static void calc_pair_energy_slow_fullelect_pprof (nonbonded *)
 
static void calc_self_pprof (nonbonded *)
 
static void calc_self_energy_pprof (nonbonded *)
 
static void calc_self_fullelect_pprof (nonbonded *)
 
static void calc_self_energy_fullelect_pprof (nonbonded *)
 
static void calc_self_merge_fullelect_pprof (nonbonded *)
 
static void calc_self_energy_merge_fullelect_pprof (nonbonded *)
 
static void calc_self_slow_fullelect_pprof (nonbonded *)
 
static void calc_self_energy_slow_fullelect_pprof (nonbonded *)
 
static void calc_pair_tabener (nonbonded *)
 
static void calc_pair_energy_tabener (nonbonded *)
 
static void calc_pair_fullelect_tabener (nonbonded *)
 
static void calc_pair_energy_fullelect_tabener (nonbonded *)
 
static void calc_pair_merge_fullelect_tabener (nonbonded *)
 
static void calc_pair_energy_merge_fullelect_tabener (nonbonded *)
 
static void calc_pair_slow_fullelect_tabener (nonbonded *)
 
static void calc_pair_energy_slow_fullelect_tabener (nonbonded *)
 
static void calc_self_tabener (nonbonded *)
 
static void calc_self_energy_tabener (nonbonded *)
 
static void calc_self_fullelect_tabener (nonbonded *)
 
static void calc_self_energy_fullelect_tabener (nonbonded *)
 
static void calc_self_merge_fullelect_tabener (nonbonded *)
 
static void calc_self_energy_merge_fullelect_tabener (nonbonded *)
 
static void calc_self_slow_fullelect_tabener (nonbonded *)
 
static void calc_self_energy_slow_fullelect_tabener (nonbonded *)
 
static void calc_pair_go (nonbonded *)
 
static void calc_pair_energy_go (nonbonded *)
 
static void calc_pair_fullelect_go (nonbonded *)
 
static void calc_pair_energy_fullelect_go (nonbonded *)
 
static void calc_pair_merge_fullelect_go (nonbonded *)
 
static void calc_pair_energy_merge_fullelect_go (nonbonded *)
 
static void calc_pair_slow_fullelect_go (nonbonded *)
 
static void calc_pair_energy_slow_fullelect_go (nonbonded *)
 
static void calc_self_go (nonbonded *)
 
static void calc_self_energy_go (nonbonded *)
 
static void calc_self_fullelect_go (nonbonded *)
 
static void calc_self_energy_fullelect_go (nonbonded *)
 
static void calc_self_merge_fullelect_go (nonbonded *)
 
static void calc_self_energy_merge_fullelect_go (nonbonded *)
 
static void calc_self_slow_fullelect_go (nonbonded *)
 
static void calc_self_energy_slow_fullelect_go (nonbonded *)
 

Additional Inherited Members

- Public Types inherited from ComputeNonbondedUtil
enum  {
  exclChecksumIndex, pairlistWarningIndex, electEnergyIndex, fullElectEnergyIndex,
  vdwEnergyIndex, goNativeEnergyIndex, goNonnativeEnergyIndex, groLJEnergyIndex,
  groGaussEnergyIndex, electEnergyIndex_s, fullElectEnergyIndex_s, vdwEnergyIndex_s,
  electEnergyIndex_ti_1, fullElectEnergyIndex_ti_1, vdwEnergyIndex_ti_1, electEnergyIndex_ti_2,
  fullElectEnergyIndex_ti_2, vdwEnergyIndex_ti_2, TENSOR =(virialIndex), TENSOR =(virialIndex),
  VECTOR =(pairVDWForceIndex), VECTOR =(pairVDWForceIndex), reductionDataSize
}
 
- Public Attributes inherited from Compute
const ComputeID cid
 
LDObjHandle ldObjHandle
 
LocalWorkMsg *const localWorkMsg
 
- Static Public Attributes inherited from ComputeNonbondedUtil
static void(* calcPair )(nonbonded *)
 
static void(* calcPairEnergy )(nonbonded *)
 
static void(* calcSelf )(nonbonded *)
 
static void(* calcSelfEnergy )(nonbonded *)
 
static void(* calcFullPair )(nonbonded *)
 
static void(* calcFullPairEnergy )(nonbonded *)
 
static void(* calcFullSelf )(nonbonded *)
 
static void(* calcFullSelfEnergy )(nonbonded *)
 
static void(* calcMergePair )(nonbonded *)
 
static void(* calcMergePairEnergy )(nonbonded *)
 
static void(* calcMergeSelf )(nonbonded *)
 
static void(* calcMergeSelfEnergy )(nonbonded *)
 
static void(* calcSlowPair )(nonbonded *)
 
static void(* calcSlowPairEnergy )(nonbonded *)
 
static void(* calcSlowSelf )(nonbonded *)
 
static void(* calcSlowSelfEnergy )(nonbonded *)
 
static Bool commOnly
 
static Bool fixedAtomsOn
 
static Bool qmForcesOn
 
static BigReal cutoff
 
static BigReal cutoff2
 
static float cutoff2_f
 
static BigReal dielectric_1
 
static const LJTableljTable = 0
 
static const Moleculemol
 
static BigReal r2_delta
 
static BigReal r2_delta_1
 
static int rowsize
 
static int columnsize
 
static int r2_delta_exp
 
static BigRealtable_alloc = 0
 
static BigRealtable_ener = 0
 
static BigRealtable_short
 
static BigRealtable_noshort
 
static BigRealfast_table
 
static BigRealscor_table
 
static BigRealslow_table
 
static BigRealcorr_table
 
static BigRealfull_table
 
static BigRealvdwa_table
 
static BigRealvdwb_table
 
static BigRealr2_table
 
static int table_length
 
static BigReal scaling
 
static BigReal scale14
 
static BigReal switchOn
 
static BigReal switchOn_1
 
static BigReal switchOn2
 
static BigReal v_vdwa
 
static BigReal v_vdwb
 
static BigReal k_vdwa
 
static BigReal k_vdwb
 
static BigReal cutoff_3
 
static BigReal cutoff_6
 
static float v_vdwa_f
 
static float v_vdwb_f
 
static float k_vdwa_f
 
static float k_vdwb_f
 
static float cutoff_3_f
 
static float cutoff_6_f
 
static float switchOn_f
 
static float A6_f
 
static float B6_f
 
static float C6_f
 
static float A12_f
 
static float B12_f
 
static float C12_f
 
static BigReal c0
 
static BigReal c1
 
static BigReal c3
 
static BigReal c5
 
static BigReal c6
 
static BigReal c7
 
static BigReal c8
 
static Bool alchFepOn
 
static Bool alchThermIntOn
 
static Bool alchWCAOn
 
static BigReal alchVdwShiftCoeff
 
static Bool vdwForceSwitching
 
static Bool alchDecouple
 
static Bool lesOn
 
static int lesFactor
 
static BigReal lesScaling
 
static BigReallambda_table = 0
 
static Bool pairInteractionOn
 
static Bool pairInteractionSelf
 
static Bool pressureProfileOn
 
static int pressureProfileSlabs
 
static int pressureProfileAtomTypes
 
static BigReal pressureProfileThickness
 
static BigReal pressureProfileMin
 
static Bool accelMDOn
 
static Bool drudeNbthole
 
static BigReal ewaldcof
 
static BigReal pi_ewaldcof
 
static int vdw_switch_mode
 
static Bool goGroPair
 
static Bool goForcesOn
 
static int goMethod
 
- Protected Member Functions inherited from Compute
void enqueueWork ()
 
- Protected Attributes inherited from Compute
int computeType
 
int basePriority
 
int gbisPhase
 
int gbisPhasePriority [3]
 

Detailed Description

Definition at line 31 of file CudaComputeNonbonded.h.

Constructor & Destructor Documentation

◆ CudaComputeNonbonded()

CudaComputeNonbonded::CudaComputeNonbonded ( ComputeID  c,
int  deviceID,
CudaNonbondedTables cudaNonbondedTables,
bool  doStreaming 
)

Definition at line 41 of file CudaComputeNonbonded.C.

References cudaCheck, Compute::gbisPhase, NAMD_die(), Node::Object(), Node::simParameters, and simParams.

42  :
43 Compute(c), deviceID(deviceID), doStreaming(doStreaming), nonbondedKernel(deviceID, cudaNonbondedTables, doStreaming),
44 tileListKernel(deviceID, doStreaming), GBISKernel(deviceID) {
45 
46  cudaCheck(cudaSetDevice(deviceID));
47 
48  exclusionsByAtom = NULL;
49 
50  vdwTypes = NULL;
51  vdwTypesSize = 0;
52 
53  exclIndexMaxDiff = NULL;
54  exclIndexMaxDiffSize = 0;
55 
56  atomIndex = NULL;
57  atomIndexSize = 0;
58 
59  atomStorageSize = 0;
60 
61  // Atom and charge storage
62  atoms = NULL;
63  atomsSize = 0;
64  part = NULL;
65  partSize = 0;
66  doAlch = false;
67  lambdaWindowUpdated = false;
68 
69  // Force storage
70  h_forces = NULL;
71  h_forcesSize = 0;
72  h_forcesSlow = NULL;
73  h_forcesSlowSize = 0;
74 
75  d_forces = NULL;
76  d_forcesSize = 0;
77  d_forcesSlow = NULL;
78  d_forcesSlowSize = 0;
79 
80  // GBIS
81  intRad0H = NULL;
82  intRad0HSize = 0;
83  intRadSH = NULL;
84  intRadSHSize = 0;
85  psiSumH = NULL;
86  psiSumHSize = 0;
87  bornRadH = NULL;
88  bornRadHSize = 0;
89  dEdaSumH = NULL;
90  dEdaSumHSize = 0;
91  dHdrPrefixH = NULL;
92  dHdrPrefixHSize = 0;
93  maxShmemPerBlock = 0;
94  cudaPatches = NULL;
95 
96  atomsChangedIn = true;
97  atomsChanged = true;
98  computesChanged = true;
99 
100  forceDoneEventRecord = false;
101 
103  if (simParams->pressureProfileOn) {
104  NAMD_die("CudaComputeNonbonded, pressure profile not supported");
105  }
106 
107  if (simParams->GBISOn) gbisPhase = 3;
108 
109  doSkip = false;
110 }
static Node * Object()
Definition: Node.h:86
SimParameters * simParameters
Definition: Node.h:181
void NAMD_die(const char *err_msg)
Definition: common.C:147
int gbisPhase
Definition: Compute.h:39
#define simParams
Definition: Output.C:129
#define cudaCheck(stmt)
Definition: CudaUtils.h:233
Compute(ComputeID)
Definition: Compute.C:37

◆ ~CudaComputeNonbonded()

CudaComputeNonbonded::~CudaComputeNonbonded ( )

Definition at line 115 of file CudaComputeNonbonded.C.

References cudaCheck, and ComputeMgr::sendUnregisterBoxesOnPe().

115  {
116  // fprintf(stderr, "Pe %d calling destructor ", CkMyPe());
117  cudaCheck(cudaSetDevice(deviceID));
118  if (exclusionsByAtom != NULL) delete [] exclusionsByAtom;
119  if (vdwTypes != NULL) deallocate_host<int>(&vdwTypes);
120  if (exclIndexMaxDiff != NULL) deallocate_host<int2>(&exclIndexMaxDiff);
121  if (atoms != NULL) deallocate_host<CudaAtom>(&atoms);
122  if (part != NULL) deallocate_host<char>(&part);
123  if (h_forces != NULL) deallocate_host<float4>(&h_forces);
124  if (h_forcesSlow != NULL) deallocate_host<float4>(&h_forcesSlow);
125  if (d_forces != NULL) deallocate_device<float4>(&d_forces);
126  if (d_forcesSlow != NULL) deallocate_device<float4>(&d_forcesSlow);
127 
128  // GBIS
129  if (intRad0H != NULL) deallocate_host<float>(&intRad0H);
130  if (intRadSH != NULL) deallocate_host<float>(&intRadSH);
131  if (psiSumH != NULL) deallocate_host<GBReal>(&psiSumH);
132  if (bornRadH != NULL) deallocate_host<float>(&bornRadH);
133  if (dEdaSumH != NULL) deallocate_host<GBReal>(&dEdaSumH);
134  if (dHdrPrefixH != NULL) deallocate_host<float>(&dHdrPrefixH);
135 
136  if (cudaPatches != NULL) deallocate_host<CudaPatchRecord>(&cudaPatches);
137 
138  if (patches.size() > 0) {
139  deallocate_host<VirialEnergy>(&h_virialEnergy);
140  deallocate_device<VirialEnergy>(&d_virialEnergy);
141  cudaCheck(cudaStreamDestroy(stream));
142  cudaCheck(cudaEventDestroy(forceDoneEvent));
143  CmiDestroyLock(lock);
144  delete reduction;
145  }
146 
147  // NOTE: unregistering happens in [sync] -entry method
148  // fprintf(stderr, "unregistering patches on pe %d\n", CkMyPe());
149  computeMgr->sendUnregisterBoxesOnPe(pes, this);
150 
151 }
void sendUnregisterBoxesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1781
#define cudaCheck(stmt)
Definition: CudaUtils.h:233

Member Function Documentation

◆ assignPatches()

void CudaComputeNonbonded::assignPatches ( ComputeMgr computeMgrIn)

Definition at line 384 of file CudaComputeNonbonded.C.

References PatchMap::basePatchIDList(), deviceCUDA, findHomePatchPe(), findProxyPatchPes(), DeviceCUDA::getDeviceCount(), DeviceCUDA::getMasterPeForDeviceID(), Compute::getNumPatches(), DeviceCUDA::getNumPesSharingDevice(), DeviceCUDA::getPesSharingDevice(), ComputePmeCUDAMgr::isPmePe(), NAMD_bug(), PatchMap::Object(), ComputePmeCUDAMgr::Object(), PatchMap::ObjectOnPe(), ComputeMgr::sendAssignPatchesOnPe(), and Compute::setNumPatches().

Referenced by ComputeMgr::createComputes().

384  {
385  // Remove duplicate patches
386  std::sort(patches.begin(), patches.end());
387  std::vector<PatchRecord>::iterator last = std::unique(patches.begin(), patches.end());
388  patches.erase(last, patches.end());
389  // Set number of patches
390  setNumPatches(patches.size());
391  masterPe = CkMyPe();
392  computeMgr = computeMgrIn;
393  // Start patch counter
394  patchesCounter = getNumPatches();
395  // Patch ID map
396  std::map<PatchID, int> pidMap;
397 #if 1
398  //-------------------------------------------------------
399  // Copied in from ComputeNonbondedCUDA::assignPatches()
400  //-------------------------------------------------------
401 
402  std::vector<int> pesOnNodeSharingDevice(CkMyNodeSize());
403  int numPesOnNodeSharingDevice = 0;
404  int masterIndex = -1;
405  for ( int i=0; i<deviceCUDA->getNumPesSharingDevice(); ++i ) {
406  int pe = deviceCUDA->getPesSharingDevice(i);
407  if ( pe == CkMyPe() ) masterIndex = numPesOnNodeSharingDevice;
408  if ( CkNodeOf(pe) == CkMyNode() ) {
409  pesOnNodeSharingDevice[numPesOnNodeSharingDevice++] = pe;
410  }
411  }
412 
413  std::vector<int> count(patches.size(), 0);
414  std::vector<int> pcount(numPesOnNodeSharingDevice, 0);
415  std::vector<int> rankpcount(CkMyNodeSize(), 0);
416  std::vector<char> table(patches.size()*numPesOnNodeSharingDevice, 0);
417 
418  PatchMap* patchMap = PatchMap::Object();
419 
420  int unassignedpatches = patches.size();
421 
422  for (int i=0;i < patches.size(); ++i) {
423  patches[i].pe = -1;
424  }
425 
426  // assign if home pe and build table of natural proxies
427  for (int i=0;i < patches.size(); ++i) {
428  int pid = patches[i].patchID;
429  // homePe = PE where the patch currently resides
430  int homePe = patchMap->node(pid);
431  for ( int j=0; j < numPesOnNodeSharingDevice; ++j ) {
432  int pe = pesOnNodeSharingDevice[j];
433  // If homePe is sharing this device, assign this patch to homePe
434  if ( pe == homePe ) {
435  patches[i].pe = pe;
436  --unassignedpatches;
437  pcount[j] += 1;
438  }
439  if ( PatchMap::ObjectOnPe(pe)->patch(pid) ) {
440  table[i*numPesOnNodeSharingDevice + j] = 1;
441  }
442  }
443  // Assign this patch to homePe, if it resides on the same node
444  if ( patches[i].pe == -1 && CkNodeOf(homePe) == CkMyNode() ) {
445  patches[i].pe = homePe;
446  --unassignedpatches;
447  rankpcount[CkRankOf(homePe)] += 1;
448  }
449  }
450  // assign if only one pe has a required proxy
451  for (int i=0; i < patches.size(); ++i) {
452  int pid = patches[i].patchID;
453  if ( patches[i].pe != -1 ) continue;
454  int c = 0;
455  int lastj;
456  for (int j=0; j < numPesOnNodeSharingDevice; ++j) {
457  if ( table[i*numPesOnNodeSharingDevice + j] ) {
458  ++c;
459  lastj = j;
460  }
461  }
462  count[i] = c;
463  if ( c == 1 ) {
464  patches[i].pe = pesOnNodeSharingDevice[lastj];
465  --unassignedpatches;
466  pcount[lastj] += 1;
467  }
468  }
469  int assignj = 0;
470  while ( unassignedpatches ) {
471  int i;
472  for (i=0;i < patches.size(); ++i) {
473  if ( ! table[i*numPesOnNodeSharingDevice + assignj] ) continue;
474  int pid = patches[i].patchID;
475  // patch_record &pr = patchRecords[pid];
476  if ( patches[i].pe != -1 ) continue;
477  patches[i].pe = pesOnNodeSharingDevice[assignj];
478  --unassignedpatches;
479  pcount[assignj] += 1;
480  if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
481  break;
482  }
483  if (i < patches.size() ) continue; // start search again
484  for ( i=0;i < patches.size(); ++i ) {
485  int pid = patches[i].patchID;
486  // patch_record &pr = patchRecords[pid];
487  if ( patches[i].pe != -1 ) continue;
488  if ( count[i] ) continue;
489  patches[i].pe = pesOnNodeSharingDevice[assignj];
490  --unassignedpatches;
491  pcount[assignj] += 1;
492  if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
493  break;
494  }
495  if ( i < patches.size() ) continue; // start search again
496  if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
497  }
498 
499  // For each rank, list of patches
500  rankPatches.resize(CkMyNodeSize());
501  for (int i=0; i < patches.size(); ++i) {
502  rankPatches[CkRankOf(patches[i].pe)].push_back(i);
503  pidMap[patches[i].patchID] = i;
504  }
505 
506  // for ( int i=0; i < patches.size(); ++i ) {
507  // CkPrintf("Pe %d patch %d hostPe %d\n", CkMyPe(), patches[i].patchID, patches[i].pe);
508  // }
509 #else
510  // For each rank, list of patches
511  rankPatches.resize(CkMyNodeSize());
512  // For each rank, list of home patch IDs
513  PatchIDList* rankHomePatchIDs = new PatchIDList[CkMyNodeSize()];
514  for (int i=0;i < CkMyNodeSize();i++) {
515  int pe = CkNodeFirst(CkMyNode()) + i;
516  PatchMap::Object()->basePatchIDList(pe, rankHomePatchIDs[i]);
517  }
518  std::vector<int> proxyPatchPes;
519  std::vector<int> peProxyPatchCounter(CkMyNodeSize(), 0);
520  //--------------------------------------------------------
521  // Build a list of PEs to avoid
522  std::vector<int> pesToAvoid;
523 #if 0
524  // Avoid other GPUs' master PEs
525  for (int i=0;i < deviceCUDA->getDeviceCount();i++) {
526  int pe = deviceCUDA->getMasterPeForDeviceID(i);
527  if (pe != -1 && pe != masterPe) pesToAvoid.push_back(pe);
528  }
529  // Avoid PEs that are involved in PME
530  ComputePmeCUDAMgr *computePmeCUDAMgr = ComputePmeCUDAMgr::Object();
531  for (int pe=CkNodeFirst(CkMyNode());pe < CkNodeFirst(CkMyNode()) + CkMyNodeSize();pe++) {
532  if (computePmeCUDAMgr->isPmePe(pe)) pesToAvoid.push_back(pe);
533  }
534  // Set counters of avoidable PEs to high numbers
535  for (int i=0;i < pesToAvoid.size();i++) {
536  int pe = pesToAvoid[i];
537  peProxyPatchCounter[CkRankOf(pe)] = (1 << 20);
538  }
539 #endif
540  // Avoid master Pe somewhat
541  peProxyPatchCounter[CkRankOf(masterPe)] = 2; // patches.size();
542  //--------------------------------------------------------
543  for (int i=0;i < patches.size();i++) {
544  //if I had this datastructure "patches" on the GPU, I could use it
545  PatchID pid = patches[i].patchID;
546  int pe = findHomePatchPe(rankHomePatchIDs, pid);
547  if (pe == -1) {
548  // Patch not present on this node => try finding a ProxyPatch
549  findProxyPatchPes(proxyPatchPes, pid);
550  if (proxyPatchPes.size() == 0) {
551  // No ProxyPatch => create one on rank that has the least ProxyPatches
552  int rank = std::min_element(peProxyPatchCounter.begin(), peProxyPatchCounter.end()) - peProxyPatchCounter.begin();
553  pe = CkNodeFirst(CkMyNode()) + rank;
554  peProxyPatchCounter[rank]++;
555  } else {
556  // Choose ProxyPatch, try to avoid masterPe (current Pe) and Pes that already have a ProxyPatch,
557  // this is done by finding the entry with minimum peProxyPatchCounter -value
558  // Find miniumum among proxyPatchPes, i.e., find the minimum among
559  // peProxyPatchCounter[CkRankOf(proxyPatchPes[j])]
560  // int pppi = std::min_element(proxyPatchPes.begin(), proxyPatchPes.end(),
561  // [&](int i, int j) {return peProxyPatchCounter[CkRankOf(i)] < peProxyPatchCounter[CkRankOf(j)];})
562  // - proxyPatchPes.begin();
563  // pe = proxyPatchPes[pppi];
564  int minCounter = (1 << 30);
565  for (int j=0;j < proxyPatchPes.size();j++) {
566  if (minCounter > peProxyPatchCounter[CkRankOf(proxyPatchPes[j])]) {
567  pe = proxyPatchPes[j];
568  minCounter = peProxyPatchCounter[CkRankOf(pe)];
569  }
570  }
571  if (pe == -1)
572  NAMD_bug("CudaComputeNonbonded::assignPatches, Unable to choose PE with proxy patch");
573  peProxyPatchCounter[CkRankOf(pe)]++;
574  }
575  } else if (std::find(pesToAvoid.begin(), pesToAvoid.end(), pe) != pesToAvoid.end()) {
576  // Found home patch on this node, but it's on PE that should be avoided => find a new one
577  int rank = std::min_element(peProxyPatchCounter.begin(), peProxyPatchCounter.end()) - peProxyPatchCounter.begin();
578  pe = CkNodeFirst(CkMyNode()) + rank;
579  peProxyPatchCounter[rank]++;
580  }
581  if (pe < CkNodeFirst(CkMyNode()) || pe >= CkNodeFirst(CkMyNode()) + CkMyNodeSize() )
582  NAMD_bug("CudaComputeNonbonded::assignPatches, Invalid PE for a patch");
583  rankPatches[CkRankOf(pe)].push_back(i);
584  pidMap[pid] = i;
585  }
586 
587  delete [] rankHomePatchIDs;
588 #endif
589  // Setup computes using pidMap
590  for (int i=0;i < computes.size();i++) {
591  computes[i].patchInd[0] = pidMap[computes[i].pid[0]];
592  computes[i].patchInd[1] = pidMap[computes[i].pid[1]];
593  }
594  for (int i=0;i < CkMyNodeSize();i++) {
595  if (rankPatches[i].size() > 0) pes.push_back(CkNodeFirst(CkMyNode()) + i);
596  }
597  computeMgr->sendAssignPatchesOnPe(pes, this);
598 }
void setNumPatches(int n)
Definition: Compute.h:52
int getDeviceCount()
Definition: DeviceCUDA.h:124
static PatchMap * Object()
Definition: PatchMap.h:27
void basePatchIDList(int pe, PatchIDList &)
Definition: PatchMap.C:454
static PatchMap * ObjectOnPe(int pe)
Definition: PatchMap.h:28
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:23
static ComputePmeCUDAMgr * Object()
void NAMD_bug(const char *err_msg)
Definition: common.C:195
int getMasterPeForDeviceID(int deviceID)
Definition: DeviceCUDA.C:530
int getPesSharingDevice(const int i)
Definition: DeviceCUDA.h:139
int getNumPatches()
Definition: Compute.h:53
int getNumPesSharingDevice()
Definition: DeviceCUDA.h:138
int32 PatchID
Definition: NamdTypes.h:277
void findProxyPatchPes(std::vector< int > &proxyPatchPes, PatchID pid)
void sendAssignPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1681
int findHomePatchPe(PatchIDList *rankPatchIDs, PatchID pid)

◆ assignPatchesOnPe()

void CudaComputeNonbonded::assignPatchesOnPe ( )

Definition at line 322 of file CudaComputeNonbonded.C.

References ResizeArray< Elem >::add(), NAMD_bug(), PatchMap::node(), PatchMap::Object(), and ResizeArray< Elem >::size().

Referenced by ComputeMgr::recvAssignPatchesOnPe().

322  {
323  if (rankPatches[CkMyRank()].size() == 0)
324  NAMD_bug("CudaComputeNonbonded::assignPatchesOnPe, empty rank");
325 
326  // calculate priority rank of local home patch within pe
327  {
328  PatchMap* patchMap = PatchMap::Object();
329  ResizeArray< ResizeArray<int2> > homePatchByRank(CkMyNodeSize());
330  for ( int k=0; k < rankPatches[CkMyRank()].size(); ++k ) {
331  int i = rankPatches[CkMyRank()][k];
332  int pid = patches[i].patchID;
333  int homePe = patchMap->node(pid);
334  if ( CkNodeOf(homePe) == CkMyNode() ) {
335  int2 pid_index;
336  pid_index.x = pid;
337  pid_index.y = i;
338  homePatchByRank[CkRankOf(homePe)].add(pid_index);
339  }
340  }
341  for ( int i=0; i<CkMyNodeSize(); ++i ) {
343  std::sort(homePatchByRank[i].begin(),homePatchByRank[i].end(),so);
344  int masterBoost = ( CkMyRank() == i ? 2 : 0 );
345  for ( int j=0; j<homePatchByRank[i].size(); ++j ) {
346  int index = homePatchByRank[i][j].y;
347  patches[index].reversePriorityRankInPe = j + masterBoost;
348  }
349  }
350  }
351 
352  for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
353  assignPatch(rankPatches[CkMyRank()][i]);
354  }
355 }
static PatchMap * Object()
Definition: PatchMap.h:27
void NAMD_bug(const char *err_msg)
Definition: common.C:195
int node(int pid) const
Definition: PatchMap.h:114

◆ atomUpdate()

void CudaComputeNonbonded::atomUpdate ( void  )
virtual

Reimplemented from Compute.

Definition at line 686 of file CudaComputeNonbonded.C.

686  {
687  atomsChangedIn = true;
688 }

◆ doWork()

void CudaComputeNonbonded::doWork ( void  )
virtual

Reimplemented from Compute.

Definition at line 1112 of file CudaComputeNonbonded.C.

References SimParameters::CUDASOAintegrate, Flags::doEnergy, Flags::doFullElectrostatics, Flags::doMinimize, Flags::doNonbonded, Flags::doVirial, Compute::gbisPhase, NAMD_bug(), Node::Object(), openBoxesOnPe(), CudaTileListKernel::prepareBuffers(), ComputeMgr::sendOpenBoxesOnPe(), Node::simParameters, and simParams.

1112  {
1113  if (CkMyPe() != masterPe)
1114  NAMD_bug("CudaComputeNonbonded::doWork() called on non masterPe");
1115 
1116  // Read value of atomsChangedIn, which is set in atomUpdate(), and reset it.
1117  // atomsChangedIn can be set to true by any Pe
1118  // atomsChanged can only be set by masterPe
1119  // This use of double varibles makes sure we don't have race condition
1120  // it seems like it's important to have the masterPe call doWork() first
1121  atomsChanged = atomsChangedIn;
1122  atomsChangedIn = false;
1123 
1125 
1126  if (patches.size() == 0) return; // No work do to
1127 
1128  // Take the flags from the first patch on this Pe
1129  // Flags &flags = patches[rankPatches[CkMyRank()][0]].patch->flags;
1130  // these flags are probably wrong.
1131  Flags &flags = patches[0].patch->flags;
1132 
1133  doSlow = flags.doFullElectrostatics;
1134  doEnergy = flags.doEnergy;
1135  doVirial = flags.doVirial;
1136  doAlch = simParams->alchOn;
1137  doMinimize = flags.doMinimize;
1138 
1139  if (flags.doNonbonded) {
1140 
1141  if (simParams->GBISOn) {
1142  gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
1143  }
1144 
1145  if (!simParams->GBISOn || gbisPhase == 1) {
1146  if ( computesChanged ) {
1147  updateComputes();
1148  }
1149  if (atomsChanged) {
1150  // Re-calculate patch atom numbers and storage
1151  updatePatches();
1152  reSortDone = false;
1153  }
1154  reallocateArrays();
1155 #ifdef NODEGROUP_FORCE_REGISTER
1156  if (simParams->CUDASOAintegrate && simParams->useDeviceMigration && atomsChanged) {
1157  tileListKernel.prepareBuffers(atomStorageSize, patches.size(), cudaPatches, stream);
1158  updatePatchRecord();
1159  }
1160 #endif // NODEGROUP_FORCE_REGISTER
1161  }
1162 
1163  // Open boxes on Pes and launch work to masterPe
1164  if(params->CUDASOAintegrate){
1165  if(!atomsChanged) this->openBoxesOnPe();
1166  }
1167  else computeMgr->sendOpenBoxesOnPe(pes, this);
1168 
1169  } else {
1170  // No work to do, skip
1171  skip();
1172  }
1173 
1174 }
static Node * Object()
Definition: Node.h:86
SimParameters * simParameters
Definition: Node.h:181
void prepareBuffers(int atomStorageSizeIn, int numPatchesIn, const CudaPatchRecord *h_cudaPatches, cudaStream_t stream)
void NAMD_bug(const char *err_msg)
Definition: common.C:195
int doEnergy
Definition: PatchTypes.h:20
int doFullElectrostatics
Definition: PatchTypes.h:23
int doNonbonded
Definition: PatchTypes.h:22
int gbisPhase
Definition: Compute.h:39
#define simParams
Definition: Output.C:129
int doVirial
Definition: PatchTypes.h:21
void sendOpenBoxesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1734
int doMinimize
Definition: PatchTypes.h:25

◆ finishPatches()

void CudaComputeNonbonded::finishPatches ( )

Definition at line 1943 of file CudaComputeNonbonded.C.

References cudaCheck, SimParameters::CUDASOAintegrate, finishPatchesOnPe(), and ComputeMgr::sendFinishPatchesOnPe().

1943  {
1944  if(params->CUDASOAintegrate){
1945  if (atomsChanged || doEnergy || doVirial) cudaCheck(cudaStreamSynchronize(stream));
1946  this->finishPatchesOnPe();
1947  }
1948  else {
1949  computeMgr->sendFinishPatchesOnPe(pes, this);
1950  }
1951 }
void sendFinishPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1707
#define cudaCheck(stmt)
Definition: CudaUtils.h:233

◆ finishPatchesOnPe()

void CudaComputeNonbonded::finishPatchesOnPe ( )

Definition at line 1931 of file CudaComputeNonbonded.C.

Referenced by finishPatches(), and ComputeMgr::recvFinishPatchesOnPe().

1931  {
1932  finishSetOfPatchesOnPe(rankPatches[CkMyRank()]);
1933 }

◆ finishPatchOnPe()

void CudaComputeNonbonded::finishPatchOnPe ( int  i)

Definition at line 1938 of file CudaComputeNonbonded.C.

Referenced by ComputeMgr::recvFinishPatchOnPe().

1938  {
1939  std::vector<int> v(1, i);
1940  finishSetOfPatchesOnPe(v);
1941 }

◆ finishReductions()

void CudaComputeNonbonded::finishReductions ( )

Definition at line 1629 of file CudaComputeNonbonded.C.

References ADD_TENSOR_OBJECT, cudaCheck, SimParameters::CUDASOAintegrate, VirialEnergy::energyElec, VirialEnergy::energyElec_s, VirialEnergy::energyElec_ti_1, VirialEnergy::energyElec_ti_2, VirialEnergy::energyGBIS, VirialEnergy::energySlow, VirialEnergy::energySlow_s, VirialEnergy::energySlow_ti_1, VirialEnergy::energySlow_ti_2, VirialEnergy::energyVdw, VirialEnergy::energyVdw_s, VirialEnergy::energyVdw_ti_1, VirialEnergy::energyVdw_ti_2, SimParameters::GBISOn, CudaTileListKernel::getNumExcluded(), SubmitReduction::item(), NodeReduction::item(), NAMD_bug(), REDUCTION_COMPUTE_CHECKSUM, REDUCTION_ELECT_ENERGY, REDUCTION_ELECT_ENERGY_F, REDUCTION_ELECT_ENERGY_SLOW, REDUCTION_ELECT_ENERGY_SLOW_F, REDUCTION_ELECT_ENERGY_SLOW_TI_1, REDUCTION_ELECT_ENERGY_SLOW_TI_2, REDUCTION_ELECT_ENERGY_TI_1, REDUCTION_ELECT_ENERGY_TI_2, REDUCTION_EXCLUSION_CHECKSUM_CUDA, REDUCTION_LJ_ENERGY, REDUCTION_LJ_ENERGY_F, REDUCTION_LJ_ENERGY_TI_1, REDUCTION_LJ_ENERGY_TI_2, SubmitReduction::submit(), VirialEnergy::virial, VirialEnergy::virialSlow, Tensor::xx, Tensor::xy, Tensor::xz, Tensor::yx, Tensor::yy, Tensor::yz, Tensor::zx, Tensor::zy, and Tensor::zz.

Referenced by ComputeMgr::recvFinishReductions().

1629  {
1630  if (CkMyPe() != masterPe)
1631  NAMD_bug("CudaComputeNonbonded::finishReductions() called on non masterPe");
1632 
1633  // fprintf(stderr, "PE[%d]: Nbond finishReductions doSkip %d doVirial %d doEnergy %d\n", CkMyPe(), doSkip, doVirial, doEnergy);
1634  if (!doSkip) {
1635 
1636  if (doStreaming && (doVirial || doEnergy)) {
1637  // For streaming kernels, we must wait for virials and forces to be copied back to CPU
1638  if (!forceDoneEventRecord)
1639  NAMD_bug("CudaComputeNonbonded::finishReductions, forceDoneEvent not being recorded");
1640  cudaCheck(cudaEventSynchronize(forceDoneEvent));
1641  forceDoneEventRecord = false;
1642  }
1643 
1644  if (doVirial) {
1645  // if(params->CUDASOAintegrate) cudaCheck(cudaStreamSynchronize(stream));
1646  Tensor virialTensor;
1647  virialTensor.xx = h_virialEnergy->virial[0];
1648  virialTensor.xy = h_virialEnergy->virial[1];
1649  virialTensor.xz = h_virialEnergy->virial[2];
1650  virialTensor.yx = h_virialEnergy->virial[3];
1651  virialTensor.yy = h_virialEnergy->virial[4];
1652  virialTensor.yz = h_virialEnergy->virial[5];
1653  virialTensor.zx = h_virialEnergy->virial[6];
1654  virialTensor.zy = h_virialEnergy->virial[7];
1655  virialTensor.zz = h_virialEnergy->virial[8];
1656  // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.xx, virialTensor.xy, virialTensor.xz);
1657  // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.yx, virialTensor.yy, virialTensor.yz);
1658  // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.zx, virialTensor.zy, virialTensor.zz);
1659 #ifdef NODEGROUP_FORCE_REGISTER
1660  if (params->CUDASOAintegrate) {
1661  ADD_TENSOR_OBJECT(nodeReduction, REDUCTION_VIRIAL_NBOND, virialTensor);
1662  } else
1663 #endif
1664  {
1665  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NBOND, virialTensor);
1666  }
1667  if (doSlow) {
1668  Tensor virialTensor;
1669  virialTensor.xx = h_virialEnergy->virialSlow[0];
1670  virialTensor.xy = h_virialEnergy->virialSlow[1];
1671  virialTensor.xz = h_virialEnergy->virialSlow[2];
1672  virialTensor.yx = h_virialEnergy->virialSlow[3];
1673  virialTensor.yy = h_virialEnergy->virialSlow[4];
1674  virialTensor.yz = h_virialEnergy->virialSlow[5];
1675  virialTensor.zx = h_virialEnergy->virialSlow[6];
1676  virialTensor.zy = h_virialEnergy->virialSlow[7];
1677  virialTensor.zz = h_virialEnergy->virialSlow[8];
1678  // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.xx, virialTensor.xy, virialTensor.xz);
1679  // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.yx, virialTensor.yy, virialTensor.yz);
1680  // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.zx, virialTensor.zy, virialTensor.zz);
1681 #ifdef NODEGROUP_FORCE_REGISTER
1682  if (params->CUDASOAintegrate) {
1683  ADD_TENSOR_OBJECT(nodeReduction, REDUCTION_VIRIAL_SLOW, virialTensor);
1684  } else
1685 #endif
1686  {
1687  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_SLOW, virialTensor);
1688  }
1689  }
1690  }
1691  if (doEnergy) {
1692  // if (doSlow)
1693 
1694 #ifdef NODEGROUP_FORCE_REGISTER
1695  if (params->CUDASOAintegrate) {
1696  nodeReduction->item(REDUCTION_LJ_ENERGY) += h_virialEnergy->energyVdw;
1697  nodeReduction->item(REDUCTION_LJ_ENERGY_F) += h_virialEnergy->energyVdw_s;
1698  nodeReduction->item(REDUCTION_ELECT_ENERGY) += h_virialEnergy->energyElec + ((params->GBISOn) ? h_virialEnergy->energyGBIS : 0.0);
1699  nodeReduction->item(REDUCTION_ELECT_ENERGY_F) += h_virialEnergy->energyElec_s;
1700 
1701  //Reduce values for TI
1702  nodeReduction->item(REDUCTION_LJ_ENERGY_TI_1) += h_virialEnergy->energyVdw_ti_1;
1703  nodeReduction->item(REDUCTION_LJ_ENERGY_TI_2) += h_virialEnergy->energyVdw_ti_2;
1704  nodeReduction->item(REDUCTION_ELECT_ENERGY_TI_1) += h_virialEnergy->energyElec_ti_1;
1705  nodeReduction->item(REDUCTION_ELECT_ENERGY_TI_2) += h_virialEnergy->energyElec_ti_2;
1706  } else
1707 #endif
1708  {
1709  // printf("energyElec %lf energySlow %lf energyGBIS %lf\n", h_virialEnergy->energyElec, h_virialEnergy->energySlow, h_virialEnergy->energyGBIS);
1710  reduction->item(REDUCTION_LJ_ENERGY) += h_virialEnergy->energyVdw;
1711  reduction->item(REDUCTION_LJ_ENERGY_F) += h_virialEnergy->energyVdw_s;
1712  reduction->item(REDUCTION_ELECT_ENERGY) += h_virialEnergy->energyElec + ((params->GBISOn) ? h_virialEnergy->energyGBIS : 0.0);
1713  reduction->item(REDUCTION_ELECT_ENERGY_F) += h_virialEnergy->energyElec_s;
1714 
1715  //Reduce values for TI
1716  reduction->item(REDUCTION_LJ_ENERGY_TI_1) += h_virialEnergy->energyVdw_ti_1;
1717  reduction->item(REDUCTION_LJ_ENERGY_TI_2) += h_virialEnergy->energyVdw_ti_2;
1718  reduction->item(REDUCTION_ELECT_ENERGY_TI_1) += h_virialEnergy->energyElec_ti_1;
1719  reduction->item(REDUCTION_ELECT_ENERGY_TI_2) += h_virialEnergy->energyElec_ti_2;
1720  }
1721 
1722  // fprintf(stderr, "energyGBIS %lf\n", h_virialEnergy->energyGBIS);
1723  if (doSlow){
1724 #ifdef NODEGROUP_FORCE_REGISTER
1725  if (params->CUDASOAintegrate) {
1726  nodeReduction->item(REDUCTION_ELECT_ENERGY_SLOW) += h_virialEnergy->energySlow;
1727  nodeReduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += h_virialEnergy->energySlow_s;
1728  nodeReduction->item(REDUCTION_ELECT_ENERGY_SLOW_TI_1) += h_virialEnergy->energySlow_ti_1;
1729  nodeReduction->item(REDUCTION_ELECT_ENERGY_SLOW_TI_2) += h_virialEnergy->energySlow_ti_2;
1730  //fprintf(stderr, "NB h_virialEnergy->energySlow %lf\n", h_virialEnergy->energySlow);
1731  } else
1732 #endif
1733  {
1734  reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += h_virialEnergy->energySlow;
1735  reduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += h_virialEnergy->energySlow_s;
1736  reduction->item(REDUCTION_ELECT_ENERGY_SLOW_TI_1) += h_virialEnergy->energySlow_ti_1;
1737  reduction->item(REDUCTION_ELECT_ENERGY_SLOW_TI_2) += h_virialEnergy->energySlow_ti_2;
1738  //fprintf(stderr, "NB h_virialEnergy->energySlow %lf\n", h_virialEnergy->energySlow);
1739  }
1740  }
1741  }
1742 
1743 #ifdef NODEGROUP_FORCE_REGISTER
1744  if (params->CUDASOAintegrate) {
1745  nodeReduction->item(REDUCTION_EXCLUSION_CHECKSUM_CUDA) += tileListKernel.getNumExcluded();
1746  } else
1747 #endif
1748  {
1749  reduction->item(REDUCTION_EXCLUSION_CHECKSUM_CUDA) += tileListKernel.getNumExcluded();
1750  }
1751  }
1752 #ifdef NODEGROUP_FORCE_REGISTER
1753  if (params->CUDASOAintegrate) {
1754  nodeReduction->item(REDUCTION_COMPUTE_CHECKSUM) += 1.;
1755  } else
1756 #endif
1757  {
1758  reduction->item(REDUCTION_COMPUTE_CHECKSUM) += 1.;
1759  }
1760 
1761 
1762  // I need to get rid of this for every timestep.
1763  if(!params->CUDASOAintegrate ) reduction->submit();
1764  // Reset flags
1765  doSkip = false;
1766  computesChanged = false;
1767 }
BigReal zy
Definition: Tensor.h:19
BigReal xz
Definition: Tensor.h:17
#define ADD_TENSOR_OBJECT(R, RL, D)
Definition: ReductionMgr.h:44
BigReal & item(int i)
Definition: ReductionMgr.h:313
BigReal yz
Definition: Tensor.h:18
void NAMD_bug(const char *err_msg)
Definition: common.C:195
BigReal yx
Definition: Tensor.h:18
ReductionValue & item(int index)
Definition: ReductionMgr.C:633
BigReal xx
Definition: Tensor.h:17
double virialSlow[9]
BigReal zz
Definition: Tensor.h:19
Definition: Tensor.h:15
BigReal xy
Definition: Tensor.h:17
BigReal yy
Definition: Tensor.h:18
#define cudaCheck(stmt)
Definition: CudaUtils.h:233
void submit(void)
Definition: ReductionMgr.h:324
BigReal zx
Definition: Tensor.h:19

◆ gbisP2PatchReady()

void CudaComputeNonbonded::gbisP2PatchReady ( PatchID  pid,
int  seq 
)
virtual

Reimplemented from Compute.

Definition at line 264 of file CudaComputeNonbonded.C.

References Compute::gbisP2PatchReady().

264  {
265  CmiLock(lock);
266  Compute::gbisP2PatchReady(pid, seq);
267  CmiUnlock(lock);
268 }
virtual void gbisP2PatchReady(PatchID, int seq)
Definition: Compute.C:96

◆ gbisP3PatchReady()

void CudaComputeNonbonded::gbisP3PatchReady ( PatchID  pid,
int  seq 
)
virtual

Reimplemented from Compute.

Definition at line 270 of file CudaComputeNonbonded.C.

References Compute::gbisP3PatchReady().

270  {
271  CmiLock(lock);
272  Compute::gbisP3PatchReady(pid, seq);
273  CmiUnlock(lock);
274 }
virtual void gbisP3PatchReady(PatchID, int seq)
Definition: Compute.C:106

◆ getDoTable()

bool CudaComputeNonbonded::getDoTable ( SimParameters params,
const bool  doSlow,
const bool  doVirial 
)
static

Definition at line 2374 of file CudaComputeNonbonded.C.

References simParams.

2374  {
2375  // There is additional logic in SimParameters.C which guards against unsupported force fields
2376  // This should only be used for performance heuristics
2377  bool doTable = simParams->useCUDANonbondedForceTable;
2378 
2379  // DMC: I found the doSlow case is faster with force tables, so overriding setting
2380  // TODO This should be reevaluated for future architectures
2381  doTable = doTable || doSlow;
2382  // Direct math does not support virial+slow
2383  // Redundant but necessary for correctness so doing it explicitly
2384  doTable = doTable || (doSlow && doVirial);
2385 
2386  return doTable;
2387 }
#define simParams
Definition: Output.C:129

◆ getNonbondedCoef()

CudaNBConstants CudaComputeNonbonded::getNonbondedCoef ( SimParameters params)
static

Definition at line 2341 of file CudaComputeNonbonded.C.

References ComputeNonbondedUtil::c1, ComputeNonbondedUtil::cutoff, ComputeNonbondedUtil::cutoff2, CudaNBConstants::e_0, CudaNBConstants::e_0_slow, CudaNBConstants::e_1, CudaNBConstants::e_2, CudaNBConstants::ewald_0, CudaNBConstants::ewald_1, CudaNBConstants::ewald_2, CudaNBConstants::ewald_3_slow, ComputeNonbondedUtil::ewaldcof, CudaNBConstants::lj_0, CudaNBConstants::lj_1, CudaNBConstants::lj_2, CudaNBConstants::lj_3, CudaNBConstants::lj_4, CudaNBConstants::lj_5, ComputeNonbondedUtil::pi_ewaldcof, simParams, CudaNBConstants::slowScale, ComputeNonbondedUtil::switchOn, and ComputeNonbondedUtil::switchOn2.

2341  {
2342  const float cutoff = ComputeNonbondedUtil::cutoff;
2343  const float cutoff2 = ComputeNonbondedUtil::cutoff2;
2344  const float cutoffInv = 1.0f / cutoff;
2345  const float cutoff2Inv = 1.0f / cutoff2;
2346  const float scutoff = ComputeNonbondedUtil::switchOn;
2347  const float scutoff2 = ComputeNonbondedUtil::switchOn2;
2348  const float scutoff2Inv = 1.0f / scutoff2;
2349  const float scutoff_denom = ComputeNonbondedUtil::c1;
2352  const float slowScale = ((float) simParams->fullElectFrequency) / simParams->nonbondedFrequency;
2353 
2354  CudaNBConstants c;
2355  c.lj_0 = scutoff_denom * cutoff2 - 3.0f * scutoff2 * scutoff_denom;
2356  c.lj_1 = scutoff_denom * 2.0f;
2357  c.lj_2 = scutoff_denom * -12.0f;
2358  c.lj_3 = 12.0f * scutoff_denom * scutoff2;
2359  c.lj_4 = cutoff2;
2360  c.lj_5 = scutoff2;
2361  c.e_0 = cutoff2Inv * cutoffInv;
2362  c.e_0_slow = cutoff2Inv * cutoffInv * (1.0f - slowScale);
2363  c.e_1 = cutoff2Inv;
2364  c.e_2 = cutoffInv;
2365  c.ewald_0 = ewaldcof;
2366  c.ewald_1 = pi_ewaldcof;
2367  c.ewald_2 = ewaldcof * ewaldcof;
2368  c.ewald_3_slow = ewaldcof * ewaldcof * ewaldcof * slowScale;
2369  c.slowScale = slowScale;
2370 
2371  return c;
2372 }
float ewald_3_slow
Definition: CudaUtils.h:607
#define simParams
Definition: Output.C:129

◆ getPatches()

std::vector<PatchRecord>& CudaComputeNonbonded::getPatches ( )
inline

Definition at line 304 of file CudaComputeNonbonded.h.

304 { return patches; }

◆ initialize()

void CudaComputeNonbonded::initialize ( void  )
virtual

Reimplemented from Compute.

Definition at line 629 of file CudaComputeNonbonded.C.

References ATOMIC_BINS, cudaCheck, deviceCUDA, DeviceCUDA::getDeviceIndex(), PatchMap::numPatches(), PatchMap::Object(), Node::Object(), ReductionMgr::Object(), Compute::priority(), PatchData::reduction, REDUCTIONS_BASIC, Node::simParameters, and ReductionMgr::willSubmit().

Referenced by ComputeMgr::createComputes().

629  {
630  if (patches.size() > 0) {
631  npairlists = 0;
632  // Allocate CUDA version of patches
633  cudaCheck(cudaSetDevice(deviceID));
634  allocate_host<CudaPatchRecord>(&cudaPatches, patches.size());
635 
636  allocate_host<VirialEnergy>(&h_virialEnergy, 1);
637  allocate_device<VirialEnergy>(&d_virialEnergy, ATOMIC_BINS);
638 
639  /* JM: Queries for maximum sharedMemoryPerBlock on deviceID
640  */
641  cudaDeviceProp props;
642  cudaCheck(cudaGetDeviceProperties(&props, deviceID)); //Gets properties of 'deviceID device'
643  maxShmemPerBlock = props.sharedMemPerBlock;
644 
645 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP)
646  int leastPriority, greatestPriority;
647  cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
648  int priority = (doStreaming) ? leastPriority : greatestPriority;
649  // int priority = greatestPriority;
650  cudaCheck(cudaStreamCreateWithPriority(&stream,cudaStreamDefault, priority));
651 #else
652  cudaCheck(cudaStreamCreate(&stream));
653 #endif
654  cudaCheck(cudaEventCreate(&forceDoneEvent));
655 
656  buildExclusions();
657 
658  lock = CmiCreateLock();
659  params = Node::Object()->simParameters;
661 
662 #ifdef NODEGROUP_FORCE_REGISTER
663  int devInd = deviceCUDA->getDeviceIndex();
664  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
665  PatchData *patchData = cpdata.ckLocalBranch();
666  nodeReduction = patchData->reduction;
667  patchData->devData[devInd].nbond_stream = stream;
668  // Fill auxiliary arrays for merging forces here
669  PatchMap* map = PatchMap::Object();
670  int nGlobalPatches = map->numPatches();
671  allocate_host<bool>( &(patchData->devData[devInd].h_hasPatches), nGlobalPatches);
672  memset(patchData->devData[devInd].h_hasPatches, 0, sizeof(bool)*nGlobalPatches);
673 
674  for(int i = 0; i < patches.size(); i++){
675  patchData->devData[devInd].h_hasPatches[patches[i].patchID] = true;
676  }
677  allocate_device<bool>( &(patchData->devData[devInd].d_hasPatches), nGlobalPatches);
678  copy_HtoD_sync<bool>( patchData->devData[devInd].h_hasPatches, patchData->devData[devInd].d_hasPatches, nGlobalPatches);
679 #endif
680  }
681 }
static Node * Object()
Definition: Node.h:86
void buildExclusions()
Definition: CompressPsf.C:1197
static PatchMap * Object()
Definition: PatchMap.h:27
SimParameters * simParameters
Definition: Node.h:181
SubmitReduction * willSubmit(int setID, int size=-1)
Definition: ReductionMgr.C:366
static ReductionMgr * Object(void)
Definition: ReductionMgr.h:279
NodeReduction * reduction
Definition: PatchData.h:133
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:23
int numPatches(void) const
Definition: PatchMap.h:59
#define ATOMIC_BINS
Definition: CudaUtils.h:70
int priority(void)
Definition: Compute.h:65
int getDeviceIndex()
Definition: DeviceCUDA.h:166
#define cudaCheck(stmt)
Definition: CudaUtils.h:233

◆ launchWork()

void CudaComputeNonbonded::launchWork ( )

Definition at line 1176 of file CudaComputeNonbonded.C.

References CudaComputeNonbonded::PatchRecord::atomStart, ResizeArray< Elem >::begin(), cudaCheck, ComputeNonbondedUtil::cutoff, ResizeArray< Elem >::end(), Patch::flags, Compute::gbisPhase, CudaTileListKernel::getEmptyPatches(), CudaTileListKernel::getNumEmptyPatches(), CudaTileListKernel::getNumPatches(), Patch::getPatchID(), CudaComputeNonbondedKernel::getPatchReadyQueue(), PatchMap::homePatchList(), SubmitReduction::item(), NodeReduction::item(), NAMD_bug(), NAMD_EVENT_START, NAMD_EVENT_STOP, CudaComputeNonbonded::PatchRecord::numAtoms, PatchMap::Object(), Node::Object(), HomePatchElem::patch, CudaComputeNonbondedKernel::reduceVirialEnergy(), REDUCTION_PAIRLIST_WARNINGS, reSortTileLists(), Flags::savePairlists, Node::simParameters, simParams, Flags::step, and Flags::usePairlists.

Referenced by openBoxesOnPe(), and ComputeMgr::recvLaunchWork().

1176  {
1177  if (CkMyPe() != masterPe)
1178  NAMD_bug("CudaComputeNonbonded::launchWork() called on non masterPe");
1179 
1180  beforeForceCompute = CkWallTimer();
1181  cudaCheck(cudaSetDevice(deviceID));
1183 
1184  // So, it seems like PE's are invoking the same object, however the patches[i] is borked on the masterPe
1185 
1186  // When I get here, it seems like compAtoms are not set for all Pes? How can that be?
1187 
1188  //execute only during GBIS phase 1, or if not using GBIS
1189  if (!simParams->GBISOn || gbisPhase == 1) {
1190 
1191  if ( atomsChanged || computesChanged ) {
1192  // Invalidate pair lists
1193  pairlistsValid = false;
1194  pairlistTolerance = 0.0f;
1195  }
1196 
1197  // Get maximum atom movement and patch tolerance
1198  float maxAtomMovement = 0.0f;
1199  float maxPatchTolerance = 0.0f;
1200  getMaxMovementTolerance(maxAtomMovement, maxPatchTolerance);
1201  // Update pair-list cutoff
1202  Flags &flags = patches[0].patch->flags;
1203  savePairlists = false;
1204  usePairlists = false;
1205  if ( flags.savePairlists ) {
1206  savePairlists = true;
1207  usePairlists = true;
1208  } else if ( flags.usePairlists ) {
1209  if ( ! pairlistsValid || ( 2. * maxAtomMovement > pairlistTolerance ) ) {
1210  reduction->item(REDUCTION_PAIRLIST_WARNINGS) += 1;
1211 #ifdef NODEGROUP_FORCE_REGISTER
1212  nodeReduction->item(REDUCTION_PAIRLIST_WARNINGS) += 1;
1213 #endif
1214  } else {
1215  usePairlists = true;
1216  }
1217  }
1218  if ( ! usePairlists ) {
1219  pairlistsValid = false;
1220  }
1221  float plcutoff = cutoff;
1222  if ( savePairlists ) {
1223  pairlistsValid = true;
1224  pairlistTolerance = 2. * maxPatchTolerance;
1225  plcutoff += pairlistTolerance;
1226  }
1227  plcutoff2 = plcutoff * plcutoff;
1228 
1229  // fprintf(stderr, "STEP[%d] plcutoff = %f listTolerance = %f save = %d maxPatchTolerance = %f maxAtomMovement = %f plvalid = %d flags.use = %d use = %d\n",
1230  // flags.step, plcutoff, pairlistTolerance, savePairlists, maxPatchTolerance, maxAtomMovement, pairlistsValid, flags.usePairlists, usePairlists);
1231  if(savePairlists || !usePairlists){
1232  reSortDone = false; // Ensures pairlist resorting if doPairlist
1233  }
1234 
1235  // if (atomsChanged)
1236  // CkPrintf("plcutoff = %f listTolerance = %f save = %d use = %d\n",
1237  // plcutoff, pairlistTolerance, savePairlists, usePairlists);
1238 
1239  } // if (!simParams->GBISOn || gbisPhase == 1)
1240 
1241  // Calculate PME & VdW forces
1242  if (!simParams->GBISOn || gbisPhase == 1) {
1243  doForce();
1244  if (doStreaming) {
1245  patchReadyQueue = nonbondedKernel.getPatchReadyQueue();
1246  patchReadyQueueLen = tileListKernel.getNumPatches();
1247  patchReadyQueueNext = 0;
1248  // Fill in empty patches [0 ... patchReadyQueueNext-1] at the top
1249  int numEmptyPatches = tileListKernel.getNumEmptyPatches();
1250  int* emptyPatches = tileListKernel.getEmptyPatches();
1251  for (int i=0;i < numEmptyPatches;i++) {
1252  PatchRecord &pr = patches[emptyPatches[i]];
1253  memset(h_forces+pr.atomStart, 0, sizeof(float4)*pr.numAtoms);
1254  if (doSlow) memset(h_forcesSlow+pr.atomStart, 0, sizeof(float4)*pr.numAtoms);
1255  patchReadyQueue[i] = emptyPatches[i];
1256  }
1257  if (patchReadyQueueLen != patches.size())
1258  NAMD_bug("CudaComputeNonbonded::launchWork, invalid patchReadyQueueLen");
1259  }
1260  }
1261 
1262  // For GBIS phase 1 at pairlist update, we must re-sort tile list
1263  // before calling doGBISphase1().
1264  if (atomsChanged && simParams->GBISOn && gbisPhase == 1) {
1265  // In this code path doGBISphase1() is called in forceDone()
1266  forceDoneSetCallback();
1267  return;
1268  }
1269 
1270  // GBIS Phases
1271  if (simParams->GBISOn) {
1272  if (gbisPhase == 1) {
1273  doGBISphase1();
1274  } else if (gbisPhase == 2) {
1275  doGBISphase2();
1276  } else if (gbisPhase == 3) {
1277  doGBISphase3();
1278  }
1279  }
1280 
1281  // Copy forces to host
1282  if (!simParams->GBISOn || gbisPhase == 3) {
1283  if (!doStreaming) {
1284 #ifdef NODEGROUP_FORCE_REGISTER
1285  if(!simParams->CUDASOAintegrate || (atomsChanged && !simParams->useDeviceMigration)){
1286  copy_DtoH<float4>(d_forces, h_forces, atomStorageSize, stream);
1287  if (doSlow) copy_DtoH<float4>(d_forcesSlow, h_forcesSlow, atomStorageSize, stream);
1288  }
1289 #else
1290  copy_DtoH<float4>(d_forces, h_forces, atomStorageSize, stream);
1291  if (doSlow) copy_DtoH<float4>(d_forcesSlow, h_forcesSlow, atomStorageSize, stream);
1292 #endif
1293 
1294  }
1295  }
1296 
1297  if ((!simParams->GBISOn || gbisPhase == 2) && (doEnergy || doVirial)) {
1298 
1299  NAMD_EVENT_START(1, NamdProfileEvent::REDUCE_VIRIAL_ENERGY);
1300  // For GBIS, energies are ready after phase 2
1301  nonbondedKernel.reduceVirialEnergy(tileListKernel,
1302  atomStorageSize, doEnergy, doVirial, doSlow, simParams->GBISOn,
1303  d_forces, d_forcesSlow, d_virialEnergy, stream);
1304  copy_DtoH<VirialEnergy>(d_virialEnergy, h_virialEnergy, 1, stream);
1305 
1306  NAMD_EVENT_STOP(1, NamdProfileEvent::REDUCE_VIRIAL_ENERGY);
1307 
1308  }
1309 
1310  if(simParams->CUDASOAintegrate && ((savePairlists || !usePairlists)) && !atomsChanged) reSortTileLists();
1311 
1312  // Setup call back
1313  forceDoneSetCallback();
1314 
1315 #if 0
1316  cudaCheck(cudaStreamSynchronize(stream));
1317  PatchMap *map = PatchMap::Object();
1318  HomePatchElem *elem;
1319  for(elem = map->homePatchList()->begin(); elem != map->homePatchList()->end(); elem++){
1320  if(elem->patch->getPatchID() == 7) break;
1321  }
1322  if(elem->patch->flags.step == 11){
1323  // it would be good to know from which patch these atoms are...
1324  fprintf(stderr, "CudaNonbonded data\n");
1325  for(int i = 0 ; i < atomStorageSize; i++){
1326  fprintf(stderr, "pos[%d] = %lf, %lf, %lf, %lf | (%f %f %f) (%f %f %f) \n",
1327  i, atoms[i].x, atoms[i].y, atoms[i].z, atoms[i].q,
1328  // for some reason, we needed to set the positions
1329  h_forces[i].x, h_forces[i].y, h_forces[i].z,
1330  h_forcesSlow[i].x, h_forcesSlow[i].y, h_forcesSlow[i].z);
1331  }
1332  }
1333 #endif
1334 
1335 }
static Node * Object()
Definition: Node.h:86
#define NAMD_EVENT_STOP(eon, id)
HomePatch * patch
Definition: HomePatchList.h:23
int numAtoms
Definition: CudaRecord.h:17
static PatchMap * Object()
Definition: PatchMap.h:27
SimParameters * simParameters
Definition: Node.h:181
int savePairlists
Definition: PatchTypes.h:40
BigReal & item(int i)
Definition: ReductionMgr.h:313
HomePatchList * homePatchList()
Definition: PatchMap.C:438
int usePairlists
Definition: PatchTypes.h:39
Flags flags
Definition: Patch.h:128
#define NAMD_EVENT_START(eon, id)
void NAMD_bug(const char *err_msg)
Definition: common.C:195
ReductionValue & item(int index)
Definition: ReductionMgr.C:633
void reduceVirialEnergy(CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doEnergy, const bool doVirial, const bool doSlow, const bool doGBIS, float4 *d_forces, float4 *d_forcesSlow, VirialEnergy *d_virialEnergy, cudaStream_t stream)
PatchID getPatchID() const
Definition: Patch.h:114
int gbisPhase
Definition: Compute.h:39
#define simParams
Definition: Output.C:129
iterator begin(void)
Definition: ResizeArray.h:36
iterator end(void)
Definition: ResizeArray.h:37
#define cudaCheck(stmt)
Definition: CudaUtils.h:233
int atomStart
Definition: CudaRecord.h:16
int step
Definition: PatchTypes.h:16

◆ messageEnqueueWork()

void CudaComputeNonbonded::messageEnqueueWork ( )

Definition at line 1025 of file CudaComputeNonbonded.C.

References WorkDistrib::messageEnqueueWork(), and NAMD_bug().

Referenced by ComputeMgr::recvMessageEnqueueWork().

1025  {
1026  if (masterPe != CkMyPe())
1027  NAMD_bug("CudaComputeNonbonded::messageEnqueueWork() must be called from masterPe");
1029 }
static void messageEnqueueWork(Compute *)
Definition: WorkDistrib.C:2852
void NAMD_bug(const char *err_msg)
Definition: common.C:195

◆ noWork()

int CudaComputeNonbonded::noWork ( )
virtual

Reimplemented from Compute.

Definition at line 1077 of file CudaComputeNonbonded.C.

References ComputeMgr::sendMessageEnqueueWork().

1077  {
1078  // Simply enqueu doWork on masterPe and return "no work"
1079  computeMgr->sendMessageEnqueueWork(masterPe, this);
1080  return 1;
1081 }
void sendMessageEnqueueWork(int pe, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1759

◆ openBoxesOnPe()

void CudaComputeNonbonded::openBoxesOnPe ( )

Definition at line 1031 of file CudaComputeNonbonded.C.

References SimParameters::CUDASOAintegrate, Compute::getNumPatches(), launchWork(), NAMD_bug(), NAMD_EVENT_START, NAMD_EVENT_STOP, Node::Object(), and ComputeMgr::sendLaunchWork().

Referenced by doWork(), and ComputeMgr::recvOpenBoxesOnPe().

1031  {
1032  if (rankPatches[CkMyRank()].size() == 0)
1033  NAMD_bug("CudaComputeNonbonded::openBoxesOnPe, empty rank");
1034 
1035  NAMD_EVENT_START(1, NamdProfileEvent::COMPUTE_NONBONDED_OPEN_BOXES);
1036 
1037 #ifdef NODEGROUP_FORCE_REGISTER
1038  if( Node::Object()->simParameters->CUDASOAintegrate && !atomsChanged) {
1039  // opens boxes to make sure NAMD won't complain
1040  for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
1041  int j = rankPatches[CkMyRank()][i];
1042  patches[j].positionBox->open();
1043  }
1044  if(masterPe == CkMyPe()) {
1045  // we need to open boxes here...
1046  if(params->CUDASOAintegrate){
1047  if(!atomsChanged) this->launchWork();
1048  }
1049  else computeMgr->sendLaunchWork(masterPe, this);
1050  }
1051  }
1052  else{
1053 #endif
1054  for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
1055  openBox(rankPatches[CkMyRank()][i]);
1056  }
1057  bool done = false;
1058  CmiLock(lock);
1059  patchesCounter -= rankPatches[CkMyRank()].size();
1060  if (patchesCounter == 0) {
1061  patchesCounter = getNumPatches();
1062  done = true;
1063  }
1064  CmiUnlock(lock);
1065  if (done) {
1066  if(params->CUDASOAintegrate){
1067  if(!atomsChanged) this->launchWork();
1068  }
1069  else computeMgr->sendLaunchWork(masterPe, this);
1070  }
1071 #ifdef NODEGROUP_FORCE_REGISTER
1072  }
1073 #endif
1074  NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_NONBONDED_OPEN_BOXES);
1075 }
static Node * Object()
Definition: Node.h:86
#define NAMD_EVENT_STOP(eon, id)
void sendLaunchWork(int pe, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1770
#define NAMD_EVENT_START(eon, id)
void NAMD_bug(const char *err_msg)
Definition: common.C:195
int getNumPatches()
Definition: Compute.h:53

◆ patchReady()

void CudaComputeNonbonded::patchReady ( PatchID  pid,
int  doneMigration,
int  seq 
)
virtual

Reimplemented from Compute.

Definition at line 247 of file CudaComputeNonbonded.C.

References SimParameters::CUDASOAintegrate, NAMD_bug(), Compute::patchReady(), and SimParameters::useDeviceMigration.

247  {
248  // DMC: This isn't need into CUDASOAintegrate scheme. All it does is call atomUpdate()
249  // however that is already called in Sequencer::runComputeObjects_CUDA
250  // The functionality of updatePatch() was moved into updatePatches()
251  if (!(params->CUDASOAintegrate && params->useDeviceMigration)) {
252  if (doneMigration) {
253  int i = findPid(pid);
254  if (i == -1)
255  NAMD_bug("CudaComputeNonbonded::patchReady, Patch ID not found");
256  updatePatch(i);
257  }
258  CmiLock(lock);
259  Compute::patchReady(pid, doneMigration, seq);
260  CmiUnlock(lock);
261  }
262 }
Bool useDeviceMigration
void NAMD_bug(const char *err_msg)
Definition: common.C:195
virtual void patchReady(PatchID, int doneMigration, int seq)
Definition: Compute.C:67

◆ registerComputePair()

void CudaComputeNonbonded::registerComputePair ( ComputeID  cid,
PatchID pid,
int *  trans 
)

Definition at line 185 of file CudaComputeNonbonded.C.

References PatchMap::center(), Compute::cid, PatchMap::Object(), Vector::x, Vector::y, and Vector::z.

185  {
186  computesChanged = true;
187  addPatch(pid[0]);
188  addPatch(pid[1]);
189  PatchMap* patchMap = PatchMap::Object();
190  int t1 = trans[0];
191  int t2 = trans[1];
192  Vector offset = patchMap->center(pid[0]) - patchMap->center(pid[1]);
193  offset.x += (t1%3-1) - (t2%3-1);
194  offset.y += ((t1/3)%3-1) - ((t2/3)%3-1);
195  offset.z += (t1/9-1) - (t2/9-1);
196  addCompute(cid, pid[0], pid[1], offset);
197 }
ScaledPosition center(int pid) const
Definition: PatchMap.h:99
static PatchMap * Object()
Definition: PatchMap.h:27
Definition: Vector.h:72
BigReal z
Definition: Vector.h:74
BigReal x
Definition: Vector.h:74
BigReal y
Definition: Vector.h:74
const ComputeID cid
Definition: Compute.h:43

◆ registerComputeSelf()

void CudaComputeNonbonded::registerComputeSelf ( ComputeID  cid,
PatchID  pid 
)

Definition at line 175 of file CudaComputeNonbonded.C.

References Compute::cid.

175  {
176  computesChanged = true;
177  addPatch(pid);
178  addCompute(cid, pid, pid, 0.);
179 }
const ComputeID cid
Definition: Compute.h:43

◆ reSortTileLists()

void CudaComputeNonbonded::reSortTileLists ( )

Definition at line 1998 of file CudaComputeNonbonded.C.

References cudaCheck, Node::Object(), CudaTileListKernel::reSortTileLists(), Node::simParameters, and simParams.

Referenced by launchWork().

1998  {
1999  // Re-sort tile lists
2001  cudaCheck(cudaSetDevice(deviceID));
2002 #ifdef NAMD_HIP
2003  tileListKernel.reSortTileLists(simParams->GBISOn, simParams->CUDASOAintegrateMode, stream);
2004 #else
2005  tileListKernel.reSortTileLists(simParams->GBISOn, stream);
2006 #endif
2007 }
static Node * Object()
Definition: Node.h:86
SimParameters * simParameters
Definition: Node.h:181
#define simParams
Definition: Output.C:129
#define cudaCheck(stmt)
Definition: CudaUtils.h:233
void reSortTileLists(const bool doGBIS, cudaStream_t stream)

◆ skipPatchesOnPe()

void CudaComputeNonbonded::skipPatchesOnPe ( )

Definition at line 798 of file CudaComputeNonbonded.C.

References Compute::getNumPatches(), NAMD_bug(), and ComputeMgr::sendFinishReductions().

Referenced by ComputeMgr::recvSkipPatchesOnPe().

798  {
799  if (rankPatches[CkMyRank()].size() == 0)
800  NAMD_bug("CudaComputeNonbonded::skipPatchesOnPe, empty rank");
801  for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
802  skipPatch(rankPatches[CkMyRank()][i]);
803  }
804  bool done = false;
805  CmiLock(lock);
806  patchesCounter -= rankPatches[CkMyRank()].size();
807  if (patchesCounter == 0) {
808  patchesCounter = getNumPatches();
809  done = true;
810  }
811  CmiUnlock(lock);
812  if (done) {
813  // Reduction must be done on masterPe
814  computeMgr->sendFinishReductions(masterPe, this);
815  }
816 }
void sendFinishReductions(int pe, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1748
void NAMD_bug(const char *err_msg)
Definition: common.C:195
int getNumPatches()
Definition: Compute.h:53

◆ unregisterBoxesOnPe()

void CudaComputeNonbonded::unregisterBoxesOnPe ( )

Definition at line 163 of file CudaComputeNonbonded.C.

References NAMD_bug().

Referenced by ComputeMgr::recvUnregisterBoxesOnPe().

163  {
164  if (rankPatches[CkMyRank()].size() == 0)
165  NAMD_bug("CudaComputeNonbonded::unregisterBoxesOnPe, empty rank");
166  for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
167  unregisterBox(rankPatches[CkMyRank()][i]);
168  }
169 }
void NAMD_bug(const char *err_msg)
Definition: common.C:195

◆ updatePatchOrder()

void CudaComputeNonbonded::updatePatchOrder ( const std::vector< CudaLocalRecord > &  data)

Definition at line 600 of file CudaComputeNonbonded.C.

600  {
601  // DMC This vector of CudaLocalRecords doesn't have the correct number of peer records
602  std::map<int, int> pidMap;
603  for (int i=0; i < data.size(); ++i) {
604  pidMap[data[i].patchID] = i;
605  }
606 
607  std::vector<PatchRecord> copy = patches;
608 
609  for (int i=0; i < copy.size(); i++) {
610  const int new_idx = pidMap[copy[i].patchID];
611  patches[new_idx] = copy[i];
612  }
613 
614  for (int i=0; i < rankPatches.size(); i++) {
615  rankPatches[i].clear();
616  }
617  for (int i=0; i < patches.size(); ++i) {
618  rankPatches[CkRankOf(patches[i].pe)].push_back(i);
619  }
620 
621  // Setup computes using pidMap
622  for (int i=0;i < computes.size();i++) {
623  computes[i].patchInd[0] = pidMap[computes[i].pid[0]];
624  computes[i].patchInd[1] = pidMap[computes[i].pid[1]];
625  }
626  // TODO do we need to call sendAssignPatchesOnPe with the new order?
627 }

The documentation for this class was generated from the following files: