NAMD
Classes | Public Member Functions | Static Public Member Functions | List of all members
CudaComputeNonbonded Class Reference

#include <CudaComputeNonbonded.h>

Inheritance diagram for CudaComputeNonbonded:
Compute ComputeNonbondedUtil

Classes

struct  ComputeRecord
 
struct  PatchRecord
 

Public Member Functions

 CudaComputeNonbonded (ComputeID c, int deviceID, CudaNonbondedTables &cudaNonbondedTables, bool doStreaming)
 
 ~CudaComputeNonbonded ()
 
void registerComputeSelf (ComputeID cid, PatchID pid)
 
void registerComputePair (ComputeID cid, PatchID *pid, int *trans)
 
void assignPatches (ComputeMgr *computeMgrIn)
 
virtual void initialize ()
 
virtual void atomUpdate ()
 
virtual int noWork ()
 
virtual void doWork ()
 
void launchWork ()
 
void finishReductions ()
 
void unregisterBoxesOnPe ()
 
void assignPatchesOnPe ()
 
void openBoxesOnPe ()
 
void skipPatchesOnPe ()
 
void finishPatchesOnPe ()
 
void finishPatchOnPe (int i)
 
void finishPatches ()
 
void messageEnqueueWork ()
 
virtual void patchReady (PatchID, int doneMigration, int seq)
 
virtual void gbisP2PatchReady (PatchID, int seq)
 
virtual void gbisP3PatchReady (PatchID, int seq)
 
void reSortTileLists ()
 
void updatePatchOrder (const std::vector< CudaLocalRecord > &data)
 
std::vector< PatchRecord > & getPatches ()
 
SubmitReductiongetCurrentReduction ()
 
- Public Member Functions inherited from Compute
 Compute (ComputeID)
 
int type ()
 
virtual ~Compute ()
 
void setNumPatches (int n)
 
int getNumPatches ()
 
int sequence (void)
 
int priority (void)
 
int getGBISPhase (void)
 
- Public Member Functions inherited from ComputeNonbondedUtil
 ComputeNonbondedUtil ()
 
 ~ComputeNonbondedUtil ()
 
void calcGBIS (nonbonded *params, GBISParamStruct *gbisParams)
 

Static Public Member Functions

static CudaNBConstants getNonbondedCoef (SimParameters *params)
 
static bool getDoTable (SimParameters *params, const bool doSlow, const bool doVirial)
 
- Static Public Member Functions inherited from ComputeNonbondedUtil
static void select (void)
 
static void submitReductionData (BigReal *, SubmitReduction *)
 
static void submitPressureProfileData (BigReal *, SubmitReduction *)
 
static BigReal square (const BigReal &x, const BigReal &y, const BigReal &z)
 
static void calc_error (nonbonded *)
 
static void calc_pair (nonbonded *)
 
static void calc_pair_energy (nonbonded *)
 
static void calc_pair_fullelect (nonbonded *)
 
static void calc_pair_energy_fullelect (nonbonded *)
 
static void calc_pair_merge_fullelect (nonbonded *)
 
static void calc_pair_energy_merge_fullelect (nonbonded *)
 
static void calc_pair_merge_fullelect_fulldisp (nonbonded *)
 
static void calc_pair_energy_merge_fullelect_fulldisp (nonbonded *)
 
static void calc_pair_slow_fullelect (nonbonded *)
 
static void calc_pair_energy_slow_fullelect (nonbonded *)
 
static void calc_self (nonbonded *)
 
static void calc_self_energy (nonbonded *)
 
static void calc_self_fullelect (nonbonded *)
 
static void calc_self_energy_fullelect (nonbonded *)
 
static void calc_self_merge_fullelect (nonbonded *)
 
static void calc_self_energy_merge_fullelect (nonbonded *)
 
static void calc_self_merge_fullelect_fulldisp (nonbonded *)
 
static void calc_self_energy_merge_fullelect_fulldisp (nonbonded *)
 
static void calc_self_slow_fullelect (nonbonded *)
 
static void calc_self_energy_slow_fullelect (nonbonded *)
 
static void calc_pair_energy_fep (nonbonded *)
 
static void calc_pair_energy_fullelect_fep (nonbonded *)
 
static void calc_pair_energy_merge_fullelect_fep (nonbonded *)
 
static void calc_pair_energy_slow_fullelect_fep (nonbonded *)
 
static void calc_self_energy_fep (nonbonded *)
 
static void calc_self_energy_fullelect_fep (nonbonded *)
 
static void calc_self_energy_merge_fullelect_fep (nonbonded *)
 
static void calc_self_energy_slow_fullelect_fep (nonbonded *)
 
static void calc_pair_energy_ti (nonbonded *)
 
static void calc_pair_ti (nonbonded *)
 
static void calc_pair_energy_fullelect_ti (nonbonded *)
 
static void calc_pair_fullelect_ti (nonbonded *)
 
static void calc_pair_energy_merge_fullelect_ti (nonbonded *)
 
static void calc_pair_merge_fullelect_ti (nonbonded *)
 
static void calc_pair_energy_slow_fullelect_ti (nonbonded *)
 
static void calc_pair_slow_fullelect_ti (nonbonded *)
 
static void calc_self_energy_ti (nonbonded *)
 
static void calc_self_ti (nonbonded *)
 
static void calc_self_energy_fullelect_ti (nonbonded *)
 
static void calc_self_fullelect_ti (nonbonded *)
 
static void calc_self_energy_merge_fullelect_ti (nonbonded *)
 
static void calc_self_merge_fullelect_ti (nonbonded *)
 
static void calc_self_energy_slow_fullelect_ti (nonbonded *)
 
static void calc_self_slow_fullelect_ti (nonbonded *)
 
static void calc_pair_les (nonbonded *)
 
static void calc_pair_energy_les (nonbonded *)
 
static void calc_pair_fullelect_les (nonbonded *)
 
static void calc_pair_energy_fullelect_les (nonbonded *)
 
static void calc_pair_merge_fullelect_les (nonbonded *)
 
static void calc_pair_energy_merge_fullelect_les (nonbonded *)
 
static void calc_pair_slow_fullelect_les (nonbonded *)
 
static void calc_pair_energy_slow_fullelect_les (nonbonded *)
 
static void calc_self_les (nonbonded *)
 
static void calc_self_energy_les (nonbonded *)
 
static void calc_self_fullelect_les (nonbonded *)
 
static void calc_self_energy_fullelect_les (nonbonded *)
 
static void calc_self_merge_fullelect_les (nonbonded *)
 
static void calc_self_energy_merge_fullelect_les (nonbonded *)
 
static void calc_self_slow_fullelect_les (nonbonded *)
 
static void calc_self_energy_slow_fullelect_les (nonbonded *)
 
static void calc_pair_energy_int (nonbonded *)
 
static void calc_pair_energy_fullelect_int (nonbonded *)
 
static void calc_pair_energy_merge_fullelect_int (nonbonded *)
 
static void calc_self_energy_int (nonbonded *)
 
static void calc_self_energy_fullelect_int (nonbonded *)
 
static void calc_self_energy_merge_fullelect_int (nonbonded *)
 
static void calc_pair_pprof (nonbonded *)
 
static void calc_pair_energy_pprof (nonbonded *)
 
static void calc_pair_fullelect_pprof (nonbonded *)
 
static void calc_pair_energy_fullelect_pprof (nonbonded *)
 
static void calc_pair_merge_fullelect_pprof (nonbonded *)
 
static void calc_pair_energy_merge_fullelect_pprof (nonbonded *)
 
static void calc_pair_slow_fullelect_pprof (nonbonded *)
 
static void calc_pair_energy_slow_fullelect_pprof (nonbonded *)
 
static void calc_self_pprof (nonbonded *)
 
static void calc_self_energy_pprof (nonbonded *)
 
static void calc_self_fullelect_pprof (nonbonded *)
 
static void calc_self_energy_fullelect_pprof (nonbonded *)
 
static void calc_self_merge_fullelect_pprof (nonbonded *)
 
static void calc_self_energy_merge_fullelect_pprof (nonbonded *)
 
static void calc_self_slow_fullelect_pprof (nonbonded *)
 
static void calc_self_energy_slow_fullelect_pprof (nonbonded *)
 
static void calc_pair_tabener (nonbonded *)
 
static void calc_pair_energy_tabener (nonbonded *)
 
static void calc_pair_fullelect_tabener (nonbonded *)
 
static void calc_pair_energy_fullelect_tabener (nonbonded *)
 
static void calc_pair_merge_fullelect_tabener (nonbonded *)
 
static void calc_pair_energy_merge_fullelect_tabener (nonbonded *)
 
static void calc_pair_slow_fullelect_tabener (nonbonded *)
 
static void calc_pair_energy_slow_fullelect_tabener (nonbonded *)
 
static void calc_self_tabener (nonbonded *)
 
static void calc_self_energy_tabener (nonbonded *)
 
static void calc_self_fullelect_tabener (nonbonded *)
 
static void calc_self_energy_fullelect_tabener (nonbonded *)
 
static void calc_self_merge_fullelect_tabener (nonbonded *)
 
static void calc_self_energy_merge_fullelect_tabener (nonbonded *)
 
static void calc_self_slow_fullelect_tabener (nonbonded *)
 
static void calc_self_energy_slow_fullelect_tabener (nonbonded *)
 
static void calc_pair_go (nonbonded *)
 
static void calc_pair_energy_go (nonbonded *)
 
static void calc_pair_fullelect_go (nonbonded *)
 
static void calc_pair_energy_fullelect_go (nonbonded *)
 
static void calc_pair_merge_fullelect_go (nonbonded *)
 
static void calc_pair_energy_merge_fullelect_go (nonbonded *)
 
static void calc_pair_slow_fullelect_go (nonbonded *)
 
static void calc_pair_energy_slow_fullelect_go (nonbonded *)
 
static void calc_self_go (nonbonded *)
 
static void calc_self_energy_go (nonbonded *)
 
static void calc_self_fullelect_go (nonbonded *)
 
static void calc_self_energy_fullelect_go (nonbonded *)
 
static void calc_self_merge_fullelect_go (nonbonded *)
 
static void calc_self_energy_merge_fullelect_go (nonbonded *)
 
static void calc_self_slow_fullelect_go (nonbonded *)
 
static void calc_self_energy_slow_fullelect_go (nonbonded *)
 

Additional Inherited Members

- Public Types inherited from ComputeNonbondedUtil
enum  {
  exclChecksumIndex, pairlistWarningIndex, electEnergyIndex, fullElectEnergyIndex,
  vdwEnergyIndex, fullVdwEnergyIndex, goNativeEnergyIndex, goNonnativeEnergyIndex,
  groLJEnergyIndex, groGaussEnergyIndex, electEnergyIndex_s, fullElectEnergyIndex_s,
  vdwEnergyIndex_s, electEnergyIndex_ti_1, fullElectEnergyIndex_ti_1, vdwEnergyIndex_ti_1,
  electEnergyIndex_ti_2, fullElectEnergyIndex_ti_2, vdwEnergyIndex_ti_2, TENSOR =(virialIndex),
  TENSOR =(virialIndex), VECTOR =(pairVDWForceIndex), VECTOR =(pairVDWForceIndex), reductionDataSize
}
 
- Public Attributes inherited from Compute
const ComputeID cid
 
LDObjHandle ldObjHandle
 
LocalWorkMsg *const localWorkMsg
 
- Static Public Attributes inherited from ComputeNonbondedUtil
static void(* calcPair )(nonbonded *)
 
static void(* calcPairEnergy )(nonbonded *)
 
static void(* calcSelf )(nonbonded *)
 
static void(* calcSelfEnergy )(nonbonded *)
 
static void(* calcFullPair )(nonbonded *)
 
static void(* calcFullPairEnergy )(nonbonded *)
 
static void(* calcFullSelf )(nonbonded *)
 
static void(* calcFullSelfEnergy )(nonbonded *)
 
static void(* calcMergePair )(nonbonded *)
 
static void(* calcMergePairEnergy )(nonbonded *)
 
static void(* calcMergeSelf )(nonbonded *)
 
static void(* calcMergeSelfEnergy )(nonbonded *)
 
static void(* calcMergeDispPair )(nonbonded *)
 
static void(* calcMergeDispPairEnergy )(nonbonded *)
 
static void(* calcMergeDispSelf )(nonbonded *)
 
static void(* calcMergeDispSelfEnergy )(nonbonded *)
 
static void(* calcSlowPair )(nonbonded *)
 
static void(* calcSlowPairEnergy )(nonbonded *)
 
static void(* calcSlowSelf )(nonbonded *)
 
static void(* calcSlowSelfEnergy )(nonbonded *)
 
static Bool commOnly
 
static Bool fixedAtomsOn
 
static Bool qmForcesOn
 
static BigReal cutoff
 
static BigReal cutoff2
 
static float cutoff2_f
 
static BigReal dielectric_1
 
static const LJTableljTable = 0
 
static const Moleculemol
 
static BigReal r2_delta
 
static BigReal r2_delta_1
 
static int rowsize
 
static int columnsize
 
static int r2_delta_exp
 
static BigRealtable_alloc = 0
 
static BigRealtable_ener = 0
 
static BigRealtable_short
 
static BigRealtable_noshort
 
static BigRealfast_table
 
static BigRealscor_table
 
static BigRealslow_table
 
static BigRealcorr_table
 
static BigRealfull_table
 
static BigRealvdwa_table
 
static BigRealvdwb_table
 
static BigRealr2_table
 
static int table_length
 
static BigReal scaling
 
static BigReal scale14
 
static BigReal switchOn
 
static BigReal switchOn_1
 
static BigReal switchOn2
 
static BigReal v_vdwa
 
static BigReal v_vdwb
 
static BigReal k_vdwa
 
static BigReal k_vdwb
 
static BigReal cutoff_3
 
static BigReal cutoff_6
 
static float v_vdwa_f
 
static float v_vdwb_f
 
static float k_vdwa_f
 
static float k_vdwb_f
 
static float cutoff_3_f
 
static float cutoff_6_f
 
static float switchOn_f
 
static float A6_f
 
static float B6_f
 
static float C6_f
 
static float A12_f
 
static float B12_f
 
static float C12_f
 
static BigReal c0
 
static BigReal c1
 
static BigReal c3
 
static BigReal c5
 
static BigReal c6
 
static BigReal c7
 
static BigReal c8
 
static Bool alchFepOn
 
static Bool alchThermIntOn
 
static Bool alchWCAOn
 
static BigReal alchVdwShiftCoeff
 
static Bool vdwForceSwitching
 
static Bool alchDecouple
 
static Bool lesOn
 
static int lesFactor
 
static BigReal lesScaling
 
static BigReallambda_table = 0
 
static Bool pairInteractionOn
 
static Bool pairInteractionSelf
 
static Bool pressureProfileOn
 
static int pressureProfileSlabs
 
static int pressureProfileAtomTypes
 
static BigReal pressureProfileThickness
 
static BigReal pressureProfileMin
 
static Bool accelMDOn
 
static Bool drudeNbthole
 
static BigReal ewaldcof
 
static BigReal pi_ewaldcof
 
static int vdw_switch_mode
 
static Bool goGroPair
 
static Bool goForcesOn
 
static int goMethod
 
- Protected Member Functions inherited from Compute
void enqueueWork ()
 
- Protected Attributes inherited from Compute
int computeType
 
int basePriority
 
int gbisPhase
 
int gbisPhasePriority [3]
 

Detailed Description

Definition at line 31 of file CudaComputeNonbonded.h.

Constructor & Destructor Documentation

◆ CudaComputeNonbonded()

CudaComputeNonbonded::CudaComputeNonbonded ( ComputeID  c,
int  deviceID,
CudaNonbondedTables cudaNonbondedTables,
bool  doStreaming 
)

Definition at line 39 of file CudaComputeNonbonded.C.

References cudaCheck, Compute::gbisPhase, NAMD_die(), Node::Object(), Node::simParameters, and simParams.

40  :
41 Compute(c), deviceID(deviceID), doStreaming(doStreaming), nonbondedKernel(deviceID, cudaNonbondedTables, doStreaming),
42 tileListKernel(deviceID, doStreaming), GBISKernel(deviceID) {
43 
44  cudaCheck(cudaSetDevice(deviceID));
45 
46  exclusionsByAtom = NULL;
47 
48  vdwTypes = NULL;
49  vdwTypesSize = 0;
50 
51  exclIndexMaxDiff = NULL;
52  exclIndexMaxDiffSize = 0;
53 
54  atomIndex = NULL;
55  atomIndexSize = 0;
56 
57  atomStorageSize = 0;
58 
59  // Atom and charge storage
60  atoms = NULL;
61  atomsSize = 0;
62  part = NULL;
63  partSize = 0;
64  doAlch = false;
65  lambdaWindowUpdated = false;
66 
67  // Force storage
68  h_forces = NULL;
69  h_forcesSize = 0;
70  h_forcesSlow = NULL;
71  h_forcesSlowSize = 0;
72 
73  d_forces = NULL;
74  d_forcesSize = 0;
75  d_forcesSlow = NULL;
76  d_forcesSlowSize = 0;
77 
78  // GBIS
79  intRad0H = NULL;
80  intRad0HSize = 0;
81  intRadSH = NULL;
82  intRadSHSize = 0;
83  psiSumH = NULL;
84  psiSumHSize = 0;
85  bornRadH = NULL;
86  bornRadHSize = 0;
87  dEdaSumH = NULL;
88  dEdaSumHSize = 0;
89  dHdrPrefixH = NULL;
90  dHdrPrefixHSize = 0;
91  maxShmemPerBlock = 0;
92  cudaPatches = NULL;
93 
94  atomsChangedIn = true;
95  atomsChanged = true;
96  computesChanged = true;
97 
98  forceDoneEventRecord = false;
99 
100  doNbThole = false;
101  isDrude = nullptr;
102  isDrudeSize = 0;
103  drudeAtomAlpha = nullptr;
104  drudeAtomAlphaSize = 0;
105 
107  if (simParams->pressureProfileOn) {
108  NAMD_die("CudaComputeNonbonded, pressure profile not supported");
109  }
110 
111  if (simParams->GBISOn) gbisPhase = 3;
112 
113  doSkip = false;
114 }
static Node * Object()
Definition: Node.h:86
SimParameters * simParameters
Definition: Node.h:181
void NAMD_die(const char *err_msg)
Definition: common.C:147
int gbisPhase
Definition: Compute.h:39
#define simParams
Definition: Output.C:131
#define cudaCheck(stmt)
Definition: CudaUtils.h:233
Compute(ComputeID)
Definition: Compute.C:37

◆ ~CudaComputeNonbonded()

CudaComputeNonbonded::~CudaComputeNonbonded ( )

Definition at line 119 of file CudaComputeNonbonded.C.

References cudaCheck, deallocate_host(), and ComputeMgr::sendUnregisterBoxesOnPe().

119  {
120  // fprintf(stderr, "Pe %d calling destructor ", CkMyPe());
121  cudaCheck(cudaSetDevice(deviceID));
122  if (exclusionsByAtom != NULL) delete [] exclusionsByAtom;
123  if (vdwTypes != NULL) deallocate_host<int>(&vdwTypes);
124  if (exclIndexMaxDiff != NULL) deallocate_host<int2>(&exclIndexMaxDiff);
125  if (atoms != NULL) deallocate_host<CudaAtom>(&atoms);
126  if (part != NULL) deallocate_host<char>(&part);
127  if (h_forces != NULL) deallocate_host<float4>(&h_forces);
128  if (h_forcesSlow != NULL) deallocate_host<float4>(&h_forcesSlow);
129  if (d_forces != NULL) deallocate_device<float4>(&d_forces);
130  if (d_forcesSlow != NULL) deallocate_device<float4>(&d_forcesSlow);
131 
132  // GBIS
133  if (intRad0H != NULL) deallocate_host<float>(&intRad0H);
134  if (intRadSH != NULL) deallocate_host<float>(&intRadSH);
135  if (psiSumH != NULL) deallocate_host<GBReal>(&psiSumH);
136  if (bornRadH != NULL) deallocate_host<float>(&bornRadH);
137  if (dEdaSumH != NULL) deallocate_host<GBReal>(&dEdaSumH);
138  if (dHdrPrefixH != NULL) deallocate_host<float>(&dHdrPrefixH);
139 
140  if (cudaPatches != NULL) deallocate_host<CudaPatchRecord>(&cudaPatches);
141 
142  // Drude/NbThole
143  if (isDrude != nullptr) deallocate_host(&isDrude);
144  if (drudeAtomAlpha != nullptr) deallocate_host(&drudeAtomAlpha);
145 
146  if (patches.size() > 0) {
147  deallocate_host<VirialEnergy>(&h_virialEnergy);
148  deallocate_device<VirialEnergy>(&d_virialEnergy);
149  cudaCheck(cudaStreamDestroy(stream));
150  cudaCheck(cudaEventDestroy(forceDoneEvent));
151  CmiDestroyLock(lock);
152  if (reductionGpuOffload) {
153  delete reductionGpuOffload;
154  }
155  if (reductionGpuResident) {
156  delete reductionGpuResident;
157  }
158  }
159 
160  // NOTE: unregistering happens in [sync] -entry method
161  // fprintf(stderr, "unregistering patches on pe %d\n", CkMyPe());
162  computeMgr->sendUnregisterBoxesOnPe(pes, this);
163 
164 }
void deallocate_host(T **pp)
Definition: CudaUtils.h:387
void sendUnregisterBoxesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1885
#define cudaCheck(stmt)
Definition: CudaUtils.h:233

Member Function Documentation

◆ assignPatches()

void CudaComputeNonbonded::assignPatches ( ComputeMgr computeMgrIn)

Definition at line 397 of file CudaComputeNonbonded.C.

References PatchMap::basePatchIDList(), deviceCUDA, findHomePatchPe(), findProxyPatchPes(), DeviceCUDA::getDeviceCount(), DeviceCUDA::getMasterPeForDeviceID(), Compute::getNumPatches(), DeviceCUDA::getNumPesSharingDevice(), DeviceCUDA::getPesSharingDevice(), ComputePmeCUDAMgr::isPmePe(), NAMD_bug(), PatchMap::Object(), ComputePmeCUDAMgr::Object(), PatchMap::ObjectOnPe(), ComputeMgr::sendAssignPatchesOnPe(), and Compute::setNumPatches().

Referenced by ComputeMgr::createComputes().

397  {
398  // Remove duplicate patches
399  std::sort(patches.begin(), patches.end());
400  std::vector<PatchRecord>::iterator last = std::unique(patches.begin(), patches.end());
401  patches.erase(last, patches.end());
402  // Set number of patches
403  setNumPatches(patches.size());
404  masterPe = CkMyPe();
405  computeMgr = computeMgrIn;
406  // Start patch counter
407  patchesCounter = getNumPatches();
408  // Patch ID map
409  std::map<PatchID, int> pidMap;
410 #if 1
411  //-------------------------------------------------------
412  // Copied in from ComputeNonbondedCUDA::assignPatches()
413  //-------------------------------------------------------
414 
415  std::vector<int> pesOnNodeSharingDevice(CkMyNodeSize());
416  int numPesOnNodeSharingDevice = 0;
417  int masterIndex = -1;
418  for ( int i=0; i<deviceCUDA->getNumPesSharingDevice(); ++i ) {
419  int pe = deviceCUDA->getPesSharingDevice(i);
420  if ( pe == CkMyPe() ) masterIndex = numPesOnNodeSharingDevice;
421  if ( CkNodeOf(pe) == CkMyNode() ) {
422  pesOnNodeSharingDevice[numPesOnNodeSharingDevice++] = pe;
423  }
424  }
425 
426  std::vector<int> count(patches.size(), 0);
427  std::vector<int> pcount(numPesOnNodeSharingDevice, 0);
428  std::vector<int> rankpcount(CkMyNodeSize(), 0);
429  std::vector<char> table(patches.size()*numPesOnNodeSharingDevice, 0);
430 
431  PatchMap* patchMap = PatchMap::Object();
432 
433  int unassignedpatches = patches.size();
434 
435  for (int i=0;i < patches.size(); ++i) {
436  patches[i].pe = -1;
437  }
438 
439  // assign if home pe and build table of natural proxies
440  for (int i=0;i < patches.size(); ++i) {
441  int pid = patches[i].patchID;
442  // homePe = PE where the patch currently resides
443  int homePe = patchMap->node(pid);
444  for ( int j=0; j < numPesOnNodeSharingDevice; ++j ) {
445  int pe = pesOnNodeSharingDevice[j];
446  // If homePe is sharing this device, assign this patch to homePe
447  if ( pe == homePe ) {
448  patches[i].pe = pe;
449  --unassignedpatches;
450  pcount[j] += 1;
451  }
452  if ( PatchMap::ObjectOnPe(pe)->patch(pid) ) {
453  table[i*numPesOnNodeSharingDevice + j] = 1;
454  }
455  }
456  // Assign this patch to homePe, if it resides on the same node
457  if ( patches[i].pe == -1 && CkNodeOf(homePe) == CkMyNode() ) {
458  patches[i].pe = homePe;
459  --unassignedpatches;
460  rankpcount[CkRankOf(homePe)] += 1;
461  }
462  }
463  // assign if only one pe has a required proxy
464  for (int i=0; i < patches.size(); ++i) {
465  int pid = patches[i].patchID;
466  if ( patches[i].pe != -1 ) continue;
467  int c = 0;
468  int lastj;
469  for (int j=0; j < numPesOnNodeSharingDevice; ++j) {
470  if ( table[i*numPesOnNodeSharingDevice + j] ) {
471  ++c;
472  lastj = j;
473  }
474  }
475  count[i] = c;
476  if ( c == 1 ) {
477  patches[i].pe = pesOnNodeSharingDevice[lastj];
478  --unassignedpatches;
479  pcount[lastj] += 1;
480  }
481  }
482  int assignj = 0;
483  while ( unassignedpatches ) {
484  int i;
485  for (i=0;i < patches.size(); ++i) {
486  if ( ! table[i*numPesOnNodeSharingDevice + assignj] ) continue;
487  int pid = patches[i].patchID;
488  // patch_record &pr = patchRecords[pid];
489  if ( patches[i].pe != -1 ) continue;
490  patches[i].pe = pesOnNodeSharingDevice[assignj];
491  --unassignedpatches;
492  pcount[assignj] += 1;
493  if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
494  break;
495  }
496  if (i < patches.size() ) continue; // start search again
497  for ( i=0;i < patches.size(); ++i ) {
498  int pid = patches[i].patchID;
499  // patch_record &pr = patchRecords[pid];
500  if ( patches[i].pe != -1 ) continue;
501  if ( count[i] ) continue;
502  patches[i].pe = pesOnNodeSharingDevice[assignj];
503  --unassignedpatches;
504  pcount[assignj] += 1;
505  if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
506  break;
507  }
508  if ( i < patches.size() ) continue; // start search again
509  if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
510  }
511 
512  // For each rank, list of patches
513  rankPatches.resize(CkMyNodeSize());
514  for (int i=0; i < patches.size(); ++i) {
515  rankPatches[CkRankOf(patches[i].pe)].push_back(i);
516  pidMap[patches[i].patchID] = i;
517  }
518 
519  // for ( int i=0; i < patches.size(); ++i ) {
520  // CkPrintf("Pe %d patch %d hostPe %d\n", CkMyPe(), patches[i].patchID, patches[i].pe);
521  // }
522 #else
523  // For each rank, list of patches
524  rankPatches.resize(CkMyNodeSize());
525  // For each rank, list of home patch IDs
526  PatchIDList* rankHomePatchIDs = new PatchIDList[CkMyNodeSize()];
527  for (int i=0;i < CkMyNodeSize();i++) {
528  int pe = CkNodeFirst(CkMyNode()) + i;
529  PatchMap::Object()->basePatchIDList(pe, rankHomePatchIDs[i]);
530  }
531  std::vector<int> proxyPatchPes;
532  std::vector<int> peProxyPatchCounter(CkMyNodeSize(), 0);
533  //--------------------------------------------------------
534  // Build a list of PEs to avoid
535  std::vector<int> pesToAvoid;
536 #if 0
537  // Avoid other GPUs' master PEs
538  for (int i=0;i < deviceCUDA->getDeviceCount();i++) {
539  int pe = deviceCUDA->getMasterPeForDeviceID(i);
540  if (pe != -1 && pe != masterPe) pesToAvoid.push_back(pe);
541  }
542  // Avoid PEs that are involved in PME
543  ComputePmeCUDAMgr *computePmeCUDAMgr = ComputePmeCUDAMgr::Object();
544  for (int pe=CkNodeFirst(CkMyNode());pe < CkNodeFirst(CkMyNode()) + CkMyNodeSize();pe++) {
545  if (computePmeCUDAMgr->isPmePe(pe)) pesToAvoid.push_back(pe);
546  }
547  // Set counters of avoidable PEs to high numbers
548  for (int i=0;i < pesToAvoid.size();i++) {
549  int pe = pesToAvoid[i];
550  peProxyPatchCounter[CkRankOf(pe)] = (1 << 20);
551  }
552 #endif
553  // Avoid master Pe somewhat
554  peProxyPatchCounter[CkRankOf(masterPe)] = 2; // patches.size();
555  //--------------------------------------------------------
556  for (int i=0;i < patches.size();i++) {
557  //if I had this datastructure "patches" on the GPU, I could use it
558  PatchID pid = patches[i].patchID;
559  int pe = findHomePatchPe(rankHomePatchIDs, pid);
560  if (pe == -1) {
561  // Patch not present on this node => try finding a ProxyPatch
562  findProxyPatchPes(proxyPatchPes, pid);
563  if (proxyPatchPes.size() == 0) {
564  // No ProxyPatch => create one on rank that has the least ProxyPatches
565  int rank = std::min_element(peProxyPatchCounter.begin(), peProxyPatchCounter.end()) - peProxyPatchCounter.begin();
566  pe = CkNodeFirst(CkMyNode()) + rank;
567  peProxyPatchCounter[rank]++;
568  } else {
569  // Choose ProxyPatch, try to avoid masterPe (current Pe) and Pes that already have a ProxyPatch,
570  // this is done by finding the entry with minimum peProxyPatchCounter -value
571  // Find miniumum among proxyPatchPes, i.e., find the minimum among
572  // peProxyPatchCounter[CkRankOf(proxyPatchPes[j])]
573  // int pppi = std::min_element(proxyPatchPes.begin(), proxyPatchPes.end(),
574  // [&](int i, int j) {return peProxyPatchCounter[CkRankOf(i)] < peProxyPatchCounter[CkRankOf(j)];})
575  // - proxyPatchPes.begin();
576  // pe = proxyPatchPes[pppi];
577  int minCounter = (1 << 30);
578  for (int j=0;j < proxyPatchPes.size();j++) {
579  if (minCounter > peProxyPatchCounter[CkRankOf(proxyPatchPes[j])]) {
580  pe = proxyPatchPes[j];
581  minCounter = peProxyPatchCounter[CkRankOf(pe)];
582  }
583  }
584  if (pe == -1)
585  NAMD_bug("CudaComputeNonbonded::assignPatches, Unable to choose PE with proxy patch");
586  peProxyPatchCounter[CkRankOf(pe)]++;
587  }
588  } else if (std::find(pesToAvoid.begin(), pesToAvoid.end(), pe) != pesToAvoid.end()) {
589  // Found home patch on this node, but it's on PE that should be avoided => find a new one
590  int rank = std::min_element(peProxyPatchCounter.begin(), peProxyPatchCounter.end()) - peProxyPatchCounter.begin();
591  pe = CkNodeFirst(CkMyNode()) + rank;
592  peProxyPatchCounter[rank]++;
593  }
594  if (pe < CkNodeFirst(CkMyNode()) || pe >= CkNodeFirst(CkMyNode()) + CkMyNodeSize() )
595  NAMD_bug("CudaComputeNonbonded::assignPatches, Invalid PE for a patch");
596  rankPatches[CkRankOf(pe)].push_back(i);
597  pidMap[pid] = i;
598  }
599 
600  delete [] rankHomePatchIDs;
601 #endif
602  // Setup computes using pidMap
603  for (int i=0;i < computes.size();i++) {
604  computes[i].patchInd[0] = pidMap[computes[i].pid[0]];
605  computes[i].patchInd[1] = pidMap[computes[i].pid[1]];
606  }
607  for (int i=0;i < CkMyNodeSize();i++) {
608  if (rankPatches[i].size() > 0) pes.push_back(CkNodeFirst(CkMyNode()) + i);
609  }
610  computeMgr->sendAssignPatchesOnPe(pes, this);
611 }
void setNumPatches(int n)
Definition: Compute.h:52
int getDeviceCount()
Definition: DeviceCUDA.h:124
static PatchMap * Object()
Definition: PatchMap.h:27
void basePatchIDList(int pe, PatchIDList &)
Definition: PatchMap.C:454
static PatchMap * ObjectOnPe(int pe)
Definition: PatchMap.h:28
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:23
static ComputePmeCUDAMgr * Object()
void NAMD_bug(const char *err_msg)
Definition: common.C:195
int getMasterPeForDeviceID(int deviceID)
Definition: DeviceCUDA.C:532
int getPesSharingDevice(const int i)
Definition: DeviceCUDA.h:139
int getNumPatches()
Definition: Compute.h:53
int getNumPesSharingDevice()
Definition: DeviceCUDA.h:138
int32 PatchID
Definition: NamdTypes.h:287
void findProxyPatchPes(std::vector< int > &proxyPatchPes, PatchID pid)
void sendAssignPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1785
int findHomePatchPe(PatchIDList *rankPatchIDs, PatchID pid)

◆ assignPatchesOnPe()

void CudaComputeNonbonded::assignPatchesOnPe ( )

Definition at line 335 of file CudaComputeNonbonded.C.

References ResizeArray< Elem >::add(), NAMD_bug(), PatchMap::node(), PatchMap::Object(), and ResizeArray< Elem >::size().

Referenced by ComputeMgr::recvAssignPatchesOnPe().

335  {
336  if (rankPatches[CkMyRank()].size() == 0)
337  NAMD_bug("CudaComputeNonbonded::assignPatchesOnPe, empty rank");
338 
339  // calculate priority rank of local home patch within pe
340  {
341  PatchMap* patchMap = PatchMap::Object();
342  ResizeArray< ResizeArray<int2> > homePatchByRank(CkMyNodeSize());
343  for ( int k=0; k < rankPatches[CkMyRank()].size(); ++k ) {
344  int i = rankPatches[CkMyRank()][k];
345  int pid = patches[i].patchID;
346  int homePe = patchMap->node(pid);
347  if ( CkNodeOf(homePe) == CkMyNode() ) {
348  int2 pid_index;
349  pid_index.x = pid;
350  pid_index.y = i;
351  homePatchByRank[CkRankOf(homePe)].add(pid_index);
352  }
353  }
354  for ( int i=0; i<CkMyNodeSize(); ++i ) {
356  std::sort(homePatchByRank[i].begin(),homePatchByRank[i].end(),so);
357  int masterBoost = ( CkMyRank() == i ? 2 : 0 );
358  for ( int j=0; j<homePatchByRank[i].size(); ++j ) {
359  int index = homePatchByRank[i][j].y;
360  patches[index].reversePriorityRankInPe = j + masterBoost;
361  }
362  }
363  }
364 
365  for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
366  assignPatch(rankPatches[CkMyRank()][i]);
367  }
368 }
static PatchMap * Object()
Definition: PatchMap.h:27
void NAMD_bug(const char *err_msg)
Definition: common.C:195
int node(int pid) const
Definition: PatchMap.h:114

◆ atomUpdate()

void CudaComputeNonbonded::atomUpdate ( void  )
virtual

Reimplemented from Compute.

Definition at line 702 of file CudaComputeNonbonded.C.

702  {
703  atomsChangedIn = true;
704 }

◆ doWork()

void CudaComputeNonbonded::doWork ( void  )
virtual

Reimplemented from Compute.

Definition at line 1184 of file CudaComputeNonbonded.C.

References SimParameters::CUDASOAintegrate, Flags::doEnergy, Flags::doFullElectrostatics, Flags::doMinimize, Flags::doNonbonded, Flags::doVirial, Compute::gbisPhase, NAMD_bug(), Node::Object(), openBoxesOnPe(), CudaTileListKernel::prepareBuffers(), ComputeMgr::sendOpenBoxesOnPe(), Node::simParameters, and simParams.

1184  {
1185  if (CkMyPe() != masterPe)
1186  NAMD_bug("CudaComputeNonbonded::doWork() called on non masterPe");
1187 
1188  // Read value of atomsChangedIn, which is set in atomUpdate(), and reset it.
1189  // atomsChangedIn can be set to true by any Pe
1190  // atomsChanged can only be set by masterPe
1191  // This use of double varibles makes sure we don't have race condition
1192  // it seems like it's important to have the masterPe call doWork() first
1193  atomsChanged = atomsChangedIn;
1194  atomsChangedIn = false;
1195 
1197 
1198  if (patches.size() == 0) return; // No work do to
1199 
1200  // Take the flags from the first patch on this Pe
1201  // Flags &flags = patches[rankPatches[CkMyRank()][0]].patch->flags;
1202  // these flags are probably wrong.
1203  Flags &flags = patches[0].patch->flags;
1204 
1205  doSlow = flags.doFullElectrostatics;
1206  doEnergy = flags.doEnergy;
1207  doVirial = flags.doVirial;
1208  doAlch = simParams->alchOn;
1209  doMinimize = flags.doMinimize;
1210 
1211  if (flags.doNonbonded) {
1212 
1213  if (simParams->GBISOn) {
1214  gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
1215  }
1216 
1217  if (!simParams->GBISOn || gbisPhase == 1) {
1218  if ( computesChanged ) {
1219  updateComputes();
1220  }
1221  if (atomsChanged) {
1222  // Re-calculate patch atom numbers and storage
1223  updatePatches();
1224  reSortDone = false;
1225  }
1226  reallocateArrays();
1227 #ifdef NODEGROUP_FORCE_REGISTER
1228  if (simParams->CUDASOAintegrate && simParams->useDeviceMigration && atomsChanged) {
1229  tileListKernel.prepareBuffers(atomStorageSize, patches.size(), cudaPatches, stream);
1230  updatePatchRecord();
1231  }
1232 #endif // NODEGROUP_FORCE_REGISTER
1233  }
1234 
1235  // Open boxes on Pes and launch work to masterPe
1236  if(params->CUDASOAintegrate){
1237  if(!atomsChanged) this->openBoxesOnPe();
1238  }
1239  else computeMgr->sendOpenBoxesOnPe(pes, this);
1240 
1241  } else {
1242  // No work to do, skip
1243  skip();
1244  }
1245 
1246 }
static Node * Object()
Definition: Node.h:86
SimParameters * simParameters
Definition: Node.h:181
void prepareBuffers(int atomStorageSizeIn, int numPatchesIn, const CudaPatchRecord *h_cudaPatches, cudaStream_t stream)
void NAMD_bug(const char *err_msg)
Definition: common.C:195
int doEnergy
Definition: PatchTypes.h:20
int doFullElectrostatics
Definition: PatchTypes.h:23
int doNonbonded
Definition: PatchTypes.h:22
int gbisPhase
Definition: Compute.h:39
#define simParams
Definition: Output.C:131
int doVirial
Definition: PatchTypes.h:21
void sendOpenBoxesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1838
int doMinimize
Definition: PatchTypes.h:26

◆ finishPatches()

void CudaComputeNonbonded::finishPatches ( )

Definition at line 1963 of file CudaComputeNonbonded.C.

References cudaCheck, SimParameters::CUDASOAintegrate, finishPatchesOnPe(), and ComputeMgr::sendFinishPatchesOnPe().

1963  {
1964  if(params->CUDASOAintegrate){
1965  if (atomsChanged || doEnergy || doVirial) cudaCheck(cudaStreamSynchronize(stream));
1966  this->finishPatchesOnPe();
1967  }
1968  else {
1969  computeMgr->sendFinishPatchesOnPe(pes, this);
1970  }
1971 }
void sendFinishPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1811
#define cudaCheck(stmt)
Definition: CudaUtils.h:233

◆ finishPatchesOnPe()

void CudaComputeNonbonded::finishPatchesOnPe ( )

Definition at line 1951 of file CudaComputeNonbonded.C.

Referenced by finishPatches(), and ComputeMgr::recvFinishPatchesOnPe().

1951  {
1952  finishSetOfPatchesOnPe(rankPatches[CkMyRank()]);
1953 }

◆ finishPatchOnPe()

void CudaComputeNonbonded::finishPatchOnPe ( int  i)

Definition at line 1958 of file CudaComputeNonbonded.C.

Referenced by ComputeMgr::recvFinishPatchOnPe().

1958  {
1959  std::vector<int> v(1, i);
1960  finishSetOfPatchesOnPe(v);
1961 }

◆ finishReductions()

void CudaComputeNonbonded::finishReductions ( )

Definition at line 1704 of file CudaComputeNonbonded.C.

References ADD_TENSOR_OBJECT, cudaCheck, VirialEnergy::energyElec, VirialEnergy::energyElec_s, VirialEnergy::energyElec_ti_1, VirialEnergy::energyElec_ti_2, VirialEnergy::energyGBIS, VirialEnergy::energySlow, VirialEnergy::energySlow_s, VirialEnergy::energySlow_ti_1, VirialEnergy::energySlow_ti_2, VirialEnergy::energyVdw, VirialEnergy::energyVdw_s, VirialEnergy::energyVdw_ti_1, VirialEnergy::energyVdw_ti_2, SimParameters::GBISOn, getCurrentReduction(), CudaTileListKernel::getNumExcluded(), SubmitReduction::item(), NAMD_bug(), REDUCTION_COMPUTE_CHECKSUM, REDUCTION_ELECT_ENERGY, REDUCTION_ELECT_ENERGY_F, REDUCTION_ELECT_ENERGY_SLOW, REDUCTION_ELECT_ENERGY_SLOW_F, REDUCTION_ELECT_ENERGY_SLOW_TI_1, REDUCTION_ELECT_ENERGY_SLOW_TI_2, REDUCTION_ELECT_ENERGY_TI_1, REDUCTION_ELECT_ENERGY_TI_2, REDUCTION_EXCLUSION_CHECKSUM_CUDA, REDUCTION_LJ_ENERGY, REDUCTION_LJ_ENERGY_F, REDUCTION_LJ_ENERGY_TI_1, REDUCTION_LJ_ENERGY_TI_2, SubmitReduction::submit(), VirialEnergy::virial, VirialEnergy::virialSlow, Tensor::xx, Tensor::xy, Tensor::xz, Tensor::yx, Tensor::yy, Tensor::yz, Tensor::zx, Tensor::zy, and Tensor::zz.

Referenced by ComputeMgr::recvFinishReductions().

1704  {
1705  if (CkMyPe() != masterPe)
1706  NAMD_bug("CudaComputeNonbonded::finishReductions() called on non masterPe");
1707 
1708  SubmitReduction *reduction = getCurrentReduction();
1709 
1710  // fprintf(stderr, "PE[%d]: Nbond finishReductions doSkip %d doVirial %d doEnergy %d\n", CkMyPe(), doSkip, doVirial, doEnergy);
1711  if (!doSkip) {
1712 
1713  if (doStreaming && (doVirial || doEnergy)) {
1714  // For streaming kernels, we must wait for virials and forces to be copied back to CPU
1715  if (!forceDoneEventRecord)
1716  NAMD_bug("CudaComputeNonbonded::finishReductions, forceDoneEvent not being recorded");
1717  cudaCheck(cudaEventSynchronize(forceDoneEvent));
1718  forceDoneEventRecord = false;
1719  }
1720 
1721  if (doVirial) {
1722  // if(params->CUDASOAintegrate) cudaCheck(cudaStreamSynchronize(stream));
1723  Tensor virialTensor;
1724  virialTensor.xx = h_virialEnergy->virial[0];
1725  virialTensor.xy = h_virialEnergy->virial[1];
1726  virialTensor.xz = h_virialEnergy->virial[2];
1727  virialTensor.yx = h_virialEnergy->virial[3];
1728  virialTensor.yy = h_virialEnergy->virial[4];
1729  virialTensor.yz = h_virialEnergy->virial[5];
1730  virialTensor.zx = h_virialEnergy->virial[6];
1731  virialTensor.zy = h_virialEnergy->virial[7];
1732  virialTensor.zz = h_virialEnergy->virial[8];
1733  // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.xx, virialTensor.xy, virialTensor.xz);
1734  // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.yx, virialTensor.yy, virialTensor.yz);
1735  // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.zx, virialTensor.zy, virialTensor.zz);
1736  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NBOND, virialTensor);
1737  if (doSlow) {
1738  Tensor virialTensor;
1739  virialTensor.xx = h_virialEnergy->virialSlow[0];
1740  virialTensor.xy = h_virialEnergy->virialSlow[1];
1741  virialTensor.xz = h_virialEnergy->virialSlow[2];
1742  virialTensor.yx = h_virialEnergy->virialSlow[3];
1743  virialTensor.yy = h_virialEnergy->virialSlow[4];
1744  virialTensor.yz = h_virialEnergy->virialSlow[5];
1745  virialTensor.zx = h_virialEnergy->virialSlow[6];
1746  virialTensor.zy = h_virialEnergy->virialSlow[7];
1747  virialTensor.zz = h_virialEnergy->virialSlow[8];
1748  // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.xx, virialTensor.xy, virialTensor.xz);
1749  // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.yx, virialTensor.yy, virialTensor.yz);
1750  // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.zx, virialTensor.zy, virialTensor.zz);
1751  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_SLOW, virialTensor);
1752  }
1753  }
1754  if (doEnergy) {
1755  // if (doSlow)
1756 
1757  // printf("energyElec %lf energySlow %lf energyGBIS %lf\n", h_virialEnergy->energyElec, h_virialEnergy->energySlow, h_virialEnergy->energyGBIS);
1758  reduction->item(REDUCTION_LJ_ENERGY) += h_virialEnergy->energyVdw;
1759  reduction->item(REDUCTION_LJ_ENERGY_F) += h_virialEnergy->energyVdw_s;
1760  reduction->item(REDUCTION_ELECT_ENERGY) += h_virialEnergy->energyElec + ((params->GBISOn) ? h_virialEnergy->energyGBIS : 0.0);
1761  reduction->item(REDUCTION_ELECT_ENERGY_F) += h_virialEnergy->energyElec_s;
1762 
1763  //Reduce values for TI
1764  reduction->item(REDUCTION_LJ_ENERGY_TI_1) += h_virialEnergy->energyVdw_ti_1;
1765  reduction->item(REDUCTION_LJ_ENERGY_TI_2) += h_virialEnergy->energyVdw_ti_2;
1766  reduction->item(REDUCTION_ELECT_ENERGY_TI_1) += h_virialEnergy->energyElec_ti_1;
1767  reduction->item(REDUCTION_ELECT_ENERGY_TI_2) += h_virialEnergy->energyElec_ti_2;
1768 
1769  // fprintf(stderr, "energyGBIS %lf\n", h_virialEnergy->energyGBIS);
1770  if (doSlow){
1771  reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += h_virialEnergy->energySlow;
1772  reduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += h_virialEnergy->energySlow_s;
1773  reduction->item(REDUCTION_ELECT_ENERGY_SLOW_TI_1) += h_virialEnergy->energySlow_ti_1;
1774  reduction->item(REDUCTION_ELECT_ENERGY_SLOW_TI_2) += h_virialEnergy->energySlow_ti_2;
1775  //fprintf(stderr, "NB h_virialEnergy->energySlow %lf\n", h_virialEnergy->energySlow);
1776  }
1777  }
1778 
1779  reduction->item(REDUCTION_EXCLUSION_CHECKSUM_CUDA) += tileListKernel.getNumExcluded();
1780  }
1781  reduction->item(REDUCTION_COMPUTE_CHECKSUM) += 1.;
1782 
1783  reduction->submit();
1784  // Reset flags
1785  doSkip = false;
1786  computesChanged = false;
1787 }
BigReal zy
Definition: Tensor.h:19
SubmitReduction * getCurrentReduction()
BigReal xz
Definition: Tensor.h:17
virtual void submit(void)=0
#define ADD_TENSOR_OBJECT(R, RL, D)
Definition: ReductionMgr.h:44
BigReal & item(int i)
Definition: ReductionMgr.h:336
BigReal yz
Definition: Tensor.h:18
void NAMD_bug(const char *err_msg)
Definition: common.C:195
BigReal yx
Definition: Tensor.h:18
BigReal xx
Definition: Tensor.h:17
double virialSlow[9]
BigReal zz
Definition: Tensor.h:19
Definition: Tensor.h:15
BigReal xy
Definition: Tensor.h:17
BigReal yy
Definition: Tensor.h:18
#define cudaCheck(stmt)
Definition: CudaUtils.h:233
BigReal zx
Definition: Tensor.h:19

◆ gbisP2PatchReady()

void CudaComputeNonbonded::gbisP2PatchReady ( PatchID  pid,
int  seq 
)
virtual

Reimplemented from Compute.

Definition at line 277 of file CudaComputeNonbonded.C.

References Compute::gbisP2PatchReady().

277  {
278  CmiLock(lock);
279  Compute::gbisP2PatchReady(pid, seq);
280  CmiUnlock(lock);
281 }
virtual void gbisP2PatchReady(PatchID, int seq)
Definition: Compute.C:96

◆ gbisP3PatchReady()

void CudaComputeNonbonded::gbisP3PatchReady ( PatchID  pid,
int  seq 
)
virtual

Reimplemented from Compute.

Definition at line 283 of file CudaComputeNonbonded.C.

References Compute::gbisP3PatchReady().

283  {
284  CmiLock(lock);
285  Compute::gbisP3PatchReady(pid, seq);
286  CmiUnlock(lock);
287 }
virtual void gbisP3PatchReady(PatchID, int seq)
Definition: Compute.C:106

◆ getCurrentReduction()

SubmitReduction * CudaComputeNonbonded::getCurrentReduction ( )

Definition at line 2425 of file CudaComputeNonbonded.C.

References Node::Object(), Node::simParameters, and simParams.

Referenced by finishReductions(), and launchWork().

2425  {
2427  return (simParams->CUDASOAintegrate) ? reductionGpuResident :
2428  reductionGpuOffload;
2429 }
static Node * Object()
Definition: Node.h:86
SimParameters * simParameters
Definition: Node.h:181
#define simParams
Definition: Output.C:131

◆ getDoTable()

bool CudaComputeNonbonded::getDoTable ( SimParameters params,
const bool  doSlow,
const bool  doVirial 
)
static

Definition at line 2410 of file CudaComputeNonbonded.C.

References simParams.

2410  {
2411  // There is additional logic in SimParameters.C which guards against unsupported force fields
2412  // This should only be used for performance heuristics
2413  bool doTable = simParams->useCUDANonbondedForceTable;
2414 
2415  // DMC: I found the doSlow case is faster with force tables, so overriding setting
2416  // TODO This should be reevaluated for future architectures
2417  doTable = doTable || doSlow;
2418  // Direct math does not support virial+slow
2419  // Redundant but necessary for correctness so doing it explicitly
2420  doTable = doTable || (doSlow && doVirial);
2421 
2422  return doTable;
2423 }
#define simParams
Definition: Output.C:131

◆ getNonbondedCoef()

CudaNBConstants CudaComputeNonbonded::getNonbondedCoef ( SimParameters params)
static

Definition at line 2377 of file CudaComputeNonbonded.C.

References ComputeNonbondedUtil::c1, ComputeNonbondedUtil::cutoff, ComputeNonbondedUtil::cutoff2, CudaNBConstants::e_0, CudaNBConstants::e_0_slow, CudaNBConstants::e_1, CudaNBConstants::e_2, CudaNBConstants::ewald_0, CudaNBConstants::ewald_1, CudaNBConstants::ewald_2, CudaNBConstants::ewald_3_slow, ComputeNonbondedUtil::ewaldcof, CudaNBConstants::lj_0, CudaNBConstants::lj_1, CudaNBConstants::lj_2, CudaNBConstants::lj_3, CudaNBConstants::lj_4, CudaNBConstants::lj_5, ComputeNonbondedUtil::pi_ewaldcof, simParams, CudaNBConstants::slowScale, ComputeNonbondedUtil::switchOn, and ComputeNonbondedUtil::switchOn2.

2377  {
2378  const float cutoff = ComputeNonbondedUtil::cutoff;
2379  const float cutoff2 = ComputeNonbondedUtil::cutoff2;
2380  const float cutoffInv = 1.0f / cutoff;
2381  const float cutoff2Inv = 1.0f / cutoff2;
2382  const float scutoff = ComputeNonbondedUtil::switchOn;
2383  const float scutoff2 = ComputeNonbondedUtil::switchOn2;
2384  const float scutoff2Inv = 1.0f / scutoff2;
2385  const float scutoff_denom = ComputeNonbondedUtil::c1;
2388  const float slowScale = ((float) simParams->fullElectFrequency) / simParams->nonbondedFrequency;
2389 
2390  CudaNBConstants c;
2391  c.lj_0 = scutoff_denom * cutoff2 - 3.0f * scutoff2 * scutoff_denom;
2392  c.lj_1 = scutoff_denom * 2.0f;
2393  c.lj_2 = scutoff_denom * -12.0f;
2394  c.lj_3 = 12.0f * scutoff_denom * scutoff2;
2395  c.lj_4 = cutoff2;
2396  c.lj_5 = scutoff2;
2397  c.e_0 = cutoff2Inv * cutoffInv;
2398  c.e_0_slow = cutoff2Inv * cutoffInv * (1.0f - slowScale);
2399  c.e_1 = cutoff2Inv;
2400  c.e_2 = cutoffInv;
2401  c.ewald_0 = ewaldcof;
2402  c.ewald_1 = pi_ewaldcof;
2403  c.ewald_2 = ewaldcof * ewaldcof;
2404  c.ewald_3_slow = ewaldcof * ewaldcof * ewaldcof * slowScale;
2405  c.slowScale = slowScale;
2406 
2407  return c;
2408 }
float ewald_3_slow
Definition: CudaUtils.h:607
#define simParams
Definition: Output.C:131

◆ getPatches()

std::vector<PatchRecord>& CudaComputeNonbonded::getPatches ( )
inline

Definition at line 329 of file CudaComputeNonbonded.h.

329 { return patches; }

◆ initialize()

void CudaComputeNonbonded::initialize ( void  )
virtual

Reimplemented from Compute.

Definition at line 642 of file CudaComputeNonbonded.C.

References ATOMIC_BINS, cudaCheck, SimParameters::CUDASOAintegrateMode, deviceCUDA, SimParameters::drudeNbtholeCut, SimParameters::drudeOn, DeviceCUDA::getDeviceIndex(), PatchMap::numPatches(), PatchMap::Object(), Node::Object(), ReductionMgr::Object(), Compute::priority(), REDUCTIONS_BASIC, REDUCTIONS_GPURESIDENT, Node::simParameters, and ReductionMgr::willSubmit().

Referenced by ComputeMgr::createComputes().

642  {
643  if (patches.size() > 0) {
644  npairlists = 0;
645  // Allocate CUDA version of patches
646  cudaCheck(cudaSetDevice(deviceID));
647  allocate_host<CudaPatchRecord>(&cudaPatches, patches.size());
648 
649  allocate_host<VirialEnergy>(&h_virialEnergy, 1);
650  allocate_device<VirialEnergy>(&d_virialEnergy, ATOMIC_BINS);
651 
652  /* JM: Queries for maximum sharedMemoryPerBlock on deviceID
653  */
654  cudaDeviceProp props;
655  cudaCheck(cudaGetDeviceProperties(&props, deviceID)); //Gets properties of 'deviceID device'
656  maxShmemPerBlock = props.sharedMemPerBlock;
657 
658 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP)
659  int leastPriority, greatestPriority;
660  cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
661  int priority = (doStreaming) ? leastPriority : greatestPriority;
662  // int priority = greatestPriority;
663  cudaCheck(cudaStreamCreateWithPriority(&stream,cudaStreamDefault, priority));
664 #else
665  cudaCheck(cudaStreamCreate(&stream));
666 #endif
667  cudaCheck(cudaEventCreate(&forceDoneEvent));
668 
669  buildExclusions();
670 
671  lock = CmiCreateLock();
672  params = Node::Object()->simParameters;
673  if (params->CUDASOAintegrateMode) {
674  reductionGpuResident = ReductionMgr::Object()->willSubmit(REDUCTIONS_GPURESIDENT);
675  }
676  reductionGpuOffload = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
677  doNbThole = params->drudeOn && (params->drudeNbtholeCut > 0.0);
678 
679 #ifdef NODEGROUP_FORCE_REGISTER
680  int devInd = deviceCUDA->getDeviceIndex();
681  CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
682  PatchData *patchData = cpdata.ckLocalBranch();
683  patchData->devData[devInd].nbond_stream = stream;
684  // Fill auxiliary arrays for merging forces here
685  PatchMap* map = PatchMap::Object();
686  int nGlobalPatches = map->numPatches();
687  allocate_host<bool>( &(patchData->devData[devInd].h_hasPatches), nGlobalPatches);
688  memset(patchData->devData[devInd].h_hasPatches, 0, sizeof(bool)*nGlobalPatches);
689 
690  for(int i = 0; i < patches.size(); i++){
691  patchData->devData[devInd].h_hasPatches[patches[i].patchID] = true;
692  }
693  allocate_device<bool>( &(patchData->devData[devInd].d_hasPatches), nGlobalPatches);
694  copy_HtoD_sync<bool>( patchData->devData[devInd].h_hasPatches, patchData->devData[devInd].d_hasPatches, nGlobalPatches);
695 #endif
696  }
697 }
static Node * Object()
Definition: Node.h:86
void buildExclusions()
Definition: CompressPsf.C:1197
static PatchMap * Object()
Definition: PatchMap.h:27
SimParameters * simParameters
Definition: Node.h:181
Bool CUDASOAintegrateMode
SubmitReduction * willSubmit(int setID, int size=-1)
Definition: ReductionMgr.C:368
static ReductionMgr * Object(void)
Definition: ReductionMgr.h:290
BigReal drudeNbtholeCut
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:23
int numPatches(void) const
Definition: PatchMap.h:59
#define ATOMIC_BINS
Definition: CudaUtils.h:70
int priority(void)
Definition: Compute.h:65
int getDeviceIndex()
Definition: DeviceCUDA.h:166
#define cudaCheck(stmt)
Definition: CudaUtils.h:233

◆ launchWork()

void CudaComputeNonbonded::launchWork ( )

Definition at line 1248 of file CudaComputeNonbonded.C.

References CudaComputeNonbonded::PatchRecord::atomStart, ResizeArray< Elem >::begin(), cudaCheck, ComputeNonbondedUtil::cutoff, ResizeArray< Elem >::end(), Patch::flags, Compute::gbisPhase, getCurrentReduction(), CudaTileListKernel::getEmptyPatches(), CudaTileListKernel::getNumEmptyPatches(), CudaTileListKernel::getNumPatches(), Patch::getPatchID(), CudaComputeNonbondedKernel::getPatchReadyQueue(), PatchMap::homePatchList(), SubmitReduction::item(), NAMD_bug(), NAMD_EVENT_START, NAMD_EVENT_STOP, CudaComputeNonbonded::PatchRecord::numAtoms, PatchMap::Object(), Node::Object(), HomePatchElem::patch, CudaComputeNonbondedKernel::reduceVirialEnergy(), REDUCTION_PAIRLIST_WARNINGS, reSortTileLists(), Flags::savePairlists, Node::simParameters, simParams, Flags::step, and Flags::usePairlists.

Referenced by openBoxesOnPe(), and ComputeMgr::recvLaunchWork().

1248  {
1249  if (CkMyPe() != masterPe)
1250  NAMD_bug("CudaComputeNonbonded::launchWork() called on non masterPe");
1251 
1252  beforeForceCompute = CkWallTimer();
1253  cudaCheck(cudaSetDevice(deviceID));
1255 
1256  // So, it seems like PE's are invoking the same object, however the patches[i] is borked on the masterPe
1257 
1258  // When I get here, it seems like compAtoms are not set for all Pes? How can that be?
1259 
1260  //execute only during GBIS phase 1, or if not using GBIS
1261  if (!simParams->GBISOn || gbisPhase == 1) {
1262 
1263  if ( atomsChanged || computesChanged ) {
1264  // Invalidate pair lists
1265  pairlistsValid = false;
1266  pairlistTolerance = 0.0f;
1267  }
1268 
1269  // Get maximum atom movement and patch tolerance
1270  float maxAtomMovement = 0.0f;
1271  float maxPatchTolerance = 0.0f;
1272  getMaxMovementTolerance(maxAtomMovement, maxPatchTolerance);
1273  // Update pair-list cutoff
1274  Flags &flags = patches[0].patch->flags;
1275  savePairlists = false;
1276  usePairlists = false;
1277  if ( flags.savePairlists ) {
1278  savePairlists = true;
1279  usePairlists = true;
1280  } else if ( flags.usePairlists ) {
1281  if ( ! pairlistsValid || ( 2. * maxAtomMovement > pairlistTolerance ) ) {
1282  SubmitReduction *reduction = getCurrentReduction();
1283  reduction->item(REDUCTION_PAIRLIST_WARNINGS) += 1;
1284  } else {
1285  usePairlists = true;
1286  }
1287  }
1288  if ( ! usePairlists ) {
1289  pairlistsValid = false;
1290  }
1291  float plcutoff = cutoff;
1292  if ( savePairlists ) {
1293  pairlistsValid = true;
1294  pairlistTolerance = 2. * maxPatchTolerance;
1295  plcutoff += pairlistTolerance;
1296  }
1297  plcutoff2 = plcutoff * plcutoff;
1298 
1299  // fprintf(stderr, "STEP[%d] plcutoff = %f listTolerance = %f save = %d maxPatchTolerance = %f maxAtomMovement = %f plvalid = %d flags.use = %d use = %d\n",
1300  // flags.step, plcutoff, pairlistTolerance, savePairlists, maxPatchTolerance, maxAtomMovement, pairlistsValid, flags.usePairlists, usePairlists);
1301  if(savePairlists || !usePairlists){
1302  reSortDone = false; // Ensures pairlist resorting if doPairlist
1303  }
1304 
1305  // if (atomsChanged)
1306  // CkPrintf("plcutoff = %f listTolerance = %f save = %d use = %d\n",
1307  // plcutoff, pairlistTolerance, savePairlists, usePairlists);
1308 
1309  } // if (!simParams->GBISOn || gbisPhase == 1)
1310 
1311  // Calculate PME & VdW forces
1312  if (!simParams->GBISOn || gbisPhase == 1) {
1313  doForce();
1314  if (doStreaming) {
1315  patchReadyQueue = nonbondedKernel.getPatchReadyQueue();
1316  patchReadyQueueLen = tileListKernel.getNumPatches();
1317  patchReadyQueueNext = 0;
1318  // Fill in empty patches [0 ... patchReadyQueueNext-1] at the top
1319  int numEmptyPatches = tileListKernel.getNumEmptyPatches();
1320  int* emptyPatches = tileListKernel.getEmptyPatches();
1321  for (int i=0;i < numEmptyPatches;i++) {
1322  PatchRecord &pr = patches[emptyPatches[i]];
1323  memset(h_forces+pr.atomStart, 0, sizeof(float4)*pr.numAtoms);
1324  if (doSlow) memset(h_forcesSlow+pr.atomStart, 0, sizeof(float4)*pr.numAtoms);
1325  patchReadyQueue[i] = emptyPatches[i];
1326  }
1327  if (patchReadyQueueLen != patches.size())
1328  NAMD_bug("CudaComputeNonbonded::launchWork, invalid patchReadyQueueLen");
1329  }
1330  }
1331 
1332  // For GBIS phase 1 at pairlist update, we must re-sort tile list
1333  // before calling doGBISphase1().
1334  if (atomsChanged && simParams->GBISOn && gbisPhase == 1) {
1335  // In this code path doGBISphase1() is called in forceDone()
1336  forceDoneSetCallback();
1337  return;
1338  }
1339 
1340  // GBIS Phases
1341  if (simParams->GBISOn) {
1342  if (gbisPhase == 1) {
1343  doGBISphase1();
1344  } else if (gbisPhase == 2) {
1345  doGBISphase2();
1346  } else if (gbisPhase == 3) {
1347  doGBISphase3();
1348  }
1349  }
1350 
1351  // Copy forces to host
1352  if (!simParams->GBISOn || gbisPhase == 3) {
1353  if (!doStreaming) {
1354 #ifdef NODEGROUP_FORCE_REGISTER
1355  if(!simParams->CUDASOAintegrate || (atomsChanged && !simParams->useDeviceMigration)){
1356  copy_DtoH<float4>(d_forces, h_forces, atomStorageSize, stream);
1357  if (doSlow) copy_DtoH<float4>(d_forcesSlow, h_forcesSlow, atomStorageSize, stream);
1358  }
1359 #else
1360  copy_DtoH<float4>(d_forces, h_forces, atomStorageSize, stream);
1361  if (doSlow) copy_DtoH<float4>(d_forcesSlow, h_forcesSlow, atomStorageSize, stream);
1362 #endif
1363 
1364  }
1365  }
1366 
1367  if ((!simParams->GBISOn || gbisPhase == 2) && (doEnergy || doVirial)) {
1368 
1369  NAMD_EVENT_START(1, NamdProfileEvent::REDUCE_VIRIAL_ENERGY);
1370  // For GBIS, energies are ready after phase 2
1371  nonbondedKernel.reduceVirialEnergy(tileListKernel,
1372  atomStorageSize, doEnergy, doVirial, doSlow, simParams->GBISOn,
1373  d_forces, d_forcesSlow, d_virialEnergy, stream);
1374  copy_DtoH<VirialEnergy>(d_virialEnergy, h_virialEnergy, 1, stream);
1375 
1376  NAMD_EVENT_STOP(1, NamdProfileEvent::REDUCE_VIRIAL_ENERGY);
1377 
1378  }
1379 
1380  if(simParams->CUDASOAintegrate && ((savePairlists || !usePairlists)) && !atomsChanged) reSortTileLists();
1381 
1382  // Setup call back
1383  forceDoneSetCallback();
1384 
1385 #if 0
1386  cudaCheck(cudaStreamSynchronize(stream));
1387  PatchMap *map = PatchMap::Object();
1388  HomePatchElem *elem;
1389  for(elem = map->homePatchList()->begin(); elem != map->homePatchList()->end(); elem++){
1390  if(elem->patch->getPatchID() == 7) break;
1391  }
1392  if(elem->patch->flags.step == 11){
1393  // it would be good to know from which patch these atoms are...
1394  fprintf(stderr, "CudaNonbonded data\n");
1395  for(int i = 0 ; i < atomStorageSize; i++){
1396  fprintf(stderr, "pos[%d] = %lf, %lf, %lf, %lf | (%f %f %f) (%f %f %f) \n",
1397  i, atoms[i].x, atoms[i].y, atoms[i].z, atoms[i].q,
1398  // for some reason, we needed to set the positions
1399  h_forces[i].x, h_forces[i].y, h_forces[i].z,
1400  h_forcesSlow[i].x, h_forcesSlow[i].y, h_forcesSlow[i].z);
1401  }
1402  }
1403 #endif
1404 
1405 }
static Node * Object()
Definition: Node.h:86
SubmitReduction * getCurrentReduction()
#define NAMD_EVENT_STOP(eon, id)
HomePatch * patch
Definition: HomePatchList.h:23
int numAtoms
Definition: CudaRecord.h:17
static PatchMap * Object()
Definition: PatchMap.h:27
SimParameters * simParameters
Definition: Node.h:181
int savePairlists
Definition: PatchTypes.h:41
BigReal & item(int i)
Definition: ReductionMgr.h:336
HomePatchList * homePatchList()
Definition: PatchMap.C:438
int usePairlists
Definition: PatchTypes.h:40
Flags flags
Definition: Patch.h:128
#define NAMD_EVENT_START(eon, id)
void NAMD_bug(const char *err_msg)
Definition: common.C:195
void reduceVirialEnergy(CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doEnergy, const bool doVirial, const bool doSlow, const bool doGBIS, float4 *d_forces, float4 *d_forcesSlow, VirialEnergy *d_virialEnergy, cudaStream_t stream)
PatchID getPatchID() const
Definition: Patch.h:114
int gbisPhase
Definition: Compute.h:39
#define simParams
Definition: Output.C:131
iterator begin(void)
Definition: ResizeArray.h:36
iterator end(void)
Definition: ResizeArray.h:37
#define cudaCheck(stmt)
Definition: CudaUtils.h:233
int atomStart
Definition: CudaRecord.h:16
int step
Definition: PatchTypes.h:16

◆ messageEnqueueWork()

void CudaComputeNonbonded::messageEnqueueWork ( )

Definition at line 1097 of file CudaComputeNonbonded.C.

References WorkDistrib::messageEnqueueWork(), and NAMD_bug().

Referenced by ComputeMgr::recvMessageEnqueueWork().

1097  {
1098  if (masterPe != CkMyPe())
1099  NAMD_bug("CudaComputeNonbonded::messageEnqueueWork() must be called from masterPe");
1101 }
static void messageEnqueueWork(Compute *)
Definition: WorkDistrib.C:2866
void NAMD_bug(const char *err_msg)
Definition: common.C:195

◆ noWork()

int CudaComputeNonbonded::noWork ( )
virtual

Reimplemented from Compute.

Definition at line 1149 of file CudaComputeNonbonded.C.

References ComputeMgr::sendMessageEnqueueWork().

1149  {
1150  // Simply enqueu doWork on masterPe and return "no work"
1151  computeMgr->sendMessageEnqueueWork(masterPe, this);
1152  return 1;
1153 }
void sendMessageEnqueueWork(int pe, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1863

◆ openBoxesOnPe()

void CudaComputeNonbonded::openBoxesOnPe ( )

Definition at line 1103 of file CudaComputeNonbonded.C.

References SimParameters::CUDASOAintegrate, Compute::getNumPatches(), launchWork(), NAMD_bug(), NAMD_EVENT_START, NAMD_EVENT_STOP, Node::Object(), and ComputeMgr::sendLaunchWork().

Referenced by doWork(), and ComputeMgr::recvOpenBoxesOnPe().

1103  {
1104  if (rankPatches[CkMyRank()].size() == 0)
1105  NAMD_bug("CudaComputeNonbonded::openBoxesOnPe, empty rank");
1106 
1107  NAMD_EVENT_START(1, NamdProfileEvent::COMPUTE_NONBONDED_OPEN_BOXES);
1108 
1109 #ifdef NODEGROUP_FORCE_REGISTER
1110  if( Node::Object()->simParameters->CUDASOAintegrate && !atomsChanged) {
1111  // opens boxes to make sure NAMD won't complain
1112  for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
1113  int j = rankPatches[CkMyRank()][i];
1114  patches[j].positionBox->open();
1115  }
1116  if(masterPe == CkMyPe()) {
1117  // we need to open boxes here...
1118  if(params->CUDASOAintegrate){
1119  if(!atomsChanged) this->launchWork();
1120  }
1121  else computeMgr->sendLaunchWork(masterPe, this);
1122  }
1123  }
1124  else{
1125 #endif
1126  for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
1127  openBox(rankPatches[CkMyRank()][i]);
1128  }
1129  bool done = false;
1130  CmiLock(lock);
1131  patchesCounter -= rankPatches[CkMyRank()].size();
1132  if (patchesCounter == 0) {
1133  patchesCounter = getNumPatches();
1134  done = true;
1135  }
1136  CmiUnlock(lock);
1137  if (done) {
1138  if(params->CUDASOAintegrate){
1139  if(!atomsChanged) this->launchWork();
1140  }
1141  else computeMgr->sendLaunchWork(masterPe, this);
1142  }
1143 #ifdef NODEGROUP_FORCE_REGISTER
1144  }
1145 #endif
1146  NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_NONBONDED_OPEN_BOXES);
1147 }
static Node * Object()
Definition: Node.h:86
#define NAMD_EVENT_STOP(eon, id)
void sendLaunchWork(int pe, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1874
#define NAMD_EVENT_START(eon, id)
void NAMD_bug(const char *err_msg)
Definition: common.C:195
int getNumPatches()
Definition: Compute.h:53

◆ patchReady()

void CudaComputeNonbonded::patchReady ( PatchID  pid,
int  doneMigration,
int  seq 
)
virtual

Reimplemented from Compute.

Definition at line 260 of file CudaComputeNonbonded.C.

References SimParameters::CUDASOAintegrate, NAMD_bug(), Compute::patchReady(), and SimParameters::useDeviceMigration.

260  {
261  // DMC: This isn't need into CUDASOAintegrate scheme. All it does is call atomUpdate()
262  // however that is already called in Sequencer::runComputeObjects_CUDA
263  // The functionality of updatePatch() was moved into updatePatches()
264  if (!(params->CUDASOAintegrate && params->useDeviceMigration)) {
265  if (doneMigration) {
266  int i = findPid(pid);
267  if (i == -1)
268  NAMD_bug("CudaComputeNonbonded::patchReady, Patch ID not found");
269  updatePatch(i);
270  }
271  CmiLock(lock);
272  Compute::patchReady(pid, doneMigration, seq);
273  CmiUnlock(lock);
274  }
275 }
Bool useDeviceMigration
void NAMD_bug(const char *err_msg)
Definition: common.C:195
virtual void patchReady(PatchID, int doneMigration, int seq)
Definition: Compute.C:67

◆ registerComputePair()

void CudaComputeNonbonded::registerComputePair ( ComputeID  cid,
PatchID pid,
int *  trans 
)

Definition at line 198 of file CudaComputeNonbonded.C.

References PatchMap::center(), Compute::cid, PatchMap::Object(), Vector::x, Vector::y, and Vector::z.

198  {
199  computesChanged = true;
200  addPatch(pid[0]);
201  addPatch(pid[1]);
202  PatchMap* patchMap = PatchMap::Object();
203  int t1 = trans[0];
204  int t2 = trans[1];
205  Vector offset = patchMap->center(pid[0]) - patchMap->center(pid[1]);
206  offset.x += (t1%3-1) - (t2%3-1);
207  offset.y += ((t1/3)%3-1) - ((t2/3)%3-1);
208  offset.z += (t1/9-1) - (t2/9-1);
209  addCompute(cid, pid[0], pid[1], offset);
210 }
ScaledPosition center(int pid) const
Definition: PatchMap.h:99
static PatchMap * Object()
Definition: PatchMap.h:27
Definition: Vector.h:72
BigReal z
Definition: Vector.h:74
BigReal x
Definition: Vector.h:74
BigReal y
Definition: Vector.h:74
const ComputeID cid
Definition: Compute.h:43

◆ registerComputeSelf()

void CudaComputeNonbonded::registerComputeSelf ( ComputeID  cid,
PatchID  pid 
)

Definition at line 188 of file CudaComputeNonbonded.C.

References Compute::cid.

188  {
189  computesChanged = true;
190  addPatch(pid);
191  addCompute(cid, pid, pid, 0.);
192 }
const ComputeID cid
Definition: Compute.h:43

◆ reSortTileLists()

void CudaComputeNonbonded::reSortTileLists ( )

Definition at line 2018 of file CudaComputeNonbonded.C.

References cudaCheck, Node::Object(), CudaTileListKernel::reSortTileLists(), Node::simParameters, and simParams.

Referenced by launchWork().

2018  {
2019  // Re-sort tile lists
2021  cudaCheck(cudaSetDevice(deviceID));
2022 #ifdef NAMD_HIP
2023  tileListKernel.reSortTileLists(simParams->GBISOn, simParams->CUDASOAintegrateMode, stream);
2024 #else
2025  tileListKernel.reSortTileLists(simParams->GBISOn, stream);
2026 #endif
2027 }
static Node * Object()
Definition: Node.h:86
SimParameters * simParameters
Definition: Node.h:181
#define simParams
Definition: Output.C:131
#define cudaCheck(stmt)
Definition: CudaUtils.h:233
void reSortTileLists(const bool doGBIS, cudaStream_t stream)

◆ skipPatchesOnPe()

void CudaComputeNonbonded::skipPatchesOnPe ( )

Definition at line 814 of file CudaComputeNonbonded.C.

References Compute::getNumPatches(), NAMD_bug(), and ComputeMgr::sendFinishReductions().

Referenced by ComputeMgr::recvSkipPatchesOnPe().

814  {
815  if (rankPatches[CkMyRank()].size() == 0)
816  NAMD_bug("CudaComputeNonbonded::skipPatchesOnPe, empty rank");
817  for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
818  skipPatch(rankPatches[CkMyRank()][i]);
819  }
820  bool done = false;
821  CmiLock(lock);
822  patchesCounter -= rankPatches[CkMyRank()].size();
823  if (patchesCounter == 0) {
824  patchesCounter = getNumPatches();
825  done = true;
826  }
827  CmiUnlock(lock);
828  if (done) {
829  // Reduction must be done on masterPe
830  computeMgr->sendFinishReductions(masterPe, this);
831  }
832 }
void sendFinishReductions(int pe, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1852
void NAMD_bug(const char *err_msg)
Definition: common.C:195
int getNumPatches()
Definition: Compute.h:53

◆ unregisterBoxesOnPe()

void CudaComputeNonbonded::unregisterBoxesOnPe ( )

Definition at line 176 of file CudaComputeNonbonded.C.

References NAMD_bug().

Referenced by ComputeMgr::recvUnregisterBoxesOnPe().

176  {
177  if (rankPatches[CkMyRank()].size() == 0)
178  NAMD_bug("CudaComputeNonbonded::unregisterBoxesOnPe, empty rank");
179  for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
180  unregisterBox(rankPatches[CkMyRank()][i]);
181  }
182 }
void NAMD_bug(const char *err_msg)
Definition: common.C:195

◆ updatePatchOrder()

void CudaComputeNonbonded::updatePatchOrder ( const std::vector< CudaLocalRecord > &  data)

Definition at line 613 of file CudaComputeNonbonded.C.

613  {
614  // DMC This vector of CudaLocalRecords doesn't have the correct number of peer records
615  std::map<int, int> pidMap;
616  for (int i=0; i < data.size(); ++i) {
617  pidMap[data[i].patchID] = i;
618  }
619 
620  std::vector<PatchRecord> copy = patches;
621 
622  for (int i=0; i < copy.size(); i++) {
623  const int new_idx = pidMap[copy[i].patchID];
624  patches[new_idx] = copy[i];
625  }
626 
627  for (int i=0; i < rankPatches.size(); i++) {
628  rankPatches[i].clear();
629  }
630  for (int i=0; i < patches.size(); ++i) {
631  rankPatches[CkRankOf(patches[i].pe)].push_back(i);
632  }
633 
634  // Setup computes using pidMap
635  for (int i=0;i < computes.size();i++) {
636  computes[i].patchInd[0] = pidMap[computes[i].pid[0]];
637  computes[i].patchInd[1] = pidMap[computes[i].pid[1]];
638  }
639  // TODO do we need to call sendAssignPatchesOnPe with the new order?
640 }

The documentation for this class was generated from the following files: