NAMD
Classes | Public Member Functions | List of all members
CudaComputeNonbonded Class Reference

#include <CudaComputeNonbonded.h>

Inheritance diagram for CudaComputeNonbonded:
Compute ComputeNonbondedUtil

Classes

struct  ComputeRecord
 
struct  PatchRecord
 

Public Member Functions

 CudaComputeNonbonded (ComputeID c, int deviceID, CudaNonbondedTables &cudaNonbondedTables, bool doStreaming)
 
 ~CudaComputeNonbonded ()
 
void registerComputeSelf (ComputeID cid, PatchID pid)
 
void registerComputePair (ComputeID cid, PatchID *pid, int *trans)
 
void assignPatches (ComputeMgr *computeMgrIn)
 
virtual void initialize ()
 
virtual void atomUpdate ()
 
virtual int noWork ()
 
virtual void doWork ()
 
void launchWork ()
 
void finishReductions ()
 
void unregisterBoxesOnPe ()
 
void assignPatchesOnPe ()
 
void openBoxesOnPe ()
 
void skipPatchesOnPe ()
 
void finishPatchesOnPe ()
 
void finishPatchOnPe (int i)
 
void messageEnqueueWork ()
 
virtual void patchReady (PatchID, int doneMigration, int seq)
 
virtual void gbisP2PatchReady (PatchID, int seq)
 
virtual void gbisP3PatchReady (PatchID, int seq)
 
- Public Member Functions inherited from Compute
 Compute (ComputeID)
 
int type ()
 
virtual ~Compute ()
 
void setNumPatches (int n)
 
int getNumPatches ()
 
int sequence (void)
 
int priority (void)
 
int getGBISPhase (void)
 
- Public Member Functions inherited from ComputeNonbondedUtil
 ComputeNonbondedUtil ()
 
 ~ComputeNonbondedUtil ()
 
void calcGBIS (nonbonded *params, GBISParamStruct *gbisParams)
 

Additional Inherited Members

- Public Types inherited from ComputeNonbondedUtil
enum  {
  exclChecksumIndex, pairlistWarningIndex, electEnergyIndex, fullElectEnergyIndex,
  vdwEnergyIndex, goNativeEnergyIndex, goNonnativeEnergyIndex, groLJEnergyIndex,
  groGaussEnergyIndex, electEnergyIndex_s, fullElectEnergyIndex_s, vdwEnergyIndex_s,
  electEnergyIndex_ti_1, fullElectEnergyIndex_ti_1, vdwEnergyIndex_ti_1, electEnergyIndex_ti_2,
  fullElectEnergyIndex_ti_2, vdwEnergyIndex_ti_2, TENSOR =(virialIndex), TENSOR =(virialIndex),
  VECTOR =(pairVDWForceIndex), VECTOR =(pairVDWForceIndex), reductionDataSize
}
 
- Static Public Member Functions inherited from ComputeNonbondedUtil
static void select (void)
 
static void submitReductionData (BigReal *, SubmitReduction *)
 
static void submitPressureProfileData (BigReal *, SubmitReduction *)
 
static BigReal square (const BigReal &x, const BigReal &y, const BigReal &z)
 
static void calc_error (nonbonded *)
 
static void calc_pair (nonbonded *)
 
static void calc_pair_energy (nonbonded *)
 
static void calc_pair_fullelect (nonbonded *)
 
static void calc_pair_energy_fullelect (nonbonded *)
 
static void calc_pair_merge_fullelect (nonbonded *)
 
static void calc_pair_energy_merge_fullelect (nonbonded *)
 
static void calc_pair_slow_fullelect (nonbonded *)
 
static void calc_pair_energy_slow_fullelect (nonbonded *)
 
static void calc_self (nonbonded *)
 
static void calc_self_energy (nonbonded *)
 
static void calc_self_fullelect (nonbonded *)
 
static void calc_self_energy_fullelect (nonbonded *)
 
static void calc_self_merge_fullelect (nonbonded *)
 
static void calc_self_energy_merge_fullelect (nonbonded *)
 
static void calc_self_slow_fullelect (nonbonded *)
 
static void calc_self_energy_slow_fullelect (nonbonded *)
 
static void calc_pair_energy_fep (nonbonded *)
 
static void calc_pair_energy_fullelect_fep (nonbonded *)
 
static void calc_pair_energy_merge_fullelect_fep (nonbonded *)
 
static void calc_pair_energy_slow_fullelect_fep (nonbonded *)
 
static void calc_self_energy_fep (nonbonded *)
 
static void calc_self_energy_fullelect_fep (nonbonded *)
 
static void calc_self_energy_merge_fullelect_fep (nonbonded *)
 
static void calc_self_energy_slow_fullelect_fep (nonbonded *)
 
static void calc_pair_energy_ti (nonbonded *)
 
static void calc_pair_ti (nonbonded *)
 
static void calc_pair_energy_fullelect_ti (nonbonded *)
 
static void calc_pair_fullelect_ti (nonbonded *)
 
static void calc_pair_energy_merge_fullelect_ti (nonbonded *)
 
static void calc_pair_merge_fullelect_ti (nonbonded *)
 
static void calc_pair_energy_slow_fullelect_ti (nonbonded *)
 
static void calc_pair_slow_fullelect_ti (nonbonded *)
 
static void calc_self_energy_ti (nonbonded *)
 
static void calc_self_ti (nonbonded *)
 
static void calc_self_energy_fullelect_ti (nonbonded *)
 
static void calc_self_fullelect_ti (nonbonded *)
 
static void calc_self_energy_merge_fullelect_ti (nonbonded *)
 
static void calc_self_merge_fullelect_ti (nonbonded *)
 
static void calc_self_energy_slow_fullelect_ti (nonbonded *)
 
static void calc_self_slow_fullelect_ti (nonbonded *)
 
static void calc_pair_les (nonbonded *)
 
static void calc_pair_energy_les (nonbonded *)
 
static void calc_pair_fullelect_les (nonbonded *)
 
static void calc_pair_energy_fullelect_les (nonbonded *)
 
static void calc_pair_merge_fullelect_les (nonbonded *)
 
static void calc_pair_energy_merge_fullelect_les (nonbonded *)
 
static void calc_pair_slow_fullelect_les (nonbonded *)
 
static void calc_pair_energy_slow_fullelect_les (nonbonded *)
 
static void calc_self_les (nonbonded *)
 
static void calc_self_energy_les (nonbonded *)
 
static void calc_self_fullelect_les (nonbonded *)
 
static void calc_self_energy_fullelect_les (nonbonded *)
 
static void calc_self_merge_fullelect_les (nonbonded *)
 
static void calc_self_energy_merge_fullelect_les (nonbonded *)
 
static void calc_self_slow_fullelect_les (nonbonded *)
 
static void calc_self_energy_slow_fullelect_les (nonbonded *)
 
static void calc_pair_energy_int (nonbonded *)
 
static void calc_pair_energy_fullelect_int (nonbonded *)
 
static void calc_pair_energy_merge_fullelect_int (nonbonded *)
 
static void calc_self_energy_int (nonbonded *)
 
static void calc_self_energy_fullelect_int (nonbonded *)
 
static void calc_self_energy_merge_fullelect_int (nonbonded *)
 
static void calc_pair_pprof (nonbonded *)
 
static void calc_pair_energy_pprof (nonbonded *)
 
static void calc_pair_fullelect_pprof (nonbonded *)
 
static void calc_pair_energy_fullelect_pprof (nonbonded *)
 
static void calc_pair_merge_fullelect_pprof (nonbonded *)
 
static void calc_pair_energy_merge_fullelect_pprof (nonbonded *)
 
static void calc_pair_slow_fullelect_pprof (nonbonded *)
 
static void calc_pair_energy_slow_fullelect_pprof (nonbonded *)
 
static void calc_self_pprof (nonbonded *)
 
static void calc_self_energy_pprof (nonbonded *)
 
static void calc_self_fullelect_pprof (nonbonded *)
 
static void calc_self_energy_fullelect_pprof (nonbonded *)
 
static void calc_self_merge_fullelect_pprof (nonbonded *)
 
static void calc_self_energy_merge_fullelect_pprof (nonbonded *)
 
static void calc_self_slow_fullelect_pprof (nonbonded *)
 
static void calc_self_energy_slow_fullelect_pprof (nonbonded *)
 
static void calc_pair_tabener (nonbonded *)
 
static void calc_pair_energy_tabener (nonbonded *)
 
static void calc_pair_fullelect_tabener (nonbonded *)
 
static void calc_pair_energy_fullelect_tabener (nonbonded *)
 
static void calc_pair_merge_fullelect_tabener (nonbonded *)
 
static void calc_pair_energy_merge_fullelect_tabener (nonbonded *)
 
static void calc_pair_slow_fullelect_tabener (nonbonded *)
 
static void calc_pair_energy_slow_fullelect_tabener (nonbonded *)
 
static void calc_self_tabener (nonbonded *)
 
static void calc_self_energy_tabener (nonbonded *)
 
static void calc_self_fullelect_tabener (nonbonded *)
 
static void calc_self_energy_fullelect_tabener (nonbonded *)
 
static void calc_self_merge_fullelect_tabener (nonbonded *)
 
static void calc_self_energy_merge_fullelect_tabener (nonbonded *)
 
static void calc_self_slow_fullelect_tabener (nonbonded *)
 
static void calc_self_energy_slow_fullelect_tabener (nonbonded *)
 
static void calc_pair_go (nonbonded *)
 
static void calc_pair_energy_go (nonbonded *)
 
static void calc_pair_fullelect_go (nonbonded *)
 
static void calc_pair_energy_fullelect_go (nonbonded *)
 
static void calc_pair_merge_fullelect_go (nonbonded *)
 
static void calc_pair_energy_merge_fullelect_go (nonbonded *)
 
static void calc_pair_slow_fullelect_go (nonbonded *)
 
static void calc_pair_energy_slow_fullelect_go (nonbonded *)
 
static void calc_self_go (nonbonded *)
 
static void calc_self_energy_go (nonbonded *)
 
static void calc_self_fullelect_go (nonbonded *)
 
static void calc_self_energy_fullelect_go (nonbonded *)
 
static void calc_self_merge_fullelect_go (nonbonded *)
 
static void calc_self_energy_merge_fullelect_go (nonbonded *)
 
static void calc_self_slow_fullelect_go (nonbonded *)
 
static void calc_self_energy_slow_fullelect_go (nonbonded *)
 
- Public Attributes inherited from Compute
const ComputeID cid
 
LDObjHandle ldObjHandle
 
LocalWorkMsg *const localWorkMsg
 
- Static Public Attributes inherited from ComputeNonbondedUtil
static void(* calcPair )(nonbonded *)
 
static void(* calcPairEnergy )(nonbonded *)
 
static void(* calcSelf )(nonbonded *)
 
static void(* calcSelfEnergy )(nonbonded *)
 
static void(* calcFullPair )(nonbonded *)
 
static void(* calcFullPairEnergy )(nonbonded *)
 
static void(* calcFullSelf )(nonbonded *)
 
static void(* calcFullSelfEnergy )(nonbonded *)
 
static void(* calcMergePair )(nonbonded *)
 
static void(* calcMergePairEnergy )(nonbonded *)
 
static void(* calcMergeSelf )(nonbonded *)
 
static void(* calcMergeSelfEnergy )(nonbonded *)
 
static void(* calcSlowPair )(nonbonded *)
 
static void(* calcSlowPairEnergy )(nonbonded *)
 
static void(* calcSlowSelf )(nonbonded *)
 
static void(* calcSlowSelfEnergy )(nonbonded *)
 
static Bool commOnly
 
static Bool fixedAtomsOn
 
static Bool qmForcesOn
 
static BigReal cutoff
 
static BigReal cutoff2
 
static float cutoff2_f
 
static BigReal dielectric_1
 
static const LJTableljTable = 0
 
static const Moleculemol
 
static BigReal r2_delta
 
static BigReal r2_delta_1
 
static int rowsize
 
static int columnsize
 
static int r2_delta_exp
 
static BigRealtable_alloc = 0
 
static BigRealtable_ener = 0
 
static BigRealtable_short
 
static BigRealtable_noshort
 
static BigRealfast_table
 
static BigRealscor_table
 
static BigRealslow_table
 
static BigRealcorr_table
 
static BigRealfull_table
 
static BigRealvdwa_table
 
static BigRealvdwb_table
 
static BigRealr2_table
 
static int table_length
 
static BigReal scaling
 
static BigReal scale14
 
static BigReal switchOn
 
static BigReal switchOn_1
 
static BigReal switchOn2
 
static BigReal v_vdwa
 
static BigReal v_vdwb
 
static BigReal k_vdwa
 
static BigReal k_vdwb
 
static BigReal cutoff_3
 
static BigReal cutoff_6
 
static float v_vdwa_f
 
static float v_vdwb_f
 
static float k_vdwa_f
 
static float k_vdwb_f
 
static float cutoff_3_f
 
static float cutoff_6_f
 
static float switchOn_f
 
static float A6_f
 
static float B6_f
 
static float C6_f
 
static float A12_f
 
static float B12_f
 
static float C12_f
 
static BigReal c0
 
static BigReal c1
 
static BigReal c3
 
static BigReal c5
 
static BigReal c6
 
static BigReal c7
 
static BigReal c8
 
static Bool alchFepOn
 
static Bool alchThermIntOn
 
static Bool alchWCAOn
 
static BigReal alchVdwShiftCoeff
 
static Bool vdwForceSwitching
 
static Bool alchDecouple
 
static Bool lesOn
 
static int lesFactor
 
static BigReal lesScaling
 
static BigReallambda_table = 0
 
static Bool pairInteractionOn
 
static Bool pairInteractionSelf
 
static Bool pressureProfileOn
 
static int pressureProfileSlabs
 
static int pressureProfileAtomTypes
 
static BigReal pressureProfileThickness
 
static BigReal pressureProfileMin
 
static Bool accelMDOn
 
static Bool drudeNbthole
 
static BigReal ewaldcof
 
static BigReal pi_ewaldcof
 
static int vdw_switch_mode
 
static Bool goGroPair
 
static Bool goForcesOn
 
static int goMethod
 
- Protected Member Functions inherited from Compute
void enqueueWork ()
 
- Protected Attributes inherited from Compute
int computeType
 
int basePriority
 
int gbisPhase
 
int gbisPhasePriority [3]
 

Detailed Description

Definition at line 27 of file CudaComputeNonbonded.h.

Constructor & Destructor Documentation

CudaComputeNonbonded::CudaComputeNonbonded ( ComputeID  c,
int  deviceID,
CudaNonbondedTables cudaNonbondedTables,
bool  doStreaming 
)

Definition at line 37 of file CudaComputeNonbonded.C.

References cudaCheck, SimParameters::GBISOn, Compute::gbisPhase, NAMD_die(), Node::Object(), SimParameters::pressureProfileOn, Node::simParameters, and simParams.

38  :
39 Compute(c), deviceID(deviceID), doStreaming(doStreaming), nonbondedKernel(deviceID, cudaNonbondedTables, doStreaming),
40 tileListKernel(deviceID, doStreaming), GBISKernel(deviceID) {
41 
42  cudaCheck(cudaSetDevice(deviceID));
43 
44  exclusionsByAtom = NULL;
45 
46  vdwTypes = NULL;
47  vdwTypesSize = 0;
48 
49  exclIndexMaxDiff = NULL;
50  exclIndexMaxDiffSize = 0;
51 
52  atomIndex = NULL;
53  atomIndexSize = 0;
54 
55  atomStorageSize = 0;
56 
57  // Atom and charge storage
58  atoms = NULL;
59  atomsSize = 0;
60 
61  // Force storage
62  h_forces = NULL;
63  h_forcesSize = 0;
64  h_forcesSlow = NULL;
65  h_forcesSlowSize = 0;
66 
67  d_forces = NULL;
68  d_forcesSize = 0;
69  d_forcesSlow = NULL;
70  d_forcesSlowSize = 0;
71 
72  // GBIS
73  intRad0H = NULL;
74  intRad0HSize = 0;
75  intRadSH = NULL;
76  intRadSHSize = 0;
77  psiSumH = NULL;
78  psiSumHSize = 0;
79  bornRadH = NULL;
80  bornRadHSize = 0;
81  dEdaSumH = NULL;
82  dEdaSumHSize = 0;
83  dHdrPrefixH = NULL;
84  dHdrPrefixHSize = 0;
85  maxShmemPerBlock = 0;
86  cudaPatches = NULL;
87 
88  atomsChangedIn = true;
89  atomsChanged = true;
90  computesChanged = true;
91 
92  forceDoneEventRecord = false;
93 
95  if (simParams->pressureProfileOn) {
96  NAMD_die("CudaComputeNonbonded, pressure profile not supported");
97  }
98 
99  if (simParams->GBISOn) gbisPhase = 3;
100 
101  doSkip = false;
102 }
static Node * Object()
Definition: Node.h:86
SimParameters * simParameters
Definition: Node.h:178
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ vdwTypes
static __thread atom * atoms
static __thread float * bornRadH
static __thread int2 * exclusionsByAtom
static __thread float * dHdrPrefixH
static __thread float * intRadSH
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ cudaPatches
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ exclIndexMaxDiff
void NAMD_die(const char *err_msg)
Definition: common.C:85
int gbisPhase
Definition: Compute.h:39
#define simParams
Definition: Output.C:127
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ atomIndex
Bool pressureProfileOn
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
Compute(ComputeID)
Definition: Compute.C:33
static __thread float * intRad0H
CudaComputeNonbonded::~CudaComputeNonbonded ( )

Definition at line 107 of file CudaComputeNonbonded.C.

References cudaCheck, and ComputeMgr::sendUnregisterBoxesOnPe().

107  {
108  cudaCheck(cudaSetDevice(deviceID));
109  if (exclusionsByAtom != NULL) delete [] exclusionsByAtom;
110  if (vdwTypes != NULL) deallocate_host<int>(&vdwTypes);
111  if (exclIndexMaxDiff != NULL) deallocate_host<int2>(&exclIndexMaxDiff);
112  if (atoms != NULL) deallocate_host<CudaAtom>(&atoms);
113  if (h_forces != NULL) deallocate_host<float4>(&h_forces);
114  if (h_forcesSlow != NULL) deallocate_host<float4>(&h_forcesSlow);
115  if (d_forces != NULL) deallocate_device<float4>(&d_forces);
116  if (d_forcesSlow != NULL) deallocate_device<float4>(&d_forcesSlow);
117 
118  // GBIS
119  if (intRad0H != NULL) deallocate_host<float>(&intRad0H);
120  if (intRadSH != NULL) deallocate_host<float>(&intRadSH);
121  if (psiSumH != NULL) deallocate_host<GBReal>(&psiSumH);
122  if (bornRadH != NULL) deallocate_host<float>(&bornRadH);
123  if (dEdaSumH != NULL) deallocate_host<GBReal>(&dEdaSumH);
124  if (dHdrPrefixH != NULL) deallocate_host<float>(&dHdrPrefixH);
125 
126  if (cudaPatches != NULL) deallocate_host<CudaPatchRecord>(&cudaPatches);
127 
128  if (patches.size() > 0) {
129  deallocate_host<VirialEnergy>(&h_virialEnergy);
130  deallocate_device<VirialEnergy>(&d_virialEnergy);
131  cudaCheck(cudaStreamDestroy(stream));
132  cudaCheck(cudaEventDestroy(forceDoneEvent));
133  CmiDestroyLock(lock);
134  delete reduction;
135  }
136 
137  // NOTE: unregistering happens in [sync] -entry method
139 
140 }
static __thread ComputeMgr * computeMgr
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ vdwTypes
static __thread atom * atoms
static __thread float * bornRadH
static __thread int2 * exclusionsByAtom
static __thread float * dHdrPrefixH
__thread cudaStream_t stream
static __thread float * intRadSH
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ cudaPatches
void sendUnregisterBoxesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1686
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ exclIndexMaxDiff
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
static __thread float * intRad0H

Member Function Documentation

void CudaComputeNonbonded::assignPatches ( ComputeMgr computeMgrIn)

Definition at line 364 of file CudaComputeNonbonded.C.

References PatchMap::basePatchIDList(), deviceCUDA, findHomePatchPe(), findProxyPatchPes(), DeviceCUDA::getDeviceCount(), DeviceCUDA::getMasterPeForDeviceID(), Compute::getNumPatches(), DeviceCUDA::getNumPesSharingDevice(), DeviceCUDA::getPesSharingDevice(), ComputePmeCUDAMgr::isPmePe(), NAMD_bug(), PatchMap::Object(), ComputePmeCUDAMgr::Object(), PatchMap::ObjectOnPe(), ComputeMgr::sendAssignPatchesOnPe(), Compute::setNumPatches(), and sort.

Referenced by ComputeMgr::createComputes().

364  {
365  // Remove duplicate patches
366  std::sort(patches.begin(), patches.end());
367  std::vector<PatchRecord>::iterator last = std::unique(patches.begin(), patches.end());
368  patches.erase(last, patches.end());
369  // Set number of patches
370  setNumPatches(patches.size());
371  masterPe = CkMyPe();
372  computeMgr = computeMgrIn;
373  // Start patch counter
374  patchesCounter = getNumPatches();
375  // Patch ID map
376  std::map<PatchID, int> pidMap;
377 #if 1
378  //-------------------------------------------------------
379  // Copied in from ComputeNonbondedCUDA::assignPatches()
380  //-------------------------------------------------------
381 
382  std::vector<int> pesOnNodeSharingDevice(CkMyNodeSize());
383  int numPesOnNodeSharingDevice = 0;
384  int masterIndex = -1;
385  for ( int i=0; i<deviceCUDA->getNumPesSharingDevice(); ++i ) {
386  int pe = deviceCUDA->getPesSharingDevice(i);
387  if ( pe == CkMyPe() ) masterIndex = numPesOnNodeSharingDevice;
388  if ( CkNodeOf(pe) == CkMyNode() ) {
389  pesOnNodeSharingDevice[numPesOnNodeSharingDevice++] = pe;
390  }
391  }
392 
393  std::vector<int> count(patches.size(), 0);
394  std::vector<int> pcount(numPesOnNodeSharingDevice, 0);
395  std::vector<int> rankpcount(CkMyNodeSize(), 0);
396  std::vector<char> table(patches.size()*numPesOnNodeSharingDevice, 0);
397 
398  PatchMap* patchMap = PatchMap::Object();
399 
400  int unassignedpatches = patches.size();
401 
402  for (int i=0;i < patches.size(); ++i) {
403  patches[i].pe = -1;
404  }
405 
406  // assign if home pe and build table of natural proxies
407  for (int i=0;i < patches.size(); ++i) {
408  int pid = patches[i].patchID;
409  // homePe = PE where the patch currently resides
410  int homePe = patchMap->node(pid);
411  for ( int j=0; j < numPesOnNodeSharingDevice; ++j ) {
412  int pe = pesOnNodeSharingDevice[j];
413  // If homePe is sharing this device, assign this patch to homePe
414  if ( pe == homePe ) {
415  patches[i].pe = pe;
416  --unassignedpatches;
417  pcount[j] += 1;
418  }
419  if ( PatchMap::ObjectOnPe(pe)->patch(pid) ) {
420  table[i*numPesOnNodeSharingDevice + j] = 1;
421  }
422  }
423  // Assign this patch to homePe, if it resides on the same node
424  if ( patches[i].pe == -1 && CkNodeOf(homePe) == CkMyNode() ) {
425  patches[i].pe = homePe;
426  --unassignedpatches;
427  rankpcount[CkRankOf(homePe)] += 1;
428  }
429  }
430  // assign if only one pe has a required proxy
431  for (int i=0; i < patches.size(); ++i) {
432  int pid = patches[i].patchID;
433  if ( patches[i].pe != -1 ) continue;
434  int c = 0;
435  int lastj;
436  for (int j=0; j < numPesOnNodeSharingDevice; ++j) {
437  if ( table[i*numPesOnNodeSharingDevice + j] ) {
438  ++c;
439  lastj = j;
440  }
441  }
442  count[i] = c;
443  if ( c == 1 ) {
444  patches[i].pe = pesOnNodeSharingDevice[lastj];
445  --unassignedpatches;
446  pcount[lastj] += 1;
447  }
448  }
449  int assignj = 0;
450  while ( unassignedpatches ) {
451  int i;
452  for (i=0;i < patches.size(); ++i) {
453  if ( ! table[i*numPesOnNodeSharingDevice + assignj] ) continue;
454  int pid = patches[i].patchID;
455  // patch_record &pr = patchRecords[pid];
456  if ( patches[i].pe != -1 ) continue;
457  patches[i].pe = pesOnNodeSharingDevice[assignj];
458  --unassignedpatches;
459  pcount[assignj] += 1;
460  if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
461  break;
462  }
463  if (i < patches.size() ) continue; // start search again
464  for ( i=0;i < patches.size(); ++i ) {
465  int pid = patches[i].patchID;
466  // patch_record &pr = patchRecords[pid];
467  if ( patches[i].pe != -1 ) continue;
468  if ( count[i] ) continue;
469  patches[i].pe = pesOnNodeSharingDevice[assignj];
470  --unassignedpatches;
471  pcount[assignj] += 1;
472  if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
473  break;
474  }
475  if ( i < patches.size() ) continue; // start search again
476  if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
477  }
478 
479  // For each rank, list of patches
480  rankPatches.resize(CkMyNodeSize());
481  for (int i=0; i < patches.size(); ++i) {
482  rankPatches[CkRankOf(patches[i].pe)].push_back(i);
483  pidMap[patches[i].patchID] = i;
484  }
485 
486  // for ( int i=0; i < patches.size(); ++i ) {
487  // CkPrintf("Pe %d patch %d hostPe %d\n", CkMyPe(), patches[i].patchID, patches[i].pe);
488  // }
489 
490 /*
491  slavePes = new int[CkMyNodeSize()];
492  slaves = new ComputeNonbondedCUDA*[CkMyNodeSize()];
493  numSlaves = 0;
494  for ( int j=0; j<numPesOnNodeSharingDevice; ++j ) {
495  int pe = pesOnNodeSharingDevice[j];
496  int rank = pe - CkNodeFirst(CkMyNode());
497  // CkPrintf("host %d sharing %d pe %d rank %d pcount %d rankpcount %d\n",
498  // CkMyPe(),j,pe,rank,pcount[j],rankpcount[rank]);
499  if ( pe == CkMyPe() ) continue;
500  if ( ! pcount[j] && ! rankpcount[rank] ) continue;
501  rankpcount[rank] = 0; // skip in rank loop below
502  slavePes[numSlaves] = pe;
503  computeMgr->sendCreateNonbondedCUDASlave(pe,numSlaves);
504  ++numSlaves;
505  }
506  for ( int j=0; j<CkMyNodeSize(); ++j ) {
507  int pe = CkNodeFirst(CkMyNode()) + j;
508  // CkPrintf("host %d rank %d pe %d rankpcount %d\n",
509  // CkMyPe(),j,pe,rankpcount[j]);
510  if ( ! rankpcount[j] ) continue;
511  if ( pe == CkMyPe() ) continue;
512  slavePes[numSlaves] = pe;
513  computeMgr->sendCreateNonbondedCUDASlave(pe,numSlaves);
514  ++numSlaves;
515  }
516 */
517 
518 #else
519  // For each rank, list of patches
520  rankPatches.resize(CkMyNodeSize());
521  // For each rank, list of home patch IDs
522  PatchIDList* rankHomePatchIDs = new PatchIDList[CkMyNodeSize()];
523  for (int i=0;i < CkMyNodeSize();i++) {
524  int pe = CkNodeFirst(CkMyNode()) + i;
525  PatchMap::Object()->basePatchIDList(pe, rankHomePatchIDs[i]);
526  }
527  std::vector<int> proxyPatchPes;
528  std::vector<int> peProxyPatchCounter(CkMyNodeSize(), 0);
529  //--------------------------------------------------------
530  // Build a list of PEs to avoid
531  std::vector<int> pesToAvoid;
532 #if 0
533  // Avoid other GPUs' master PEs
534  for (int i=0;i < deviceCUDA->getDeviceCount();i++) {
535  int pe = deviceCUDA->getMasterPeForDeviceID(i);
536  if (pe != -1 && pe != masterPe) pesToAvoid.push_back(pe);
537  }
538  // Avoid PEs that are involved in PME
539  ComputePmeCUDAMgr *computePmeCUDAMgr = ComputePmeCUDAMgr::Object();
540  for (int pe=CkNodeFirst(CkMyNode());pe < CkNodeFirst(CkMyNode()) + CkMyNodeSize();pe++) {
541  if (computePmeCUDAMgr->isPmePe(pe)) pesToAvoid.push_back(pe);
542  }
543  // Set counters of avoidable PEs to high numbers
544  for (int i=0;i < pesToAvoid.size();i++) {
545  int pe = pesToAvoid[i];
546  peProxyPatchCounter[CkRankOf(pe)] = (1 << 20);
547  }
548 #endif
549  // Avoid master Pe somewhat
550  peProxyPatchCounter[CkRankOf(masterPe)] = 2; // patches.size();
551  //--------------------------------------------------------
552  for (int i=0;i < patches.size();i++) {
553  PatchID pid = patches[i].patchID;
554  int pe = findHomePatchPe(rankHomePatchIDs, pid);
555  if (pe == -1) {
556  // Patch not present on this node => try finding a ProxyPatch
557  findProxyPatchPes(proxyPatchPes, pid);
558  if (proxyPatchPes.size() == 0) {
559  // No ProxyPatch => create one on rank that has the least ProxyPatches
560  int rank = std::min_element(peProxyPatchCounter.begin(), peProxyPatchCounter.end()) - peProxyPatchCounter.begin();
561  pe = CkNodeFirst(CkMyNode()) + rank;
562  peProxyPatchCounter[rank]++;
563  } else {
564  // Choose ProxyPatch, try to avoid masterPe (current Pe) and Pes that already have a ProxyPatch,
565  // this is done by finding the entry with minimum peProxyPatchCounter -value
566  // Find miniumum among proxyPatchPes, i.e., find the minimum among
567  // peProxyPatchCounter[CkRankOf(proxyPatchPes[j])]
568  // int pppi = std::min_element(proxyPatchPes.begin(), proxyPatchPes.end(),
569  // [&](int i, int j) {return peProxyPatchCounter[CkRankOf(i)] < peProxyPatchCounter[CkRankOf(j)];})
570  // - proxyPatchPes.begin();
571  // pe = proxyPatchPes[pppi];
572  int minCounter = (1 << 30);
573  for (int j=0;j < proxyPatchPes.size();j++) {
574  if (minCounter > peProxyPatchCounter[CkRankOf(proxyPatchPes[j])]) {
575  pe = proxyPatchPes[j];
576  minCounter = peProxyPatchCounter[CkRankOf(pe)];
577  }
578  }
579  if (pe == -1)
580  NAMD_bug("CudaComputeNonbonded::assignPatches, Unable to choose PE with proxy patch");
581  peProxyPatchCounter[CkRankOf(pe)]++;
582  }
583  } else if (std::find(pesToAvoid.begin(), pesToAvoid.end(), pe) != pesToAvoid.end()) {
584  // Found home patch on this node, but it's on PE that should be avoided => find a new one
585  int rank = std::min_element(peProxyPatchCounter.begin(), peProxyPatchCounter.end()) - peProxyPatchCounter.begin();
586  pe = CkNodeFirst(CkMyNode()) + rank;
587  peProxyPatchCounter[rank]++;
588  }
589  if (pe < CkNodeFirst(CkMyNode()) || pe >= CkNodeFirst(CkMyNode()) + CkMyNodeSize() )
590  NAMD_bug("CudaComputeNonbonded::assignPatches, Invalid PE for a patch");
591  rankPatches[CkRankOf(pe)].push_back(i);
592  pidMap[pid] = i;
593  }
594 
595  delete [] rankHomePatchIDs;
596 #endif
597  // Setup computes using pidMap
598  for (int i=0;i < computes.size();i++) {
599  computes[i].patchInd[0] = pidMap[computes[i].pid[0]];
600  computes[i].patchInd[1] = pidMap[computes[i].pid[1]];
601  }
602  for (int i=0;i < CkMyNodeSize();i++) {
603  if (rankPatches[i].size() > 0) pes.push_back(CkNodeFirst(CkMyNode()) + i);
604  }
605  computeMgr->sendAssignPatchesOnPe(pes, this);
606 }
void setNumPatches(int n)
Definition: Compute.h:52
int getDeviceCount()
Definition: DeviceCUDA.h:92
static PatchMap * Object()
Definition: PatchMap.h:27
static __thread ComputeMgr * computeMgr
void basePatchIDList(int pe, PatchIDList &)
Definition: PatchMap.C:454
static PatchMap * ObjectOnPe(int pe)
Definition: PatchMap.h:28
static ComputePmeCUDAMgr * Object()
void NAMD_bug(const char *err_msg)
Definition: common.C:129
int getMasterPeForDeviceID(int deviceID)
Definition: DeviceCUDA.C:385
int getPesSharingDevice(const int i)
Definition: DeviceCUDA.h:107
int PatchID
Definition: NamdTypes.h:182
BlockRadixSort::TempStorage sort
int getNumPatches()
Definition: Compute.h:53
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:22
int getNumPesSharingDevice()
Definition: DeviceCUDA.h:106
void findProxyPatchPes(std::vector< int > &proxyPatchPes, PatchID pid)
void sendAssignPatchesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1586
int findHomePatchPe(PatchIDList *rankPatchIDs, PatchID pid)
void CudaComputeNonbonded::assignPatchesOnPe ( )

Definition at line 302 of file CudaComputeNonbonded.C.

References ResizeArray< T >::add(), NAMD_bug(), PatchMap::node(), PatchMap::Object(), ResizeArray< T >::size(), and sort.

Referenced by ComputeMgr::recvAssignPatchesOnPe().

302  {
303  if (rankPatches[CkMyRank()].size() == 0)
304  NAMD_bug("CudaComputeNonbonded::assignPatchesOnPe, empty rank");
305 
306  // calculate priority rank of local home patch within pe
307  {
308  PatchMap* patchMap = PatchMap::Object();
309  ResizeArray< ResizeArray<int2> > homePatchByRank(CkMyNodeSize());
310  for ( int k=0; k < rankPatches[CkMyRank()].size(); ++k ) {
311  int i = rankPatches[CkMyRank()][k];
312  int pid = patches[i].patchID;
313  int homePe = patchMap->node(pid);
314  if ( CkNodeOf(homePe) == CkMyNode() ) {
315  int2 pid_index;
316  pid_index.x = pid;
317  pid_index.y = i;
318  homePatchByRank[CkRankOf(homePe)].add(pid_index);
319  }
320  }
321  for ( int i=0; i<CkMyNodeSize(); ++i ) {
323  std::sort(homePatchByRank[i].begin(),homePatchByRank[i].end(),so);
324  int masterBoost = ( CkMyRank() == i ? 2 : 0 );
325  for ( int j=0; j<homePatchByRank[i].size(); ++j ) {
326  int index = homePatchByRank[i][j].y;
327  patches[index].reversePriorityRankInPe = j + masterBoost;
328  }
329  }
330  }
331 
332  for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
333  assignPatch(rankPatches[CkMyRank()][i]);
334  }
335 }
static PatchMap * Object()
Definition: PatchMap.h:27
void NAMD_bug(const char *err_msg)
Definition: common.C:129
BlockRadixSort::TempStorage sort
int node(int pid) const
Definition: PatchMap.h:114
void CudaComputeNonbonded::atomUpdate ( void  )
virtual

Reimplemented from Compute.

Definition at line 645 of file CudaComputeNonbonded.C.

645  {
646  atomsChangedIn = true;
647 }
void CudaComputeNonbonded::doWork ( void  )
virtual

Reimplemented from Compute.

Definition at line 922 of file CudaComputeNonbonded.C.

References Flags::doEnergy, Flags::doFullElectrostatics, Flags::doNonbonded, Flags::doVirial, SimParameters::GBISOn, Compute::gbisPhase, NAMD_bug(), Node::Object(), ComputeMgr::sendOpenBoxesOnPe(), and Node::simParameters.

922  {
923  if (CkMyPe() != masterPe)
924  NAMD_bug("CudaComputeNonbonded::doWork() called on non masterPe");
925 
926  // Read value of atomsChangedIn, which is set in atomUpdate(), and reset it.
927  // atomsChangedIn can be set to true by any Pe
928  // atomsChanged can only be set by masterPe
929  // This use of double varibles makes sure we don't have race condition
930  atomsChanged = atomsChangedIn;
931  atomsChangedIn = false;
932 
934 
935  if (patches.size() == 0) return; // No work do to
936 
937  // Take the flags from the first patch on this Pe
938  // Flags &flags = patches[rankPatches[CkMyRank()][0]].patch->flags;
939  Flags &flags = patches[0].patch->flags;
940 
941  doSlow = flags.doFullElectrostatics;
942  doEnergy = flags.doEnergy;
943  doVirial = flags.doVirial;
944 
945  if (flags.doNonbonded) {
946 
947  if (simParams->GBISOn) {
948  gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
949  }
950 
951  if (!simParams->GBISOn || gbisPhase == 1) {
952  if ( computesChanged ) {
953  updateComputes();
954  }
955  if (atomsChanged) {
956  // Re-calculate patch atom numbers and storage
957  updatePatches();
958  reSortDone = false;
959  }
960  reallocateArrays();
961  }
962 
963  // Open boxes on Pes and launch work to masterPe
964  computeMgr->sendOpenBoxesOnPe(pes, this);
965 
966  } else {
967  // No work to do, skip
968  skip();
969  }
970 
971 }
static Node * Object()
Definition: Node.h:86
static __thread ComputeMgr * computeMgr
SimParameters * simParameters
Definition: Node.h:178
void NAMD_bug(const char *err_msg)
Definition: common.C:129
int doEnergy
Definition: PatchTypes.h:20
int doFullElectrostatics
Definition: PatchTypes.h:23
int doNonbonded
Definition: PatchTypes.h:22
int gbisPhase
Definition: Compute.h:39
#define simParams
Definition: Output.C:127
int doVirial
Definition: PatchTypes.h:21
void sendOpenBoxesOnPe(std::vector< int > &pes, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1639
void CudaComputeNonbonded::finishPatchesOnPe ( )

Definition at line 1379 of file CudaComputeNonbonded.C.

Referenced by ComputeMgr::recvFinishPatchesOnPe().

1379  {
1380  finishSetOfPatchesOnPe(rankPatches[CkMyRank()]);
1381 }
void CudaComputeNonbonded::finishPatchOnPe ( int  i)

Definition at line 1386 of file CudaComputeNonbonded.C.

Referenced by ComputeMgr::recvFinishPatchOnPe().

1386  {
1387  std::vector<int> v(1, i);
1388  finishSetOfPatchesOnPe(v);
1389 }
void CudaComputeNonbonded::finishReductions ( )

Definition at line 1213 of file CudaComputeNonbonded.C.

References ADD_TENSOR_OBJECT, cudaCheck, VirialEnergy::energyElec, VirialEnergy::energyGBIS, VirialEnergy::energySlow, VirialEnergy::energyVdw, SimParameters::GBISOn, CudaTileListKernel::getNumExcluded(), SubmitReduction::item(), NAMD_bug(), Node::Object(), REDUCTION_COMPUTE_CHECKSUM, REDUCTION_ELECT_ENERGY, REDUCTION_ELECT_ENERGY_SLOW, REDUCTION_EXCLUSION_CHECKSUM_CUDA, REDUCTION_LJ_ENERGY, Node::simParameters, SubmitReduction::submit(), VirialEnergy::virial, VirialEnergy::virialSlow, Tensor::xx, Tensor::xy, Tensor::xz, Tensor::yx, Tensor::yy, Tensor::yz, Tensor::zx, Tensor::zy, and Tensor::zz.

Referenced by ComputeMgr::recvFinishReductions().

1213  {
1214 
1215  if (CkMyPe() != masterPe)
1216  NAMD_bug("CudaComputeNonbonded::finishReductions() called on non masterPe");
1217 
1218  // fprintf(stderr, "%d finishReductions doSkip %d doVirial %d doEnergy %d\n", CkMyPe(), doSkip, doVirial, doEnergy);
1219 
1220  if (!doSkip) {
1221 
1222  if (doStreaming && (doVirial || doEnergy)) {
1223  // For streaming kernels, we must wait for virials and forces to be copied back to CPU
1224  if (!forceDoneEventRecord)
1225  NAMD_bug("CudaComputeNonbonded::finishReductions, forceDoneEvent not being recorded");
1226  cudaCheck(cudaEventSynchronize(forceDoneEvent));
1227  forceDoneEventRecord = false;
1228  }
1229 
1230  if (doVirial) {
1231  Tensor virialTensor;
1232  virialTensor.xx = h_virialEnergy->virial[0];
1233  virialTensor.xy = h_virialEnergy->virial[1];
1234  virialTensor.xz = h_virialEnergy->virial[2];
1235  virialTensor.yx = h_virialEnergy->virial[3];
1236  virialTensor.yy = h_virialEnergy->virial[4];
1237  virialTensor.yz = h_virialEnergy->virial[5];
1238  virialTensor.zx = h_virialEnergy->virial[6];
1239  virialTensor.zy = h_virialEnergy->virial[7];
1240  virialTensor.zz = h_virialEnergy->virial[8];
1241  // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.xx, virialTensor.xy, virialTensor.xz);
1242  // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.yx, virialTensor.yy, virialTensor.yz);
1243  // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.zx, virialTensor.zy, virialTensor.zz);
1244  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NBOND, virialTensor);
1245  if (doSlow) {
1246  Tensor virialTensor;
1247  virialTensor.xx = h_virialEnergy->virialSlow[0];
1248  virialTensor.xy = h_virialEnergy->virialSlow[1];
1249  virialTensor.xz = h_virialEnergy->virialSlow[2];
1250  virialTensor.yx = h_virialEnergy->virialSlow[3];
1251  virialTensor.yy = h_virialEnergy->virialSlow[4];
1252  virialTensor.yz = h_virialEnergy->virialSlow[5];
1253  virialTensor.zx = h_virialEnergy->virialSlow[6];
1254  virialTensor.zy = h_virialEnergy->virialSlow[7];
1255  virialTensor.zz = h_virialEnergy->virialSlow[8];
1256  // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.xx, virialTensor.xy, virialTensor.xz);
1257  // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.yx, virialTensor.yy, virialTensor.yz);
1258  // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.zx, virialTensor.zy, virialTensor.zz);
1259  ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_SLOW, virialTensor);
1260  }
1261  }
1262  if (doEnergy) {
1263  // if (doSlow)
1264  // printf("energyElec %lf energySlow %lf energyGBIS %lf\n", h_virialEnergy->energyElec, h_virialEnergy->energySlow, h_virialEnergy->energyGBIS);
1266  reduction->item(REDUCTION_LJ_ENERGY) += h_virialEnergy->energyVdw;
1267  reduction->item(REDUCTION_ELECT_ENERGY) += h_virialEnergy->energyElec + ((simParams->GBISOn) ? h_virialEnergy->energyGBIS : 0.0);
1268  // fprintf(stderr, "energyGBIS %lf\n", h_virialEnergy->energyGBIS);
1269  if (doSlow) reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += h_virialEnergy->energySlow;
1270  // fprintf(stderr, "h_virialEnergy->energyElec %lf\n", h_virialEnergy->energyElec);
1271  }
1272 
1273  reduction->item(REDUCTION_EXCLUSION_CHECKSUM_CUDA) += tileListKernel.getNumExcluded();
1274  }
1275  reduction->item(REDUCTION_COMPUTE_CHECKSUM) += 1.;
1276  reduction->submit();
1277 
1278  // Reset flags
1279  doSkip = false;
1280  computesChanged = false;
1281 }
static Node * Object()
Definition: Node.h:86
BigReal zy
Definition: Tensor.h:19
BigReal xz
Definition: Tensor.h:17
#define ADD_TENSOR_OBJECT(R, RL, D)
Definition: ReductionMgr.h:43
SimParameters * simParameters
Definition: Node.h:178
BigReal & item(int i)
Definition: ReductionMgr.h:312
BigReal yz
Definition: Tensor.h:18
void NAMD_bug(const char *err_msg)
Definition: common.C:129
BigReal yx
Definition: Tensor.h:18
BigReal xx
Definition: Tensor.h:17
double virialSlow[9]
BigReal zz
Definition: Tensor.h:19
#define simParams
Definition: Output.C:127
Definition: Tensor.h:15
BigReal xy
Definition: Tensor.h:17
BigReal yy
Definition: Tensor.h:18
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
void submit(void)
Definition: ReductionMgr.h:323
BigReal zx
Definition: Tensor.h:19
void CudaComputeNonbonded::gbisP2PatchReady ( PatchID  pid,
int  seq 
)
virtual

Reimplemented from Compute.

Definition at line 245 of file CudaComputeNonbonded.C.

References Compute::gbisP2PatchReady().

245  {
246  CmiLock(lock);
247  Compute::gbisP2PatchReady(pid, seq);
248  CmiUnlock(lock);
249 }
virtual void gbisP2PatchReady(PatchID, int seq)
Definition: Compute.C:84
void CudaComputeNonbonded::gbisP3PatchReady ( PatchID  pid,
int  seq 
)
virtual

Reimplemented from Compute.

Definition at line 251 of file CudaComputeNonbonded.C.

References Compute::gbisP3PatchReady().

251  {
252  CmiLock(lock);
253  Compute::gbisP3PatchReady(pid, seq);
254  CmiUnlock(lock);
255 }
virtual void gbisP3PatchReady(PatchID, int seq)
Definition: Compute.C:94
void CudaComputeNonbonded::initialize ( void  )
virtual

Reimplemented from Compute.

Definition at line 608 of file CudaComputeNonbonded.C.

References ATOMIC_BINS, cudaCheck, ReductionMgr::Object(), Compute::priority(), REDUCTIONS_BASIC, and ReductionMgr::willSubmit().

Referenced by ComputeMgr::createComputes().

608  {
609  if (patches.size() > 0) {
610  // Allocate CUDA version of patches
611  cudaCheck(cudaSetDevice(deviceID));
612  allocate_host<CudaPatchRecord>(&cudaPatches, patches.size());
613 
614  allocate_host<VirialEnergy>(&h_virialEnergy, 1);
615  allocate_device<VirialEnergy>(&d_virialEnergy, ATOMIC_BINS);
616 
617  /* JM: Queries for maximum sharedMemoryPerBlock on deviceID
618  */
619  cudaDeviceProp props;
620  cudaCheck(cudaGetDeviceProperties(&props, deviceID)); //Gets properties of 'deviceID device'
621  maxShmemPerBlock = props.sharedMemPerBlock;
622 
623 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP)
624  int leastPriority, greatestPriority;
625  cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
626  int priority = (doStreaming) ? leastPriority : greatestPriority;
627  // int priority = greatestPriority;
628  cudaCheck(cudaStreamCreateWithPriority(&stream,cudaStreamDefault, priority));
629 #else
630  cudaCheck(cudaStreamCreate(&stream));
631 #endif
632  cudaCheck(cudaEventCreate(&forceDoneEvent));
633 
634  buildExclusions();
635 
636  lock = CmiCreateLock();
637 
639  }
640 }
void buildExclusions()
Definition: CompressPsf.C:1197
SubmitReduction * willSubmit(int setID, int size=-1)
Definition: ReductionMgr.C:365
static ReductionMgr * Object(void)
Definition: ReductionMgr.h:278
__thread cudaStream_t stream
#define ATOMIC_BINS
Definition: CudaUtils.h:24
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float const PatchPairRecord *__restrict__ const int *__restrict__ const int2 *__restrict__ const unsigned int *__restrict__ unsigned int *__restrict__ int *__restrict__ int *__restrict__ TileListStat *__restrict__ const BoundingBox *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ float *__restrict__ const int unsigned int *__restrict__ const CudaPatchRecord *__restrict__ cudaPatches
int priority(void)
Definition: Compute.h:65
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
void CudaComputeNonbonded::launchWork ( )

Definition at line 973 of file CudaComputeNonbonded.C.

References CudaComputeNonbonded::PatchRecord::atomStart, cudaCheck, ComputeNonbondedUtil::cutoff, SimParameters::GBISOn, Compute::gbisPhase, CudaTileListKernel::getEmptyPatches(), CudaTileListKernel::getNumEmptyPatches(), CudaTileListKernel::getNumPatches(), CudaComputeNonbondedKernel::getPatchReadyQueue(), SubmitReduction::item(), NAMD_bug(), CudaComputeNonbonded::PatchRecord::numAtoms, Node::Object(), CudaComputeNonbondedKernel::reduceVirialEnergy(), REDUCTION_PAIRLIST_WARNINGS, Flags::savePairlists, Node::simParameters, and Flags::usePairlists.

Referenced by ComputeMgr::recvLaunchWork().

973  {
974  if (CkMyPe() != masterPe)
975  NAMD_bug("CudaComputeNonbonded::launchWork() called on non masterPe");
976 
977  beforeForceCompute = CkWallTimer();
978 
979  cudaCheck(cudaSetDevice(deviceID));
981 
982  //execute only during GBIS phase 1, or if not using GBIS
983  if (!simParams->GBISOn || gbisPhase == 1) {
984 
985  if ( atomsChanged || computesChanged ) {
986  // Invalidate pair lists
987  pairlistsValid = false;
988  pairlistTolerance = 0.0f;
989  }
990 
991  // Get maximum atom movement and patch tolerance
992  float maxAtomMovement = 0.0f;
993  float maxPatchTolerance = 0.0f;
994  getMaxMovementTolerance(maxAtomMovement, maxPatchTolerance);
995  // Update pair-list cutoff
996  Flags &flags = patches[0].patch->flags;
997  savePairlists = false;
998  usePairlists = false;
999  if ( flags.savePairlists ) {
1000  savePairlists = true;
1001  usePairlists = true;
1002  } else if ( flags.usePairlists ) {
1003  if ( ! pairlistsValid ||
1004  ( 2. * maxAtomMovement > pairlistTolerance ) ) {
1005  reduction->item(REDUCTION_PAIRLIST_WARNINGS) += 1;
1006  } else {
1007  usePairlists = true;
1008  }
1009  }
1010  if ( ! usePairlists ) {
1011  pairlistsValid = false;
1012  }
1013  float plcutoff = cutoff;
1014  if ( savePairlists ) {
1015  pairlistsValid = true;
1016  pairlistTolerance = 2. * maxPatchTolerance;
1017  plcutoff += pairlistTolerance;
1018  }
1019  plcutoff2 = plcutoff * plcutoff;
1020 
1021  // if (atomsChanged)
1022  // CkPrintf("plcutoff = %f listTolerance = %f save = %d use = %d\n",
1023  // plcutoff, pairlistTolerance, savePairlists, usePairlists);
1024 
1025  } // if (!simParams->GBISOn || gbisPhase == 1)
1026 
1027  // Calculate PME & VdW forces
1028  if (!simParams->GBISOn || gbisPhase == 1) {
1029  doForce();
1030  if (doStreaming) {
1031  patchReadyQueue = nonbondedKernel.getPatchReadyQueue();
1032  patchReadyQueueLen = tileListKernel.getNumPatches();
1033  patchReadyQueueNext = 0;
1034  // Fill in empty patches [0 ... patchReadyQueueNext-1] at the top
1035  int numEmptyPatches = tileListKernel.getNumEmptyPatches();
1036  int* emptyPatches = tileListKernel.getEmptyPatches();
1037  for (int i=0;i < numEmptyPatches;i++) {
1038  PatchRecord &pr = patches[emptyPatches[i]];
1039  memset(h_forces+pr.atomStart, 0, sizeof(float4)*pr.numAtoms);
1040  if (doSlow) memset(h_forcesSlow+pr.atomStart, 0, sizeof(float4)*pr.numAtoms);
1041  patchReadyQueue[i] = emptyPatches[i];
1042  }
1043  if (patchReadyQueueLen != patches.size())
1044  NAMD_bug("CudaComputeNonbonded::launchWork, invalid patchReadyQueueLen");
1045  }
1046  }
1047 
1048  // For GBIS phase 1 at pairlist update, we must re-sort tile list
1049  // before calling doGBISphase1().
1050  if (atomsChanged && simParams->GBISOn && gbisPhase == 1) {
1051  // In this code path doGBISphase1() is called in forceDone()
1052  forceDoneSetCallback();
1053  return;
1054  }
1055 
1056  // GBIS Phases
1057  if (simParams->GBISOn) {
1058  if (gbisPhase == 1) {
1059  doGBISphase1();
1060  } else if (gbisPhase == 2) {
1061  doGBISphase2();
1062  } else if (gbisPhase == 3) {
1063  doGBISphase3();
1064  }
1065  }
1066 
1067  // Copy forces to host
1068  if (!simParams->GBISOn || gbisPhase == 3) {
1069  if (!doStreaming) {
1070  copy_DtoH<float4>(d_forces, h_forces, atomStorageSize, stream);
1071  if (doSlow) copy_DtoH<float4>(d_forcesSlow, h_forcesSlow, atomStorageSize, stream);
1072  }
1073  }
1074 
1075  if ((!simParams->GBISOn || gbisPhase == 2) && (doEnergy || doVirial)) {
1076  // For GBIS, energies are ready after phase 2
1077  nonbondedKernel.reduceVirialEnergy(tileListKernel,
1078  atomStorageSize, doEnergy, doVirial, doSlow, simParams->GBISOn,
1079  d_forces, d_forcesSlow, d_virialEnergy, stream);
1080  copy_DtoH<VirialEnergy>(d_virialEnergy, h_virialEnergy, 1, stream);
1081  }
1082 
1083  // Setup call back
1084  forceDoneSetCallback();
1085 }
static Node * Object()
Definition: Node.h:86
SimParameters * simParameters
Definition: Node.h:178
int savePairlists
Definition: PatchTypes.h:39
BigReal & item(int i)
Definition: ReductionMgr.h:312
int usePairlists
Definition: PatchTypes.h:38
__global__ void const int const TileList *__restrict__ TileExcl *__restrict__ const int *__restrict__ const int const float2 *__restrict__ cudaTextureObject_t const int *__restrict__ const float3 const float3 const float3 const float4 *__restrict__ const float cudaTextureObject_t cudaTextureObject_t float plcutoff2
__thread cudaStream_t stream
void NAMD_bug(const char *err_msg)
Definition: common.C:129
void reduceVirialEnergy(CudaTileListKernel &tlKernel, const int atomStorageSize, const bool doEnergy, const bool doVirial, const bool doSlow, const bool doGBIS, float4 *d_forces, float4 *d_forcesSlow, VirialEnergy *d_virialEnergy, cudaStream_t stream)
int gbisPhase
Definition: Compute.h:39
#define simParams
Definition: Output.C:127
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
void CudaComputeNonbonded::messageEnqueueWork ( )

Definition at line 862 of file CudaComputeNonbonded.C.

References WorkDistrib::messageEnqueueWork(), and NAMD_bug().

Referenced by ComputeMgr::recvMessageEnqueueWork().

862  {
863  if (masterPe != CkMyPe())
864  NAMD_bug("CudaComputeNonbonded::messageEnqueueWork() must be called from masterPe");
866 }
static void messageEnqueueWork(Compute *)
Definition: WorkDistrib.C:2732
void NAMD_bug(const char *err_msg)
Definition: common.C:129
int CudaComputeNonbonded::noWork ( )
virtual

Reimplemented from Compute.

Definition at line 887 of file CudaComputeNonbonded.C.

References ComputeMgr::sendMessageEnqueueWork().

887  {
888  // Simply enqueu doWork on masterPe and return "no work"
889  computeMgr->sendMessageEnqueueWork(masterPe, this);
890  return 1;
891 }
void sendMessageEnqueueWork(int pe, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1664
static __thread ComputeMgr * computeMgr
void CudaComputeNonbonded::openBoxesOnPe ( )

Definition at line 868 of file CudaComputeNonbonded.C.

References Compute::getNumPatches(), NAMD_bug(), and ComputeMgr::sendLaunchWork().

Referenced by ComputeMgr::recvOpenBoxesOnPe().

868  {
869  if (rankPatches[CkMyRank()].size() == 0)
870  NAMD_bug("CudaComputeNonbonded::openBoxesOnPe, empty rank");
871  for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
872  openBox(rankPatches[CkMyRank()][i]);
873  }
874  bool done = false;
875  CmiLock(lock);
876  patchesCounter -= rankPatches[CkMyRank()].size();
877  if (patchesCounter == 0) {
878  patchesCounter = getNumPatches();
879  done = true;
880  }
881  CmiUnlock(lock);
882  if (done) {
883  computeMgr->sendLaunchWork(masterPe, this);
884  }
885 }
static __thread ComputeMgr * computeMgr
void sendLaunchWork(int pe, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1675
void NAMD_bug(const char *err_msg)
Definition: common.C:129
int getNumPatches()
Definition: Compute.h:53
void CudaComputeNonbonded::patchReady ( PatchID  pid,
int  doneMigration,
int  seq 
)
virtual

Reimplemented from Compute.

Definition at line 233 of file CudaComputeNonbonded.C.

References NAMD_bug(), and Compute::patchReady().

233  {
234  if (doneMigration) {
235  int i = findPid(pid);
236  if (i == -1)
237  NAMD_bug("CudaComputeNonbonded::patchReady, Patch ID not found");
238  updatePatch(i);
239  }
240  CmiLock(lock);
241  Compute::patchReady(pid, doneMigration, seq);
242  CmiUnlock(lock);
243 }
void NAMD_bug(const char *err_msg)
Definition: common.C:129
virtual void patchReady(PatchID, int doneMigration, int seq)
Definition: Compute.C:63
void CudaComputeNonbonded::registerComputePair ( ComputeID  cid,
PatchID pid,
int *  trans 
)

Definition at line 174 of file CudaComputeNonbonded.C.

References PatchMap::center(), PatchMap::Object(), Vector::x, Vector::y, and Vector::z.

174  {
175  computesChanged = true;
176  addPatch(pid[0]);
177  addPatch(pid[1]);
178  PatchMap* patchMap = PatchMap::Object();
179  int t1 = trans[0];
180  int t2 = trans[1];
181  Vector offset = patchMap->center(pid[0]) - patchMap->center(pid[1]);
182  offset.x += (t1%3-1) - (t2%3-1);
183  offset.y += ((t1/3)%3-1) - ((t2/3)%3-1);
184  offset.z += (t1/9-1) - (t2/9-1);
185  addCompute(cid, pid[0], pid[1], offset);
186 }
static PatchMap * Object()
Definition: PatchMap.h:27
Definition: Vector.h:64
BigReal z
Definition: Vector.h:66
BigReal x
Definition: Vector.h:66
ScaledPosition center(int pid) const
Definition: PatchMap.h:99
BigReal y
Definition: Vector.h:66
const ComputeID cid
Definition: Compute.h:43
void CudaComputeNonbonded::registerComputeSelf ( ComputeID  cid,
PatchID  pid 
)

Definition at line 164 of file CudaComputeNonbonded.C.

164  {
165  computesChanged = true;
166  addPatch(pid);
167  addCompute(cid, pid, pid, 0.);
168 }
const ComputeID cid
Definition: Compute.h:43
void CudaComputeNonbonded::skipPatchesOnPe ( )

Definition at line 687 of file CudaComputeNonbonded.C.

References Compute::getNumPatches(), NAMD_bug(), and ComputeMgr::sendFinishReductions().

Referenced by ComputeMgr::recvSkipPatchesOnPe().

687  {
688  if (rankPatches[CkMyRank()].size() == 0)
689  NAMD_bug("CudaComputeNonbonded::skipPatchesOnPe, empty rank");
690  for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
691  skipPatch(rankPatches[CkMyRank()][i]);
692  }
693  bool done = false;
694  CmiLock(lock);
695  patchesCounter -= rankPatches[CkMyRank()].size();
696  if (patchesCounter == 0) {
697  patchesCounter = getNumPatches();
698  done = true;
699  }
700  CmiUnlock(lock);
701  if (done) {
702  // Reduction must be done on masterPe
703  computeMgr->sendFinishReductions(masterPe, this);
704  }
705 }
static __thread ComputeMgr * computeMgr
void sendFinishReductions(int pe, CudaComputeNonbonded *c)
Definition: ComputeMgr.C:1653
void NAMD_bug(const char *err_msg)
Definition: common.C:129
int getNumPatches()
Definition: Compute.h:53
void CudaComputeNonbonded::unregisterBoxesOnPe ( )

Definition at line 152 of file CudaComputeNonbonded.C.

References NAMD_bug().

Referenced by ComputeMgr::recvUnregisterBoxesOnPe().

152  {
153  if (rankPatches[CkMyRank()].size() == 0)
154  NAMD_bug("CudaComputeNonbonded::unregisterBoxesOnPe, empty rank");
155  for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
156  unregisterBox(rankPatches[CkMyRank()][i]);
157  }
158 }
void NAMD_bug(const char *err_msg)
Definition: common.C:129

The documentation for this class was generated from the following files: