#include <CudaComputeNonbonded.h>

Inheritance diagram for CudaComputeNonbonded:

Classes
struct	ComputeRecord

struct	PatchRecord

Public Member Functions
	CudaComputeNonbonded (ComputeID c, int deviceID, CudaNonbondedTables &cudaNonbondedTables, bool doStreaming)

	~CudaComputeNonbonded ()

void	registerComputeSelf (ComputeID cid, PatchID pid)

void	registerComputePair (ComputeID cid, PatchID pid, int trans)

void	assignPatches (ComputeMgr *computeMgrIn)

virtual void	initialize ()

virtual void	atomUpdate ()

virtual int	noWork ()

virtual void	doWork ()

void	launchWork ()

void	finishReductions ()

void	unregisterBoxesOnPe ()

void	assignPatchesOnPe ()

void	openBoxesOnPe ()

void	skipPatchesOnPe ()

void	finishPatchesOnPe ()

void	finishPatchOnPe (int i)

void	finishPatches ()

void	messageEnqueueWork ()

virtual void	patchReady (PatchID, int doneMigration, int seq)

virtual void	gbisP2PatchReady (PatchID, int seq)

virtual void	gbisP3PatchReady (PatchID, int seq)

void	reSortTileLists ()

void	updatePatchOrder (const std::vector< CudaLocalRecord > &data)

std::vector< PatchRecord > &	getPatches ()

Public Member Functions inherited from Compute
	Compute (ComputeID)

int	type ()

virtual	~Compute ()

void	setNumPatches (int n)

int	getNumPatches ()

int	sequence (void)

int	priority (void)

int	getGBISPhase (void)

Public Member Functions inherited from ComputeNonbondedUtil
	ComputeNonbondedUtil ()

	~ComputeNonbondedUtil ()

void	calcGBIS (nonbonded params, GBISParamStruct gbisParams)

Static Public Member Functions
static CudaNBConstants	getNonbondedCoef (SimParameters *params)

static bool	getDoTable (SimParameters *params, const bool doSlow, const bool doVirial)

Static Public Member Functions inherited from ComputeNonbondedUtil
static void	select (void)

static void	submitReductionData (BigReal , SubmitReduction )

static void	submitPressureProfileData (BigReal , SubmitReduction )

static BigReal	square (const BigReal &x, const BigReal &y, const BigReal &z)

static void	calc_error (nonbonded *)

static void	calc_pair (nonbonded *)

static void	calc_pair_energy (nonbonded *)

static void	calc_pair_fullelect (nonbonded *)

static void	calc_pair_energy_fullelect (nonbonded *)

static void	calc_pair_merge_fullelect (nonbonded *)

static void	calc_pair_energy_merge_fullelect (nonbonded *)

static void	calc_pair_slow_fullelect (nonbonded *)

static void	calc_pair_energy_slow_fullelect (nonbonded *)

static void	calc_self (nonbonded *)

static void	calc_self_energy (nonbonded *)

static void	calc_self_fullelect (nonbonded *)

static void	calc_self_energy_fullelect (nonbonded *)

static void	calc_self_merge_fullelect (nonbonded *)

static void	calc_self_energy_merge_fullelect (nonbonded *)

static void	calc_self_slow_fullelect (nonbonded *)

static void	calc_self_energy_slow_fullelect (nonbonded *)

static void	calc_pair_energy_fep (nonbonded *)

static void	calc_pair_energy_fullelect_fep (nonbonded *)

static void	calc_pair_energy_merge_fullelect_fep (nonbonded *)

static void	calc_pair_energy_slow_fullelect_fep (nonbonded *)

static void	calc_self_energy_fep (nonbonded *)

static void	calc_self_energy_fullelect_fep (nonbonded *)

static void	calc_self_energy_merge_fullelect_fep (nonbonded *)

static void	calc_self_energy_slow_fullelect_fep (nonbonded *)

static void	calc_pair_energy_ti (nonbonded *)

static void	calc_pair_ti (nonbonded *)

static void	calc_pair_energy_fullelect_ti (nonbonded *)

static void	calc_pair_fullelect_ti (nonbonded *)

static void	calc_pair_energy_merge_fullelect_ti (nonbonded *)

static void	calc_pair_merge_fullelect_ti (nonbonded *)

static void	calc_pair_energy_slow_fullelect_ti (nonbonded *)

static void	calc_pair_slow_fullelect_ti (nonbonded *)

static void	calc_self_energy_ti (nonbonded *)

static void	calc_self_ti (nonbonded *)

static void	calc_self_energy_fullelect_ti (nonbonded *)

static void	calc_self_fullelect_ti (nonbonded *)

static void	calc_self_energy_merge_fullelect_ti (nonbonded *)

static void	calc_self_merge_fullelect_ti (nonbonded *)

static void	calc_self_energy_slow_fullelect_ti (nonbonded *)

static void	calc_self_slow_fullelect_ti (nonbonded *)

static void	calc_pair_les (nonbonded *)

static void	calc_pair_energy_les (nonbonded *)

static void	calc_pair_fullelect_les (nonbonded *)

static void	calc_pair_energy_fullelect_les (nonbonded *)

static void	calc_pair_merge_fullelect_les (nonbonded *)

static void	calc_pair_energy_merge_fullelect_les (nonbonded *)

static void	calc_pair_slow_fullelect_les (nonbonded *)

static void	calc_pair_energy_slow_fullelect_les (nonbonded *)

static void	calc_self_les (nonbonded *)

static void	calc_self_energy_les (nonbonded *)

static void	calc_self_fullelect_les (nonbonded *)

static void	calc_self_energy_fullelect_les (nonbonded *)

static void	calc_self_merge_fullelect_les (nonbonded *)

static void	calc_self_energy_merge_fullelect_les (nonbonded *)

static void	calc_self_slow_fullelect_les (nonbonded *)

static void	calc_self_energy_slow_fullelect_les (nonbonded *)

static void	calc_pair_energy_int (nonbonded *)

static void	calc_pair_energy_fullelect_int (nonbonded *)

static void	calc_pair_energy_merge_fullelect_int (nonbonded *)

static void	calc_self_energy_int (nonbonded *)

static void	calc_self_energy_fullelect_int (nonbonded *)

static void	calc_self_energy_merge_fullelect_int (nonbonded *)

static void	calc_pair_pprof (nonbonded *)

static void	calc_pair_energy_pprof (nonbonded *)

static void	calc_pair_fullelect_pprof (nonbonded *)

static void	calc_pair_energy_fullelect_pprof (nonbonded *)

static void	calc_pair_merge_fullelect_pprof (nonbonded *)

static void	calc_pair_energy_merge_fullelect_pprof (nonbonded *)

static void	calc_pair_slow_fullelect_pprof (nonbonded *)

static void	calc_pair_energy_slow_fullelect_pprof (nonbonded *)

static void	calc_self_pprof (nonbonded *)

static void	calc_self_energy_pprof (nonbonded *)

static void	calc_self_fullelect_pprof (nonbonded *)

static void	calc_self_energy_fullelect_pprof (nonbonded *)

static void	calc_self_merge_fullelect_pprof (nonbonded *)

static void	calc_self_energy_merge_fullelect_pprof (nonbonded *)

static void	calc_self_slow_fullelect_pprof (nonbonded *)

static void	calc_self_energy_slow_fullelect_pprof (nonbonded *)

static void	calc_pair_tabener (nonbonded *)

static void	calc_pair_energy_tabener (nonbonded *)

static void	calc_pair_fullelect_tabener (nonbonded *)

static void	calc_pair_energy_fullelect_tabener (nonbonded *)

static void	calc_pair_merge_fullelect_tabener (nonbonded *)

static void	calc_pair_energy_merge_fullelect_tabener (nonbonded *)

static void	calc_pair_slow_fullelect_tabener (nonbonded *)

static void	calc_pair_energy_slow_fullelect_tabener (nonbonded *)

static void	calc_self_tabener (nonbonded *)

static void	calc_self_energy_tabener (nonbonded *)

static void	calc_self_fullelect_tabener (nonbonded *)

static void	calc_self_energy_fullelect_tabener (nonbonded *)

static void	calc_self_merge_fullelect_tabener (nonbonded *)

static void	calc_self_energy_merge_fullelect_tabener (nonbonded *)

static void	calc_self_slow_fullelect_tabener (nonbonded *)

static void	calc_self_energy_slow_fullelect_tabener (nonbonded *)

static void	calc_pair_go (nonbonded *)

static void	calc_pair_energy_go (nonbonded *)

static void	calc_pair_fullelect_go (nonbonded *)

static void	calc_pair_energy_fullelect_go (nonbonded *)

static void	calc_pair_merge_fullelect_go (nonbonded *)

static void	calc_pair_energy_merge_fullelect_go (nonbonded *)

static void	calc_pair_slow_fullelect_go (nonbonded *)

static void	calc_pair_energy_slow_fullelect_go (nonbonded *)

static void	calc_self_go (nonbonded *)

static void	calc_self_energy_go (nonbonded *)

static void	calc_self_fullelect_go (nonbonded *)

static void	calc_self_energy_fullelect_go (nonbonded *)

static void	calc_self_merge_fullelect_go (nonbonded *)

static void	calc_self_energy_merge_fullelect_go (nonbonded *)

static void	calc_self_slow_fullelect_go (nonbonded *)

static void	calc_self_energy_slow_fullelect_go (nonbonded *)

Additional Inherited Members
Public Types inherited from ComputeNonbondedUtil
enum	{ exclChecksumIndex, pairlistWarningIndex, electEnergyIndex, fullElectEnergyIndex, vdwEnergyIndex, goNativeEnergyIndex, goNonnativeEnergyIndex, groLJEnergyIndex, groGaussEnergyIndex, electEnergyIndex_s, fullElectEnergyIndex_s, vdwEnergyIndex_s, electEnergyIndex_ti_1, fullElectEnergyIndex_ti_1, vdwEnergyIndex_ti_1, electEnergyIndex_ti_2, fullElectEnergyIndex_ti_2, vdwEnergyIndex_ti_2, TENSOR =(virialIndex), TENSOR =(virialIndex), VECTOR =(pairVDWForceIndex), VECTOR =(pairVDWForceIndex), reductionDataSize }

Public Attributes inherited from Compute
const ComputeID	cid

LDObjHandle	ldObjHandle

LocalWorkMsg *const	localWorkMsg

Static Public Attributes inherited from ComputeNonbondedUtil
static void(*	calcPair )(nonbonded *)

static void(*	calcPairEnergy )(nonbonded *)

static void(*	calcSelf )(nonbonded *)

static void(*	calcSelfEnergy )(nonbonded *)

static void(*	calcFullPair )(nonbonded *)

static void(*	calcFullPairEnergy )(nonbonded *)

static void(*	calcFullSelf )(nonbonded *)

static void(*	calcFullSelfEnergy )(nonbonded *)

static void(*	calcMergePair )(nonbonded *)

static void(*	calcMergePairEnergy )(nonbonded *)

static void(*	calcMergeSelf )(nonbonded *)

static void(*	calcMergeSelfEnergy )(nonbonded *)

static void(*	calcSlowPair )(nonbonded *)

static void(*	calcSlowPairEnergy )(nonbonded *)

static void(*	calcSlowSelf )(nonbonded *)

static void(*	calcSlowSelfEnergy )(nonbonded *)

static Bool	commOnly

static Bool	fixedAtomsOn

static Bool	qmForcesOn

static BigReal	cutoff

static BigReal	cutoff2

static float	cutoff2_f

static BigReal	dielectric_1

static const LJTable *	ljTable = 0

static const Molecule *	mol

static BigReal	r2_delta

static BigReal	r2_delta_1

static int	rowsize

static int	columnsize

static int	r2_delta_exp

static BigReal *	table_alloc = 0

static BigReal *	table_ener = 0

static BigReal *	table_short

static BigReal *	table_noshort

static BigReal *	fast_table

static BigReal *	scor_table

static BigReal *	slow_table

static BigReal *	corr_table

static BigReal *	full_table

static BigReal *	vdwa_table

static BigReal *	vdwb_table

static BigReal *	r2_table

static int	table_length

static BigReal	scaling

static BigReal	scale14

static BigReal	switchOn

static BigReal	switchOn_1

static BigReal	switchOn2

static BigReal	v_vdwa

static BigReal	v_vdwb

static BigReal	k_vdwa

static BigReal	k_vdwb

static BigReal	cutoff_3

static BigReal	cutoff_6

static float	v_vdwa_f

static float	v_vdwb_f

static float	k_vdwa_f

static float	k_vdwb_f

static float	cutoff_3_f

static float	cutoff_6_f

static float	switchOn_f

static float	A6_f

static float	B6_f

static float	C6_f

static float	A12_f

static float	B12_f

static float	C12_f

static BigReal	c0

static BigReal	c1

static BigReal	c3

static BigReal	c5

static BigReal	c6

static BigReal	c7

static BigReal	c8

static Bool	alchFepOn

static Bool	alchThermIntOn

static Bool	alchWCAOn

static BigReal	alchVdwShiftCoeff

static Bool	vdwForceSwitching

static Bool	alchDecouple

static Bool	lesOn

static int	lesFactor

static BigReal	lesScaling

static BigReal *	lambda_table = 0

static Bool	pairInteractionOn

static Bool	pairInteractionSelf

static Bool	pressureProfileOn

static int	pressureProfileSlabs

static int	pressureProfileAtomTypes

static BigReal	pressureProfileThickness

static BigReal	pressureProfileMin

static Bool	accelMDOn

static Bool	drudeNbthole

static BigReal	ewaldcof

static BigReal	pi_ewaldcof

static int	vdw_switch_mode

static Bool	goGroPair

static Bool	goForcesOn

static int	goMethod

Protected Member Functions inherited from Compute
void	enqueueWork ()

Protected Attributes inherited from Compute
int	computeType

int	basePriority

int	gbisPhase

int	gbisPhasePriority [3]

Detailed Description

Definition at line 31 of file CudaComputeNonbonded.h.

Constructor & Destructor Documentation

◆ CudaComputeNonbonded()

CudaComputeNonbonded::CudaComputeNonbonded	(	ComputeID	c,
		int	deviceID,
		CudaNonbondedTables &	cudaNonbondedTables,
		bool	doStreaming
	)

Definition at line 41 of file CudaComputeNonbonded.C.

References cudaCheck, Compute::gbisPhase, NAMD_die(), Node::Object(), Node::simParameters, and simParams.

                                                               :
 Compute(c), deviceID(deviceID), doStreaming(doStreaming), nonbondedKernel(deviceID, cudaNonbondedTables, doStreaming),
 tileListKernel(deviceID, doStreaming), GBISKernel(deviceID) {
 
   cudaCheck(cudaSetDevice(deviceID));
 
         exclusionsByAtom = NULL;
 
   vdwTypes = NULL;
   vdwTypesSize = 0;
 
   exclIndexMaxDiff = NULL;
   exclIndexMaxDiffSize = 0;
 
   atomIndex = NULL;
   atomIndexSize = 0;
 
   atomStorageSize = 0;
 
   // Atom and charge storage
   atoms = NULL;
   atomsSize = 0;
   part = NULL;
   partSize = 0;
   doAlch = false;
   lambdaWindowUpdated = false;
 
   // Force storage
   h_forces = NULL;
   h_forcesSize = 0;
   h_forcesSlow = NULL;
   h_forcesSlowSize = 0;
 
   d_forces = NULL;
   d_forcesSize = 0;
   d_forcesSlow = NULL;
   d_forcesSlowSize = 0;
 
   // GBIS
   intRad0H = NULL;
   intRad0HSize = 0;
   intRadSH = NULL;
   intRadSHSize = 0;
   psiSumH = NULL;
   psiSumHSize = 0;
   bornRadH = NULL;
   bornRadHSize = 0;
   dEdaSumH = NULL;
   dEdaSumHSize = 0;
   dHdrPrefixH = NULL;
   dHdrPrefixHSize = 0;
   maxShmemPerBlock = 0;
   cudaPatches = NULL;
 
   atomsChangedIn = true;
   atomsChanged = true;
   computesChanged = true;
 
   forceDoneEventRecord = false;
 
   SimParameters *simParams = Node::Object()->simParameters;
   if (simParams->pressureProfileOn) {
     NAMD_die("CudaComputeNonbonded, pressure profile not supported");
   }
   
   if (simParams->GBISOn) gbisPhase = 3;
 
   doSkip = false;
 }

◆ ~CudaComputeNonbonded()

CudaComputeNonbonded::~CudaComputeNonbonded ( )

Definition at line 115 of file CudaComputeNonbonded.C.

References cudaCheck, and ComputeMgr::sendUnregisterBoxesOnPe().

                                             {
   // fprintf(stderr, "Pe %d calling destructor ", CkMyPe());
   cudaCheck(cudaSetDevice(deviceID));
         if (exclusionsByAtom != NULL) delete [] exclusionsByAtom;
   if (vdwTypes != NULL) deallocate_host<int>(&vdwTypes);
   if (exclIndexMaxDiff != NULL) deallocate_host<int2>(&exclIndexMaxDiff);
   if (atoms != NULL) deallocate_host<CudaAtom>(&atoms);
   if (part != NULL) deallocate_host<char>(&part);
   if (h_forces != NULL) deallocate_host<float4>(&h_forces);
   if (h_forcesSlow != NULL) deallocate_host<float4>(&h_forcesSlow);
   if (d_forces != NULL) deallocate_device<float4>(&d_forces);
   if (d_forcesSlow != NULL) deallocate_device<float4>(&d_forcesSlow);
 
   // GBIS
   if (intRad0H != NULL) deallocate_host<float>(&intRad0H);
   if (intRadSH != NULL) deallocate_host<float>(&intRadSH);
   if (psiSumH != NULL) deallocate_host<GBReal>(&psiSumH);
   if (bornRadH != NULL) deallocate_host<float>(&bornRadH);
   if (dEdaSumH != NULL) deallocate_host<GBReal>(&dEdaSumH);
   if (dHdrPrefixH != NULL) deallocate_host<float>(&dHdrPrefixH);
 
   if (cudaPatches != NULL) deallocate_host<CudaPatchRecord>(&cudaPatches);
 
   if (patches.size() > 0) {
     deallocate_host<VirialEnergy>(&h_virialEnergy);
     deallocate_device<VirialEnergy>(&d_virialEnergy);
     cudaCheck(cudaStreamDestroy(stream));
     cudaCheck(cudaEventDestroy(forceDoneEvent));
     CmiDestroyLock(lock);
     delete reduction;
   }
 
   // NOTE: unregistering happens in [sync] -entry method
   // fprintf(stderr, "unregistering patches on pe %d\n", CkMyPe());
   computeMgr->sendUnregisterBoxesOnPe(pes, this);
 
 }

Member Function Documentation

◆ assignPatches()

void CudaComputeNonbonded::assignPatches ( ComputeMgr * computeMgrIn )

Definition at line 384 of file CudaComputeNonbonded.C.

References PatchMap::basePatchIDList(), deviceCUDA, findHomePatchPe(), findProxyPatchPes(), DeviceCUDA::getDeviceCount(), DeviceCUDA::getMasterPeForDeviceID(), Compute::getNumPatches(), DeviceCUDA::getNumPesSharingDevice(), DeviceCUDA::getPesSharingDevice(), ComputePmeCUDAMgr::isPmePe(), NAMD_bug(), PatchMap::Object(), ComputePmeCUDAMgr::Object(), PatchMap::ObjectOnPe(), ComputeMgr::sendAssignPatchesOnPe(), and Compute::setNumPatches().

Referenced by ComputeMgr::createComputes().

                                                                  {
   // Remove duplicate patches
   std::sort(patches.begin(), patches.end());
   std::vector<PatchRecord>::iterator last = std::unique(patches.begin(), patches.end());
   patches.erase(last, patches.end());
   // Set number of patches
   setNumPatches(patches.size());
   masterPe = CkMyPe();
   computeMgr = computeMgrIn;
   // Start patch counter
   patchesCounter = getNumPatches();
   // Patch ID map
   std::map<PatchID, int> pidMap;
 #if 1
   //-------------------------------------------------------
   // Copied in from ComputeNonbondedCUDA::assignPatches()
   //-------------------------------------------------------
 
   std::vector<int> pesOnNodeSharingDevice(CkMyNodeSize());
   int numPesOnNodeSharingDevice = 0;
   int masterIndex = -1;
   for ( int i=0; i<deviceCUDA->getNumPesSharingDevice(); ++i ) {
     int pe = deviceCUDA->getPesSharingDevice(i);
     if ( pe == CkMyPe() ) masterIndex = numPesOnNodeSharingDevice;
     if ( CkNodeOf(pe) == CkMyNode() ) {
       pesOnNodeSharingDevice[numPesOnNodeSharingDevice++] = pe;
     }
   }
 
   std::vector<int> count(patches.size(), 0);
   std::vector<int> pcount(numPesOnNodeSharingDevice, 0);
   std::vector<int> rankpcount(CkMyNodeSize(), 0);
   std::vector<char> table(patches.size()*numPesOnNodeSharingDevice, 0);
 
   PatchMap* patchMap = PatchMap::Object();
 
   int unassignedpatches = patches.size();
 
   for (int i=0;i < patches.size(); ++i) {
     patches[i].pe = -1;
   }
 
   // assign if home pe and build table of natural proxies
   for (int i=0;i < patches.size(); ++i) {
     int pid = patches[i].patchID;
     // homePe = PE where the patch currently resides
     int homePe = patchMap->node(pid);
     for ( int j=0; j < numPesOnNodeSharingDevice; ++j ) {
       int pe = pesOnNodeSharingDevice[j];
       // If homePe is sharing this device, assign this patch to homePe
       if ( pe == homePe ) {
         patches[i].pe = pe;
         --unassignedpatches;
         pcount[j] += 1;
       }
       if ( PatchMap::ObjectOnPe(pe)->patch(pid) ) {
         table[i*numPesOnNodeSharingDevice + j] = 1;
       }
     }
     // Assign this patch to homePe, if it resides on the same node
     if ( patches[i].pe == -1 && CkNodeOf(homePe) == CkMyNode() ) {
       patches[i].pe = homePe;
       --unassignedpatches;
       rankpcount[CkRankOf(homePe)] += 1;
     }
   }
   // assign if only one pe has a required proxy
   for (int i=0; i < patches.size(); ++i) {
     int pid = patches[i].patchID;
     if ( patches[i].pe != -1 ) continue;
     int c = 0;
     int lastj;
     for (int j=0; j < numPesOnNodeSharingDevice; ++j) {
       if ( table[i*numPesOnNodeSharingDevice + j] ) {
         ++c;
         lastj = j;
       }
     }
     count[i] = c;
     if ( c == 1 ) {
       patches[i].pe = pesOnNodeSharingDevice[lastj];
       --unassignedpatches;
       pcount[lastj] += 1;
     }
   }
   int assignj = 0;
   while ( unassignedpatches ) {
     int i;
     for (i=0;i < patches.size(); ++i) {
       if ( ! table[i*numPesOnNodeSharingDevice + assignj] ) continue;
       int pid = patches[i].patchID;
       // patch_record &pr = patchRecords[pid];
       if ( patches[i].pe != -1 ) continue;
       patches[i].pe = pesOnNodeSharingDevice[assignj];
       --unassignedpatches;
       pcount[assignj] += 1;
       if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
       break;
     }
     if (i < patches.size() ) continue;  // start search again
     for ( i=0;i < patches.size(); ++i ) {
       int pid = patches[i].patchID;
       // patch_record &pr = patchRecords[pid];
       if ( patches[i].pe != -1 ) continue;
       if ( count[i] ) continue;
       patches[i].pe = pesOnNodeSharingDevice[assignj];
       --unassignedpatches;
       pcount[assignj] += 1;
       if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
       break;
     }
     if ( i < patches.size() ) continue;  // start search again
     if ( ++assignj == numPesOnNodeSharingDevice ) assignj = 0;
   }
 
   // For each rank, list of patches
   rankPatches.resize(CkMyNodeSize());
   for (int i=0; i < patches.size(); ++i) {
     rankPatches[CkRankOf(patches[i].pe)].push_back(i);
     pidMap[patches[i].patchID] = i;
   }
 
   // for ( int i=0; i < patches.size(); ++i ) {
   //   CkPrintf("Pe %d patch %d hostPe %d\n", CkMyPe(), patches[i].patchID, patches[i].pe);
   // }
 #else
   // For each rank, list of patches
   rankPatches.resize(CkMyNodeSize());
   // For each rank, list of home patch IDs
   PatchIDList* rankHomePatchIDs = new PatchIDList[CkMyNodeSize()];
   for (int i=0;i < CkMyNodeSize();i++) {
     int pe = CkNodeFirst(CkMyNode()) + i;
     PatchMap::Object()->basePatchIDList(pe, rankHomePatchIDs[i]);
   }
   std::vector<int> proxyPatchPes;
   std::vector<int> peProxyPatchCounter(CkMyNodeSize(), 0);
   //--------------------------------------------------------
   // Build a list of PEs to avoid
   std::vector<int> pesToAvoid;
 #if 0
   // Avoid other GPUs' master PEs
   for (int i=0;i < deviceCUDA->getDeviceCount();i++) {
     int pe = deviceCUDA->getMasterPeForDeviceID(i);
     if (pe != -1 && pe != masterPe) pesToAvoid.push_back(pe);
   }
   // Avoid PEs that are involved in PME
   ComputePmeCUDAMgr *computePmeCUDAMgr = ComputePmeCUDAMgr::Object();
   for (int pe=CkNodeFirst(CkMyNode());pe < CkNodeFirst(CkMyNode()) + CkMyNodeSize();pe++) {
     if (computePmeCUDAMgr->isPmePe(pe)) pesToAvoid.push_back(pe);
   }
   // Set counters of avoidable PEs to high numbers
   for (int i=0;i < pesToAvoid.size();i++) {
     int pe = pesToAvoid[i];
     peProxyPatchCounter[CkRankOf(pe)] = (1 << 20);
   }
 #endif
   // Avoid master Pe somewhat
   peProxyPatchCounter[CkRankOf(masterPe)] = 2; // patches.size();
   //--------------------------------------------------------
   for (int i=0;i < patches.size();i++) {
     //if I had this datastructure "patches" on the GPU, I could use it
     PatchID pid = patches[i].patchID;
     int pe = findHomePatchPe(rankHomePatchIDs, pid);
     if (pe == -1) {
       // Patch not present on this node => try finding a ProxyPatch
       findProxyPatchPes(proxyPatchPes, pid);
       if (proxyPatchPes.size() == 0) {
         // No ProxyPatch => create one on rank that has the least ProxyPatches
         int rank = std::min_element(peProxyPatchCounter.begin(), peProxyPatchCounter.end()) - peProxyPatchCounter.begin();
         pe = CkNodeFirst(CkMyNode()) + rank;
         peProxyPatchCounter[rank]++;
       } else {
         // Choose ProxyPatch, try to avoid masterPe (current Pe) and Pes that already have a ProxyPatch,
         // this is done by finding the entry with minimum peProxyPatchCounter -value
         // Find miniumum among proxyPatchPes, i.e., find the minimum among
         // peProxyPatchCounter[CkRankOf(proxyPatchPes[j])]
         // int pppi = std::min_element(proxyPatchPes.begin(), proxyPatchPes.end(),
         //   [&](int i, int j) {return peProxyPatchCounter[CkRankOf(i)] < peProxyPatchCounter[CkRankOf(j)];})
         //   - proxyPatchPes.begin();
         // pe = proxyPatchPes[pppi];
         int minCounter = (1 << 30);
         for (int j=0;j < proxyPatchPes.size();j++) {
           if (minCounter > peProxyPatchCounter[CkRankOf(proxyPatchPes[j])]) {
             pe = proxyPatchPes[j];
             minCounter = peProxyPatchCounter[CkRankOf(pe)];
           }
         }
         if (pe == -1)
           NAMD_bug("CudaComputeNonbonded::assignPatches, Unable to choose PE with proxy patch");
         peProxyPatchCounter[CkRankOf(pe)]++;
       }
     } else if (std::find(pesToAvoid.begin(), pesToAvoid.end(), pe) != pesToAvoid.end()) {
       // Found home patch on this node, but it's on PE that should be avoided => find a new one
       int rank = std::min_element(peProxyPatchCounter.begin(), peProxyPatchCounter.end()) - peProxyPatchCounter.begin();
       pe = CkNodeFirst(CkMyNode()) + rank;
       peProxyPatchCounter[rank]++;
     }
     if (pe < CkNodeFirst(CkMyNode()) || pe >= CkNodeFirst(CkMyNode()) + CkMyNodeSize() )
       NAMD_bug("CudaComputeNonbonded::assignPatches, Invalid PE for a patch");
     rankPatches[CkRankOf(pe)].push_back(i);
     pidMap[pid] = i;
   }
 
   delete [] rankHomePatchIDs;
 #endif
   // Setup computes using pidMap
   for (int i=0;i < computes.size();i++) {
     computes[i].patchInd[0] = pidMap[computes[i].pid[0]];
     computes[i].patchInd[1] = pidMap[computes[i].pid[1]];
   }
   for (int i=0;i < CkMyNodeSize();i++) {
     if (rankPatches[i].size() > 0) pes.push_back(CkNodeFirst(CkMyNode()) + i);
   }
   computeMgr->sendAssignPatchesOnPe(pes, this);
 }

◆ assignPatchesOnPe()

void CudaComputeNonbonded::assignPatchesOnPe ( )

Definition at line 322 of file CudaComputeNonbonded.C.

References ResizeArray< Elem >::add(), NAMD_bug(), PatchMap::node(), PatchMap::Object(), and ResizeArray< Elem >::size().

Referenced by ComputeMgr::recvAssignPatchesOnPe().

                                              {
   if (rankPatches[CkMyRank()].size() == 0)
     NAMD_bug("CudaComputeNonbonded::assignPatchesOnPe, empty rank");
 
   // calculate priority rank of local home patch within pe
   {
     PatchMap* patchMap = PatchMap::Object();
     ResizeArray< ResizeArray<int2> > homePatchByRank(CkMyNodeSize());
     for ( int k=0; k < rankPatches[CkMyRank()].size(); ++k ) {
       int i = rankPatches[CkMyRank()][k];
       int pid = patches[i].patchID;
       int homePe = patchMap->node(pid);
       if ( CkNodeOf(homePe) == CkMyNode() ) {
         int2 pid_index;
         pid_index.x = pid;
         pid_index.y = i;
         homePatchByRank[CkRankOf(homePe)].add(pid_index);
       }
     }
     for ( int i=0; i<CkMyNodeSize(); ++i ) {
       pid_sortop_reverse_priority so;
       std::sort(homePatchByRank[i].begin(),homePatchByRank[i].end(),so);
       int masterBoost = ( CkMyRank() == i ? 2 : 0 );
       for ( int j=0; j<homePatchByRank[i].size(); ++j ) {
         int index = homePatchByRank[i][j].y;
         patches[index].reversePriorityRankInPe = j + masterBoost;
       }
     }
   }
 
   for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
     assignPatch(rankPatches[CkMyRank()][i]);
   }
 }

◆ atomUpdate()

void CudaComputeNonbonded::atomUpdate ( void )

virtual

Reimplemented from Compute.

Definition at line 686 of file CudaComputeNonbonded.C.

                                       {
   atomsChangedIn = true;
 }

◆ doWork()

void CudaComputeNonbonded::doWork ( void )

virtual

Reimplemented from Compute.

Definition at line 1112 of file CudaComputeNonbonded.C.

References SimParameters::CUDASOAintegrate, Flags::doEnergy, Flags::doFullElectrostatics, Flags::doMinimize, Flags::doNonbonded, Flags::doVirial, Compute::gbisPhase, NAMD_bug(), Node::Object(), openBoxesOnPe(), CudaTileListKernel::prepareBuffers(), ComputeMgr::sendOpenBoxesOnPe(), Node::simParameters, and simParams.

                                   {
   if (CkMyPe() != masterPe)
     NAMD_bug("CudaComputeNonbonded::doWork() called on non masterPe");
 
   // Read value of atomsChangedIn, which is set in atomUpdate(), and reset it.
   // atomsChangedIn can be set to true by any Pe
   // atomsChanged can only be set by masterPe
   // This use of double varibles makes sure we don't have race condition
   // it seems like it's important to have the masterPe call doWork() first
   atomsChanged = atomsChangedIn;
   atomsChangedIn = false;
 
   SimParameters *simParams = Node::Object()->simParameters;
 
   if (patches.size() == 0) return;  // No work do to
 
   // Take the flags from the first patch on this Pe
   // Flags &flags = patches[rankPatches[CkMyRank()][0]].patch->flags;
   // these flags are probably wrong.
   Flags &flags = patches[0].patch->flags;
 
   doSlow = flags.doFullElectrostatics;
   doEnergy = flags.doEnergy;
   doVirial = flags.doVirial;
   doAlch = simParams->alchOn;
   doMinimize = flags.doMinimize;
   
   if (flags.doNonbonded) {
 
     if (simParams->GBISOn) {
       gbisPhase = 1 + (gbisPhase % 3);//1->2->3->1...
     }
 
     if (!simParams->GBISOn || gbisPhase == 1) {
       if ( computesChanged ) {
         updateComputes();
       }
       if (atomsChanged) {
         // Re-calculate patch atom numbers and storage
         updatePatches();
         reSortDone = false;
       }
       reallocateArrays();
 #ifdef NODEGROUP_FORCE_REGISTER 
       if (simParams->CUDASOAintegrate && simParams->useDeviceMigration && atomsChanged) {
         tileListKernel.prepareBuffers(atomStorageSize, patches.size(), cudaPatches, stream);
         updatePatchRecord();
       }
 #endif  // NODEGROUP_FORCE_REGISTER 
     }
 
     // Open boxes on Pes and launch work to masterPe
     if(params->CUDASOAintegrate){
        if(!atomsChanged) this->openBoxesOnPe();
      }
     else computeMgr->sendOpenBoxesOnPe(pes, this);
 
   } else {
     // No work to do, skip
     skip();
   }
 
 }

◆ finishPatches()

void CudaComputeNonbonded::finishPatches ( )

Definition at line 1943 of file CudaComputeNonbonded.C.

References cudaCheck, SimParameters::CUDASOAintegrate, finishPatchesOnPe(), and ComputeMgr::sendFinishPatchesOnPe().

                                          {
   if(params->CUDASOAintegrate){
      if (atomsChanged || doEnergy || doVirial) cudaCheck(cudaStreamSynchronize(stream));
      this->finishPatchesOnPe();
    }
   else {
     computeMgr->sendFinishPatchesOnPe(pes, this);
   }
 }

◆ finishPatchesOnPe()

void CudaComputeNonbonded::finishPatchesOnPe ( )

Definition at line 1931 of file CudaComputeNonbonded.C.

Referenced by finishPatches(), and ComputeMgr::recvFinishPatchesOnPe().

                                              {
   finishSetOfPatchesOnPe(rankPatches[CkMyRank()]);
 }

◆ finishPatchOnPe()

void CudaComputeNonbonded::finishPatchOnPe ( int i )

Definition at line 1938 of file CudaComputeNonbonded.C.

Referenced by ComputeMgr::recvFinishPatchOnPe().

                                                 {
   std::vector<int> v(1, i);
   finishSetOfPatchesOnPe(v);
 }

◆ finishReductions()

void CudaComputeNonbonded::finishReductions ( )

Definition at line 1629 of file CudaComputeNonbonded.C.

Referenced by ComputeMgr::recvFinishReductions().

                                             {
   if (CkMyPe() != masterPe)
     NAMD_bug("CudaComputeNonbonded::finishReductions() called on non masterPe");
 
   // fprintf(stderr, "PE[%d]: Nbond finishReductions doSkip %d doVirial %d doEnergy %d\n", CkMyPe(), doSkip, doVirial, doEnergy);
   if (!doSkip) {
 
     if (doStreaming && (doVirial || doEnergy)) {
       // For streaming kernels, we must wait for virials and forces to be copied back to CPU
       if (!forceDoneEventRecord)
         NAMD_bug("CudaComputeNonbonded::finishReductions, forceDoneEvent not being recorded");
       cudaCheck(cudaEventSynchronize(forceDoneEvent));
       forceDoneEventRecord = false;
     }
 
     if (doVirial) {
       // if(params->CUDASOAintegrate) cudaCheck(cudaStreamSynchronize(stream));
       Tensor virialTensor;
       virialTensor.xx = h_virialEnergy->virial[0];
       virialTensor.xy = h_virialEnergy->virial[1];
       virialTensor.xz = h_virialEnergy->virial[2];
       virialTensor.yx = h_virialEnergy->virial[3];
       virialTensor.yy = h_virialEnergy->virial[4];
       virialTensor.yz = h_virialEnergy->virial[5];
       virialTensor.zx = h_virialEnergy->virial[6];
       virialTensor.zy = h_virialEnergy->virial[7];
       virialTensor.zz = h_virialEnergy->virial[8];
       // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.xx, virialTensor.xy, virialTensor.xz);
       // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.yx, virialTensor.yy, virialTensor.yz);
       // fprintf(stderr, "virialTensor %lf %lf %lf\n", virialTensor.zx, virialTensor.zy, virialTensor.zz);
 #ifdef NODEGROUP_FORCE_REGISTER
       if (params->CUDASOAintegrate) {
         ADD_TENSOR_OBJECT(nodeReduction, REDUCTION_VIRIAL_NBOND, virialTensor);
       } else
 #endif
       {
         ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_NBOND, virialTensor);
       }
       if (doSlow) {
         Tensor virialTensor;
         virialTensor.xx = h_virialEnergy->virialSlow[0];
         virialTensor.xy = h_virialEnergy->virialSlow[1];
         virialTensor.xz = h_virialEnergy->virialSlow[2];
         virialTensor.yx = h_virialEnergy->virialSlow[3];
         virialTensor.yy = h_virialEnergy->virialSlow[4];
         virialTensor.yz = h_virialEnergy->virialSlow[5];
         virialTensor.zx = h_virialEnergy->virialSlow[6];
         virialTensor.zy = h_virialEnergy->virialSlow[7];
         virialTensor.zz = h_virialEnergy->virialSlow[8];
         // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.xx, virialTensor.xy, virialTensor.xz);
         // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.yx, virialTensor.yy, virialTensor.yz);
         // fprintf(stderr, "virialTensor (slow) %lf %lf %lf\n", virialTensor.zx, virialTensor.zy, virialTensor.zz);
 #ifdef NODEGROUP_FORCE_REGISTER
         if (params->CUDASOAintegrate) {
           ADD_TENSOR_OBJECT(nodeReduction, REDUCTION_VIRIAL_SLOW, virialTensor);
         } else
 #endif
         {
           ADD_TENSOR_OBJECT(reduction, REDUCTION_VIRIAL_SLOW, virialTensor);
         }
       }
     }
     if (doEnergy) {
       // if (doSlow)
       
 #ifdef NODEGROUP_FORCE_REGISTER
       if (params->CUDASOAintegrate) {
         nodeReduction->item(REDUCTION_LJ_ENERGY)    += h_virialEnergy->energyVdw;
         nodeReduction->item(REDUCTION_LJ_ENERGY_F)  += h_virialEnergy->energyVdw_s;
         nodeReduction->item(REDUCTION_ELECT_ENERGY) += h_virialEnergy->energyElec + ((params->GBISOn) ? h_virialEnergy->energyGBIS : 0.0);
         nodeReduction->item(REDUCTION_ELECT_ENERGY_F) += h_virialEnergy->energyElec_s;
         
         //Reduce values for TI
         nodeReduction->item(REDUCTION_LJ_ENERGY_TI_1) += h_virialEnergy->energyVdw_ti_1;
         nodeReduction->item(REDUCTION_LJ_ENERGY_TI_2) += h_virialEnergy->energyVdw_ti_2;
         nodeReduction->item(REDUCTION_ELECT_ENERGY_TI_1) += h_virialEnergy->energyElec_ti_1;
         nodeReduction->item(REDUCTION_ELECT_ENERGY_TI_2) += h_virialEnergy->energyElec_ti_2;
       } else 
 #endif
       {
         //   printf("energyElec %lf energySlow %lf energyGBIS %lf\n", h_virialEnergy->energyElec, h_virialEnergy->energySlow, h_virialEnergy->energyGBIS);
         reduction->item(REDUCTION_LJ_ENERGY)    += h_virialEnergy->energyVdw;
         reduction->item(REDUCTION_LJ_ENERGY_F)  += h_virialEnergy->energyVdw_s;
         reduction->item(REDUCTION_ELECT_ENERGY) += h_virialEnergy->energyElec + ((params->GBISOn) ? h_virialEnergy->energyGBIS : 0.0);
         reduction->item(REDUCTION_ELECT_ENERGY_F) += h_virialEnergy->energyElec_s;
         
         //Reduce values for TI
         reduction->item(REDUCTION_LJ_ENERGY_TI_1) += h_virialEnergy->energyVdw_ti_1;
         reduction->item(REDUCTION_LJ_ENERGY_TI_2) += h_virialEnergy->energyVdw_ti_2;
         reduction->item(REDUCTION_ELECT_ENERGY_TI_1) += h_virialEnergy->energyElec_ti_1;
         reduction->item(REDUCTION_ELECT_ENERGY_TI_2) += h_virialEnergy->energyElec_ti_2;
       }
 
       // fprintf(stderr, "energyGBIS %lf\n", h_virialEnergy->energyGBIS);
        if (doSlow){
 #ifdef NODEGROUP_FORCE_REGISTER
          if (params->CUDASOAintegrate) {
            nodeReduction->item(REDUCTION_ELECT_ENERGY_SLOW) += h_virialEnergy->energySlow;
            nodeReduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += h_virialEnergy->energySlow_s;
            nodeReduction->item(REDUCTION_ELECT_ENERGY_SLOW_TI_1) += h_virialEnergy->energySlow_ti_1;
            nodeReduction->item(REDUCTION_ELECT_ENERGY_SLOW_TI_2) += h_virialEnergy->energySlow_ti_2;
            //fprintf(stderr, "NB h_virialEnergy->energySlow %lf\n", h_virialEnergy->energySlow);
          } else 
 #endif
          {
            reduction->item(REDUCTION_ELECT_ENERGY_SLOW) += h_virialEnergy->energySlow;
            reduction->item(REDUCTION_ELECT_ENERGY_SLOW_F) += h_virialEnergy->energySlow_s;
            reduction->item(REDUCTION_ELECT_ENERGY_SLOW_TI_1) += h_virialEnergy->energySlow_ti_1;
            reduction->item(REDUCTION_ELECT_ENERGY_SLOW_TI_2) += h_virialEnergy->energySlow_ti_2;
            //fprintf(stderr, "NB h_virialEnergy->energySlow %lf\n", h_virialEnergy->energySlow);
          }
       }
     }
 
 #ifdef NODEGROUP_FORCE_REGISTER
     if (params->CUDASOAintegrate) {
       nodeReduction->item(REDUCTION_EXCLUSION_CHECKSUM_CUDA) += tileListKernel.getNumExcluded();
     } else
 #endif
     {
     reduction->item(REDUCTION_EXCLUSION_CHECKSUM_CUDA) += tileListKernel.getNumExcluded();
     }
   }
 #ifdef NODEGROUP_FORCE_REGISTER
   if (params->CUDASOAintegrate) {
     nodeReduction->item(REDUCTION_COMPUTE_CHECKSUM) += 1.;
   } else
 #endif
   {
   reduction->item(REDUCTION_COMPUTE_CHECKSUM) += 1.;
   }
 
 
   // I need to get rid of this for every timestep. 
   if(!params->CUDASOAintegrate ) reduction->submit();
   // Reset flags
   doSkip = false;
   computesChanged = false;
 }

◆ gbisP2PatchReady()

void CudaComputeNonbonded::gbisP2PatchReady	(	PatchID	pid,
		int	seq
	)

virtual

Reimplemented from Compute.

Definition at line 264 of file CudaComputeNonbonded.C.

References Compute::gbisP2PatchReady().

                                                                 {
   CmiLock(lock);
   Compute::gbisP2PatchReady(pid, seq);
   CmiUnlock(lock);
 }

◆ gbisP3PatchReady()

void CudaComputeNonbonded::gbisP3PatchReady	(	PatchID	pid,
		int	seq
	)

virtual

Reimplemented from Compute.

Definition at line 270 of file CudaComputeNonbonded.C.

References Compute::gbisP3PatchReady().

                                                                 {
   CmiLock(lock);
   Compute::gbisP3PatchReady(pid, seq);
   CmiUnlock(lock);
 }

◆ getDoTable()

bool CudaComputeNonbonded::getDoTable	(	SimParameters *	params,
		const bool	doSlow,
		const bool	doVirial
	)

static

Definition at line 2374 of file CudaComputeNonbonded.C.

References simParams.

                                                                                                       {
   // There is additional logic in SimParameters.C which guards against unsupported force fields
   // This should only be used for performance heuristics
   bool doTable = simParams->useCUDANonbondedForceTable;
 
   // DMC: I found the doSlow case is faster with force tables, so overriding setting
   // TODO This should be reevaluated for future architectures
   doTable = doTable || doSlow;
   // Direct math does not support virial+slow
   // Redundant but necessary for correctness so doing it explicitly
   doTable = doTable || (doSlow && doVirial);
 
   return doTable;
 }

◆ getNonbondedCoef()

CudaNBConstants CudaComputeNonbonded::getNonbondedCoef ( SimParameters * params )

static

Definition at line 2341 of file CudaComputeNonbonded.C.

References ComputeNonbondedUtil::c1, ComputeNonbondedUtil::cutoff, ComputeNonbondedUtil::cutoff2, CudaNBConstants::e_0, CudaNBConstants::e_0_slow, CudaNBConstants::e_1, CudaNBConstants::e_2, CudaNBConstants::ewald_0, CudaNBConstants::ewald_1, CudaNBConstants::ewald_2, CudaNBConstants::ewald_3_slow, ComputeNonbondedUtil::ewaldcof, CudaNBConstants::lj_0, CudaNBConstants::lj_1, CudaNBConstants::lj_2, CudaNBConstants::lj_3, CudaNBConstants::lj_4, CudaNBConstants::lj_5, ComputeNonbondedUtil::pi_ewaldcof, simParams, CudaNBConstants::slowScale, ComputeNonbondedUtil::switchOn, and ComputeNonbondedUtil::switchOn2.

                                                                                {
   const float cutoff = ComputeNonbondedUtil::cutoff;
   const float cutoff2 = ComputeNonbondedUtil::cutoff2;
   const float cutoffInv = 1.0f / cutoff;
   const float cutoff2Inv = 1.0f / cutoff2;
   const float scutoff = ComputeNonbondedUtil::switchOn;
   const float scutoff2 = ComputeNonbondedUtil::switchOn2;
   const float scutoff2Inv = 1.0f / scutoff2;
   const float scutoff_denom = ComputeNonbondedUtil::c1;
   const float ewaldcof = ComputeNonbondedUtil::ewaldcof;
   const float pi_ewaldcof = ComputeNonbondedUtil::pi_ewaldcof;
   const float slowScale = ((float) simParams->fullElectFrequency) / simParams->nonbondedFrequency;
 
   CudaNBConstants c;
   c.lj_0 = scutoff_denom * cutoff2 - 3.0f * scutoff2 * scutoff_denom;
   c.lj_1 = scutoff_denom * 2.0f;
   c.lj_2 = scutoff_denom * -12.0f;
   c.lj_3 = 12.0f * scutoff_denom * scutoff2;
   c.lj_4 = cutoff2;
   c.lj_5 = scutoff2;
   c.e_0 = cutoff2Inv * cutoffInv;
   c.e_0_slow = cutoff2Inv * cutoffInv * (1.0f - slowScale);
   c.e_1 = cutoff2Inv;
   c.e_2 = cutoffInv;
   c.ewald_0 = ewaldcof;
   c.ewald_1 = pi_ewaldcof;
   c.ewald_2 = ewaldcof * ewaldcof;
   c.ewald_3_slow = ewaldcof * ewaldcof * ewaldcof * slowScale;
   c.slowScale = slowScale;
 
   return c;
 }

◆ getPatches()

std::vector<PatchRecord>& CudaComputeNonbonded::getPatches ( )

inline

Definition at line 304 of file CudaComputeNonbonded.h.

304 { return patches; }

◆ initialize()

void CudaComputeNonbonded::initialize ( void )

virtual

Reimplemented from Compute.

Definition at line 629 of file CudaComputeNonbonded.C.

References ATOMIC_BINS, cudaCheck, deviceCUDA, DeviceCUDA::getDeviceIndex(), PatchMap::numPatches(), PatchMap::Object(), Node::Object(), ReductionMgr::Object(), Compute::priority(), PatchData::reduction, REDUCTIONS_BASIC, Node::simParameters, and ReductionMgr::willSubmit().

Referenced by ComputeMgr::createComputes().

                                       {
   if (patches.size() > 0) {
     npairlists = 0;
     // Allocate CUDA version of patches
     cudaCheck(cudaSetDevice(deviceID));
     allocate_host<CudaPatchRecord>(&cudaPatches, patches.size());
 
     allocate_host<VirialEnergy>(&h_virialEnergy, 1);
     allocate_device<VirialEnergy>(&d_virialEnergy, ATOMIC_BINS);
 
   /* JM: Queries for maximum sharedMemoryPerBlock on deviceID
    */
    cudaDeviceProp props;
    cudaCheck(cudaGetDeviceProperties(&props, deviceID)); //Gets properties of 'deviceID device'
    maxShmemPerBlock = props.sharedMemPerBlock;
 
 #if CUDA_VERSION >= 5050 || defined(NAMD_HIP)
     int leastPriority, greatestPriority;
     cudaCheck(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority));
     int priority = (doStreaming) ? leastPriority : greatestPriority;
     // int priority = greatestPriority;
     cudaCheck(cudaStreamCreateWithPriority(&stream,cudaStreamDefault, priority));
 #else
     cudaCheck(cudaStreamCreate(&stream));
 #endif
     cudaCheck(cudaEventCreate(&forceDoneEvent));
 
     buildExclusions();
 
     lock = CmiCreateLock();
     params = Node::Object()->simParameters;
     reduction = ReductionMgr::Object()->willSubmit(REDUCTIONS_BASIC);
 
 #ifdef NODEGROUP_FORCE_REGISTER
     int devInd = deviceCUDA->getDeviceIndex();
     CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
     PatchData *patchData = cpdata.ckLocalBranch();
     nodeReduction = patchData->reduction;
     patchData->devData[devInd].nbond_stream = stream;
     // Fill auxiliary arrays for merging forces here
     PatchMap* map = PatchMap::Object();
     int nGlobalPatches = map->numPatches();
     allocate_host<bool>( &(patchData->devData[devInd].h_hasPatches), nGlobalPatches);
     memset(patchData->devData[devInd].h_hasPatches, 0, sizeof(bool)*nGlobalPatches);
 
     for(int i = 0; i < patches.size(); i++){
       patchData->devData[devInd].h_hasPatches[patches[i].patchID] = true;
     }
     allocate_device<bool>( &(patchData->devData[devInd].d_hasPatches), nGlobalPatches);
     copy_HtoD_sync<bool>( patchData->devData[devInd].h_hasPatches, patchData->devData[devInd].d_hasPatches, nGlobalPatches);
 #endif
   }
 }

◆ launchWork()

void CudaComputeNonbonded::launchWork ( )

Definition at line 1176 of file CudaComputeNonbonded.C.

References CudaComputeNonbonded::PatchRecord::atomStart, ResizeArray< Elem >::begin(), cudaCheck, ComputeNonbondedUtil::cutoff, ResizeArray< Elem >::end(), Patch::flags, Compute::gbisPhase, CudaTileListKernel::getEmptyPatches(), CudaTileListKernel::getNumEmptyPatches(), CudaTileListKernel::getNumPatches(), Patch::getPatchID(), CudaComputeNonbondedKernel::getPatchReadyQueue(), PatchMap::homePatchList(), SubmitReduction::item(), NodeReduction::item(), NAMD_bug(), NAMD_EVENT_START, NAMD_EVENT_STOP, CudaComputeNonbonded::PatchRecord::numAtoms, PatchMap::Object(), Node::Object(), HomePatchElem::patch, CudaComputeNonbondedKernel::reduceVirialEnergy(), REDUCTION_PAIRLIST_WARNINGS, reSortTileLists(), Flags::savePairlists, Node::simParameters, simParams, Flags::step, and Flags::usePairlists.

Referenced by openBoxesOnPe(), and ComputeMgr::recvLaunchWork().

                                       {
   if (CkMyPe() != masterPe)
     NAMD_bug("CudaComputeNonbonded::launchWork() called on non masterPe");
 
   beforeForceCompute = CkWallTimer();
   cudaCheck(cudaSetDevice(deviceID));
   SimParameters *simParams = Node::Object()->simParameters;
   
   // So, it seems like PE's are invoking the same object, however the patches[i] is borked on the masterPe
   
   // When I get here, it seems like compAtoms are not set for all Pes? How can that be?
 
   //execute only during GBIS phase 1, or if not using GBIS
   if (!simParams->GBISOn || gbisPhase == 1) {
 
     if ( atomsChanged || computesChanged ) {
       // Invalidate pair lists
       pairlistsValid = false;
       pairlistTolerance = 0.0f;
     }
 
     // Get maximum atom movement and patch tolerance
     float maxAtomMovement = 0.0f;
     float maxPatchTolerance = 0.0f;
     getMaxMovementTolerance(maxAtomMovement, maxPatchTolerance);
     // Update pair-list cutoff
     Flags &flags = patches[0].patch->flags;
     savePairlists = false;
     usePairlists = false;
     if ( flags.savePairlists ) {
       savePairlists = true;
       usePairlists = true;
     } else if ( flags.usePairlists ) {
       if ( ! pairlistsValid || ( 2. * maxAtomMovement > pairlistTolerance ) ) {
         reduction->item(REDUCTION_PAIRLIST_WARNINGS) += 1;
 #ifdef NODEGROUP_FORCE_REGISTER
         nodeReduction->item(REDUCTION_PAIRLIST_WARNINGS) += 1;
 #endif
       } else {
         usePairlists = true;
       }
     }
     if ( ! usePairlists ) {
       pairlistsValid = false;
     }
     float plcutoff = cutoff;
     if ( savePairlists ) {
       pairlistsValid = true;
       pairlistTolerance = 2. * maxPatchTolerance;
       plcutoff += pairlistTolerance;
     }
     plcutoff2 = plcutoff * plcutoff;
     
     // fprintf(stderr, "STEP[%d] plcutoff = %f  listTolerance = %f  save = %d  maxPatchTolerance = %f maxAtomMovement = %f plvalid = %d flags.use = %d use = %d\n",
     //      flags.step, plcutoff, pairlistTolerance, savePairlists, maxPatchTolerance, maxAtomMovement, pairlistsValid, flags.usePairlists, usePairlists);
     if(savePairlists || !usePairlists){
       reSortDone = false; // Ensures pairlist resorting if doPairlist
     }
 
     // if (atomsChanged)
     //   CkPrintf("plcutoff = %f  listTolerance = %f  save = %d  use = %d\n",
     //     plcutoff, pairlistTolerance, savePairlists, usePairlists);
 
   } // if (!simParams->GBISOn || gbisPhase == 1)
 
   // Calculate PME & VdW forces
   if (!simParams->GBISOn || gbisPhase == 1) {
     doForce();
     if (doStreaming) {
       patchReadyQueue = nonbondedKernel.getPatchReadyQueue();
       patchReadyQueueLen = tileListKernel.getNumPatches();
       patchReadyQueueNext = 0;
       // Fill in empty patches [0 ... patchReadyQueueNext-1] at the top
       int numEmptyPatches = tileListKernel.getNumEmptyPatches();
       int* emptyPatches = tileListKernel.getEmptyPatches();
       for (int i=0;i < numEmptyPatches;i++) {
         PatchRecord &pr = patches[emptyPatches[i]];
         memset(h_forces+pr.atomStart, 0, sizeof(float4)*pr.numAtoms);
         if (doSlow) memset(h_forcesSlow+pr.atomStart, 0, sizeof(float4)*pr.numAtoms);
         patchReadyQueue[i] = emptyPatches[i];
       }
       if (patchReadyQueueLen != patches.size())
         NAMD_bug("CudaComputeNonbonded::launchWork, invalid patchReadyQueueLen");
     }
   }
 
   // For GBIS phase 1 at pairlist update, we must re-sort tile list
   // before calling doGBISphase1().
   if (atomsChanged && simParams->GBISOn && gbisPhase == 1) {
     // In this code path doGBISphase1() is called in forceDone()
     forceDoneSetCallback();
     return;
   }
   
   // GBIS Phases
   if (simParams->GBISOn) {
     if (gbisPhase == 1) {
       doGBISphase1();
     } else if (gbisPhase == 2) {
       doGBISphase2();
     } else if (gbisPhase == 3) {
       doGBISphase3();
     } 
   }
 
   // Copy forces to host
   if (!simParams->GBISOn || gbisPhase == 3) {
     if (!doStreaming) {
 #ifdef NODEGROUP_FORCE_REGISTER
       if(!simParams->CUDASOAintegrate || (atomsChanged && !simParams->useDeviceMigration)){
         copy_DtoH<float4>(d_forces, h_forces, atomStorageSize, stream);
         if (doSlow) copy_DtoH<float4>(d_forcesSlow, h_forcesSlow, atomStorageSize, stream);
       }
 #else
       copy_DtoH<float4>(d_forces, h_forces, atomStorageSize, stream);
       if (doSlow) copy_DtoH<float4>(d_forcesSlow, h_forcesSlow, atomStorageSize, stream);
 #endif
       
     }
   }
 
   if ((!simParams->GBISOn || gbisPhase == 2) && (doEnergy || doVirial)) {
 
     NAMD_EVENT_START(1, NamdProfileEvent::REDUCE_VIRIAL_ENERGY);
     // For GBIS, energies are ready after phase 2
     nonbondedKernel.reduceVirialEnergy(tileListKernel,
       atomStorageSize, doEnergy, doVirial, doSlow, simParams->GBISOn,
       d_forces, d_forcesSlow, d_virialEnergy, stream);
     copy_DtoH<VirialEnergy>(d_virialEnergy, h_virialEnergy, 1, stream);
 
     NAMD_EVENT_STOP(1, NamdProfileEvent::REDUCE_VIRIAL_ENERGY);
 
   }
 
   if(simParams->CUDASOAintegrate && ((savePairlists || !usePairlists)) && !atomsChanged) reSortTileLists();
 
   // Setup call back
   forceDoneSetCallback();
   
 #if 0
   cudaCheck(cudaStreamSynchronize(stream));
   PatchMap *map = PatchMap::Object();
   HomePatchElem *elem;
   for(elem = map->homePatchList()->begin(); elem != map->homePatchList()->end(); elem++){
     if(elem->patch->getPatchID() == 7) break;
   }
   if(elem->patch->flags.step == 11){
     // it would be good to know from which patch these atoms are...
     fprintf(stderr, "CudaNonbonded data\n");
     for(int i = 0 ; i < atomStorageSize; i++){
       fprintf(stderr, "pos[%d] = %lf, %lf, %lf, %lf | (%f %f %f) (%f %f %f) \n", 
         i, atoms[i].x, atoms[i].y, atoms[i].z, atoms[i].q,
         // for some reason, we needed to set the positions
         h_forces[i].x, h_forces[i].y, h_forces[i].z,
         h_forcesSlow[i].x, h_forcesSlow[i].y, h_forcesSlow[i].z);
     }
   }
 #endif
 
 }

◆ messageEnqueueWork()

void CudaComputeNonbonded::messageEnqueueWork ( )

Definition at line 1025 of file CudaComputeNonbonded.C.

References WorkDistrib::messageEnqueueWork(), and NAMD_bug().

Referenced by ComputeMgr::recvMessageEnqueueWork().

                                               {
   if (masterPe != CkMyPe())
     NAMD_bug("CudaComputeNonbonded::messageEnqueueWork() must be called from masterPe");
   WorkDistrib::messageEnqueueWork(this);
 }

◆ noWork()

int CudaComputeNonbonded::noWork ( )

virtual

Reimplemented from Compute.

Definition at line 1077 of file CudaComputeNonbonded.C.

References ComputeMgr::sendMessageEnqueueWork().

                                  {
   // Simply enqueu doWork on masterPe and return "no work"
   computeMgr->sendMessageEnqueueWork(masterPe, this);
   return 1;
 }

◆ openBoxesOnPe()

void CudaComputeNonbonded::openBoxesOnPe ( )

Definition at line 1031 of file CudaComputeNonbonded.C.

References SimParameters::CUDASOAintegrate, Compute::getNumPatches(), launchWork(), NAMD_bug(), NAMD_EVENT_START, NAMD_EVENT_STOP, Node::Object(), and ComputeMgr::sendLaunchWork().

Referenced by doWork(), and ComputeMgr::recvOpenBoxesOnPe().

                                          {
   if (rankPatches[CkMyRank()].size() == 0)
     NAMD_bug("CudaComputeNonbonded::openBoxesOnPe, empty rank");
 
   NAMD_EVENT_START(1, NamdProfileEvent::COMPUTE_NONBONDED_OPEN_BOXES);
 
 #ifdef NODEGROUP_FORCE_REGISTER
   if( Node::Object()->simParameters->CUDASOAintegrate && !atomsChanged) {
       // opens boxes to make sure NAMD won't complain
       for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
         int j = rankPatches[CkMyRank()][i];
         patches[j].positionBox->open();
       }
       if(masterPe == CkMyPe()) {
         // we need to open boxes here...
         if(params->CUDASOAintegrate){
           if(!atomsChanged) this->launchWork();
         }
         else computeMgr->sendLaunchWork(masterPe, this); 
       }
   }
   else{
 #endif
   for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
     openBox(rankPatches[CkMyRank()][i]);
   }
   bool done = false;
   CmiLock(lock);
   patchesCounter -= rankPatches[CkMyRank()].size();
   if (patchesCounter == 0) {
     patchesCounter = getNumPatches();
     done = true;
   }
   CmiUnlock(lock);
   if (done) {
     if(params->CUDASOAintegrate){
        if(!atomsChanged) this->launchWork();
     }
     else computeMgr->sendLaunchWork(masterPe, this);
   }
 #ifdef NODEGROUP_FORCE_REGISTER
   }
 #endif
   NAMD_EVENT_STOP(1, NamdProfileEvent::COMPUTE_NONBONDED_OPEN_BOXES);
 }

◆ patchReady()

void CudaComputeNonbonded::patchReady	(	PatchID	pid,
		int	doneMigration,
		int	seq
	)

virtual

Reimplemented from Compute.

Definition at line 247 of file CudaComputeNonbonded.C.

References SimParameters::CUDASOAintegrate, NAMD_bug(), Compute::patchReady(), and SimParameters::useDeviceMigration.

                                                                              {
   // DMC: This isn't need into CUDASOAintegrate scheme. All it does is call atomUpdate()
   // however that is already called in Sequencer::runComputeObjects_CUDA
   // The functionality of updatePatch() was moved into updatePatches()
   if (!(params->CUDASOAintegrate && params->useDeviceMigration)) {
     if (doneMigration) {
       int i = findPid(pid);
       if (i == -1)
         NAMD_bug("CudaComputeNonbonded::patchReady, Patch ID not found");
       updatePatch(i);
     }
     CmiLock(lock);
     Compute::patchReady(pid, doneMigration, seq);
     CmiUnlock(lock);
   }
 }

◆ registerComputePair()

void CudaComputeNonbonded::registerComputePair	(	ComputeID	cid,
		PatchID *	pid,
		int *	trans
	)

Definition at line 185 of file CudaComputeNonbonded.C.

References PatchMap::center(), Compute::cid, PatchMap::Object(), Vector::x, Vector::y, and Vector::z.

                                                                                       {
   computesChanged = true;
   addPatch(pid[0]);
   addPatch(pid[1]);
   PatchMap* patchMap = PatchMap::Object();
   int t1 = trans[0];
   int t2 = trans[1];
   Vector offset = patchMap->center(pid[0]) - patchMap->center(pid[1]);
   offset.x += (t1%3-1) - (t2%3-1);
   offset.y += ((t1/3)%3-1) - ((t2/3)%3-1);
   offset.z += (t1/9-1) - (t2/9-1);
   addCompute(cid, pid[0], pid[1], offset);
 }

◆ registerComputeSelf()

void CudaComputeNonbonded::registerComputeSelf	(	ComputeID	cid,
		PatchID	pid
	)

Definition at line 175 of file CudaComputeNonbonded.C.

References Compute::cid.

                                                                          {
   computesChanged = true;
   addPatch(pid);
   addCompute(cid, pid, pid, 0.);
 }

◆ reSortTileLists()

void CudaComputeNonbonded::reSortTileLists ( )

Definition at line 1998 of file CudaComputeNonbonded.C.

References cudaCheck, Node::Object(), CudaTileListKernel::reSortTileLists(), Node::simParameters, and simParams.

Referenced by launchWork().

                                            {
   // Re-sort tile lists
   SimParameters *simParams = Node::Object()->simParameters;
   cudaCheck(cudaSetDevice(deviceID));
 #ifdef NAMD_HIP
   tileListKernel.reSortTileLists(simParams->GBISOn, simParams->CUDASOAintegrateMode, stream);
 #else
   tileListKernel.reSortTileLists(simParams->GBISOn, stream);
 #endif
 }

◆ skipPatchesOnPe()

void CudaComputeNonbonded::skipPatchesOnPe ( )

Definition at line 798 of file CudaComputeNonbonded.C.

References Compute::getNumPatches(), NAMD_bug(), and ComputeMgr::sendFinishReductions().

Referenced by ComputeMgr::recvSkipPatchesOnPe().

                                            {
   if (rankPatches[CkMyRank()].size() == 0)
     NAMD_bug("CudaComputeNonbonded::skipPatchesOnPe, empty rank");
   for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
     skipPatch(rankPatches[CkMyRank()][i]);
   }
   bool done = false;
   CmiLock(lock);
   patchesCounter -= rankPatches[CkMyRank()].size();
   if (patchesCounter == 0) {
     patchesCounter = getNumPatches();
     done = true;
   }
   CmiUnlock(lock);
   if (done) {
     // Reduction must be done on masterPe
     computeMgr->sendFinishReductions(masterPe, this);
   }
 }

◆ unregisterBoxesOnPe()

void CudaComputeNonbonded::unregisterBoxesOnPe ( )

Definition at line 163 of file CudaComputeNonbonded.C.

References NAMD_bug().

Referenced by ComputeMgr::recvUnregisterBoxesOnPe().

                                                {
   if (rankPatches[CkMyRank()].size() == 0)
     NAMD_bug("CudaComputeNonbonded::unregisterBoxesOnPe, empty rank");
   for (int i=0;i < rankPatches[CkMyRank()].size();i++) {
     unregisterBox(rankPatches[CkMyRank()][i]);
   }
 }

◆ updatePatchOrder()

void CudaComputeNonbonded::updatePatchOrder ( const std::vector< CudaLocalRecord > & data )

Definition at line 600 of file CudaComputeNonbonded.C.

                                                                                   {
   // DMC This vector of CudaLocalRecords doesn't have the correct number of peer records
   std::map<int, int> pidMap;
   for (int i=0; i < data.size(); ++i) {
     pidMap[data[i].patchID] = i;
   }
 
   std::vector<PatchRecord> copy = patches;
 
   for (int i=0; i < copy.size(); i++) {
     const int new_idx = pidMap[copy[i].patchID];
     patches[new_idx] = copy[i];
   }
 
   for (int i=0; i < rankPatches.size(); i++) {
     rankPatches[i].clear();
   }
   for (int i=0; i < patches.size(); ++i) {
     rankPatches[CkRankOf(patches[i].pe)].push_back(i);
   } 
 
   // Setup computes using pidMap
   for (int i=0;i < computes.size();i++) {
     computes[i].patchInd[0] = pidMap[computes[i].pid[0]];
     computes[i].patchInd[1] = pidMap[computes[i].pid[1]];
   }
   // TODO do we need to call sendAssignPatchesOnPe with the new order?
 }

The documentation for this class was generated from the following files:

Classes

Public Member Functions

Static Public Member Functions

Additional Inherited Members

Detailed Description

Constructor & Destructor Documentation

◆ CudaComputeNonbonded()

◆ ~CudaComputeNonbonded()

Member Function Documentation

◆ assignPatches()

◆ assignPatchesOnPe()

◆ atomUpdate()

◆ doWork()

◆ finishPatches()

◆ finishPatchesOnPe()

◆ finishPatchOnPe()

◆ finishReductions()

◆ gbisP2PatchReady()

◆ gbisP3PatchReady()

◆ getDoTable()

◆ getNonbondedCoef()

◆ getPatches()

◆ initialize()

◆ launchWork()

◆ messageEnqueueWork()

◆ noWork()

◆ openBoxesOnPe()

◆ patchReady()

◆ registerComputePair()

◆ registerComputeSelf()

◆ reSortTileLists()

◆ skipPatchesOnPe()

◆ unregisterBoxesOnPe()

◆ updatePatchOrder()