#include <ComputeCUDAMgr.h>

Inheritance diagram for ComputeCUDAMgr:

Public Member Functions
	ComputeCUDAMgr ()

	ComputeCUDAMgr (CkMigrateMessage *)

	~ComputeCUDAMgr ()

void	initialize (CkQdMsg *msg)

void	initialize_devices (CkQdMsg *msg)

void	update ()

CudaComputeNonbonded *	createCudaComputeNonbonded (ComputeID c)

CudaComputeNonbonded *	getCudaComputeNonbonded ()

CudaPmeOneDevice *	createCudaPmeOneDevice ()

CudaPmeOneDevice *	getCudaPmeOneDevice ()

std::shared_ptr< CudaGlobalMasterServer >	createCudaGlobalMaster ()

std::shared_ptr< CudaGlobalMasterServer >	getCudaGlobalMaster ()

Static Public Member Functions
static ComputeCUDAMgr *	getComputeCUDAMgr ()

Public Attributes
double3 **	curSMDCOM

double3 ***	curGrp1COM

double3 ***	curGrp2COM

std::atomic< int >	reducerSMDDevice

std::atomic< int >	reducerGroupRestraintDevice

Detailed Description

Definition at line 16 of file ComputeCUDAMgr.h.

Constructor & Destructor Documentation

◆ ComputeCUDAMgr() [1/2]

ComputeCUDAMgr::ComputeCUDAMgr ( )

Definition at line 18 of file ComputeCUDAMgr.C.

                                {
         // __sdag_init();
   numDevices = 0;
   // numNodesContributed = 0;
   // numDevicesMax = 0;
   cudaPmeOneDevice = NULL;  // XXX is this needed?
   cudaGlobalMasterObject = nullptr;
 }

◆ ComputeCUDAMgr() [2/2]

ComputeCUDAMgr::ComputeCUDAMgr ( CkMigrateMessage * )

Definition at line 30 of file ComputeCUDAMgr.C.

References NAMD_bug().

                                                  {
         // __sdag_init();
   NAMD_bug("ComputeCUDAMgr cannot be migrated");
   numDevices = 0;
   // numNodesContributed = 0;
   // numDevicesMax = 0;
   cudaPmeOneDevice = NULL;  // XXX is this needed?
   cudaGlobalMasterObject = nullptr;
 }

◆ ~ComputeCUDAMgr()

ComputeCUDAMgr::~ComputeCUDAMgr ( )

Definition at line 43 of file ComputeCUDAMgr.C.

References curGrp1COM, curGrp2COM, and curSMDCOM.

                                 {
   for (int i=0;i < numDevices;i++) {
     if (cudaNonbondedTablesList[i] != NULL) delete cudaNonbondedTablesList[i];
     if (cudaComputeNonbondedList[i] != NULL) delete cudaComputeNonbondedList[i];
 #ifdef BONDED_CUDA
     if (computeBondedCUDAList[i] != NULL) delete computeBondedCUDAList[i];
 #endif
     if(curSMDCOM != NULL && curSMDCOM[i] != NULL) cudaFree(curSMDCOM[i]);
     if(curGrp1COM != NULL && curGrp1COM[i] != NULL) cudaFree(curGrp1COM[i]);
     if(curGrp2COM != NULL && curGrp2COM[i] != NULL) cudaFree(curGrp2COM[i]);
   }
   if(curSMDCOM != NULL) cudaFree(curSMDCOM);
   if(curGrp1COM != NULL) cudaFree(curGrp1COM);
   if(curGrp2COM != NULL) cudaFree(curGrp2COM);
 }

Member Function Documentation

◆ createCudaComputeNonbonded()

CudaComputeNonbonded * ComputeCUDAMgr::createCudaComputeNonbonded ( ComputeID c )

Definition at line 261 of file ComputeCUDAMgr.C.

References SimParameters::CUDASOAintegrate, deviceCUDA, SimParameters::GBISOn, DeviceCUDA::getDeviceID(), DeviceCUDA::getNoStreaming(), NAMD_bug(), Node::Object(), and Node::simParameters.

Referenced by createCudaComputeNonbonded().

                                                                             {
   int deviceID = deviceCUDA->getDeviceID();
   if (cudaComputeNonbondedList.at(deviceID) != NULL)
     NAMD_bug("ComputeCUDAMgr::createCudaComputeNonbonded called twice");
   if (cudaNonbondedTablesList.at(deviceID) == NULL)
     NAMD_bug("ComputeCUDAMgr::createCudaComputeNonbonded, non-bonded CUDA tables not created");
   //bool doStreaming = !deviceCUDA->getNoStreaming() && !Node::Object()->simParameters->GBISOn && !Node::Object()->simParameters->CUDASOAintegrate;
   bool doStreaming = !deviceCUDA->getNoStreaming() && !Node::Object()->simParameters->GBISOn && !Node::Object()->simParameters->CUDASOAintegrate;
   cudaComputeNonbondedList[deviceID] = new CudaComputeNonbonded(c, deviceID, *cudaNonbondedTablesList[deviceID], doStreaming);
   return cudaComputeNonbondedList[deviceID];
 }

◆ createCudaGlobalMaster()

std::shared_ptr< CudaGlobalMasterServer > ComputeCUDAMgr::createCudaGlobalMaster ( )

Definition at line 316 of file ComputeCUDAMgr.C.

References deviceCUDA, endi(), DeviceCUDA::getGlobalDevice(), iINFO(), iout, Node::Object(), Node::simParameters, and simParams.

Referenced by ComputeMgr::recvCudaGlobalMasterCreateMsg().

                                                                              {
   iout << iINFO << "Creating CUDAGlobalMaster on PE " << CkMyPe() << '\n' << endi;
   if (cudaGlobalMasterObject) {
     return cudaGlobalMasterObject;
   }
   const int deviceID = deviceCUDA->getGlobalDevice();
   SimParameters *simParams = Node::Object()->simParameters;
   cudaGlobalMasterObject = std::make_shared<CudaGlobalMasterServer>(deviceID, simParams->cudaGlobalProfilingFreq);
   return cudaGlobalMasterObject;
 }

◆ createCudaPmeOneDevice()

CudaPmeOneDevice * ComputeCUDAMgr::createCudaPmeOneDevice ( )

Definition at line 225 of file ComputeCUDAMgr.C.

References PmeGrid::block1, PmeGrid::block2, PmeGrid::block3, deviceCUDA, PmeGrid::dim2, PmeGrid::dim3, DeviceCUDA::getPmeDevice(), DeviceCUDA::getPmeDeviceIndex(), PmeGrid::K1, PmeGrid::K2, PmeGrid::K3, Node::Object(), PmeGrid::order, Node::simParameters, simParams, PmeGrid::xBlocks, PmeGrid::yBlocks, and PmeGrid::zBlocks.

                                                          {
   // initialize pmeGrid from simParams
   SimParameters *simParams = Node::Object()->simParameters;
   PmeGrid pmeGrid;
   pmeGrid.K1 = simParams->PMEGridSizeX;
   pmeGrid.K2 = simParams->PMEGridSizeY;
   pmeGrid.K3 = simParams->PMEGridSizeZ;
   pmeGrid.order = simParams->PMEInterpOrder;
   pmeGrid.dim2 = pmeGrid.K2;
   pmeGrid.dim3 = 2 * (pmeGrid.K3/2 + 1);
   // override settings for PME pencils
   pmeGrid.xBlocks = 1;
   pmeGrid.yBlocks = 1;
   pmeGrid.zBlocks = 1;
   pmeGrid.block1 = pmeGrid.K1;
   pmeGrid.block2 = pmeGrid.K2;
   pmeGrid.block3 = pmeGrid.K3;
   // use shared deviceID class
   int deviceID = 0;
   int deviceIndex = 0;
 #ifdef NODEGROUP_FORCE_REGISTER
   deviceID    = deviceCUDA->getPmeDevice();
   deviceIndex = deviceCUDA->getPmeDeviceIndex();
 #endif
   if (cudaPmeOneDevice != NULL) delete cudaPmeOneDevice;
   cudaPmeOneDevice = new CudaPmeOneDevice(pmeGrid, deviceID, deviceIndex);
   return cudaPmeOneDevice;
 }

◆ getComputeCUDAMgr()

ComputeCUDAMgr * ComputeCUDAMgr::getComputeCUDAMgr ( )

static

Definition at line 216 of file ComputeCUDAMgr.C.

References NAMD_bug().

Referenced by createCudaComputeNonbonded(), getCudaComputeNonbonded(), ComputeMgr::recvCudaGlobalMasterCreateMsg(), ComputeMgr::recvCudaGlobalMasterRemoveMsg(), ComputeMgr::recvCudaGlobalMasterUpdateMsg(), and SimParameters::scriptSet().

                                                   {
   // Get pointer to ComputeCUDAMgr on this node
   CProxy_ComputeCUDAMgr computeCUDAMgrProxy = CkpvAccess(BOCclass_group).computeCUDAMgr;
   ComputeCUDAMgr* computeCUDAMgr = computeCUDAMgrProxy.ckLocalBranch();
   if (computeCUDAMgr == NULL)
     NAMD_bug("getComputeCUDAMgr, unable to locate local branch of BOC entry ComputeCUDAMgr");
   return computeCUDAMgr;
 }

◆ getCudaComputeNonbonded()

CudaComputeNonbonded * ComputeCUDAMgr::getCudaComputeNonbonded ( )

Definition at line 276 of file ComputeCUDAMgr.C.

References deviceCUDA, DeviceCUDA::getDeviceID(), and NAMD_bug().

Referenced by getCudaComputeNonbonded().

                                                               {
   // Get device ID for this Pe
   int deviceID = deviceCUDA->getDeviceID();
   CudaComputeNonbonded* p = cudaComputeNonbondedList[deviceID];
   if (p == NULL)
     NAMD_bug("ComputeCUDAMgr::getCudaComputeNonbonded(), device not created yet");
   return p;
 }

◆ getCudaGlobalMaster()

std::shared_ptr< CudaGlobalMasterServer > ComputeCUDAMgr::getCudaGlobalMaster ( )

Definition at line 312 of file ComputeCUDAMgr.C.

Referenced by ComputeMgr::recvCudaGlobalMasterCreateMsg(), ComputeMgr::recvCudaGlobalMasterRemoveMsg(), and ComputeMgr::recvCudaGlobalMasterUpdateMsg().

                                                                           {
   return cudaGlobalMasterObject;
 }

◆ getCudaPmeOneDevice()

CudaPmeOneDevice * ComputeCUDAMgr::getCudaPmeOneDevice ( )

Definition at line 254 of file ComputeCUDAMgr.C.

                                                       {
   return cudaPmeOneDevice;
 }

◆ initialize()

void ComputeCUDAMgr::initialize ( CkQdMsg * msg )

Definition at line 63 of file ComputeCUDAMgr.C.

References cudaCheck, curGrp1COM, curGrp2COM, curSMDCOM, deviceCUDA, DeviceCUDA::getDeviceCount(), DeviceCUDA::getDeviceIDbyRank(), DeviceCUDA::getNumDevice(), DeviceCUDA::isGpuReservedPme(), Node::Object(), Node::simParameters, and simParams.

                                             {
         if (msg != NULL) delete msg;
 
         numDevices = deviceCUDA->getDeviceCount();
 #ifdef NODEGROUP_FORCE_REGISTER
     CProxy_PatchData cpdata(CkpvAccess(BOCclass_group).patchData);
     PatchData *pdata = cpdata.ckLocalBranch();
     int ndevs = deviceCUDA->getNumDevice() + 1*deviceCUDA->isGpuReservedPme();
     pdata->devData.resize(numDevices);
 
     {
       // Pointers to SOA integration data
       allocate_host<bool*>(&(pdata->h_devHasForces),ndevs);
       allocate_host<double*>(&(pdata->h_soa_fb_x),  ndevs);
       allocate_host<double*>(&(pdata->h_soa_fb_y),  ndevs);
       allocate_host<double*>(&(pdata->h_soa_fb_z),  ndevs);
 
       allocate_host<double*>(&(pdata->h_soa_fn_x),  ndevs);
       allocate_host<double*>(&(pdata->h_soa_fn_y),  ndevs);
       allocate_host<double*>(&(pdata->h_soa_fn_z),  ndevs);
 
       allocate_host<double*>(&(pdata->h_soa_fs_x),  ndevs);
       allocate_host<double*>(&(pdata->h_soa_fs_y),  ndevs);
       allocate_host<double*>(&(pdata->h_soa_fs_z),  ndevs);
 
       allocate_host<double*>(&(pdata->h_soa_pos_x), ndevs);
       allocate_host<double*>(&(pdata->h_soa_pos_y), ndevs);
       allocate_host<double*>(&(pdata->h_soa_pos_z), ndevs);
 
       allocate_host<double*>(&(pdata->h_soa_vel_x), deviceCUDA->getNumDevice());
       allocate_host<double*>(&(pdata->h_soa_vel_y), deviceCUDA->getNumDevice());
       allocate_host<double*>(&(pdata->h_soa_vel_z), deviceCUDA->getNumDevice());
 
       allocate_host<float*>  (&(pdata->h_soa_charge), deviceCUDA->getNumDevice());
 
       allocate_host<int*>    (&(pdata->h_soa_id),                   deviceCUDA->getNumDevice());
       allocate_host<int*>    (&(pdata->h_soa_vdwType),              deviceCUDA->getNumDevice());
       allocate_host<int*>    (&(pdata->h_soa_sortOrder),            deviceCUDA->getNumDevice());
       allocate_host<int*>    (&(pdata->h_soa_unsortOrder),          deviceCUDA->getNumDevice());
       allocate_host<double3*>(&(pdata->h_soa_patchCenter),          deviceCUDA->getNumDevice());
       allocate_host<int4*>   (&(pdata->h_soa_migrationDestination), deviceCUDA->getNumDevice());
       allocate_host<int*>    (&(pdata->h_soa_sortSoluteIndex),      deviceCUDA->getNumDevice());
 
       allocate_host<int*>    (&(pdata->h_soa_partition),            deviceCUDA->getNumDevice());
 
       allocate_host<FullAtom*>(&(pdata->h_atomdata_AoS), deviceCUDA->getNumDevice());
       allocate_host<CudaLocalRecord*>(&(pdata->h_peer_record), deviceCUDA->getNumDevice());
 
       allocate_host<int*>(&(pdata->h_tupleCount.bond), deviceCUDA->getNumDevice());
       allocate_host<int*>(&(pdata->h_tupleCount.angle), deviceCUDA->getNumDevice());
       allocate_host<int*>(&(pdata->h_tupleCount.dihedral), deviceCUDA->getNumDevice());
       allocate_host<int*>(&(pdata->h_tupleCount.improper), deviceCUDA->getNumDevice());
       allocate_host<int*>(&(pdata->h_tupleCount.modifiedExclusion), deviceCUDA->getNumDevice());
       allocate_host<int*>(&(pdata->h_tupleCount.exclusion), deviceCUDA->getNumDevice());
       allocate_host<int*>(&(pdata->h_tupleCount.crossterm), deviceCUDA->getNumDevice());
 
       allocate_host<int*>(&(pdata->h_tupleOffset.bond), deviceCUDA->getNumDevice());
       allocate_host<int*>(&(pdata->h_tupleOffset.angle), deviceCUDA->getNumDevice());
       allocate_host<int*>(&(pdata->h_tupleOffset.dihedral), deviceCUDA->getNumDevice());
       allocate_host<int*>(&(pdata->h_tupleOffset.improper), deviceCUDA->getNumDevice());
       allocate_host<int*>(&(pdata->h_tupleOffset.modifiedExclusion), deviceCUDA->getNumDevice());
       allocate_host<int*>(&(pdata->h_tupleOffset.exclusion), deviceCUDA->getNumDevice());
       allocate_host<int*>(&(pdata->h_tupleOffset.crossterm), deviceCUDA->getNumDevice());
 
       allocate_host<CudaBondStage*>(&(pdata->h_tupleDataStage.bond), deviceCUDA->getNumDevice());
       allocate_host<CudaAngleStage*>(&(pdata->h_tupleDataStage.angle), deviceCUDA->getNumDevice());
       allocate_host<CudaDihedralStage*>(&(pdata->h_tupleDataStage.dihedral), deviceCUDA->getNumDevice());
       allocate_host<CudaDihedralStage*>(&(pdata->h_tupleDataStage.improper), deviceCUDA->getNumDevice());
       allocate_host<CudaExclusionStage*>(&(pdata->h_tupleDataStage.modifiedExclusion), deviceCUDA->getNumDevice());
       allocate_host<CudaExclusionStage*>(&(pdata->h_tupleDataStage.exclusion), deviceCUDA->getNumDevice());
       allocate_host<CudaCrosstermStage*>(&(pdata->h_tupleDataStage.crossterm), deviceCUDA->getNumDevice());
     }
 
     // Allocate the work queues
     allocate_host<unsigned int*>(&(pdata->d_queues), ndevs);
     allocate_host<unsigned int>(&(pdata->d_queueCounters), ndevs);
 
     cudaCheck(cudaMemset(pdata->d_queueCounters, 0, sizeof(unsigned int)*ndevs));
 
     pdata->migrationFlagPerDevice.resize(deviceCUDA->getNumDevice());
 
     pdata->tupleReallocationFlagPerDevice.resize(deviceCUDA->getNumDevice());
     pdata->atomReallocationFlagPerDevice.resize(deviceCUDA->getNumDevice());
     pdata->maxNumBonds.store(0);
     pdata->maxNumAngles.store(0);
     pdata->maxNumDihedrals.store(0);
     pdata->maxNumImpropers.store(0);
     pdata->maxNumModifiedExclusions.store(0);
     pdata->maxNumExclusions.store(0);
     pdata->maxNumCrossterms.store(0);
     pdata->devicePatchMapFlag.resize(CkNumPes(), 0);
 #ifdef NAMD_NCCL_ALLREDUCE
     // Allocate NCCL-related stuff
     deviceCUDA->setupNcclUniqueId();
     // After I do this, I can go ahead and register it in patchData
     pdata->ncclId = deviceCUDA->getNcclUniqueId(); // registered in ngroup
 #endif
     // allocate global data for mGpuOn shared memory accumulation
     // one per device, each referencing a numDevices element buffer
     SimParameters *simParams = Node::Object()->simParameters;
     if(simParams->SMDOn && numDevices>1){
       allocate_device<double3*>(&curSMDCOM, sizeof(double3*)*numDevices);
     }
     else
       {
         curSMDCOM = NULL;
       }
     if(simParams->groupRestraintsOn){
       // as SMD, but we we need numGroups buffers for type1 and type2
       allocate_host<double3**>(&curGrp1COM, sizeof(double3**)*simParams->groupRestraintsCount);
       allocate_host<double3**>(&curGrp2COM, sizeof(double3**)*simParams->groupRestraintsCount);
       for(int i=0;i<simParams->groupRestraintsCount;i++)
         {
           allocate_device<double3*>(&curGrp1COM[i], sizeof(double3*)*numDevices);
           allocate_device<double3*>(&curGrp2COM[i], sizeof(double3*)*numDevices);
         }
     }
     else
       {
         curGrp1COM = NULL;
         curGrp2COM = NULL;
       }
 #endif
 
   // Create pointers to devices
   cudaNonbondedTablesList.resize(numDevices, NULL);
   cudaComputeNonbondedList.resize(numDevices, NULL);
 #ifdef BONDED_CUDA
   computeBondedCUDAList.resize(numDevices, NULL);
 #endif
   if (cudaPmeOneDevice != NULL) delete cudaPmeOneDevice;
   cudaPmeOneDevice = NULL;
 
   // Create CUDA non-bonded tables for all devices that are used for computation
   for (int i=0;i < deviceCUDA->getNumDevice();i++) {
     int deviceID = deviceCUDA->getDeviceIDbyRank(i);
     cudaNonbondedTablesList[deviceID] = new CudaNonbondedTables(deviceID);
   }
 }

◆ initialize_devices()

void ComputeCUDAMgr::initialize_devices ( CkQdMsg * msg )

◆ update()

void ComputeCUDAMgr::update ( )

Definition at line 207 of file ComputeCUDAMgr.C.

References deviceCUDA, DeviceCUDA::getDeviceIDbyRank(), DeviceCUDA::getNumDevice(), and NAMD_bug().

Referenced by SimParameters::scriptSet().

                             {
   if ( CkMyRank() ) NAMD_bug("ComputeCUDAMgr::update() should be called only by rank 0");
   for (int i=0;  i < deviceCUDA->getNumDevice();  i++) {
     int deviceID = deviceCUDA->getDeviceIDbyRank(i);
     // calls update function from CudaNonbondedTables
     cudaNonbondedTablesList[deviceID]->updateTables();
   }
 }

Member Data Documentation

◆ curGrp1COM

double3*** ComputeCUDAMgr::curGrp1COM

Host-mapped center of mass of all atoms in each Grp1

Definition at line 37 of file ComputeCUDAMgr.h.

Referenced by initialize(), and ~ComputeCUDAMgr().

◆ curGrp2COM

double3*** ComputeCUDAMgr::curGrp2COM

Host-mapped center of mass of all atoms in each Grp2

Definition at line 38 of file ComputeCUDAMgr.h.

Referenced by initialize(), and ~ComputeCUDAMgr().

◆ curSMDCOM

double3** ComputeCUDAMgr::curSMDCOM

center of mass of atoms in SMD, one per device

Definition at line 36 of file ComputeCUDAMgr.h.

Referenced by initialize(), and ~ComputeCUDAMgr().

◆ reducerGroupRestraintDevice

std::atomic<int> ComputeCUDAMgr::reducerGroupRestraintDevice

which device reports extForce and energy

Definition at line 40 of file ComputeCUDAMgr.h.

◆ reducerSMDDevice

std::atomic<int> ComputeCUDAMgr::reducerSMDDevice

Definition at line 39 of file ComputeCUDAMgr.h.

The documentation for this class was generated from the following files:

Public Member Functions

Static Public Member Functions

Public Attributes

Detailed Description

Constructor & Destructor Documentation

◆ ComputeCUDAMgr() [1/2]

◆ ComputeCUDAMgr() [2/2]

◆ ~ComputeCUDAMgr()

Member Function Documentation

◆ createCudaComputeNonbonded()

◆ createCudaGlobalMaster()

◆ createCudaPmeOneDevice()

◆ getComputeCUDAMgr()

◆ getCudaComputeNonbonded()

◆ getCudaGlobalMaster()

◆ getCudaPmeOneDevice()

◆ initialize()

◆ initialize_devices()

◆ update()

Member Data Documentation

◆ curGrp1COM

◆ curGrp2COM

◆ curSMDCOM

◆ reducerGroupRestraintDevice

◆ reducerSMDDevice