NAMD
Public Member Functions | List of all members
DeviceCUDA Class Reference

#include <DeviceCUDA.h>

Public Member Functions

 DeviceCUDA ()
 
 ~DeviceCUDA ()
 
void initialize ()
 
int getDeviceCount ()
 
int getNumDevice ()
 
bool device_shared_with_pe (int pe)
 
bool one_device_per_node ()
 
int getNoStreaming ()
 
int getNoMergeGrids ()
 
int getMergeGrids ()
 
void setMergeGrids (const int val)
 
bool getSharedGpu ()
 
int getNextPeSharingGpu ()
 
int getMasterPe ()
 
int getNumPesSharingDevice ()
 
int getPesSharingDevice (const int i)
 
int getGpuIsMine ()
 
void setGpuIsMine (const int val)
 
int getDeviceID ()
 
int getDeviceIDbyRank (int rank)
 
int getDeviceIDforPe (int pe)
 
int getMasterPeForDeviceID (int deviceID)
 
int getMaxNumThreads ()
 
int getMaxNumBlocks ()
 

Detailed Description

Definition at line 38 of file DeviceCUDA.h.

Constructor & Destructor Documentation

DeviceCUDA::DeviceCUDA ( )

Definition at line 84 of file DeviceCUDA.C.

84 : deviceProps(NULL), devices(NULL) {}
DeviceCUDA::~DeviceCUDA ( )

Definition at line 369 of file DeviceCUDA.C.

369  {
370  if (deviceProps != NULL) delete [] deviceProps;
371  if (devices != NULL) delete [] devices;
372  delete [] pesSharingDevice;
373 }

Member Function Documentation

bool DeviceCUDA::device_shared_with_pe ( int  pe)

Definition at line 392 of file DeviceCUDA.C.

Referenced by ComputeMgr::createComputes().

392  {
393  for ( int i=0; i<numPesSharingDevice; ++i ) {
394  if ( pesSharingDevice[i] == pe ) return true;
395  }
396  return false;
397 }
int DeviceCUDA::getDeviceCount ( )
inline

Definition at line 92 of file DeviceCUDA.h.

Referenced by CudaComputeNonbonded::assignPatches(), and ComputeCUDAMgr::initialize().

92 {return deviceCount;}
int DeviceCUDA::getDeviceID ( )
inline
int DeviceCUDA::getDeviceIDbyRank ( int  rank)
inline
int DeviceCUDA::getDeviceIDforPe ( int  pe)

Definition at line 378 of file DeviceCUDA.C.

References deviceIDList.

378  {
379  return deviceIDList[CkRankOf(pe) % CkMyNodeSize()];
380 }
int deviceIDList[MAX_NUM_RANKS]
Definition: DeviceCUDA.C:72
int DeviceCUDA::getGpuIsMine ( )
inline

Definition at line 109 of file DeviceCUDA.h.

Referenced by ComputeNonbondedCUDA::doWork().

109 {return gpuIsMine;}
int DeviceCUDA::getMasterPe ( )
inline
int DeviceCUDA::getMasterPeForDeviceID ( int  deviceID)

Definition at line 385 of file DeviceCUDA.C.

References masterPeList.

Referenced by CudaComputeNonbonded::assignPatches().

385  {
386  return masterPeList[deviceID % deviceCount] - 1;
387 }
int masterPeList[MAX_NUM_DEVICES]
Definition: DeviceCUDA.C:75
int DeviceCUDA::getMaxNumBlocks ( )
int DeviceCUDA::getMaxNumThreads ( )

Definition at line 413 of file DeviceCUDA.C.

References cudaCheck.

413  {
414  int dev;
415  cudaCheck(cudaGetDevice(&dev));
416  return deviceProps[dev].maxThreadsPerBlock;
417 }
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
int DeviceCUDA::getMergeGrids ( )
inline
int DeviceCUDA::getNextPeSharingGpu ( )
inline

Definition at line 104 of file DeviceCUDA.h.

Referenced by cuda_check_local_calc(), and cuda_check_remote_calc().

104 {return nextPeSharingGpu;}
int DeviceCUDA::getNoMergeGrids ( )
inline

Definition at line 99 of file DeviceCUDA.h.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA().

99 {return nomergegrids;}
int DeviceCUDA::getNoStreaming ( )
inline
int DeviceCUDA::getNumDevice ( )
inline
int DeviceCUDA::getNumPesSharingDevice ( )
inline

Definition at line 106 of file DeviceCUDA.h.

Referenced by ComputeNonbondedCUDA::assignPatches(), and CudaComputeNonbonded::assignPatches().

106 {return numPesSharingDevice;}
int DeviceCUDA::getPesSharingDevice ( const int  i)
inline

Definition at line 107 of file DeviceCUDA.h.

Referenced by ComputeNonbondedCUDA::assignPatches(), and CudaComputeNonbonded::assignPatches().

107 {return pesSharingDevice[i];}
bool DeviceCUDA::getSharedGpu ( )
inline

Definition at line 103 of file DeviceCUDA.h.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA(), and ComputeNonbondedCUDA::recvYieldDevice().

103 {return sharedGpu;}
void DeviceCUDA::initialize ( void  )

Definition at line 89 of file DeviceCUDA.C.

References cuda_args, cudaCheck, cudaDie(), deviceIDList, cuda_args_t::devicelist, cuda_args_t::devicesperreplica, dummy(), cuda_args_t::ignoresharing, masterPeList, MAX_NUM_DEVICES, MAX_NUM_RANKS, cuda_args_t::mergegrids, NAMD_die(), cuda_args_t::nomergegrids, cuda_args_t::nostreaming, and cuda_args_t::usedevicelist.

Referenced by cuda_initialize().

89  {
90  // Copy command-line arguments into class
91  this->devicelist = cuda_args.devicelist;
92  this->usedevicelist = cuda_args.usedevicelist;
93  this->devicesperreplica = cuda_args.devicesperreplica;
94  this->ignoresharing = cuda_args.ignoresharing;
95  this->mergegrids = cuda_args.mergegrids;
96  this->nomergegrids = cuda_args.nomergegrids;
97  this->nostreaming = cuda_args.nostreaming;
98 
99  if (CkMyPe() == 0) register_user_events();
100 #if defined(CUDA_VERSION)
101  if (CkMyPe() == 0) CkPrintf("Info: Built with CUDA version %d\n", CUDA_VERSION);
102 #endif
103  char host[128];
104  gethostname(host, 128); host[127] = 0;
105 
106  int myPhysicalNodeID = CmiPhysicalNodeID(CkMyPe());
107  int myRankInPhysicalNode;
108  int numPesOnPhysicalNode;
109  int *pesOnPhysicalNode;
110  CmiGetPesOnPhysicalNode(myPhysicalNodeID,
111  &pesOnPhysicalNode,&numPesOnPhysicalNode);
112 
113  {
114  int i;
115  for ( i=0; i < numPesOnPhysicalNode; ++i ) {
116  if ( i && (pesOnPhysicalNode[i] <= pesOnPhysicalNode[i-1]) ) {
117  i = numPesOnPhysicalNode;
118  break;
119  }
120  if ( pesOnPhysicalNode[i] == CkMyPe() ) break;
121  }
122  if ( i == numPesOnPhysicalNode || i != CmiPhysicalRank(CkMyPe()) ) {
123  CkPrintf("Bad result from CmiGetPesOnPhysicalNode!\n");
124  for ( i=0; i < numPesOnPhysicalNode; ++i ) {
125  CkPrintf("pe %d physnode rank %d of %d is %d\n", CkMyPe(),
126  i, numPesOnPhysicalNode, pesOnPhysicalNode[i]);
127  }
128  myRankInPhysicalNode = 0;
129  numPesOnPhysicalNode = 1;
130  pesOnPhysicalNode = new int[1];
131  pesOnPhysicalNode[0] = CkMyPe();
132  } else {
133  myRankInPhysicalNode = i;
134  }
135  }
136  // CkPrintf("Pe %d ranks %d in physical node\n",CkMyPe(),myRankInPhysicalNode);
137 
138  deviceCount = 0;
139  cudaCheck(cudaGetDeviceCount(&deviceCount));
140  if ( deviceCount <= 0 ) {
141  cudaDie("No CUDA devices found.");
142  }
143 
144  // Store all device props
145  deviceProps = new cudaDeviceProp[deviceCount];
146  for ( int i=0; i<deviceCount; ++i ) {
147  cudaCheck(cudaGetDeviceProperties(&deviceProps[i], i));
148  }
149 
150  ndevices = 0;
151  int nexclusive = 0;
152  if ( usedevicelist ) {
153  devices = new int[strlen(devicelist)];
154  int i = 0;
155  while ( devicelist[i] ) {
156  ndevices += sscanf(devicelist+i,"%d",devices+ndevices);
157  while ( devicelist[i] && isdigit(devicelist[i]) ) ++i;
158  while ( devicelist[i] && ! isdigit(devicelist[i]) ) ++i;
159  }
160  } else {
161  if ( ! CkMyPe() ) {
162  CkPrintf("Did not find +devices i,j,k,... argument, using all\n");
163  }
164  devices = new int[deviceCount];
165  for ( int i=0; i<deviceCount; ++i ) {
166  int dev = i % deviceCount;
167 #if CUDA_VERSION >= 2020 || defined(NAMD_HIP)
168  cudaDeviceProp deviceProp;
169  cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
170  if ( deviceProp.computeMode != cudaComputeModeProhibited
171  && (deviceProp.major >= 3)
172  && deviceProp.canMapHostMemory
173  && ( (deviceProp.multiProcessorCount > 2) ||
174  ((ndevices==0)&&(CkNumNodes()==1)) ) // exclude weak cards
175  ) {
176  devices[ndevices++] = dev;
177  }
178  if ( deviceProp.computeMode == cudaComputeModeExclusive ) {
179  ++nexclusive;
180  }
181 #else
182  devices[ndevices++] = dev;
183 #endif
184  }
185  }
186 
187  if ( ! ndevices ) {
188  cudaDie("all devices are in prohibited mode, of compute capability < 3.0, unable to map host memory, too small, or otherwise unusable");
189  }
190 
191  if ( devicesperreplica > 0 ) {
192  if ( devicesperreplica > ndevices ) {
193  NAMD_die("More devices per partition requested than devices are available");
194  }
195  int *olddevices = devices;
196  devices = new int[devicesperreplica];
197  for ( int i=0; i<devicesperreplica; ++i ) {
198  int mypart = CmiMyPartition();
199  devices[i] = olddevices[(i+devicesperreplica*mypart)%ndevices];
200  }
201  ndevices = devicesperreplica;
202  delete [] olddevices;
203  }
204 
205  int myRankForDevice = ignoresharing ? CkMyRank() : myRankInPhysicalNode;
206  int numPesForDevice = ignoresharing ? CkMyNodeSize() : numPesOnPhysicalNode;
207 
208  // catch multiple processes per device
209  if ( ndevices % ( numPesForDevice / CkMyNodeSize() ) ) {
210  char msg[1024];
211  sprintf(msg,"Number of devices (%d) is not a multiple of number of processes (%d). "
212  "Sharing devices between processes is inefficient. "
213  "Specify +ignoresharing (each process uses all visible devices) if "
214  "not all devices are visible to each process, otherwise "
215  "adjust number of processes to evenly divide number of devices, "
216  "specify subset of devices with +devices argument (e.g., +devices 0,2), "
217  "or multiply list shared devices (e.g., +devices 0,1,2,0).",
218  ndevices, numPesForDevice / CkMyNodeSize() );
219  NAMD_die(msg);
220  }
221 
222  {
223  // build list of devices actually used by this node
224  nodedevices = new int[ndevices];
225  nnodedevices = 0;
226  int pe = CkNodeFirst(CkMyNode());
227  int dr = -1;
228  for ( int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
229  int rank = ignoresharing ? i : CmiPhysicalRank(pe);
230  int peDeviceRank = rank * ndevices / numPesForDevice;
231  if ( peDeviceRank != dr ) {
232  dr = peDeviceRank;
233  nodedevices[nnodedevices++] = devices[dr];
234  }
235  }
236  }
237 
238  {
239  // check for devices used twice by this node
240  for ( int i=0; i<nnodedevices; ++i ) {
241  for ( int j=i+1; j<nnodedevices; ++j ) {
242  if ( nodedevices[i] == nodedevices[j] ) {
243  char msg[1024];
244  sprintf(msg,"Device %d bound twice by same process.", nodedevices[i]);
245  NAMD_die(msg);
246  }
247  }
248  }
249  }
250 
251  sharedGpu = 0;
252  gpuIsMine = 1;
253  int firstPeSharingGpu = CkMyPe();
254  nextPeSharingGpu = CkMyPe();
255 
256  {
257  int dev;
258  if ( numPesForDevice > 1 ) {
259  int myDeviceRank = myRankForDevice * ndevices / numPesForDevice;
260  dev = devices[myDeviceRank];
261  masterPe = CkMyPe();
262  {
263  pesSharingDevice = new int[numPesForDevice];
264  masterPe = -1;
265  numPesSharingDevice = 0;
266  for ( int i = 0; i < numPesForDevice; ++i ) {
267  if ( i * ndevices / numPesForDevice == myDeviceRank ) {
268  int thisPe = ignoresharing ? (CkNodeFirst(CkMyNode())+i) : pesOnPhysicalNode[i];
269  pesSharingDevice[numPesSharingDevice++] = thisPe;
270  if ( masterPe < 1 ) masterPe = thisPe;
271  if ( WorkDistrib::pe_sortop_diffuse()(thisPe,masterPe) ) masterPe = thisPe;
272  }
273  }
274  for ( int j = 0; j < ndevices; ++j ) {
275  if ( devices[j] == dev && j != myDeviceRank ) sharedGpu = 1;
276  }
277  }
278  if ( sharedGpu && masterPe == CkMyPe() ) {
279  if ( CmiPhysicalNodeID(masterPe) < 2 )
280  CkPrintf("Pe %d sharing CUDA device %d\n", CkMyPe(), dev);
281  }
282  } else { // in case phys node code is lying
283  dev = devices[CkMyPe() % ndevices];
284  masterPe = CkMyPe();
285  pesSharingDevice = new int[1];
286  pesSharingDevice[0] = CkMyPe();
287  numPesSharingDevice = 1;
288  }
289 
290  deviceID = dev;
291 
292  // Store device IDs to node-wide list
293  if (CkMyRank() >= MAX_NUM_RANKS)
294  NAMD_die("Maximum number of ranks (2048) per node exceeded");
295  deviceIDList[CkMyRank()] = deviceID;
296 
297  if ( masterPe != CkMyPe() ) {
298  if ( CmiPhysicalNodeID(masterPe) < 2 )
299  CkPrintf("Pe %d physical rank %d will use CUDA device of pe %d\n",
300  CkMyPe(), myRankInPhysicalNode, masterPe);
301  // for PME only
302  cudaCheck(cudaSetDevice(dev));
303  return;
304  }
305 
306  // Store master PEs for every device ID to node-wide list
307  if (deviceID >= MAX_NUM_DEVICES)
308  NAMD_die("Maximum number of CUDA devices (256) per node exceeded");
309  masterPeList[deviceID] = masterPe + 1; // array is pre-initialized to zeros
310 
311  // disable token-passing but don't submit local until remote finished
312  // if shared_gpu is true, otherwise submit all work immediately
313  firstPeSharingGpu = CkMyPe();
314  nextPeSharingGpu = CkMyPe();
315 
316  gpuIsMine = ( firstPeSharingGpu == CkMyPe() );
317 
318  if ( dev >= deviceCount ) {
319  char buf[256];
320  sprintf(buf,"Pe %d unable to bind to CUDA device %d on %s because only %d devices are present",
321  CkMyPe(), dev, host, deviceCount);
322  NAMD_die(buf);
323  }
324 
325  cudaDeviceProp deviceProp;
326  cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
327  if ( CmiPhysicalNodeID(masterPe) < 2 )
328  CkPrintf("Pe %d physical rank %d binding to CUDA device %d on %s: '%s' Mem: %luMB Rev: %d.%d PCI: %x:%x:%x\n",
329  CkMyPe(), myRankInPhysicalNode, dev, host,
330  deviceProp.name,
331  (unsigned long) (deviceProp.totalGlobalMem / (1024*1024)),
332  deviceProp.major, deviceProp.minor,
333  deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
334 
335  cudaCheck(cudaSetDevice(dev));
336 
337  } // just let CUDA pick a device for us
338 
339  {
340  // if only one device then already initialized in cuda_affinity_initialize()
341  cudaError_t cudaSetDeviceFlags_cudaDeviceMapHost = cudaSetDeviceFlags(cudaDeviceMapHost);
342  if ( cudaSetDeviceFlags_cudaDeviceMapHost == cudaErrorSetOnActiveProcess ) {
343  cudaGetLastError();
344  } else {
345  cudaCheck(cudaSetDeviceFlags_cudaDeviceMapHost);
346  }
347 
348  int dev;
349  cudaCheck(cudaGetDevice(&dev));
350  deviceID = dev;
351  cudaDeviceProp deviceProp;
352  cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
353  if ( deviceProp.computeMode == cudaComputeModeProhibited )
354  cudaDie("device in prohibited mode");
355  if ( deviceProp.major < 3 )
356  cudaDie("device not of compute capability 3.0 or higher");
357  if ( ! deviceProp.canMapHostMemory )
358  cudaDie("device cannot map host memory");
359 
360  // initialize the device on this thread
361  int *dummy;
362  cudaCheck(cudaMalloc(&dummy, 4));
363  }
364 }
int devicesperreplica
Definition: DeviceCUDA.C:46
#define MAX_NUM_DEVICES
Definition: DeviceCUDA.C:74
int nomergegrids
Definition: DeviceCUDA.C:49
int masterPeList[MAX_NUM_DEVICES]
Definition: DeviceCUDA.C:75
static __thread cuda_args_t cuda_args
Definition: DeviceCUDA.C:53
int usedevicelist
Definition: DeviceCUDA.C:45
int deviceIDList[MAX_NUM_RANKS]
Definition: DeviceCUDA.C:72
int mergegrids
Definition: DeviceCUDA.C:48
int nostreaming
Definition: DeviceCUDA.C:50
void cudaDie(const char *msg, cudaError_t err=cudaSuccess)
Definition: CudaUtils.C:9
void NAMD_die(const char *err_msg)
Definition: common.C:85
#define MAX_NUM_RANKS
Definition: DeviceCUDA.C:71
void dummy()
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
int ignoresharing
Definition: DeviceCUDA.C:47
char * devicelist
Definition: DeviceCUDA.C:44
bool DeviceCUDA::one_device_per_node ( )

Definition at line 402 of file DeviceCUDA.C.

Referenced by ComputePmeMgr::initialize().

402  {
403  if ( numPesSharingDevice != CkMyNodeSize() ) return false;
404  int numPesOnNodeSharingDevice = 0;
405  for ( int i=0; i<numPesSharingDevice; ++i ) {
406  if ( CkNodeOf(pesSharingDevice[i]) == CkMyNode() ) {
407  ++numPesOnNodeSharingDevice;
408  }
409  }
410  return ( numPesOnNodeSharingDevice == CkMyNodeSize() );
411 }
void DeviceCUDA::setGpuIsMine ( const int  val)
inline

Definition at line 110 of file DeviceCUDA.h.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA(), and ComputeNonbondedCUDA::recvYieldDevice().

110 {gpuIsMine = val;}
void DeviceCUDA::setMergeGrids ( const int  val)
inline

Definition at line 101 of file DeviceCUDA.h.

Referenced by ComputeNonbondedCUDA::ComputeNonbondedCUDA().

101 {mergegrids = val;}

The documentation for this class was generated from the following files: