NAMD
Public Member Functions | List of all members
DeviceCUDA Class Reference

#include <DeviceCUDA.h>

Public Member Functions

 DeviceCUDA ()
 
 ~DeviceCUDA ()
 
void initialize ()
 
int getDeviceCount ()
 
int getNumDevice ()
 
bool device_shared_with_pe (int pe)
 
bool one_device_per_node ()
 
int getNoStreaming ()
 
int getNoMergeGrids ()
 
int getMergeGrids ()
 
void setMergeGrids (const int val)
 
bool getSharedGpu ()
 
int getNextPeSharingGpu ()
 
int getMasterPe ()
 
int getNumPesSharingDevice ()
 
int getPesSharingDevice (const int i)
 
int getGpuIsMine ()
 
void setGpuIsMine (const int val)
 
int getDeviceID ()
 
int getDeviceIDbyRank (int rank)
 
int getDeviceIDforPe (int pe)
 
int getMasterPeForDeviceID (int deviceID)
 
int getMaxNumThreads ()
 
int getMaxNumBlocks ()
 
void setupDevicePeerAccess ()
 
bool isGpuReservedPme ()
 
int getPmeDevice ()
 
int getDeviceIndex ()
 
int getPmeDeviceIndex ()
 
bool getIsPmeDevice ()
 
bool getIsMasterDevice ()
 
int getGlobalDevice () const
 
bool getIsGlobalDevice () const
 
const int * allDevices () const
 

Detailed Description

Definition at line 54 of file DeviceCUDA.h.

Constructor & Destructor Documentation

◆ DeviceCUDA()

DeviceCUDA::DeviceCUDA ( )

Definition at line 104 of file DeviceCUDA.C.

104 : deviceProps(NULL), devices(NULL) {}

◆ ~DeviceCUDA()

DeviceCUDA::~DeviceCUDA ( )

Definition at line 516 of file DeviceCUDA.C.

516  {
517  if (deviceProps != NULL) delete [] deviceProps;
518  if (devices != NULL) delete [] devices;
519  delete [] pesSharingDevice;
520 }

Member Function Documentation

◆ allDevices()

const int* DeviceCUDA::allDevices ( ) const
inline

Definition at line 173 of file DeviceCUDA.h.

173 {return devices;}

◆ device_shared_with_pe()

bool DeviceCUDA::device_shared_with_pe ( int  pe)

Definition at line 539 of file DeviceCUDA.C.

Referenced by ComputeMgr::createComputes().

539  {
540  for ( int i=0; i<numPesSharingDevice; ++i ) {
541  if ( pesSharingDevice[i] == pe ) return true;
542  }
543  return false;
544 }

◆ getDeviceCount()

int DeviceCUDA::getDeviceCount ( )
inline

Definition at line 124 of file DeviceCUDA.h.

Referenced by CudaComputeNonbonded::assignPatches(), and ComputeCUDAMgr::initialize().

124 {return deviceCount;}

◆ getDeviceID()

int DeviceCUDA::getDeviceID ( )
inline

◆ getDeviceIDbyRank()

int DeviceCUDA::getDeviceIDbyRank ( int  rank)
inline

◆ getDeviceIDforPe()

int DeviceCUDA::getDeviceIDforPe ( int  pe)

Definition at line 525 of file DeviceCUDA.C.

References deviceIDList.

525  {
526  return deviceIDList[CkRankOf(pe) % CkMyNodeSize()];
527 }
int deviceIDList[MAX_NUM_RANKS]
Definition: DeviceCUDA.C:92

◆ getDeviceIndex()

int DeviceCUDA::getDeviceIndex ( )
inline

Definition at line 166 of file DeviceCUDA.h.

Referenced by CudaComputeNonbonded::initialize().

166 { return deviceIndex; }

◆ getGlobalDevice()

int DeviceCUDA::getGlobalDevice ( ) const
inline

Definition at line 171 of file DeviceCUDA.h.

Referenced by ComputeCUDAMgr::createCudaGlobalMaster(), and ComputeMgr::recvCudaGlobalMasterCreateMsg().

171 {return globalDevice;}

◆ getGpuIsMine()

int DeviceCUDA::getGpuIsMine ( )
inline

Definition at line 141 of file DeviceCUDA.h.

141 {return gpuIsMine;}

◆ getIsGlobalDevice()

bool DeviceCUDA::getIsGlobalDevice ( ) const
inline

◆ getIsMasterDevice()

bool DeviceCUDA::getIsMasterDevice ( )

Definition at line 642 of file DeviceCUDA.C.

Referenced by GlobalGPUMgr::initialize().

642  {
643  return (CkMyPe() == masterPe) && isMasterDevice;
644 }

◆ getIsPmeDevice()

bool DeviceCUDA::getIsPmeDevice ( )
inline

Definition at line 168 of file DeviceCUDA.h.

168 { return isPmeDevice; }

◆ getMasterPe()

int DeviceCUDA::getMasterPe ( )
inline

◆ getMasterPeForDeviceID()

int DeviceCUDA::getMasterPeForDeviceID ( int  deviceID)

Definition at line 532 of file DeviceCUDA.C.

References masterPeList.

Referenced by CudaComputeNonbonded::assignPatches().

532  {
533  return masterPeList[deviceID % deviceCount] - 1;
534 }
int masterPeList[MAX_NUM_DEVICES]
Definition: DeviceCUDA.C:95

◆ getMaxNumBlocks()

int DeviceCUDA::getMaxNumBlocks ( )

Definition at line 566 of file DeviceCUDA.C.

References cudaCheck.

566  {
567  int dev;
568  cudaCheck(cudaGetDevice(&dev));
569  return deviceProps[dev].maxGridSize[0];
570 }
#define cudaCheck(stmt)
Definition: CudaUtils.h:233

◆ getMaxNumThreads()

int DeviceCUDA::getMaxNumThreads ( )

Definition at line 560 of file DeviceCUDA.C.

References cudaCheck.

560  {
561  int dev;
562  cudaCheck(cudaGetDevice(&dev));
563  return deviceProps[dev].maxThreadsPerBlock;
564 }
#define cudaCheck(stmt)
Definition: CudaUtils.h:233

◆ getMergeGrids()

int DeviceCUDA::getMergeGrids ( )
inline

Definition at line 132 of file DeviceCUDA.h.

132 {return mergegrids;}

◆ getNextPeSharingGpu()

int DeviceCUDA::getNextPeSharingGpu ( )
inline

Definition at line 136 of file DeviceCUDA.h.

136 {return nextPeSharingGpu;}

◆ getNoMergeGrids()

int DeviceCUDA::getNoMergeGrids ( )
inline

Definition at line 131 of file DeviceCUDA.h.

131 {return nomergegrids;}

◆ getNoStreaming()

int DeviceCUDA::getNoStreaming ( )
inline

Definition at line 130 of file DeviceCUDA.h.

Referenced by ComputeCUDAMgr::createCudaComputeNonbonded().

130 {return nostreaming;}

◆ getNumDevice()

int DeviceCUDA::getNumDevice ( )
inline

◆ getNumPesSharingDevice()

int DeviceCUDA::getNumPesSharingDevice ( )
inline

Definition at line 138 of file DeviceCUDA.h.

Referenced by CudaComputeNonbonded::assignPatches().

138 {return numPesSharingDevice;}

◆ getPesSharingDevice()

int DeviceCUDA::getPesSharingDevice ( const int  i)
inline

Definition at line 139 of file DeviceCUDA.h.

Referenced by CudaComputeNonbonded::assignPatches().

139 {return pesSharingDevice[i];}

◆ getPmeDevice()

int DeviceCUDA::getPmeDevice ( )
inline

Definition at line 165 of file DeviceCUDA.h.

Referenced by ComputeCUDAMgr::createCudaPmeOneDevice().

165 { return pmeDevice; }

◆ getPmeDeviceIndex()

int DeviceCUDA::getPmeDeviceIndex ( )
inline

Definition at line 167 of file DeviceCUDA.h.

Referenced by ComputeCUDAMgr::createCudaPmeOneDevice(), and GlobalGPUMgr::getIsPmeDevice().

167 { return pmeDeviceIndex; }

◆ getSharedGpu()

bool DeviceCUDA::getSharedGpu ( )
inline

Definition at line 135 of file DeviceCUDA.h.

135 {return sharedGpu;}

◆ initialize()

void DeviceCUDA::initialize ( void  )

Definition at line 109 of file DeviceCUDA.C.

References cuda_args, cudaCheck, cudaDie(), deviceIDList, cuda_args_t::devicelist, cuda_args_t::devicesperreplica, for(), cuda_args_t::globaldevice, cuda_args_t::ignoresharing, masterPeList, MAX_NUM_DEVICES, MAX_NUM_RANKS, cuda_args_t::mergegrids, NAMD_die(), cuda_args_t::nomergegrids, cuda_args_t::nostreaming, cuda_args_t::pmedevice, cuda_args_t::pmePes, and cuda_args_t::usedevicelist.

Referenced by cuda_initialize().

109  {
110  // Copy command-line arguments into class
111  this->devicelist = cuda_args.devicelist;
112  this->usedevicelist = cuda_args.usedevicelist;
113  this->devicesperreplica = cuda_args.devicesperreplica;
114  this->ignoresharing = cuda_args.ignoresharing;
115  this->mergegrids = cuda_args.mergegrids;
116  this->nomergegrids = cuda_args.nomergegrids;
117  this->nostreaming = cuda_args.nostreaming;
118 #ifdef NODEGROUP_FORCE_REGISTER
119  reservePme = 0;
120  isPmeDevice = 0;
121  isGlobalDevice = false;
122 #endif
123 
124  if (CkMyPe() == 0) register_user_events();
125 
126 #if defined(CUDA_VERSION)
127  if (CkMyPe() == 0) CkPrintf("Info: Built with CUDA version %d\n", CUDA_VERSION);
128 #endif
129 
130  char host[128];
131  gethostname(host, 128); host[127] = 0;
132 
133  int myPhysicalNodeID = CmiPhysicalNodeID(CkMyPe());
134  int myRankInPhysicalNode;
135  int numPesOnPhysicalNode;
136  int *pesOnPhysicalNode;
137  CmiGetPesOnPhysicalNode(myPhysicalNodeID,
138  &pesOnPhysicalNode,&numPesOnPhysicalNode);
139 
140  {
141  int i;
142  for ( i=0; i < numPesOnPhysicalNode; ++i ) {
143  if ( i && (pesOnPhysicalNode[i] <= pesOnPhysicalNode[i-1]) ) {
144  i = numPesOnPhysicalNode;
145  break;
146  }
147  if ( pesOnPhysicalNode[i] == CkMyPe() ) break;
148  }
149  if ( i == numPesOnPhysicalNode || i != CmiPhysicalRank(CkMyPe()) ) {
150  CkPrintf("Bad result from CmiGetPesOnPhysicalNode!\n");
151  for ( i=0; i < numPesOnPhysicalNode; ++i ) {
152  CkPrintf("pe %d physnode rank %d of %d is %d\n", CkMyPe(),
153  i, numPesOnPhysicalNode, pesOnPhysicalNode[i]);
154  }
155  myRankInPhysicalNode = 0;
156  numPesOnPhysicalNode = 1;
157  pesOnPhysicalNode = new int[1];
158  pesOnPhysicalNode[0] = CkMyPe();
159  } else {
160  myRankInPhysicalNode = i;
161  }
162  }
163  // CkPrintf("Pe %d ranks %d in physical node\n",CkMyPe(),myRankInPhysicalNode);
164 
165  deviceCount = 0;
166  cudaCheck(cudaGetDeviceCount(&deviceCount));
167  if ( deviceCount <= 0 ) {
168  cudaDie("No CUDA devices found.");
169  }
170 
171  // Store all device props
172  deviceProps = new cudaDeviceProp[deviceCount];
173  for ( int i=0; i<deviceCount; ++i ) {
174  cudaCheck(cudaGetDeviceProperties(&deviceProps[i], i));
175  }
176 
177  ndevices = 0;
178  int nexclusive = 0;
179  if ( usedevicelist ) {
180  devices = new int[strlen(devicelist)];
181  int i = 0;
182  while ( devicelist[i] ) {
183  ndevices += sscanf(devicelist+i,"%d",devices+ndevices);
184  while ( devicelist[i] && isdigit(devicelist[i]) ) ++i;
185  while ( devicelist[i] && ! isdigit(devicelist[i]) ) ++i;
186  }
187  } else {
188  if ( ! CkMyPe() ) {
189  CkPrintf("Did not find +devices i,j,k,... argument, using all\n");
190  }
191  devices = new int[deviceCount];
192  for ( int i=0; i<deviceCount; ++i ) {
193  int dev = i % deviceCount;
194 #if CUDA_VERSION >= 2020 || defined(NAMD_HIP)
195  cudaDeviceProp deviceProp;
196  cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
197  if ( deviceProp.computeMode != cudaComputeModeProhibited
198  && (deviceProp.major >= 3)
199  && deviceProp.canMapHostMemory
200  && ( (deviceProp.multiProcessorCount > 2) ||
201  ((ndevices==0)&&(CkNumNodes()==1)) ) // exclude weak cards
202  ) {
203  devices[ndevices++] = dev;
204  }
205  if ( deviceProp.computeMode == cudaComputeModeExclusive ) {
206  ++nexclusive;
207  }
208 #else
209  devices[ndevices++] = dev;
210 #endif
211  }
212  }
213 
214  if ( ! ndevices ) {
215  cudaDie("all devices are in prohibited mode, of compute capability < 3.0, unable to map host memory, too small, or otherwise unusable");
216  }
217 
218  if ( devicesperreplica > 0 ) {
219  if ( devicesperreplica > ndevices ) {
220  NAMD_die("More devices per partition requested than devices are available");
221  }
222  int *olddevices = devices;
223  devices = new int[devicesperreplica];
224  for ( int i=0; i<devicesperreplica; ++i ) {
225  int mypart = CmiMyPartition();
226  devices[i] = olddevices[(i+devicesperreplica*mypart)%ndevices];
227  }
228  ndevices = devicesperreplica;
229  delete [] olddevices;
230  }
231 
232  int myRankForDevice = ignoresharing ? CkMyRank() : myRankInPhysicalNode;
233  int numPesForDevice = ignoresharing ? CkMyNodeSize() : numPesOnPhysicalNode;
234 
235  // This option allows users to specify the number of PEs given to the PME device.
236  // This can be used to improve loadbalancing.
237  #ifdef NODEGROUP_FORCE_REGISTER
238  const int pmePes = (cuda_args.pmePes == NULL) ? -1 : cuda_args.pmePes;
239  #else
240  const int pmePes = -1;
241  #endif
242 
243  // catch multiple processes per device
244  if ( ndevices % ( numPesForDevice / CkMyNodeSize() ) ) {
245  char msg[1024];
246  sprintf(msg,"Number of devices (%d) is not a multiple of number of processes (%d). "
247  "Sharing devices between processes is inefficient. "
248  "Specify +ignoresharing (each process uses all visible devices) if "
249  "not all devices are visible to each process, otherwise "
250  "adjust number of processes to evenly divide number of devices, "
251  "specify subset of devices with +devices argument (e.g., +devices 0,2), "
252  "or multiply list shared devices (e.g., +devices 0,1,2,0).",
253  ndevices, numPesForDevice / CkMyNodeSize() );
254  NAMD_die(msg);
255  }
256 
257  {
258  // build list of devices actually used by this node
259  nodedevices = new int[ndevices];
260  nnodedevices = 0;
261  int pe = CkNodeFirst(CkMyNode());
262  int dr = -1;
263  for ( int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
264  int rank = ignoresharing ? i : CmiPhysicalRank(pe);
265  int peDeviceRank = rank * ndevices / numPesForDevice;
266  if ( peDeviceRank != dr ) {
267  dr = peDeviceRank;
268  nodedevices[nnodedevices++] = devices[dr];
269  }
270  }
271  }
272 
273  {
274  // check for devices used twice by this node
275  for ( int i=0; i<nnodedevices; ++i ) {
276  for ( int j=i+1; j<nnodedevices; ++j ) {
277  if ( nodedevices[i] == nodedevices[j] ) {
278  char msg[1024];
279  sprintf(msg,"Device %d bound twice by same process.", nodedevices[i]);
280  NAMD_die(msg);
281  }
282  }
283  }
284  }
285 
286  sharedGpu = 0;
287  gpuIsMine = 1;
288  int firstPeSharingGpu = CkMyPe();
289  nextPeSharingGpu = CkMyPe();
290 
291  {
292  int dev;
293  if (pmePes != -1) {
294  int myDeviceRank;
295  if (myRankForDevice < pmePes) {
296  myDeviceRank = 0;
297  } else {
298  myDeviceRank = 1 + (myRankForDevice-pmePes) * (ndevices-1) / (numPesForDevice-pmePes);
299  }
300 
301  dev = devices[myDeviceRank];
302  masterPe = CkMyPe();
303  if (myRankForDevice >= pmePes) {
304  pesSharingDevice = new int[numPesForDevice];
305  masterPe = -1;
306  numPesSharingDevice = 0;
307  for ( int i = pmePes; i < numPesForDevice; ++i ) {
308  if ( 1 + (i-pmePes) * (ndevices-1) / (numPesForDevice-pmePes) == myDeviceRank ) {
309  int thisPe = ignoresharing ? (CkNodeFirst(CkMyNode())+i) : pesOnPhysicalNode[i];
310  pesSharingDevice[numPesSharingDevice++] = thisPe;
311  if ( masterPe < 1 ) masterPe = thisPe;
312  if ( WorkDistrib::pe_sortop_diffuse()(thisPe,masterPe) ) masterPe = thisPe;
313  }
314  }
315  for ( int j = 0; j < ndevices; ++j ) {
316  if ( devices[j] == dev && j != myDeviceRank ) sharedGpu = 1;
317  }
318  } else {
319  #ifdef NODEGROUP_FORCE_REGISTER
320  pesSharingDevice = new int[pmePes];
321  #else
322  pesSharingDevice = NULL;
323  #endif
324  masterPe = -1;
325  numPesSharingDevice = 0;
326  for (int i = 0; i < pmePes; ++i) {
327  int thisPe = ignoresharing ? (CkNodeFirst(CkMyNode())+i) : pesOnPhysicalNode[i];
328  pesSharingDevice[numPesSharingDevice++] = thisPe;
329  if ( masterPe < 1 ) masterPe = thisPe;
330  if ( WorkDistrib::pe_sortop_diffuse()(thisPe,masterPe) ) masterPe = thisPe;
331  }
332  }
333  if ( sharedGpu && masterPe == CkMyPe() ) {
334  if ( CmiPhysicalNodeID(masterPe) < 2 )
335  CkPrintf("Pe %d sharing CUDA device %d\n", CkMyPe(), dev);
336  }
337  } else if ( numPesForDevice > 1 ) {
338  int myDeviceRank = myRankForDevice * ndevices / numPesForDevice;
339  dev = devices[myDeviceRank];
340  masterPe = CkMyPe();
341  {
342  pesSharingDevice = new int[numPesForDevice];
343  masterPe = -1;
344  numPesSharingDevice = 0;
345  for ( int i = 0; i < numPesForDevice; ++i ) {
346  if ( i * ndevices / numPesForDevice == myDeviceRank ) {
347  int thisPe = ignoresharing ? (CkNodeFirst(CkMyNode())+i) : pesOnPhysicalNode[i];
348  pesSharingDevice[numPesSharingDevice++] = thisPe;
349  if ( masterPe < 1 ) masterPe = thisPe;
350  if ( WorkDistrib::pe_sortop_diffuse()(thisPe,masterPe) ) masterPe = thisPe;
351  }
352  }
353  for ( int j = 0; j < ndevices; ++j ) {
354  if ( devices[j] == dev && j != myDeviceRank ) sharedGpu = 1;
355  }
356  }
357  if ( sharedGpu && masterPe == CkMyPe() ) {
358  if ( CmiPhysicalNodeID(masterPe) < 2 )
359  CkPrintf("Pe %d sharing CUDA device %d\n", CkMyPe(), dev);
360  }
361  } else { // in case phys node code is lying
362  dev = devices[CkMyPe() % ndevices];
363  masterPe = CkMyPe();
364  pesSharingDevice = new int[1];
365  pesSharingDevice[0] = CkMyPe();
366  numPesSharingDevice = 1;
367  }
368 
369  deviceID = dev;
370 
371  // Setting PME device for single-node scheme
372  // Sanity check in order to see if pmeDevice in contained in the device list
373  bool contained = false;
374  pmeDevice = (cuda_args.pmedevice == NULL) ? devices[0]: cuda_args.pmedevice;
375  for(int i = 0; i < ndevices; i++){
376  if(!contained) {
377  contained = devices[i] == pmeDevice;
378  pmeDeviceIndex = (contained) ? i : -1; // saves index for pmeDevice
379  }
380  if(deviceID == devices[i]) deviceIndex = i;
381  }
382 
383  masterDevice = devices[0]; // head of device list responsible for printing stuff
384  isMasterDevice = deviceID == masterDevice;
385  if(!contained){
386  // Uses a particular device to do PME and reserves it (no other force terms on it)
387  reservePme = true;
388  pmeDeviceIndex = nnodedevices; // PME device index always at the tail of the list
389  isPmeDevice = isMasterDevice; // Master device launches work on the PME device as well
390  }else{
391  reservePme = false;
392  isPmeDevice = pmeDevice == deviceID;
393  }
394 
395  // Device for CudaGlobalMaster
396  globalDevice = (cuda_args.globaldevice < 0) ? devices[0]: cuda_args.globaldevice;
397  // Sanity check in order to see if globalDevice is contained in the device list
398  contained = false;
399  for (int i = 0; i < ndevices; ++i) {
400  if(!contained) {
401  contained = devices[i] == globalDevice;
402  }
403  }
404  if (!contained) {
405  NAMD_die("The selected GPU device for global forces is in the available devices list.\n");
406  }
407  isGlobalDevice = globalDevice == deviceID;
408 
409  // Store device IDs to node-wide list
410  if (CkMyRank() >= MAX_NUM_RANKS)
411  NAMD_die("Maximum number of ranks (2048) per node exceeded");
412  deviceIDList[CkMyRank()] = deviceID;
413 
414  if ( masterPe != CkMyPe() ) {
415  if ( CmiPhysicalNodeID(masterPe) < 2 )
416  CkPrintf("Pe %d physical rank %d will use CUDA device of pe %d\n",
417  CkMyPe(), myRankInPhysicalNode, masterPe);
418  // for PME only
419  cudaCheck(cudaSetDevice(dev));
420  return;
421  }
422 
423  // Store master PEs for every device ID to node-wide list
424  if (deviceID >= MAX_NUM_DEVICES)
425  NAMD_die("Maximum number of CUDA devices (256) per node exceeded");
426  masterPeList[deviceID] = masterPe + 1; // array is pre-initialized to zeros
427 
428  // disable token-passing but don't submit local until remote finished
429  // if shared_gpu is true, otherwise submit all work immediately
430  firstPeSharingGpu = CkMyPe();
431  nextPeSharingGpu = CkMyPe();
432 
433  gpuIsMine = ( firstPeSharingGpu == CkMyPe() );
434 
435  if ( dev >= deviceCount ) {
436  char buf[256];
437  sprintf(buf,"Pe %d unable to bind to CUDA device %d on %s because only %d devices are present",
438  CkMyPe(), dev, host, deviceCount);
439  NAMD_die(buf);
440  }
441 
442  cudaDeviceProp deviceProp;
443  cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
444  if ( CmiPhysicalNodeID(masterPe) < 2 )
445  CkPrintf("Pe %d physical rank %d binding to CUDA device %d on %s: '%s' Mem: %luMB Rev: %d.%d PCI: %x:%x:%x\n",
446  CkMyPe(), myRankInPhysicalNode, dev, host,
447  deviceProp.name,
448  (unsigned long) (deviceProp.totalGlobalMem / (1024*1024)),
449  deviceProp.major, deviceProp.minor,
450  deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
451 
452  cudaCheck(cudaSetDevice(dev));
453 
454  } // just let CUDA pick a device for us
455 
456  {
457  // if only one device then already initialized in cuda_affinity_initialize()
458  cudaError_t cudaSetDeviceFlags_cudaDeviceMapHost = cudaSetDeviceFlags(cudaDeviceMapHost);
459  if ( cudaSetDeviceFlags_cudaDeviceMapHost == cudaErrorSetOnActiveProcess ) {
460  cudaGetLastError();
461  } else {
462  cudaCheck(cudaSetDeviceFlags_cudaDeviceMapHost);
463  }
464 
465  int dev;
466  cudaCheck(cudaGetDevice(&dev));
467  deviceID = dev;
468  cudaDeviceProp deviceProp;
469  cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
470  if ( deviceProp.computeMode == cudaComputeModeProhibited )
471  cudaDie("device in prohibited mode");
472  if ( deviceProp.major < 3 )
473  cudaDie("device not of compute capability 3.0 or higher");
474  if ( ! deviceProp.canMapHostMemory )
475  cudaDie("device cannot map host memory");
476 
477  // initialize the device on this thread
478  int *dummy;
479  cudaCheck(cudaMalloc(&dummy, 4));
480  }
481 
482 #if NODEGROUP_FORCE_REGISTER
483 
484  {
485  // Setting PME device for single-node scheme
486 
487  // Sanity check in order to see if pmeDevice in contained in the device list
488  bool contained = false;
489  pmeDevice = (cuda_args.pmedevice == NULL) ? devices[0]: cuda_args.pmedevice;
490  for(int i = 0; i < ndevices; i++){
491  if(!contained) {
492  contained = devices[i] == pmeDevice;
493  pmeDeviceIndex = (contained) ? i : -1; // saves index for pmeDevice
494  }
495  if(deviceID == devices[i]) deviceIndex = i;
496  }
497 
498  if(!contained && CkMyPe() == 0) cudaDie("device specified for PME is not contained in +devices!");
499  // Everything is OK, sets flags
500 
501  isPmeDevice = pmeDevice == deviceID;
502  masterDevice = devices[0]; // head of device list responsible for printing stuff
503  isMasterDevice = deviceID == masterDevice;
504 
505  if (pmeDeviceIndex != 0 && pmePes != -1) {
506  NAMD_die("PME device must be index 0 if pmePes is set");
507  }
508  }
509 
510 #endif
511 }
int devicesperreplica
Definition: DeviceCUDA.C:57
#define MAX_NUM_DEVICES
Definition: DeviceCUDA.C:94
int nomergegrids
Definition: DeviceCUDA.C:60
void cudaDie(const char *msg, cudaError_t err)
Definition: CudaUtils.C:9
int masterPeList[MAX_NUM_DEVICES]
Definition: DeviceCUDA.C:95
static __thread cuda_args_t cuda_args
Definition: DeviceCUDA.C:67
int globaldevice
Definition: DeviceCUDA.C:64
int usedevicelist
Definition: DeviceCUDA.C:56
int deviceIDList[MAX_NUM_RANKS]
Definition: DeviceCUDA.C:92
int mergegrids
Definition: DeviceCUDA.C:59
int nostreaming
Definition: DeviceCUDA.C:61
int pmedevice
Definition: DeviceCUDA.C:62
void NAMD_die(const char *err_msg)
Definition: common.C:147
#define MAX_NUM_RANKS
Definition: DeviceCUDA.C:91
#define cudaCheck(stmt)
Definition: CudaUtils.h:233
int ignoresharing
Definition: DeviceCUDA.C:58
char * devicelist
Definition: DeviceCUDA.C:55
for(int i=0;i< n1;++i)

◆ isGpuReservedPme()

bool DeviceCUDA::isGpuReservedPme ( )
inline

Definition at line 164 of file DeviceCUDA.h.

Referenced by ComputeCUDAMgr::initialize().

164 { return reservePme; }

◆ one_device_per_node()

bool DeviceCUDA::one_device_per_node ( )

Definition at line 549 of file DeviceCUDA.C.

Referenced by ComputePmeMgr::initialize().

549  {
550  if ( numPesSharingDevice != CkMyNodeSize() ) return false;
551  int numPesOnNodeSharingDevice = 0;
552  for ( int i=0; i<numPesSharingDevice; ++i ) {
553  if ( CkNodeOf(pesSharingDevice[i]) == CkMyNode() ) {
554  ++numPesOnNodeSharingDevice;
555  }
556  }
557  return ( numPesOnNodeSharingDevice == CkMyNodeSize() );
558 }

◆ setGpuIsMine()

void DeviceCUDA::setGpuIsMine ( const int  val)
inline

Definition at line 142 of file DeviceCUDA.h.

142 {gpuIsMine = val;}

◆ setMergeGrids()

void DeviceCUDA::setMergeGrids ( const int  val)
inline

Definition at line 133 of file DeviceCUDA.h.

133 {mergegrids = val;}

◆ setupDevicePeerAccess()

void DeviceCUDA::setupDevicePeerAccess ( )

The documentation for this class was generated from the following files: