NAMD
Public Member Functions | List of all members
DeviceCUDA Class Reference

#include <DeviceCUDA.h>

Public Member Functions

 DeviceCUDA ()
 
 ~DeviceCUDA ()
 
void initialize ()
 
int getDeviceCount ()
 
int getNumDevice ()
 
bool device_shared_with_pe (int pe)
 
bool one_device_per_node ()
 
int getNoStreaming ()
 
int getNoMergeGrids ()
 
int getMergeGrids ()
 
void setMergeGrids (const int val)
 
bool getSharedGpu ()
 
int getNextPeSharingGpu ()
 
int getMasterPe ()
 
int getNumPesSharingDevice ()
 
int getPesSharingDevice (const int i)
 
int getGpuIsMine ()
 
void setGpuIsMine (const int val)
 
int getDeviceID ()
 
int getDeviceIDbyRank (int rank)
 
int getDeviceIDforPe (int pe)
 
int getMasterPeForDeviceID (int deviceID)
 
int getMaxNumThreads ()
 
int getMaxNumBlocks ()
 
void setupDevicePeerAccess ()
 
bool isGpuReservedPme ()
 
int getPmeDevice ()
 
int getDeviceIndex ()
 
int getPmeDeviceIndex ()
 
bool getIsPmeDevice ()
 
bool getIsMasterDevice ()
 
int getGlobalDevice () const
 
bool getIsGlobalDevice () const
 
const int * allDevices () const
 

Detailed Description

Definition at line 54 of file DeviceCUDA.h.

Constructor & Destructor Documentation

◆ DeviceCUDA()

DeviceCUDA::DeviceCUDA ( )

Definition at line 104 of file DeviceCUDA.C.

104 : deviceProps(NULL), devices(NULL) {}

◆ ~DeviceCUDA()

DeviceCUDA::~DeviceCUDA ( )

Definition at line 520 of file DeviceCUDA.C.

520  {
521  if (deviceProps != NULL) delete [] deviceProps;
522  if (devices != NULL) delete [] devices;
523  delete [] pesSharingDevice;
524 }

Member Function Documentation

◆ allDevices()

const int* DeviceCUDA::allDevices ( ) const
inline

Definition at line 173 of file DeviceCUDA.h.

173 {return devices;}

◆ device_shared_with_pe()

bool DeviceCUDA::device_shared_with_pe ( int  pe)

Definition at line 543 of file DeviceCUDA.C.

Referenced by ComputeMgr::createComputes().

543  {
544  for ( int i=0; i<numPesSharingDevice; ++i ) {
545  if ( pesSharingDevice[i] == pe ) return true;
546  }
547  return false;
548 }

◆ getDeviceCount()

int DeviceCUDA::getDeviceCount ( )
inline

Definition at line 124 of file DeviceCUDA.h.

Referenced by CudaComputeNonbonded::assignPatches(), and ComputeCUDAMgr::initialize().

124 {return deviceCount;}

◆ getDeviceID()

int DeviceCUDA::getDeviceID ( )
inline

◆ getDeviceIDbyRank()

int DeviceCUDA::getDeviceIDbyRank ( int  rank)
inline

◆ getDeviceIDforPe()

int DeviceCUDA::getDeviceIDforPe ( int  pe)

Definition at line 529 of file DeviceCUDA.C.

References deviceIDList.

529  {
530  return deviceIDList[CkRankOf(pe) % CkMyNodeSize()];
531 }
int deviceIDList[MAX_NUM_RANKS]
Definition: DeviceCUDA.C:92

◆ getDeviceIndex()

int DeviceCUDA::getDeviceIndex ( )
inline

Definition at line 166 of file DeviceCUDA.h.

Referenced by CudaComputeNonbonded::initialize().

166 { return deviceIndex; }

◆ getGlobalDevice()

int DeviceCUDA::getGlobalDevice ( ) const
inline

Definition at line 171 of file DeviceCUDA.h.

Referenced by ComputeCUDAMgr::createCudaGlobalMaster(), and ComputeMgr::recvCudaGlobalMasterCreateMsg().

171 {return globalDevice;}

◆ getGpuIsMine()

int DeviceCUDA::getGpuIsMine ( )
inline

Definition at line 141 of file DeviceCUDA.h.

141 {return gpuIsMine;}

◆ getIsGlobalDevice()

bool DeviceCUDA::getIsGlobalDevice ( ) const
inline

◆ getIsMasterDevice()

bool DeviceCUDA::getIsMasterDevice ( )

Definition at line 646 of file DeviceCUDA.C.

Referenced by GlobalGPUMgr::initialize().

646  {
647  return (CkMyPe() == masterPe) && isMasterDevice;
648 }

◆ getIsPmeDevice()

bool DeviceCUDA::getIsPmeDevice ( )
inline

Definition at line 168 of file DeviceCUDA.h.

168 { return isPmeDevice; }

◆ getMasterPe()

int DeviceCUDA::getMasterPe ( )
inline

◆ getMasterPeForDeviceID()

int DeviceCUDA::getMasterPeForDeviceID ( int  deviceID)

Definition at line 536 of file DeviceCUDA.C.

References masterPeList.

Referenced by CudaComputeNonbonded::assignPatches().

536  {
537  return masterPeList[deviceID % deviceCount] - 1;
538 }
int masterPeList[MAX_NUM_DEVICES]
Definition: DeviceCUDA.C:95

◆ getMaxNumBlocks()

int DeviceCUDA::getMaxNumBlocks ( )

Definition at line 570 of file DeviceCUDA.C.

References cudaCheck.

570  {
571  int dev;
572  cudaCheck(cudaGetDevice(&dev));
573  return deviceProps[dev].maxGridSize[0];
574 }
#define cudaCheck(stmt)
Definition: CudaUtils.h:242

◆ getMaxNumThreads()

int DeviceCUDA::getMaxNumThreads ( )

Definition at line 564 of file DeviceCUDA.C.

References cudaCheck.

564  {
565  int dev;
566  cudaCheck(cudaGetDevice(&dev));
567  return deviceProps[dev].maxThreadsPerBlock;
568 }
#define cudaCheck(stmt)
Definition: CudaUtils.h:242

◆ getMergeGrids()

int DeviceCUDA::getMergeGrids ( )
inline

Definition at line 132 of file DeviceCUDA.h.

132 {return mergegrids;}

◆ getNextPeSharingGpu()

int DeviceCUDA::getNextPeSharingGpu ( )
inline

Definition at line 136 of file DeviceCUDA.h.

136 {return nextPeSharingGpu;}

◆ getNoMergeGrids()

int DeviceCUDA::getNoMergeGrids ( )
inline

Definition at line 131 of file DeviceCUDA.h.

131 {return nomergegrids;}

◆ getNoStreaming()

int DeviceCUDA::getNoStreaming ( )
inline

Definition at line 130 of file DeviceCUDA.h.

Referenced by ComputeCUDAMgr::createCudaComputeNonbonded().

130 {return nostreaming;}

◆ getNumDevice()

int DeviceCUDA::getNumDevice ( )
inline

◆ getNumPesSharingDevice()

int DeviceCUDA::getNumPesSharingDevice ( )
inline

Definition at line 138 of file DeviceCUDA.h.

Referenced by CudaComputeNonbonded::assignPatches().

138 {return numPesSharingDevice;}

◆ getPesSharingDevice()

int DeviceCUDA::getPesSharingDevice ( const int  i)
inline

Definition at line 139 of file DeviceCUDA.h.

Referenced by CudaComputeNonbonded::assignPatches().

139 {return pesSharingDevice[i];}

◆ getPmeDevice()

int DeviceCUDA::getPmeDevice ( )
inline

Definition at line 165 of file DeviceCUDA.h.

Referenced by ComputeCUDAMgr::createCudaPmeOneDevice().

165 { return pmeDevice; }

◆ getPmeDeviceIndex()

int DeviceCUDA::getPmeDeviceIndex ( )
inline

Definition at line 167 of file DeviceCUDA.h.

Referenced by ComputeCUDAMgr::createCudaPmeOneDevice(), and GlobalGPUMgr::getIsPmeDevice().

167 { return pmeDeviceIndex; }

◆ getSharedGpu()

bool DeviceCUDA::getSharedGpu ( )
inline

Definition at line 135 of file DeviceCUDA.h.

135 {return sharedGpu;}

◆ initialize()

void DeviceCUDA::initialize ( void  )

Definition at line 109 of file DeviceCUDA.C.

References cuda_args, cudaCheck, cudaDie(), deviceIDList, cuda_args_t::devicelist, cuda_args_t::devicesperreplica, for(), cuda_args_t::globaldevice, cuda_args_t::ignoresharing, masterPeList, MAX_NUM_DEVICES, MAX_NUM_RANKS, cuda_args_t::mergegrids, NAMD_die(), cuda_args_t::nomergegrids, cuda_args_t::nostreaming, cuda_args_t::pmedevice, cuda_args_t::pmePes, and cuda_args_t::usedevicelist.

Referenced by cuda_initialize().

109  {
110  // Copy command-line arguments into class
111  this->devicelist = cuda_args.devicelist;
112  this->usedevicelist = cuda_args.usedevicelist;
113  this->devicesperreplica = cuda_args.devicesperreplica;
114  this->ignoresharing = cuda_args.ignoresharing;
115  this->mergegrids = cuda_args.mergegrids;
116  this->nomergegrids = cuda_args.nomergegrids;
117  this->nostreaming = cuda_args.nostreaming;
118 #ifdef NODEGROUP_FORCE_REGISTER
119  reservePme = 0;
120  isPmeDevice = 0;
121  isGlobalDevice = false;
122 #endif
123 
124  if (CkMyPe() == 0) register_user_events();
125 
126 #if defined(CUDA_VERSION)
127  if (CkMyPe() == 0) CkPrintf("Info: Built with CUDA version %d\n", CUDA_VERSION);
128 #endif
129 
130  char host[128];
131  gethostname(host, 128); host[127] = 0;
132 
133  int myPhysicalNodeID = CmiPhysicalNodeID(CkMyPe());
134  int myRankInPhysicalNode;
135  int numPesOnPhysicalNode;
136  int *pesOnPhysicalNode;
137  CmiGetPesOnPhysicalNode(myPhysicalNodeID,
138  &pesOnPhysicalNode,&numPesOnPhysicalNode);
139 
140  {
141  int i;
142  for ( i=0; i < numPesOnPhysicalNode; ++i ) {
143  if ( i && (pesOnPhysicalNode[i] <= pesOnPhysicalNode[i-1]) ) {
144  i = numPesOnPhysicalNode;
145  break;
146  }
147  if ( pesOnPhysicalNode[i] == CkMyPe() ) break;
148  }
149  if ( i == numPesOnPhysicalNode || i != CmiPhysicalRank(CkMyPe()) ) {
150  CkPrintf("Bad result from CmiGetPesOnPhysicalNode!\n");
151  for ( i=0; i < numPesOnPhysicalNode; ++i ) {
152  CkPrintf("pe %d physnode rank %d of %d is %d\n", CkMyPe(),
153  i, numPesOnPhysicalNode, pesOnPhysicalNode[i]);
154  }
155  myRankInPhysicalNode = 0;
156  numPesOnPhysicalNode = 1;
157  pesOnPhysicalNode = new int[1];
158  pesOnPhysicalNode[0] = CkMyPe();
159  } else {
160  myRankInPhysicalNode = i;
161  }
162  }
163  // CkPrintf("Pe %d ranks %d in physical node\n",CkMyPe(),myRankInPhysicalNode);
164 
165  deviceCount = 0;
166  cudaCheck(cudaGetDeviceCount(&deviceCount));
167  if ( deviceCount <= 0 ) {
168  cudaDie("No CUDA devices found.");
169  }
170 
171  // Store all device props
172  deviceProps = new cudaDeviceProp[deviceCount];
173  for ( int i=0; i<deviceCount; ++i ) {
174  cudaCheck(cudaGetDeviceProperties(&deviceProps[i], i));
175  }
176 
177  ndevices = 0;
178  int nexclusive = 0;
179  if ( usedevicelist ) {
180  devices = new int[strlen(devicelist)];
181  int i = 0;
182  while ( devicelist[i] ) {
183  ndevices += sscanf(devicelist+i,"%d",devices+ndevices);
184  while ( devicelist[i] && isdigit(devicelist[i]) ) ++i;
185  while ( devicelist[i] && ! isdigit(devicelist[i]) ) ++i;
186  }
187  } else {
188  if ( ! CkMyPe() ) {
189  CkPrintf("Did not find +devices i,j,k,... argument, using all\n");
190  }
191  devices = new int[deviceCount];
192  for ( int i=0; i<deviceCount; ++i ) {
193  int dev = i % deviceCount;
194 #if CUDA_VERSION >= 2020 || defined(NAMD_HIP)
195  cudaDeviceProp deviceProp;
196  int computeMode;
197  cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
198  cudaCheck(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, dev));
199  if ( computeMode != cudaComputeModeProhibited
200  && (deviceProp.major >= 3)
201  && deviceProp.canMapHostMemory
202  && ( (deviceProp.multiProcessorCount > 2) ||
203  ((ndevices==0)&&(CkNumNodes()==1)) ) // exclude weak cards
204  ) {
205  devices[ndevices++] = dev;
206  }
207  if ( computeMode == cudaComputeModeExclusive ) {
208  ++nexclusive;
209  }
210 #else
211  devices[ndevices++] = dev;
212 #endif
213  }
214  }
215 
216  if ( ! ndevices ) {
217  cudaDie("all devices are in prohibited mode, of compute capability < 3.0, unable to map host memory, too small, or otherwise unusable");
218  }
219 
220  if ( devicesperreplica > 0 ) {
221  if ( devicesperreplica > ndevices ) {
222  NAMD_die("More devices per partition requested than devices are available");
223  }
224  int *olddevices = devices;
225  devices = new int[devicesperreplica];
226  for ( int i=0; i<devicesperreplica; ++i ) {
227  int mypart = CmiMyPartition();
228  devices[i] = olddevices[(i+devicesperreplica*mypart)%ndevices];
229  }
230  ndevices = devicesperreplica;
231  delete [] olddevices;
232  }
233 
234  int myRankForDevice = ignoresharing ? CkMyRank() : myRankInPhysicalNode;
235  int numPesForDevice = ignoresharing ? CkMyNodeSize() : numPesOnPhysicalNode;
236 
237  // This option allows users to specify the number of PEs given to the PME device.
238  // This can be used to improve loadbalancing.
239  #ifdef NODEGROUP_FORCE_REGISTER
240  const int pmePes = (cuda_args.pmePes == NULL) ? -1 : cuda_args.pmePes;
241  #else
242  const int pmePes = -1;
243  #endif
244 
245  // catch multiple processes per device
246  if ( ndevices % ( numPesForDevice / CkMyNodeSize() ) ) {
247  char msg[1024];
248  sprintf(msg,"Number of devices (%d) is not a multiple of number of processes (%d). "
249  "Sharing devices between processes is inefficient. "
250  "Specify +ignoresharing (each process uses all visible devices) if "
251  "not all devices are visible to each process, otherwise "
252  "adjust number of processes to evenly divide number of devices, "
253  "specify subset of devices with +devices argument (e.g., +devices 0,2), "
254  "or multiply list shared devices (e.g., +devices 0,1,2,0).",
255  ndevices, numPesForDevice / CkMyNodeSize() );
256  NAMD_die(msg);
257  }
258 
259  {
260  // build list of devices actually used by this node
261  nodedevices = new int[ndevices];
262  nnodedevices = 0;
263  int pe = CkNodeFirst(CkMyNode());
264  int dr = -1;
265  for ( int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
266  int rank = ignoresharing ? i : CmiPhysicalRank(pe);
267  int peDeviceRank = rank * ndevices / numPesForDevice;
268  if ( peDeviceRank != dr ) {
269  dr = peDeviceRank;
270  nodedevices[nnodedevices++] = devices[dr];
271  }
272  }
273  }
274 
275  {
276  // check for devices used twice by this node
277  for ( int i=0; i<nnodedevices; ++i ) {
278  for ( int j=i+1; j<nnodedevices; ++j ) {
279  if ( nodedevices[i] == nodedevices[j] ) {
280  char msg[1024];
281  sprintf(msg,"Device %d bound twice by same process.", nodedevices[i]);
282  NAMD_die(msg);
283  }
284  }
285  }
286  }
287 
288  sharedGpu = 0;
289  gpuIsMine = 1;
290  int firstPeSharingGpu = CkMyPe();
291  nextPeSharingGpu = CkMyPe();
292 
293  {
294  int dev;
295  if (pmePes != -1) {
296  int myDeviceRank;
297  if (myRankForDevice < pmePes) {
298  myDeviceRank = 0;
299  } else {
300  myDeviceRank = 1 + (myRankForDevice-pmePes) * (ndevices-1) / (numPesForDevice-pmePes);
301  }
302 
303  dev = devices[myDeviceRank];
304  masterPe = CkMyPe();
305  if (myRankForDevice >= pmePes) {
306  pesSharingDevice = new int[numPesForDevice];
307  masterPe = -1;
308  numPesSharingDevice = 0;
309  for ( int i = pmePes; i < numPesForDevice; ++i ) {
310  if ( 1 + (i-pmePes) * (ndevices-1) / (numPesForDevice-pmePes) == myDeviceRank ) {
311  int thisPe = ignoresharing ? (CkNodeFirst(CkMyNode())+i) : pesOnPhysicalNode[i];
312  pesSharingDevice[numPesSharingDevice++] = thisPe;
313  if ( masterPe < 1 ) masterPe = thisPe;
314  if ( WorkDistrib::pe_sortop_diffuse()(thisPe,masterPe) ) masterPe = thisPe;
315  }
316  }
317  for ( int j = 0; j < ndevices; ++j ) {
318  if ( devices[j] == dev && j != myDeviceRank ) sharedGpu = 1;
319  }
320  } else {
321  #ifdef NODEGROUP_FORCE_REGISTER
322  pesSharingDevice = new int[pmePes];
323  #else
324  pesSharingDevice = NULL;
325  #endif
326  masterPe = -1;
327  numPesSharingDevice = 0;
328  for (int i = 0; i < pmePes; ++i) {
329  int thisPe = ignoresharing ? (CkNodeFirst(CkMyNode())+i) : pesOnPhysicalNode[i];
330  pesSharingDevice[numPesSharingDevice++] = thisPe;
331  if ( masterPe < 1 ) masterPe = thisPe;
332  if ( WorkDistrib::pe_sortop_diffuse()(thisPe,masterPe) ) masterPe = thisPe;
333  }
334  }
335  if ( sharedGpu && masterPe == CkMyPe() ) {
336  if ( CmiPhysicalNodeID(masterPe) < 2 )
337  CkPrintf("Pe %d sharing CUDA device %d\n", CkMyPe(), dev);
338  }
339  } else if ( numPesForDevice > 1 ) {
340  int myDeviceRank = myRankForDevice * ndevices / numPesForDevice;
341  dev = devices[myDeviceRank];
342  masterPe = CkMyPe();
343  {
344  pesSharingDevice = new int[numPesForDevice];
345  masterPe = -1;
346  numPesSharingDevice = 0;
347  for ( int i = 0; i < numPesForDevice; ++i ) {
348  if ( i * ndevices / numPesForDevice == myDeviceRank ) {
349  int thisPe = ignoresharing ? (CkNodeFirst(CkMyNode())+i) : pesOnPhysicalNode[i];
350  pesSharingDevice[numPesSharingDevice++] = thisPe;
351  if ( masterPe < 1 ) masterPe = thisPe;
352  if ( WorkDistrib::pe_sortop_diffuse()(thisPe,masterPe) ) masterPe = thisPe;
353  }
354  }
355  for ( int j = 0; j < ndevices; ++j ) {
356  if ( devices[j] == dev && j != myDeviceRank ) sharedGpu = 1;
357  }
358  }
359  if ( sharedGpu && masterPe == CkMyPe() ) {
360  if ( CmiPhysicalNodeID(masterPe) < 2 )
361  CkPrintf("Pe %d sharing CUDA device %d\n", CkMyPe(), dev);
362  }
363  } else { // in case phys node code is lying
364  dev = devices[CkMyPe() % ndevices];
365  masterPe = CkMyPe();
366  pesSharingDevice = new int[1];
367  pesSharingDevice[0] = CkMyPe();
368  numPesSharingDevice = 1;
369  }
370 
371  deviceID = dev;
372 
373  // Setting PME device for single-node scheme
374  // Sanity check in order to see if pmeDevice in contained in the device list
375  bool contained = false;
376  pmeDevice = (cuda_args.pmedevice == NULL) ? devices[0]: cuda_args.pmedevice;
377  for(int i = 0; i < ndevices; i++){
378  if(!contained) {
379  contained = devices[i] == pmeDevice;
380  pmeDeviceIndex = (contained) ? i : -1; // saves index for pmeDevice
381  }
382  if(deviceID == devices[i]) deviceIndex = i;
383  }
384 
385  masterDevice = devices[0]; // head of device list responsible for printing stuff
386  isMasterDevice = deviceID == masterDevice;
387  if(!contained){
388  // Uses a particular device to do PME and reserves it (no other force terms on it)
389  reservePme = true;
390  pmeDeviceIndex = nnodedevices; // PME device index always at the tail of the list
391  isPmeDevice = isMasterDevice; // Master device launches work on the PME device as well
392  }else{
393  reservePme = false;
394  isPmeDevice = pmeDevice == deviceID;
395  }
396 
397  // Device for CudaGlobalMaster
398  globalDevice = (cuda_args.globaldevice < 0) ? devices[0]: cuda_args.globaldevice;
399  // Sanity check in order to see if globalDevice is contained in the device list
400  contained = false;
401  for (int i = 0; i < ndevices; ++i) {
402  if(!contained) {
403  contained = devices[i] == globalDevice;
404  }
405  }
406  if (!contained) {
407  NAMD_die("The selected GPU device for global forces is in the available devices list.\n");
408  }
409  isGlobalDevice = globalDevice == deviceID;
410 
411  // Store device IDs to node-wide list
412  if (CkMyRank() >= MAX_NUM_RANKS)
413  NAMD_die("Maximum number of ranks (2048) per node exceeded");
414  deviceIDList[CkMyRank()] = deviceID;
415 
416  if ( masterPe != CkMyPe() ) {
417  if ( CmiPhysicalNodeID(masterPe) < 2 )
418  CkPrintf("Pe %d physical rank %d will use CUDA device of pe %d\n",
419  CkMyPe(), myRankInPhysicalNode, masterPe);
420  // for PME only
421  cudaCheck(cudaSetDevice(dev));
422  return;
423  }
424 
425  // Store master PEs for every device ID to node-wide list
426  if (deviceID >= MAX_NUM_DEVICES)
427  NAMD_die("Maximum number of CUDA devices (256) per node exceeded");
428  masterPeList[deviceID] = masterPe + 1; // array is pre-initialized to zeros
429 
430  // disable token-passing but don't submit local until remote finished
431  // if shared_gpu is true, otherwise submit all work immediately
432  firstPeSharingGpu = CkMyPe();
433  nextPeSharingGpu = CkMyPe();
434 
435  gpuIsMine = ( firstPeSharingGpu == CkMyPe() );
436 
437  if ( dev >= deviceCount ) {
438  char buf[256];
439  sprintf(buf,"Pe %d unable to bind to CUDA device %d on %s because only %d devices are present",
440  CkMyPe(), dev, host, deviceCount);
441  NAMD_die(buf);
442  }
443 
444  cudaDeviceProp deviceProp;
445  cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
446  if ( CmiPhysicalNodeID(masterPe) < 2 )
447  CkPrintf("Pe %d physical rank %d binding to CUDA device %d on %s: '%s' Mem: %luMB Rev: %d.%d PCI: %x:%x:%x\n",
448  CkMyPe(), myRankInPhysicalNode, dev, host,
449  deviceProp.name,
450  (unsigned long) (deviceProp.totalGlobalMem / (1024*1024)),
451  deviceProp.major, deviceProp.minor,
452  deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
453 
454  cudaCheck(cudaSetDevice(dev));
455 
456  } // just let CUDA pick a device for us
457 
458  {
459  // if only one device then already initialized in cuda_affinity_initialize()
460  cudaError_t cudaSetDeviceFlags_cudaDeviceMapHost = cudaSetDeviceFlags(cudaDeviceMapHost);
461  if ( cudaSetDeviceFlags_cudaDeviceMapHost == cudaErrorSetOnActiveProcess ) {
462  cudaGetLastError();
463  } else {
464  cudaCheck(cudaSetDeviceFlags_cudaDeviceMapHost);
465  }
466 
467  int dev;
468  cudaCheck(cudaGetDevice(&dev));
469  deviceID = dev;
470  cudaDeviceProp deviceProp;
471  int computeMode;
472  cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
473  cudaCheck(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, dev));
474  if ( computeMode == cudaComputeModeProhibited )
475  cudaDie("device in prohibited mode");
476  if ( deviceProp.major < 3 )
477  cudaDie("device not of compute capability 3.0 or higher");
478  if ( ! deviceProp.canMapHostMemory )
479  cudaDie("device cannot map host memory");
480 
481  // initialize the device on this thread
482  int *dummy;
483  cudaCheck(cudaMalloc(&dummy, 4));
484  }
485 
486 #if NODEGROUP_FORCE_REGISTER
487 
488  {
489  // Setting PME device for single-node scheme
490 
491  // Sanity check in order to see if pmeDevice in contained in the device list
492  bool contained = false;
493  pmeDevice = (cuda_args.pmedevice == NULL) ? devices[0]: cuda_args.pmedevice;
494  for(int i = 0; i < ndevices; i++){
495  if(!contained) {
496  contained = devices[i] == pmeDevice;
497  pmeDeviceIndex = (contained) ? i : -1; // saves index for pmeDevice
498  }
499  if(deviceID == devices[i]) deviceIndex = i;
500  }
501 
502  if(!contained && CkMyPe() == 0) cudaDie("device specified for PME is not contained in +devices!");
503  // Everything is OK, sets flags
504 
505  isPmeDevice = pmeDevice == deviceID;
506  masterDevice = devices[0]; // head of device list responsible for printing stuff
507  isMasterDevice = deviceID == masterDevice;
508 
509  if (pmeDeviceIndex != 0 && pmePes != -1) {
510  NAMD_die("PME device must be index 0 if pmePes is set");
511  }
512  }
513 
514 #endif
515 }
int devicesperreplica
Definition: DeviceCUDA.C:57
#define MAX_NUM_DEVICES
Definition: DeviceCUDA.C:94
int nomergegrids
Definition: DeviceCUDA.C:60
void cudaDie(const char *msg, cudaError_t err)
Definition: CudaUtils.C:9
int masterPeList[MAX_NUM_DEVICES]
Definition: DeviceCUDA.C:95
static __thread cuda_args_t cuda_args
Definition: DeviceCUDA.C:67
int globaldevice
Definition: DeviceCUDA.C:64
int usedevicelist
Definition: DeviceCUDA.C:56
int deviceIDList[MAX_NUM_RANKS]
Definition: DeviceCUDA.C:92
int mergegrids
Definition: DeviceCUDA.C:59
int nostreaming
Definition: DeviceCUDA.C:61
int pmedevice
Definition: DeviceCUDA.C:62
void NAMD_die(const char *err_msg)
Definition: common.C:147
#define MAX_NUM_RANKS
Definition: DeviceCUDA.C:91
#define cudaCheck(stmt)
Definition: CudaUtils.h:242
int ignoresharing
Definition: DeviceCUDA.C:58
char * devicelist
Definition: DeviceCUDA.C:55
for(int i=0;i< n1;++i)

◆ isGpuReservedPme()

bool DeviceCUDA::isGpuReservedPme ( )
inline

Definition at line 164 of file DeviceCUDA.h.

Referenced by ComputeCUDAMgr::initialize().

164 { return reservePme; }

◆ one_device_per_node()

bool DeviceCUDA::one_device_per_node ( )

Definition at line 553 of file DeviceCUDA.C.

Referenced by ComputePmeMgr::initialize().

553  {
554  if ( numPesSharingDevice != CkMyNodeSize() ) return false;
555  int numPesOnNodeSharingDevice = 0;
556  for ( int i=0; i<numPesSharingDevice; ++i ) {
557  if ( CkNodeOf(pesSharingDevice[i]) == CkMyNode() ) {
558  ++numPesOnNodeSharingDevice;
559  }
560  }
561  return ( numPesOnNodeSharingDevice == CkMyNodeSize() );
562 }

◆ setGpuIsMine()

void DeviceCUDA::setGpuIsMine ( const int  val)
inline

Definition at line 142 of file DeviceCUDA.h.

142 {gpuIsMine = val;}

◆ setMergeGrids()

void DeviceCUDA::setMergeGrids ( const int  val)
inline

Definition at line 133 of file DeviceCUDA.h.

133 {mergegrids = val;}

◆ setupDevicePeerAccess()

void DeviceCUDA::setupDevicePeerAccess ( )

The documentation for this class was generated from the following files: