NAMD
Public Member Functions | List of all members
DeviceCUDA Class Reference

#include <DeviceCUDA.h>

Public Member Functions

 DeviceCUDA ()
 
 ~DeviceCUDA ()
 
void initialize ()
 
int getDeviceCount ()
 
int getNumDevice ()
 
bool device_shared_with_pe (int pe)
 
bool one_device_per_node ()
 
int getNoStreaming ()
 
int getNoMergeGrids ()
 
int getMergeGrids ()
 
void setMergeGrids (const int val)
 
bool getSharedGpu ()
 
int getNextPeSharingGpu ()
 
int getMasterPe ()
 
int getNumPesSharingDevice ()
 
int getPesSharingDevice (const int i)
 
int getGpuIsMine ()
 
void setGpuIsMine (const int val)
 
int getDeviceID ()
 
int getDeviceIDbyRank (int rank)
 
int getDeviceIDforPe (int pe)
 
int getMasterPeForDeviceID (int deviceID)
 
int getMaxNumThreads ()
 
int getMaxNumBlocks ()
 
void setupDevicePeerAccess ()
 
bool isGpuReservedPme ()
 
int getPmeDevice ()
 
int getDeviceIndex ()
 
int getPmeDeviceIndex ()
 
bool getIsPmeDevice ()
 
bool getIsMasterDevice ()
 
int getGlobalDevice () const
 
bool getIsGlobalDevice () const
 
const int * allDevices () const
 

Detailed Description

Definition at line 54 of file DeviceCUDA.h.

Constructor & Destructor Documentation

◆ DeviceCUDA()

DeviceCUDA::DeviceCUDA ( )

Definition at line 102 of file DeviceCUDA.C.

102 : deviceProps(NULL), devices(NULL) {}

◆ ~DeviceCUDA()

DeviceCUDA::~DeviceCUDA ( )

Definition at line 514 of file DeviceCUDA.C.

514  {
515  if (deviceProps != NULL) delete [] deviceProps;
516  if (devices != NULL) delete [] devices;
517  delete [] pesSharingDevice;
518 }

Member Function Documentation

◆ allDevices()

const int* DeviceCUDA::allDevices ( ) const
inline

Definition at line 173 of file DeviceCUDA.h.

173 {return devices;}

◆ device_shared_with_pe()

bool DeviceCUDA::device_shared_with_pe ( int  pe)

Definition at line 537 of file DeviceCUDA.C.

Referenced by ComputeMgr::createComputes().

537  {
538  for ( int i=0; i<numPesSharingDevice; ++i ) {
539  if ( pesSharingDevice[i] == pe ) return true;
540  }
541  return false;
542 }

◆ getDeviceCount()

int DeviceCUDA::getDeviceCount ( )
inline

Definition at line 124 of file DeviceCUDA.h.

Referenced by CudaComputeNonbonded::assignPatches(), and ComputeCUDAMgr::initialize().

124 {return deviceCount;}

◆ getDeviceID()

int DeviceCUDA::getDeviceID ( )
inline

◆ getDeviceIDbyRank()

int DeviceCUDA::getDeviceIDbyRank ( int  rank)
inline

◆ getDeviceIDforPe()

int DeviceCUDA::getDeviceIDforPe ( int  pe)

Definition at line 523 of file DeviceCUDA.C.

References deviceIDList.

523  {
524  return deviceIDList[CkRankOf(pe) % CkMyNodeSize()];
525 }
int deviceIDList[MAX_NUM_RANKS]
Definition: DeviceCUDA.C:90

◆ getDeviceIndex()

int DeviceCUDA::getDeviceIndex ( )
inline

Definition at line 166 of file DeviceCUDA.h.

Referenced by CudaComputeNonbonded::initialize().

166 { return deviceIndex; }

◆ getGlobalDevice()

int DeviceCUDA::getGlobalDevice ( ) const
inline

Definition at line 171 of file DeviceCUDA.h.

Referenced by ComputeCUDAMgr::createCudaGlobalMaster(), and ComputeMgr::recvCudaGlobalMasterCreateMsg().

171 {return globalDevice;}

◆ getGpuIsMine()

int DeviceCUDA::getGpuIsMine ( )
inline

Definition at line 141 of file DeviceCUDA.h.

141 {return gpuIsMine;}

◆ getIsGlobalDevice()

bool DeviceCUDA::getIsGlobalDevice ( ) const
inline

◆ getIsMasterDevice()

bool DeviceCUDA::getIsMasterDevice ( )

◆ getIsPmeDevice()

bool DeviceCUDA::getIsPmeDevice ( )
inline

Definition at line 168 of file DeviceCUDA.h.

168 { return isPmeDevice; }

◆ getMasterPe()

int DeviceCUDA::getMasterPe ( )
inline

◆ getMasterPeForDeviceID()

int DeviceCUDA::getMasterPeForDeviceID ( int  deviceID)

Definition at line 530 of file DeviceCUDA.C.

References masterPeList.

Referenced by CudaComputeNonbonded::assignPatches().

530  {
531  return masterPeList[deviceID % deviceCount] - 1;
532 }
int masterPeList[MAX_NUM_DEVICES]
Definition: DeviceCUDA.C:93

◆ getMaxNumBlocks()

int DeviceCUDA::getMaxNumBlocks ( )

Definition at line 564 of file DeviceCUDA.C.

References cudaCheck.

564  {
565  int dev;
566  cudaCheck(cudaGetDevice(&dev));
567  return deviceProps[dev].maxGridSize[0];
568 }
#define cudaCheck(stmt)
Definition: CudaUtils.h:233

◆ getMaxNumThreads()

int DeviceCUDA::getMaxNumThreads ( )

Definition at line 558 of file DeviceCUDA.C.

References cudaCheck.

558  {
559  int dev;
560  cudaCheck(cudaGetDevice(&dev));
561  return deviceProps[dev].maxThreadsPerBlock;
562 }
#define cudaCheck(stmt)
Definition: CudaUtils.h:233

◆ getMergeGrids()

int DeviceCUDA::getMergeGrids ( )
inline

Definition at line 132 of file DeviceCUDA.h.

132 {return mergegrids;}

◆ getNextPeSharingGpu()

int DeviceCUDA::getNextPeSharingGpu ( )
inline

Definition at line 136 of file DeviceCUDA.h.

136 {return nextPeSharingGpu;}

◆ getNoMergeGrids()

int DeviceCUDA::getNoMergeGrids ( )
inline

Definition at line 131 of file DeviceCUDA.h.

131 {return nomergegrids;}

◆ getNoStreaming()

int DeviceCUDA::getNoStreaming ( )
inline

Definition at line 130 of file DeviceCUDA.h.

Referenced by ComputeCUDAMgr::createCudaComputeNonbonded().

130 {return nostreaming;}

◆ getNumDevice()

int DeviceCUDA::getNumDevice ( )
inline

◆ getNumPesSharingDevice()

int DeviceCUDA::getNumPesSharingDevice ( )
inline

Definition at line 138 of file DeviceCUDA.h.

Referenced by CudaComputeNonbonded::assignPatches().

138 {return numPesSharingDevice;}

◆ getPesSharingDevice()

int DeviceCUDA::getPesSharingDevice ( const int  i)
inline

Definition at line 139 of file DeviceCUDA.h.

Referenced by CudaComputeNonbonded::assignPatches().

139 {return pesSharingDevice[i];}

◆ getPmeDevice()

int DeviceCUDA::getPmeDevice ( )
inline

Definition at line 165 of file DeviceCUDA.h.

Referenced by ComputeCUDAMgr::createCudaPmeOneDevice().

165 { return pmeDevice; }

◆ getPmeDeviceIndex()

int DeviceCUDA::getPmeDeviceIndex ( )
inline

Definition at line 167 of file DeviceCUDA.h.

Referenced by ComputeCUDAMgr::createCudaPmeOneDevice().

167 { return pmeDeviceIndex; }

◆ getSharedGpu()

bool DeviceCUDA::getSharedGpu ( )
inline

Definition at line 135 of file DeviceCUDA.h.

135 {return sharedGpu;}

◆ initialize()

void DeviceCUDA::initialize ( void  )

Definition at line 107 of file DeviceCUDA.C.

References cuda_args, cudaCheck, cudaDie(), deviceIDList, cuda_args_t::devicelist, cuda_args_t::devicesperreplica, for(), cuda_args_t::globaldevice, cuda_args_t::ignoresharing, masterPeList, MAX_NUM_DEVICES, MAX_NUM_RANKS, cuda_args_t::mergegrids, NAMD_die(), cuda_args_t::nomergegrids, cuda_args_t::nostreaming, cuda_args_t::pmedevice, cuda_args_t::pmePes, and cuda_args_t::usedevicelist.

Referenced by cuda_initialize().

107  {
108  // Copy command-line arguments into class
109  this->devicelist = cuda_args.devicelist;
110  this->usedevicelist = cuda_args.usedevicelist;
111  this->devicesperreplica = cuda_args.devicesperreplica;
112  this->ignoresharing = cuda_args.ignoresharing;
113  this->mergegrids = cuda_args.mergegrids;
114  this->nomergegrids = cuda_args.nomergegrids;
115  this->nostreaming = cuda_args.nostreaming;
116 #ifdef NODEGROUP_FORCE_REGISTER
117  reservePme = 0;
118  isPmeDevice = 0;
119  isGlobalDevice = false;
120 #endif
121 
122  if (CkMyPe() == 0) register_user_events();
123 
124 #if defined(CUDA_VERSION)
125  if (CkMyPe() == 0) CkPrintf("Info: Built with CUDA version %d\n", CUDA_VERSION);
126 #endif
127 
128  char host[128];
129  gethostname(host, 128); host[127] = 0;
130 
131  int myPhysicalNodeID = CmiPhysicalNodeID(CkMyPe());
132  int myRankInPhysicalNode;
133  int numPesOnPhysicalNode;
134  int *pesOnPhysicalNode;
135  CmiGetPesOnPhysicalNode(myPhysicalNodeID,
136  &pesOnPhysicalNode,&numPesOnPhysicalNode);
137 
138  {
139  int i;
140  for ( i=0; i < numPesOnPhysicalNode; ++i ) {
141  if ( i && (pesOnPhysicalNode[i] <= pesOnPhysicalNode[i-1]) ) {
142  i = numPesOnPhysicalNode;
143  break;
144  }
145  if ( pesOnPhysicalNode[i] == CkMyPe() ) break;
146  }
147  if ( i == numPesOnPhysicalNode || i != CmiPhysicalRank(CkMyPe()) ) {
148  CkPrintf("Bad result from CmiGetPesOnPhysicalNode!\n");
149  for ( i=0; i < numPesOnPhysicalNode; ++i ) {
150  CkPrintf("pe %d physnode rank %d of %d is %d\n", CkMyPe(),
151  i, numPesOnPhysicalNode, pesOnPhysicalNode[i]);
152  }
153  myRankInPhysicalNode = 0;
154  numPesOnPhysicalNode = 1;
155  pesOnPhysicalNode = new int[1];
156  pesOnPhysicalNode[0] = CkMyPe();
157  } else {
158  myRankInPhysicalNode = i;
159  }
160  }
161  // CkPrintf("Pe %d ranks %d in physical node\n",CkMyPe(),myRankInPhysicalNode);
162 
163  deviceCount = 0;
164  cudaCheck(cudaGetDeviceCount(&deviceCount));
165  if ( deviceCount <= 0 ) {
166  cudaDie("No CUDA devices found.");
167  }
168 
169  // Store all device props
170  deviceProps = new cudaDeviceProp[deviceCount];
171  for ( int i=0; i<deviceCount; ++i ) {
172  cudaCheck(cudaGetDeviceProperties(&deviceProps[i], i));
173  }
174 
175  ndevices = 0;
176  int nexclusive = 0;
177  if ( usedevicelist ) {
178  devices = new int[strlen(devicelist)];
179  int i = 0;
180  while ( devicelist[i] ) {
181  ndevices += sscanf(devicelist+i,"%d",devices+ndevices);
182  while ( devicelist[i] && isdigit(devicelist[i]) ) ++i;
183  while ( devicelist[i] && ! isdigit(devicelist[i]) ) ++i;
184  }
185  } else {
186  if ( ! CkMyPe() ) {
187  CkPrintf("Did not find +devices i,j,k,... argument, using all\n");
188  }
189  devices = new int[deviceCount];
190  for ( int i=0; i<deviceCount; ++i ) {
191  int dev = i % deviceCount;
192 #if CUDA_VERSION >= 2020 || defined(NAMD_HIP)
193  cudaDeviceProp deviceProp;
194  cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
195  if ( deviceProp.computeMode != cudaComputeModeProhibited
196  && (deviceProp.major >= 3)
197  && deviceProp.canMapHostMemory
198  && ( (deviceProp.multiProcessorCount > 2) ||
199  ((ndevices==0)&&(CkNumNodes()==1)) ) // exclude weak cards
200  ) {
201  devices[ndevices++] = dev;
202  }
203  if ( deviceProp.computeMode == cudaComputeModeExclusive ) {
204  ++nexclusive;
205  }
206 #else
207  devices[ndevices++] = dev;
208 #endif
209  }
210  }
211 
212  if ( ! ndevices ) {
213  cudaDie("all devices are in prohibited mode, of compute capability < 3.0, unable to map host memory, too small, or otherwise unusable");
214  }
215 
216  if ( devicesperreplica > 0 ) {
217  if ( devicesperreplica > ndevices ) {
218  NAMD_die("More devices per partition requested than devices are available");
219  }
220  int *olddevices = devices;
221  devices = new int[devicesperreplica];
222  for ( int i=0; i<devicesperreplica; ++i ) {
223  int mypart = CmiMyPartition();
224  devices[i] = olddevices[(i+devicesperreplica*mypart)%ndevices];
225  }
226  ndevices = devicesperreplica;
227  delete [] olddevices;
228  }
229 
230  int myRankForDevice = ignoresharing ? CkMyRank() : myRankInPhysicalNode;
231  int numPesForDevice = ignoresharing ? CkMyNodeSize() : numPesOnPhysicalNode;
232 
233  // This option allows users to specify the number of PEs given to the PME device.
234  // This can be used to improve loadbalancing.
235  #ifdef NODEGROUP_FORCE_REGISTER
236  const int pmePes = (cuda_args.pmePes == NULL) ? -1 : cuda_args.pmePes;
237  #else
238  const int pmePes = -1;
239  #endif
240 
241  // catch multiple processes per device
242  if ( ndevices % ( numPesForDevice / CkMyNodeSize() ) ) {
243  char msg[1024];
244  sprintf(msg,"Number of devices (%d) is not a multiple of number of processes (%d). "
245  "Sharing devices between processes is inefficient. "
246  "Specify +ignoresharing (each process uses all visible devices) if "
247  "not all devices are visible to each process, otherwise "
248  "adjust number of processes to evenly divide number of devices, "
249  "specify subset of devices with +devices argument (e.g., +devices 0,2), "
250  "or multiply list shared devices (e.g., +devices 0,1,2,0).",
251  ndevices, numPesForDevice / CkMyNodeSize() );
252  NAMD_die(msg);
253  }
254 
255  {
256  // build list of devices actually used by this node
257  nodedevices = new int[ndevices];
258  nnodedevices = 0;
259  int pe = CkNodeFirst(CkMyNode());
260  int dr = -1;
261  for ( int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
262  int rank = ignoresharing ? i : CmiPhysicalRank(pe);
263  int peDeviceRank = rank * ndevices / numPesForDevice;
264  if ( peDeviceRank != dr ) {
265  dr = peDeviceRank;
266  nodedevices[nnodedevices++] = devices[dr];
267  }
268  }
269  }
270 
271  {
272  // check for devices used twice by this node
273  for ( int i=0; i<nnodedevices; ++i ) {
274  for ( int j=i+1; j<nnodedevices; ++j ) {
275  if ( nodedevices[i] == nodedevices[j] ) {
276  char msg[1024];
277  sprintf(msg,"Device %d bound twice by same process.", nodedevices[i]);
278  NAMD_die(msg);
279  }
280  }
281  }
282  }
283 
284  sharedGpu = 0;
285  gpuIsMine = 1;
286  int firstPeSharingGpu = CkMyPe();
287  nextPeSharingGpu = CkMyPe();
288 
289  {
290  int dev;
291  if (pmePes != -1) {
292  int myDeviceRank;
293  if (myRankForDevice < pmePes) {
294  myDeviceRank = 0;
295  } else {
296  myDeviceRank = 1 + (myRankForDevice-pmePes) * (ndevices-1) / (numPesForDevice-pmePes);
297  }
298 
299  dev = devices[myDeviceRank];
300  masterPe = CkMyPe();
301  if (myRankForDevice >= pmePes) {
302  pesSharingDevice = new int[numPesForDevice];
303  masterPe = -1;
304  numPesSharingDevice = 0;
305  for ( int i = pmePes; i < numPesForDevice; ++i ) {
306  if ( 1 + (i-pmePes) * (ndevices-1) / (numPesForDevice-pmePes) == myDeviceRank ) {
307  int thisPe = ignoresharing ? (CkNodeFirst(CkMyNode())+i) : pesOnPhysicalNode[i];
308  pesSharingDevice[numPesSharingDevice++] = thisPe;
309  if ( masterPe < 1 ) masterPe = thisPe;
310  if ( WorkDistrib::pe_sortop_diffuse()(thisPe,masterPe) ) masterPe = thisPe;
311  }
312  }
313  for ( int j = 0; j < ndevices; ++j ) {
314  if ( devices[j] == dev && j != myDeviceRank ) sharedGpu = 1;
315  }
316  } else {
317  #ifdef NODEGROUP_FORCE_REGISTER
318  pesSharingDevice = new int[pmePes];
319  #else
320  pesSharingDevice = NULL;
321  #endif
322  masterPe = -1;
323  numPesSharingDevice = 0;
324  for (int i = 0; i < pmePes; ++i) {
325  int thisPe = ignoresharing ? (CkNodeFirst(CkMyNode())+i) : pesOnPhysicalNode[i];
326  pesSharingDevice[numPesSharingDevice++] = thisPe;
327  if ( masterPe < 1 ) masterPe = thisPe;
328  if ( WorkDistrib::pe_sortop_diffuse()(thisPe,masterPe) ) masterPe = thisPe;
329  }
330  }
331  if ( sharedGpu && masterPe == CkMyPe() ) {
332  if ( CmiPhysicalNodeID(masterPe) < 2 )
333  CkPrintf("Pe %d sharing CUDA device %d\n", CkMyPe(), dev);
334  }
335  } else if ( numPesForDevice > 1 ) {
336  int myDeviceRank = myRankForDevice * ndevices / numPesForDevice;
337  dev = devices[myDeviceRank];
338  masterPe = CkMyPe();
339  {
340  pesSharingDevice = new int[numPesForDevice];
341  masterPe = -1;
342  numPesSharingDevice = 0;
343  for ( int i = 0; i < numPesForDevice; ++i ) {
344  if ( i * ndevices / numPesForDevice == myDeviceRank ) {
345  int thisPe = ignoresharing ? (CkNodeFirst(CkMyNode())+i) : pesOnPhysicalNode[i];
346  pesSharingDevice[numPesSharingDevice++] = thisPe;
347  if ( masterPe < 1 ) masterPe = thisPe;
348  if ( WorkDistrib::pe_sortop_diffuse()(thisPe,masterPe) ) masterPe = thisPe;
349  }
350  }
351  for ( int j = 0; j < ndevices; ++j ) {
352  if ( devices[j] == dev && j != myDeviceRank ) sharedGpu = 1;
353  }
354  }
355  if ( sharedGpu && masterPe == CkMyPe() ) {
356  if ( CmiPhysicalNodeID(masterPe) < 2 )
357  CkPrintf("Pe %d sharing CUDA device %d\n", CkMyPe(), dev);
358  }
359  } else { // in case phys node code is lying
360  dev = devices[CkMyPe() % ndevices];
361  masterPe = CkMyPe();
362  pesSharingDevice = new int[1];
363  pesSharingDevice[0] = CkMyPe();
364  numPesSharingDevice = 1;
365  }
366 
367  deviceID = dev;
368 
369  // Setting PME device for single-node scheme
370  // Sanity check in order to see if pmeDevice in contained in the device list
371  bool contained = false;
372  pmeDevice = (cuda_args.pmedevice == NULL) ? devices[0]: cuda_args.pmedevice;
373  for(int i = 0; i < ndevices; i++){
374  if(!contained) {
375  contained = devices[i] == pmeDevice;
376  pmeDeviceIndex = (contained) ? i : -1; // saves index for pmeDevice
377  }
378  if(deviceID == devices[i]) deviceIndex = i;
379  }
380 
381  masterDevice = devices[0]; // head of device list responsible for printing stuff
382  isMasterDevice = deviceID == masterDevice;
383  if(!contained){
384  // Uses a particular device to do PME and reserves it (no other force terms on it)
385  reservePme = true;
386  pmeDeviceIndex = nnodedevices; // PME device index always at the tail of the list
387  isPmeDevice = isMasterDevice; // Master device launches work on the PME device as well
388  }else{
389  reservePme = false;
390  isPmeDevice = pmeDevice == deviceID;
391  }
392 
393  // Device for CudaGlobalMaster
394  globalDevice = (cuda_args.globaldevice < 0) ? devices[0]: cuda_args.globaldevice;
395  // Sanity check in order to see if globalDevice is contained in the device list
396  contained = false;
397  for (int i = 0; i < ndevices; ++i) {
398  if(!contained) {
399  contained = devices[i] == globalDevice;
400  }
401  }
402  if (!contained) {
403  NAMD_die("The selected GPU device for global forces is in the available devices list.\n");
404  }
405  isGlobalDevice = globalDevice == deviceID;
406 
407  // Store device IDs to node-wide list
408  if (CkMyRank() >= MAX_NUM_RANKS)
409  NAMD_die("Maximum number of ranks (2048) per node exceeded");
410  deviceIDList[CkMyRank()] = deviceID;
411 
412  if ( masterPe != CkMyPe() ) {
413  if ( CmiPhysicalNodeID(masterPe) < 2 )
414  CkPrintf("Pe %d physical rank %d will use CUDA device of pe %d\n",
415  CkMyPe(), myRankInPhysicalNode, masterPe);
416  // for PME only
417  cudaCheck(cudaSetDevice(dev));
418  return;
419  }
420 
421  // Store master PEs for every device ID to node-wide list
422  if (deviceID >= MAX_NUM_DEVICES)
423  NAMD_die("Maximum number of CUDA devices (256) per node exceeded");
424  masterPeList[deviceID] = masterPe + 1; // array is pre-initialized to zeros
425 
426  // disable token-passing but don't submit local until remote finished
427  // if shared_gpu is true, otherwise submit all work immediately
428  firstPeSharingGpu = CkMyPe();
429  nextPeSharingGpu = CkMyPe();
430 
431  gpuIsMine = ( firstPeSharingGpu == CkMyPe() );
432 
433  if ( dev >= deviceCount ) {
434  char buf[256];
435  sprintf(buf,"Pe %d unable to bind to CUDA device %d on %s because only %d devices are present",
436  CkMyPe(), dev, host, deviceCount);
437  NAMD_die(buf);
438  }
439 
440  cudaDeviceProp deviceProp;
441  cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
442  if ( CmiPhysicalNodeID(masterPe) < 2 )
443  CkPrintf("Pe %d physical rank %d binding to CUDA device %d on %s: '%s' Mem: %luMB Rev: %d.%d PCI: %x:%x:%x\n",
444  CkMyPe(), myRankInPhysicalNode, dev, host,
445  deviceProp.name,
446  (unsigned long) (deviceProp.totalGlobalMem / (1024*1024)),
447  deviceProp.major, deviceProp.minor,
448  deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
449 
450  cudaCheck(cudaSetDevice(dev));
451 
452  } // just let CUDA pick a device for us
453 
454  {
455  // if only one device then already initialized in cuda_affinity_initialize()
456  cudaError_t cudaSetDeviceFlags_cudaDeviceMapHost = cudaSetDeviceFlags(cudaDeviceMapHost);
457  if ( cudaSetDeviceFlags_cudaDeviceMapHost == cudaErrorSetOnActiveProcess ) {
458  cudaGetLastError();
459  } else {
460  cudaCheck(cudaSetDeviceFlags_cudaDeviceMapHost);
461  }
462 
463  int dev;
464  cudaCheck(cudaGetDevice(&dev));
465  deviceID = dev;
466  cudaDeviceProp deviceProp;
467  cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
468  if ( deviceProp.computeMode == cudaComputeModeProhibited )
469  cudaDie("device in prohibited mode");
470  if ( deviceProp.major < 3 )
471  cudaDie("device not of compute capability 3.0 or higher");
472  if ( ! deviceProp.canMapHostMemory )
473  cudaDie("device cannot map host memory");
474 
475  // initialize the device on this thread
476  int *dummy;
477  cudaCheck(cudaMalloc(&dummy, 4));
478  }
479 
480 #if NODEGROUP_FORCE_REGISTER
481 
482  {
483  // Setting PME device for single-node scheme
484 
485  // Sanity check in order to see if pmeDevice in contained in the device list
486  bool contained = false;
487  pmeDevice = (cuda_args.pmedevice == NULL) ? devices[0]: cuda_args.pmedevice;
488  for(int i = 0; i < ndevices; i++){
489  if(!contained) {
490  contained = devices[i] == pmeDevice;
491  pmeDeviceIndex = (contained) ? i : -1; // saves index for pmeDevice
492  }
493  if(deviceID == devices[i]) deviceIndex = i;
494  }
495 
496  if(!contained && CkMyPe() == 0) cudaDie("device specified for PME is not contained in +devices!");
497  // Everything is OK, sets flags
498 
499  isPmeDevice = pmeDevice == deviceID;
500  masterDevice = devices[0]; // head of device list responsible for printing stuff
501  isMasterDevice = deviceID == masterDevice;
502 
503  if (pmeDeviceIndex != 0 && pmePes != -1) {
504  NAMD_die("PME device must be index 0 if pmePes is set");
505  }
506  }
507 
508 #endif
509 }
int devicesperreplica
Definition: DeviceCUDA.C:55
#define MAX_NUM_DEVICES
Definition: DeviceCUDA.C:92
int nomergegrids
Definition: DeviceCUDA.C:58
void cudaDie(const char *msg, cudaError_t err)
Definition: CudaUtils.C:9
int masterPeList[MAX_NUM_DEVICES]
Definition: DeviceCUDA.C:93
static __thread cuda_args_t cuda_args
Definition: DeviceCUDA.C:65
int globaldevice
Definition: DeviceCUDA.C:62
int usedevicelist
Definition: DeviceCUDA.C:54
int deviceIDList[MAX_NUM_RANKS]
Definition: DeviceCUDA.C:90
int mergegrids
Definition: DeviceCUDA.C:57
int nostreaming
Definition: DeviceCUDA.C:59
int pmedevice
Definition: DeviceCUDA.C:60
void NAMD_die(const char *err_msg)
Definition: common.C:147
#define MAX_NUM_RANKS
Definition: DeviceCUDA.C:89
#define cudaCheck(stmt)
Definition: CudaUtils.h:233
int ignoresharing
Definition: DeviceCUDA.C:56
char * devicelist
Definition: DeviceCUDA.C:53
for(int i=0;i< n1;++i)

◆ isGpuReservedPme()

bool DeviceCUDA::isGpuReservedPme ( )
inline

Definition at line 164 of file DeviceCUDA.h.

Referenced by ComputeCUDAMgr::initialize().

164 { return reservePme; }

◆ one_device_per_node()

bool DeviceCUDA::one_device_per_node ( )

Definition at line 547 of file DeviceCUDA.C.

Referenced by ComputePmeMgr::initialize().

547  {
548  if ( numPesSharingDevice != CkMyNodeSize() ) return false;
549  int numPesOnNodeSharingDevice = 0;
550  for ( int i=0; i<numPesSharingDevice; ++i ) {
551  if ( CkNodeOf(pesSharingDevice[i]) == CkMyNode() ) {
552  ++numPesOnNodeSharingDevice;
553  }
554  }
555  return ( numPesOnNodeSharingDevice == CkMyNodeSize() );
556 }

◆ setGpuIsMine()

void DeviceCUDA::setGpuIsMine ( const int  val)
inline

Definition at line 142 of file DeviceCUDA.h.

142 {gpuIsMine = val;}

◆ setMergeGrids()

void DeviceCUDA::setMergeGrids ( const int  val)
inline

Definition at line 133 of file DeviceCUDA.h.

133 {mergegrids = val;}

◆ setupDevicePeerAccess()

void DeviceCUDA::setupDevicePeerAccess ( )

The documentation for this class was generated from the following files: