NAMD
DeviceCUDA.C
Go to the documentation of this file.
1 #ifdef NAMD_CUDA
2 #include <cuda_runtime.h>
3 #include <cuda.h>
4 #endif
5 #ifdef NAMD_HIP
6 #include <hip/hip_runtime.h>
7 #endif
8 
9 #include "common.h"
10 #include "charm++.h"
11 #include "DeviceCUDA.h"
12 #include "WorkDistrib.h"
13 #include "CudaUtils.h"
14 #include "Node.h"
15 #include "SimParameters.h"
16 
17 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
18 #ifdef WIN32
19 #define __thread __declspec(thread)
20 #endif
21 
22 // Global storage for CUDA devices
24 
25 void cuda_finalize();
26 
28  deviceCUDA = new DeviceCUDA();
30  if (CkMyRank() == 0) {
31  std::atexit(cuda_finalize);
32  }
33 }
34 
35 // kill all service threads
36 void cuda_finalize() {
37 #if defined(NAMD_CUDA) || ((NAMD_HIP) && ((HIP_VERSION_MAJOR < 4) && (HIP_VERSION_MINOR < 5)))
38  int ndevs = 0;
39  cudaGetDeviceCount(&ndevs);
40  for ( int dev=0; dev < ndevs; ++dev ) {
41  cudaSetDevice(dev);
42  cudaDeviceReset();
43  }
44 #else
45  // for hip on rocm versions later > 4.5, there seems to be
46  // additional hipFree's called after hipDeviceReset, so removing it for now
47 #endif
48 }
49 
50 // -------------------------------------------------------------------------------------------------
51 // Called from BackEnd.C by all processes to read command line arguments
52 // These argument settings are used by DeviceCUDA -class
53 // -------------------------------------------------------------------------------------------------
54 struct cuda_args_t {
55  char *devicelist;
62  int pmedevice;
63  int pmePes;
65 };
66 
67 static __thread cuda_args_t cuda_args;
68 
69 void cuda_getargs(char **argv) {
71  cuda_args.usedevicelist = CmiGetArgStringDesc(argv, "+devices", &cuda_args.devicelist,
72  "comma-delimited list of CUDA device numbers such as 0,2,1,2");
74  CmiGetArgInt(argv, "+devicesperreplica", &cuda_args.devicesperreplica);
75  if ( cuda_args.devicesperreplica < 0 ) NAMD_die("Devices per replica must be positive\n");
76  cuda_args.ignoresharing = CmiGetArgFlag(argv, "+ignoresharing");
77  cuda_args.mergegrids = CmiGetArgFlag(argv, "+mergegrids");
78  cuda_args.nomergegrids = CmiGetArgFlag(argv, "+nomergegrids");
79  if ( cuda_args.mergegrids && cuda_args.nomergegrids ) NAMD_die("Do not specify both +mergegrids and +nomergegrids");
80  cuda_args.nostreaming = CmiGetArgFlag(argv, "+nostreaming");
81 #ifdef NODEGROUP_FORCE_REGISTER
82  CmiGetArgInt(argv, "+pmedevice", &cuda_args.pmedevice);
83  CmiGetArgInt(argv, "+pmepes", &cuda_args.pmePes);
85  CmiGetArgInt(argv, "+globaldevice", &cuda_args.globaldevice);
86 #endif
87 }
88 // -------------------------------------------------------------------------------------------------
89 
90 // Node-wide list of device IDs for every rank
91 #define MAX_NUM_RANKS 2048
93 // Node-wide of master PEs for every device ID
94 #define MAX_NUM_DEVICES 256
96 
97 // -------------------------------------------------------------------------------------------------
98 // -------------------------------------------------------------------------------------------------
99 // -------------------------------------------------------------------------------------------------
100 
101 //
102 // Class creator
103 //
104 DeviceCUDA::DeviceCUDA() : deviceProps(NULL), devices(NULL) {}
105 
106 //
107 // Initalize device
108 //
110  // Copy command-line arguments into class
111  this->devicelist = cuda_args.devicelist;
112  this->usedevicelist = cuda_args.usedevicelist;
113  this->devicesperreplica = cuda_args.devicesperreplica;
114  this->ignoresharing = cuda_args.ignoresharing;
115  this->mergegrids = cuda_args.mergegrids;
116  this->nomergegrids = cuda_args.nomergegrids;
117  this->nostreaming = cuda_args.nostreaming;
118 #ifdef NODEGROUP_FORCE_REGISTER
119  reservePme = 0;
120  isPmeDevice = 0;
121  isGlobalDevice = false;
122 #endif
123 
124  if (CkMyPe() == 0) register_user_events();
125 
126 #if defined(CUDA_VERSION)
127  if (CkMyPe() == 0) CkPrintf("Info: Built with CUDA version %d\n", CUDA_VERSION);
128 #endif
129 
130  char host[128];
131  gethostname(host, 128); host[127] = 0;
132 
133  int myPhysicalNodeID = CmiPhysicalNodeID(CkMyPe());
134  int myRankInPhysicalNode;
135  int numPesOnPhysicalNode;
136  int *pesOnPhysicalNode;
137  CmiGetPesOnPhysicalNode(myPhysicalNodeID,
138  &pesOnPhysicalNode,&numPesOnPhysicalNode);
139 
140  {
141  int i;
142  for ( i=0; i < numPesOnPhysicalNode; ++i ) {
143  if ( i && (pesOnPhysicalNode[i] <= pesOnPhysicalNode[i-1]) ) {
144  i = numPesOnPhysicalNode;
145  break;
146  }
147  if ( pesOnPhysicalNode[i] == CkMyPe() ) break;
148  }
149  if ( i == numPesOnPhysicalNode || i != CmiPhysicalRank(CkMyPe()) ) {
150  CkPrintf("Bad result from CmiGetPesOnPhysicalNode!\n");
151  for ( i=0; i < numPesOnPhysicalNode; ++i ) {
152  CkPrintf("pe %d physnode rank %d of %d is %d\n", CkMyPe(),
153  i, numPesOnPhysicalNode, pesOnPhysicalNode[i]);
154  }
155  myRankInPhysicalNode = 0;
156  numPesOnPhysicalNode = 1;
157  pesOnPhysicalNode = new int[1];
158  pesOnPhysicalNode[0] = CkMyPe();
159  } else {
160  myRankInPhysicalNode = i;
161  }
162  }
163  // CkPrintf("Pe %d ranks %d in physical node\n",CkMyPe(),myRankInPhysicalNode);
164 
165  deviceCount = 0;
166  cudaCheck(cudaGetDeviceCount(&deviceCount));
167  if ( deviceCount <= 0 ) {
168  cudaDie("No CUDA devices found.");
169  }
170 
171  // Store all device props
172  deviceProps = new cudaDeviceProp[deviceCount];
173  for ( int i=0; i<deviceCount; ++i ) {
174  cudaCheck(cudaGetDeviceProperties(&deviceProps[i], i));
175  }
176 
177  ndevices = 0;
178  int nexclusive = 0;
179  if ( usedevicelist ) {
180  devices = new int[strlen(devicelist)];
181  int i = 0;
182  while ( devicelist[i] ) {
183  ndevices += sscanf(devicelist+i,"%d",devices+ndevices);
184  while ( devicelist[i] && isdigit(devicelist[i]) ) ++i;
185  while ( devicelist[i] && ! isdigit(devicelist[i]) ) ++i;
186  }
187  } else {
188  if ( ! CkMyPe() ) {
189  CkPrintf("Did not find +devices i,j,k,... argument, using all\n");
190  }
191  devices = new int[deviceCount];
192  for ( int i=0; i<deviceCount; ++i ) {
193  int dev = i % deviceCount;
194 #if CUDA_VERSION >= 2020 || defined(NAMD_HIP)
195  cudaDeviceProp deviceProp;
196  cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
197  if ( deviceProp.computeMode != cudaComputeModeProhibited
198  && (deviceProp.major >= 3)
199  && deviceProp.canMapHostMemory
200  && ( (deviceProp.multiProcessorCount > 2) ||
201  ((ndevices==0)&&(CkNumNodes()==1)) ) // exclude weak cards
202  ) {
203  devices[ndevices++] = dev;
204  }
205  if ( deviceProp.computeMode == cudaComputeModeExclusive ) {
206  ++nexclusive;
207  }
208 #else
209  devices[ndevices++] = dev;
210 #endif
211  }
212  }
213 
214  if ( ! ndevices ) {
215  cudaDie("all devices are in prohibited mode, of compute capability < 3.0, unable to map host memory, too small, or otherwise unusable");
216  }
217 
218  if ( devicesperreplica > 0 ) {
219  if ( devicesperreplica > ndevices ) {
220  NAMD_die("More devices per partition requested than devices are available");
221  }
222  int *olddevices = devices;
223  devices = new int[devicesperreplica];
224  for ( int i=0; i<devicesperreplica; ++i ) {
225  int mypart = CmiMyPartition();
226  devices[i] = olddevices[(i+devicesperreplica*mypart)%ndevices];
227  }
228  ndevices = devicesperreplica;
229  delete [] olddevices;
230  }
231 
232  int myRankForDevice = ignoresharing ? CkMyRank() : myRankInPhysicalNode;
233  int numPesForDevice = ignoresharing ? CkMyNodeSize() : numPesOnPhysicalNode;
234 
235  // This option allows users to specify the number of PEs given to the PME device.
236  // This can be used to improve loadbalancing.
237  #ifdef NODEGROUP_FORCE_REGISTER
238  const int pmePes = (cuda_args.pmePes == NULL) ? -1 : cuda_args.pmePes;
239  #else
240  const int pmePes = -1;
241  #endif
242 
243  // catch multiple processes per device
244  if ( ndevices % ( numPesForDevice / CkMyNodeSize() ) ) {
245  char msg[1024];
246  sprintf(msg,"Number of devices (%d) is not a multiple of number of processes (%d). "
247  "Sharing devices between processes is inefficient. "
248  "Specify +ignoresharing (each process uses all visible devices) if "
249  "not all devices are visible to each process, otherwise "
250  "adjust number of processes to evenly divide number of devices, "
251  "specify subset of devices with +devices argument (e.g., +devices 0,2), "
252  "or multiply list shared devices (e.g., +devices 0,1,2,0).",
253  ndevices, numPesForDevice / CkMyNodeSize() );
254  NAMD_die(msg);
255  }
256 
257  {
258  // build list of devices actually used by this node
259  nodedevices = new int[ndevices];
260  nnodedevices = 0;
261  int pe = CkNodeFirst(CkMyNode());
262  int dr = -1;
263  for ( int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
264  int rank = ignoresharing ? i : CmiPhysicalRank(pe);
265  int peDeviceRank = rank * ndevices / numPesForDevice;
266  if ( peDeviceRank != dr ) {
267  dr = peDeviceRank;
268  nodedevices[nnodedevices++] = devices[dr];
269  }
270  }
271  }
272 
273  {
274  // check for devices used twice by this node
275  for ( int i=0; i<nnodedevices; ++i ) {
276  for ( int j=i+1; j<nnodedevices; ++j ) {
277  if ( nodedevices[i] == nodedevices[j] ) {
278  char msg[1024];
279  sprintf(msg,"Device %d bound twice by same process.", nodedevices[i]);
280  NAMD_die(msg);
281  }
282  }
283  }
284  }
285 
286  sharedGpu = 0;
287  gpuIsMine = 1;
288  int firstPeSharingGpu = CkMyPe();
289  nextPeSharingGpu = CkMyPe();
290 
291  {
292  int dev;
293  if (pmePes != -1) {
294  int myDeviceRank;
295  if (myRankForDevice < pmePes) {
296  myDeviceRank = 0;
297  } else {
298  myDeviceRank = 1 + (myRankForDevice-pmePes) * (ndevices-1) / (numPesForDevice-pmePes);
299  }
300 
301  dev = devices[myDeviceRank];
302  masterPe = CkMyPe();
303  if (myRankForDevice >= pmePes) {
304  pesSharingDevice = new int[numPesForDevice];
305  masterPe = -1;
306  numPesSharingDevice = 0;
307  for ( int i = pmePes; i < numPesForDevice; ++i ) {
308  if ( 1 + (i-pmePes) * (ndevices-1) / (numPesForDevice-pmePes) == myDeviceRank ) {
309  int thisPe = ignoresharing ? (CkNodeFirst(CkMyNode())+i) : pesOnPhysicalNode[i];
310  pesSharingDevice[numPesSharingDevice++] = thisPe;
311  if ( masterPe < 1 ) masterPe = thisPe;
312  if ( WorkDistrib::pe_sortop_diffuse()(thisPe,masterPe) ) masterPe = thisPe;
313  }
314  }
315  for ( int j = 0; j < ndevices; ++j ) {
316  if ( devices[j] == dev && j != myDeviceRank ) sharedGpu = 1;
317  }
318  } else {
319  #ifdef NODEGROUP_FORCE_REGISTER
320  pesSharingDevice = new int[pmePes];
321  #else
322  pesSharingDevice = NULL;
323  #endif
324  masterPe = -1;
325  numPesSharingDevice = 0;
326  for (int i = 0; i < pmePes; ++i) {
327  int thisPe = ignoresharing ? (CkNodeFirst(CkMyNode())+i) : pesOnPhysicalNode[i];
328  pesSharingDevice[numPesSharingDevice++] = thisPe;
329  if ( masterPe < 1 ) masterPe = thisPe;
330  if ( WorkDistrib::pe_sortop_diffuse()(thisPe,masterPe) ) masterPe = thisPe;
331  }
332  }
333  if ( sharedGpu && masterPe == CkMyPe() ) {
334  if ( CmiPhysicalNodeID(masterPe) < 2 )
335  CkPrintf("Pe %d sharing CUDA device %d\n", CkMyPe(), dev);
336  }
337  } else if ( numPesForDevice > 1 ) {
338  int myDeviceRank = myRankForDevice * ndevices / numPesForDevice;
339  dev = devices[myDeviceRank];
340  masterPe = CkMyPe();
341  {
342  pesSharingDevice = new int[numPesForDevice];
343  masterPe = -1;
344  numPesSharingDevice = 0;
345  for ( int i = 0; i < numPesForDevice; ++i ) {
346  if ( i * ndevices / numPesForDevice == myDeviceRank ) {
347  int thisPe = ignoresharing ? (CkNodeFirst(CkMyNode())+i) : pesOnPhysicalNode[i];
348  pesSharingDevice[numPesSharingDevice++] = thisPe;
349  if ( masterPe < 1 ) masterPe = thisPe;
350  if ( WorkDistrib::pe_sortop_diffuse()(thisPe,masterPe) ) masterPe = thisPe;
351  }
352  }
353  for ( int j = 0; j < ndevices; ++j ) {
354  if ( devices[j] == dev && j != myDeviceRank ) sharedGpu = 1;
355  }
356  }
357  if ( sharedGpu && masterPe == CkMyPe() ) {
358  if ( CmiPhysicalNodeID(masterPe) < 2 )
359  CkPrintf("Pe %d sharing CUDA device %d\n", CkMyPe(), dev);
360  }
361  } else { // in case phys node code is lying
362  dev = devices[CkMyPe() % ndevices];
363  masterPe = CkMyPe();
364  pesSharingDevice = new int[1];
365  pesSharingDevice[0] = CkMyPe();
366  numPesSharingDevice = 1;
367  }
368 
369  deviceID = dev;
370 
371  // Setting PME device for single-node scheme
372  // Sanity check in order to see if pmeDevice in contained in the device list
373  bool contained = false;
374  pmeDevice = (cuda_args.pmedevice == NULL) ? devices[0]: cuda_args.pmedevice;
375  for(int i = 0; i < ndevices; i++){
376  if(!contained) {
377  contained = devices[i] == pmeDevice;
378  pmeDeviceIndex = (contained) ? i : -1; // saves index for pmeDevice
379  }
380  if(deviceID == devices[i]) deviceIndex = i;
381  }
382 
383  masterDevice = devices[0]; // head of device list responsible for printing stuff
384  isMasterDevice = deviceID == masterDevice;
385  if(!contained){
386  // Uses a particular device to do PME and reserves it (no other force terms on it)
387  reservePme = true;
388  pmeDeviceIndex = nnodedevices; // PME device index always at the tail of the list
389  isPmeDevice = isMasterDevice; // Master device launches work on the PME device as well
390  }else{
391  reservePme = false;
392  isPmeDevice = pmeDevice == deviceID;
393  }
394 
395  // Device for CudaGlobalMaster
396  globalDevice = (cuda_args.globaldevice < 0) ? devices[0]: cuda_args.globaldevice;
397  // Sanity check in order to see if globalDevice is contained in the device list
398  contained = false;
399  for (int i = 0; i < ndevices; ++i) {
400  if(!contained) {
401  contained = devices[i] == globalDevice;
402  }
403  }
404  if (!contained) {
405  NAMD_die("The selected GPU device for global forces is in the available devices list.\n");
406  }
407  isGlobalDevice = globalDevice == deviceID;
408 
409  // Store device IDs to node-wide list
410  if (CkMyRank() >= MAX_NUM_RANKS)
411  NAMD_die("Maximum number of ranks (2048) per node exceeded");
412  deviceIDList[CkMyRank()] = deviceID;
413 
414  if ( masterPe != CkMyPe() ) {
415  if ( CmiPhysicalNodeID(masterPe) < 2 )
416  CkPrintf("Pe %d physical rank %d will use CUDA device of pe %d\n",
417  CkMyPe(), myRankInPhysicalNode, masterPe);
418  // for PME only
419  cudaCheck(cudaSetDevice(dev));
420  return;
421  }
422 
423  // Store master PEs for every device ID to node-wide list
424  if (deviceID >= MAX_NUM_DEVICES)
425  NAMD_die("Maximum number of CUDA devices (256) per node exceeded");
426  masterPeList[deviceID] = masterPe + 1; // array is pre-initialized to zeros
427 
428  // disable token-passing but don't submit local until remote finished
429  // if shared_gpu is true, otherwise submit all work immediately
430  firstPeSharingGpu = CkMyPe();
431  nextPeSharingGpu = CkMyPe();
432 
433  gpuIsMine = ( firstPeSharingGpu == CkMyPe() );
434 
435  if ( dev >= deviceCount ) {
436  char buf[256];
437  sprintf(buf,"Pe %d unable to bind to CUDA device %d on %s because only %d devices are present",
438  CkMyPe(), dev, host, deviceCount);
439  NAMD_die(buf);
440  }
441 
442  cudaDeviceProp deviceProp;
443  cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
444  if ( CmiPhysicalNodeID(masterPe) < 2 )
445  CkPrintf("Pe %d physical rank %d binding to CUDA device %d on %s: '%s' Mem: %luMB Rev: %d.%d PCI: %x:%x:%x\n",
446  CkMyPe(), myRankInPhysicalNode, dev, host,
447  deviceProp.name,
448  (unsigned long) (deviceProp.totalGlobalMem / (1024*1024)),
449  deviceProp.major, deviceProp.minor,
450  deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
451 
452  cudaCheck(cudaSetDevice(dev));
453 
454  } // just let CUDA pick a device for us
455 
456  {
457  // if only one device then already initialized in cuda_affinity_initialize()
458  cudaError_t cudaSetDeviceFlags_cudaDeviceMapHost = cudaSetDeviceFlags(cudaDeviceMapHost);
459  if ( cudaSetDeviceFlags_cudaDeviceMapHost == cudaErrorSetOnActiveProcess ) {
460  cudaGetLastError();
461  } else {
462  cudaCheck(cudaSetDeviceFlags_cudaDeviceMapHost);
463  }
464 
465  int dev;
466  cudaCheck(cudaGetDevice(&dev));
467  deviceID = dev;
468  cudaDeviceProp deviceProp;
469  cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
470  if ( deviceProp.computeMode == cudaComputeModeProhibited )
471  cudaDie("device in prohibited mode");
472  if ( deviceProp.major < 3 )
473  cudaDie("device not of compute capability 3.0 or higher");
474  if ( ! deviceProp.canMapHostMemory )
475  cudaDie("device cannot map host memory");
476 
477  // initialize the device on this thread
478  int *dummy;
479  cudaCheck(cudaMalloc(&dummy, 4));
480  }
481 
482 #if NODEGROUP_FORCE_REGISTER
483 
484  {
485  // Setting PME device for single-node scheme
486 
487  // Sanity check in order to see if pmeDevice in contained in the device list
488  bool contained = false;
489  pmeDevice = (cuda_args.pmedevice == NULL) ? devices[0]: cuda_args.pmedevice;
490  for(int i = 0; i < ndevices; i++){
491  if(!contained) {
492  contained = devices[i] == pmeDevice;
493  pmeDeviceIndex = (contained) ? i : -1; // saves index for pmeDevice
494  }
495  if(deviceID == devices[i]) deviceIndex = i;
496  }
497 
498  if(!contained && CkMyPe() == 0) cudaDie("device specified for PME is not contained in +devices!");
499  // Everything is OK, sets flags
500 
501  isPmeDevice = pmeDevice == deviceID;
502  masterDevice = devices[0]; // head of device list responsible for printing stuff
503  isMasterDevice = deviceID == masterDevice;
504 
505  if (pmeDeviceIndex != 0 && pmePes != -1) {
506  NAMD_die("PME device must be index 0 if pmePes is set");
507  }
508  }
509 
510 #endif
511 }
512 
513 //
514 // Class destructor
515 //
517  if (deviceProps != NULL) delete [] deviceProps;
518  if (devices != NULL) delete [] devices;
519  delete [] pesSharingDevice;
520 }
521 
522 //
523 // Return device ID for pe. Assumes all nodes are the same
524 //
526  return deviceIDList[CkRankOf(pe) % CkMyNodeSize()];
527 }
528 
529 //
530 // Returns master PE for the device ID, or -1 if device not found
531 //
533  return masterPeList[deviceID % deviceCount] - 1;
534 }
535 
536 //
537 // Returns true if process "pe" shares this device
538 //
540  for ( int i=0; i<numPesSharingDevice; ++i ) {
541  if ( pesSharingDevice[i] == pe ) return true;
542  }
543  return false;
544 }
545 
546 //
547 // Returns true if there is single device per node
548 //
550  if ( numPesSharingDevice != CkMyNodeSize() ) return false;
551  int numPesOnNodeSharingDevice = 0;
552  for ( int i=0; i<numPesSharingDevice; ++i ) {
553  if ( CkNodeOf(pesSharingDevice[i]) == CkMyNode() ) {
554  ++numPesOnNodeSharingDevice;
555  }
556  }
557  return ( numPesOnNodeSharingDevice == CkMyNodeSize() );
558 }
559 
561  int dev;
562  cudaCheck(cudaGetDevice(&dev));
563  return deviceProps[dev].maxThreadsPerBlock;
564 }
565 
567  int dev;
568  cudaCheck(cudaGetDevice(&dev));
569  return deviceProps[dev].maxGridSize[0];
570 }
571 
572 #ifdef NODEGROUP_FORCE_REGISTER
574  int canAccessPeer;
575  for(int i = 0; i < ndevices; i++){
576  if (devices[i] == deviceID) continue;
577  canAccessPeer = 0;
578  // GPUs involved in the simulation need to be fully interconnected, otherwise we bail out
579  cudaCheck(cudaDeviceCanAccessPeer( &canAccessPeer, deviceID, devices[i]));
580  if( !canAccessPeer){
581  char *err = new char[128];
582  sprintf(err, "Failed setting up device peer access - Devices %d and %d are not paired.\n",
583  deviceID, devices[i]);
584  NAMD_die(err);
585  }else{
586 #if 1
587  cudaError_t error = cudaDeviceEnablePeerAccess(devices[i], 0); // pairs devices
588  if(error == cudaErrorPeerAccessAlreadyEnabled ) {
589  cudaCheck(cudaDeviceDisablePeerAccess(devices[i]));
590  cudaCheck(cudaDeviceEnablePeerAccess(devices[i], 0));
591  cudaGetLastError(); // Clear the cudaErrorPeerAccessAlreadyEnabled error
592  }
593  else cudaCheck(error);
594 
595  int p2pAtomics;
596  error = cudaDeviceGetP2PAttribute(&p2pAtomics, cudaDevP2PAttrNativeAtomicSupported, deviceID, devices[i]);
597  cudaCheck(error);
598  if (p2pAtomics != 1) {
599  SimParameters* sim_params = Node::Object()->simParameters;
600  if (sim_params->useDeviceMigration) {
601  // Failing instead of turning off device migration in case CPU migration step has already been
602  // skipped. DMC doesn't think this is necessary, but better to be safe
603  NAMD_die("Device Migration is not supported on systems without P2P atomics.");
604  }
605  }
606 #else
607  cudaCheck(cudaDeviceEnablePeerAccess(devices[i], 0)); // pairs devices
608 #endif
609  }
610  }
611  if(reservePme){
612  // binds this particular device to the pmeDevice
613  canAccessPeer = 0;
614  cudaCheck(cudaDeviceCanAccessPeer( &canAccessPeer, deviceID, pmeDevice));
615  if( !canAccessPeer){
616  char *err = new char[128];
617  sprintf(err, "Failed setting up device peer access - Devices %d and %d are not paired.\n",
618  deviceID, pmeDevice);
619  NAMD_die(err);
620  }else{
621  cudaCheck(cudaDeviceEnablePeerAccess(pmeDevice, 0));
622  }
623  }
624 }
625 
626 #ifdef NAMD_NCCL_ALLREDUCE
627 // Should be called by device on rank 0 on ComputeCUDAMgr;
628 void DeviceCUDA::setupNcclUniqueId(){
629  ncclGetUniqueId( &ncclId);
630 }
631 
632 void DeviceCUDA::setupNcclComm(){
633  // Initialize communicators
634  ncclCommInitRank(&ncclComm, CkNumPes(), ncclId, CkMyPe());
635 }
636 #endif /* NAMD_NCCL_ALLREDUCE */
637 #endif /* NODEGROUP_FORCE_REGISTER */
638 
639 //
640 // Returns true if called by the masterPE that owns the master device
641 //
643  return (CkMyPe() == masterPe) && isMasterDevice;
644 }
645 
646 /*
647 BASE
648 2 types (remote & local)
649 16 pes per node
650 3 phases (1, 2, 3)
651 */
652 
653 void DeviceCUDA::register_user_events() {
654 
655  traceRegisterUserEvent("CUDA PME spreadCharge", CUDA_PME_SPREADCHARGE_EVENT);
656  traceRegisterUserEvent("CUDA PME gatherForce", CUDA_PME_GATHERFORCE_EVENT);
657 
658  traceRegisterUserEvent("CUDA bonded", CUDA_BONDED_KERNEL_EVENT);
659  traceRegisterUserEvent("CUDA debug", CUDA_DEBUG_EVENT);
660  traceRegisterUserEvent("CUDA nonbonded", CUDA_NONBONDED_KERNEL_EVENT);
661  traceRegisterUserEvent("CUDA GBIS Phase 1 kernel", CUDA_GBIS1_KERNEL_EVENT);
662  traceRegisterUserEvent("CUDA GBIS Phase 2 kernel", CUDA_GBIS2_KERNEL_EVENT);
663  traceRegisterUserEvent("CUDA GBIS Phase 3 kernel", CUDA_GBIS3_KERNEL_EVENT);
664 
665  traceRegisterUserEvent("CUDA poll remote", CUDA_EVENT_ID_POLL_REMOTE);
666  traceRegisterUserEvent("CUDA poll local", CUDA_EVENT_ID_POLL_LOCAL);
667 
668 #define REGISTER_DEVICE_EVENTS(DEV) \
669  traceRegisterUserEvent("CUDA device " #DEV " remote", CUDA_EVENT_ID_BASE + 2 * DEV); \
670  traceRegisterUserEvent("CUDA device " #DEV " local", CUDA_EVENT_ID_BASE + 2 * DEV + 1);
671 
688 
689 }
690 
691 #endif // NAMD_CUDA
static Node * Object()
Definition: Node.h:86
#define CUDA_GBIS2_KERNEL_EVENT
Definition: DeviceCUDA.h:33
void initialize()
Definition: DeviceCUDA.C:109
#define REGISTER_DEVICE_EVENTS(DEV)
int getMaxNumThreads()
Definition: DeviceCUDA.C:560
int devicesperreplica
Definition: DeviceCUDA.C:57
#define MAX_NUM_DEVICES
Definition: DeviceCUDA.C:94
#define CUDA_BONDED_KERNEL_EVENT
Definition: DeviceCUDA.h:29
int nomergegrids
Definition: DeviceCUDA.C:60
SimParameters * simParameters
Definition: Node.h:181
#define CUDA_PME_SPREADCHARGE_EVENT
Definition: DeviceCUDA.h:27
void cudaDie(const char *msg, cudaError_t err)
Definition: CudaUtils.C:9
void cuda_finalize()
Definition: DeviceCUDA.C:36
int masterPeList[MAX_NUM_DEVICES]
Definition: DeviceCUDA.C:95
#define CUDA_EVENT_ID_POLL_REMOTE
Definition: DeviceCUDA.h:36
void setupDevicePeerAccess()
Bool useDeviceMigration
static __thread cuda_args_t cuda_args
Definition: DeviceCUDA.C:67
int globaldevice
Definition: DeviceCUDA.C:64
int usedevicelist
Definition: DeviceCUDA.C:56
#define CUDA_DEBUG_EVENT
Definition: DeviceCUDA.h:30
int deviceIDList[MAX_NUM_RANKS]
Definition: DeviceCUDA.C:92
int getMasterPeForDeviceID(int deviceID)
Definition: DeviceCUDA.C:532
int mergegrids
Definition: DeviceCUDA.C:59
#define CUDA_GBIS3_KERNEL_EVENT
Definition: DeviceCUDA.h:34
int nostreaming
Definition: DeviceCUDA.C:61
bool device_shared_with_pe(int pe)
Definition: DeviceCUDA.C:539
int pmedevice
Definition: DeviceCUDA.C:62
int getMaxNumBlocks()
Definition: DeviceCUDA.C:566
void NAMD_die(const char *err_msg)
Definition: common.C:147
#define MAX_NUM_RANKS
Definition: DeviceCUDA.C:91
#define CUDA_NONBONDED_KERNEL_EVENT
Definition: DeviceCUDA.h:31
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:23
#define CUDA_GBIS1_KERNEL_EVENT
Definition: DeviceCUDA.h:32
#define cudaCheck(stmt)
Definition: CudaUtils.h:233
bool getIsMasterDevice()
Definition: DeviceCUDA.C:642
int ignoresharing
Definition: DeviceCUDA.C:58
void cuda_initialize()
Definition: DeviceCUDA.C:27
bool one_device_per_node()
Definition: DeviceCUDA.C:549
int getDeviceIDforPe(int pe)
Definition: DeviceCUDA.C:525
#define CUDA_EVENT_ID_POLL_LOCAL
Definition: DeviceCUDA.h:39
char * devicelist
Definition: DeviceCUDA.C:55
#define CUDA_PME_GATHERFORCE_EVENT
Definition: DeviceCUDA.h:28
for(int i=0;i< n1;++i)
void cuda_getargs(char **argv)
Definition: DeviceCUDA.C:69