NAMD
DeviceCUDA.C
Go to the documentation of this file.
1 #ifdef NAMD_CUDA
2 #include <cuda_runtime.h>
3 #include <cuda.h>
4 #endif
5 #ifdef NAMD_HIP
6 #include <hip/hip_runtime.h>
7 #endif
8 
9 #include "common.h"
10 #include "charm++.h"
11 #include "DeviceCUDA.h"
12 #include "WorkDistrib.h"
13 #include "CudaUtils.h"
14 #include "Node.h"
15 #include "SimParameters.h"
16 
17 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
18 #ifdef WIN32
19 #define __thread __declspec(thread)
20 #endif
21 
22 // Global storage for CUDA devices
24 
25 void cuda_finalize();
26 
28  deviceCUDA = new DeviceCUDA();
30  if (CkMyRank() == 0) {
31  std::atexit(cuda_finalize);
32  }
33 }
34 
35 // kill all service threads
36 void cuda_finalize() {
37 #if defined(NAMD_CUDA) || ((NAMD_HIP) && ((HIP_VERSION_MAJOR < 4) && (HIP_VERSION_MINOR < 5)))
38  int ndevs = 0;
39  cudaGetDeviceCount(&ndevs);
40  for ( int dev=0; dev < ndevs; ++dev ) {
41  cudaSetDevice(dev);
42  cudaDeviceReset();
43  }
44 #else
45  // for hip on rocm versions later > 4.5, there seems to be
46  // additional hipFree's called after hipDeviceReset, so removing it for now
47 #endif
48 }
49 
50 // -------------------------------------------------------------------------------------------------
51 // Called from BackEnd.C by all processes to read command line arguments
52 // These argument settings are used by DeviceCUDA -class
53 // -------------------------------------------------------------------------------------------------
54 struct cuda_args_t {
55  char *devicelist;
62  int pmedevice;
63  int pmePes;
65 };
66 
67 static __thread cuda_args_t cuda_args;
68 
69 void cuda_getargs(char **argv) {
71  cuda_args.usedevicelist = CmiGetArgStringDesc(argv, "+devices", &cuda_args.devicelist,
72  "comma-delimited list of CUDA device numbers such as 0,2,1,2");
74  CmiGetArgInt(argv, "+devicesperreplica", &cuda_args.devicesperreplica);
75  if ( cuda_args.devicesperreplica < 0 ) NAMD_die("Devices per replica must be positive\n");
76  cuda_args.ignoresharing = CmiGetArgFlag(argv, "+ignoresharing");
77  cuda_args.mergegrids = CmiGetArgFlag(argv, "+mergegrids");
78  cuda_args.nomergegrids = CmiGetArgFlag(argv, "+nomergegrids");
79  if ( cuda_args.mergegrids && cuda_args.nomergegrids ) NAMD_die("Do not specify both +mergegrids and +nomergegrids");
80  cuda_args.nostreaming = CmiGetArgFlag(argv, "+nostreaming");
81 #ifdef NODEGROUP_FORCE_REGISTER
82  CmiGetArgInt(argv, "+pmedevice", &cuda_args.pmedevice);
83  CmiGetArgInt(argv, "+pmepes", &cuda_args.pmePes);
85  CmiGetArgInt(argv, "+globaldevice", &cuda_args.globaldevice);
86 #endif
87 }
88 // -------------------------------------------------------------------------------------------------
89 
90 // Node-wide list of device IDs for every rank
91 #define MAX_NUM_RANKS 2048
93 // Node-wide of master PEs for every device ID
94 #define MAX_NUM_DEVICES 256
96 
97 // -------------------------------------------------------------------------------------------------
98 // -------------------------------------------------------------------------------------------------
99 // -------------------------------------------------------------------------------------------------
100 
101 //
102 // Class creator
103 //
104 DeviceCUDA::DeviceCUDA() : deviceProps(NULL), devices(NULL) {}
105 
106 //
107 // Initalize device
108 //
110  // Copy command-line arguments into class
111  this->devicelist = cuda_args.devicelist;
112  this->usedevicelist = cuda_args.usedevicelist;
113  this->devicesperreplica = cuda_args.devicesperreplica;
114  this->ignoresharing = cuda_args.ignoresharing;
115  this->mergegrids = cuda_args.mergegrids;
116  this->nomergegrids = cuda_args.nomergegrids;
117  this->nostreaming = cuda_args.nostreaming;
118 #ifdef NODEGROUP_FORCE_REGISTER
119  reservePme = 0;
120  isPmeDevice = 0;
121  isGlobalDevice = false;
122 #endif
123 
124  if (CkMyPe() == 0) register_user_events();
125 
126 #if defined(CUDA_VERSION)
127  if (CkMyPe() == 0) CkPrintf("Info: Built with CUDA version %d\n", CUDA_VERSION);
128 #endif
129 
130  char host[128];
131  gethostname(host, 128); host[127] = 0;
132 
133  int myPhysicalNodeID = CmiPhysicalNodeID(CkMyPe());
134  int myRankInPhysicalNode;
135  int numPesOnPhysicalNode;
136  int *pesOnPhysicalNode;
137  CmiGetPesOnPhysicalNode(myPhysicalNodeID,
138  &pesOnPhysicalNode,&numPesOnPhysicalNode);
139 
140  {
141  int i;
142  for ( i=0; i < numPesOnPhysicalNode; ++i ) {
143  if ( i && (pesOnPhysicalNode[i] <= pesOnPhysicalNode[i-1]) ) {
144  i = numPesOnPhysicalNode;
145  break;
146  }
147  if ( pesOnPhysicalNode[i] == CkMyPe() ) break;
148  }
149  if ( i == numPesOnPhysicalNode || i != CmiPhysicalRank(CkMyPe()) ) {
150  CkPrintf("Bad result from CmiGetPesOnPhysicalNode!\n");
151  for ( i=0; i < numPesOnPhysicalNode; ++i ) {
152  CkPrintf("pe %d physnode rank %d of %d is %d\n", CkMyPe(),
153  i, numPesOnPhysicalNode, pesOnPhysicalNode[i]);
154  }
155  myRankInPhysicalNode = 0;
156  numPesOnPhysicalNode = 1;
157  pesOnPhysicalNode = new int[1];
158  pesOnPhysicalNode[0] = CkMyPe();
159  } else {
160  myRankInPhysicalNode = i;
161  }
162  }
163  // CkPrintf("Pe %d ranks %d in physical node\n",CkMyPe(),myRankInPhysicalNode);
164 
165  deviceCount = 0;
166  cudaCheck(cudaGetDeviceCount(&deviceCount));
167  if ( deviceCount <= 0 ) {
168  cudaDie("No CUDA devices found.");
169  }
170 
171  // Store all device props
172  deviceProps = new cudaDeviceProp[deviceCount];
173  for ( int i=0; i<deviceCount; ++i ) {
174  cudaCheck(cudaGetDeviceProperties(&deviceProps[i], i));
175  }
176 
177  ndevices = 0;
178  int nexclusive = 0;
179  if ( usedevicelist ) {
180  devices = new int[strlen(devicelist)];
181  int i = 0;
182  while ( devicelist[i] ) {
183  ndevices += sscanf(devicelist+i,"%d",devices+ndevices);
184  while ( devicelist[i] && isdigit(devicelist[i]) ) ++i;
185  while ( devicelist[i] && ! isdigit(devicelist[i]) ) ++i;
186  }
187  } else {
188  if ( ! CkMyPe() ) {
189  CkPrintf("Did not find +devices i,j,k,... argument, using all\n");
190  }
191  devices = new int[deviceCount];
192  for ( int i=0; i<deviceCount; ++i ) {
193  int dev = i % deviceCount;
194 #if CUDA_VERSION >= 2020 || defined(NAMD_HIP)
195  cudaDeviceProp deviceProp;
196  int computeMode;
197  cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
198  cudaCheck(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, dev));
199  if ( computeMode != cudaComputeModeProhibited
200  && (deviceProp.major >= 3)
201  && deviceProp.canMapHostMemory
202  && ( (deviceProp.multiProcessorCount > 2) ||
203  ((ndevices==0)&&(CkNumNodes()==1)) ) // exclude weak cards
204  ) {
205  devices[ndevices++] = dev;
206  }
207  if ( computeMode == cudaComputeModeExclusive ) {
208  ++nexclusive;
209  }
210 #else
211  devices[ndevices++] = dev;
212 #endif
213  }
214  }
215 
216  if ( ! ndevices ) {
217  cudaDie("all devices are in prohibited mode, of compute capability < 3.0, unable to map host memory, too small, or otherwise unusable");
218  }
219 
220  if ( devicesperreplica > 0 ) {
221  if ( devicesperreplica > ndevices ) {
222  NAMD_die("More devices per partition requested than devices are available");
223  }
224  int *olddevices = devices;
225  devices = new int[devicesperreplica];
226  for ( int i=0; i<devicesperreplica; ++i ) {
227  int mypart = CmiMyPartition();
228  devices[i] = olddevices[(i+devicesperreplica*mypart)%ndevices];
229  }
230  ndevices = devicesperreplica;
231  delete [] olddevices;
232  }
233 
234  int myRankForDevice = ignoresharing ? CkMyRank() : myRankInPhysicalNode;
235  int numPesForDevice = ignoresharing ? CkMyNodeSize() : numPesOnPhysicalNode;
236 
237  // This option allows users to specify the number of PEs given to the PME device.
238  // This can be used to improve loadbalancing.
239  #ifdef NODEGROUP_FORCE_REGISTER
240  const int pmePes = (cuda_args.pmePes == NULL) ? -1 : cuda_args.pmePes;
241  #else
242  const int pmePes = -1;
243  #endif
244 
245  // catch multiple processes per device
246  if ( ndevices % ( numPesForDevice / CkMyNodeSize() ) ) {
247  char msg[1024];
248  sprintf(msg,"Number of devices (%d) is not a multiple of number of processes (%d). "
249  "Sharing devices between processes is inefficient. "
250  "Specify +ignoresharing (each process uses all visible devices) if "
251  "not all devices are visible to each process, otherwise "
252  "adjust number of processes to evenly divide number of devices, "
253  "specify subset of devices with +devices argument (e.g., +devices 0,2), "
254  "or multiply list shared devices (e.g., +devices 0,1,2,0).",
255  ndevices, numPesForDevice / CkMyNodeSize() );
256  NAMD_die(msg);
257  }
258 
259  {
260  // build list of devices actually used by this node
261  nodedevices = new int[ndevices];
262  nnodedevices = 0;
263  int pe = CkNodeFirst(CkMyNode());
264  int dr = -1;
265  for ( int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
266  int rank = ignoresharing ? i : CmiPhysicalRank(pe);
267  int peDeviceRank = rank * ndevices / numPesForDevice;
268  if ( peDeviceRank != dr ) {
269  dr = peDeviceRank;
270  nodedevices[nnodedevices++] = devices[dr];
271  }
272  }
273  }
274 
275  {
276  // check for devices used twice by this node
277  for ( int i=0; i<nnodedevices; ++i ) {
278  for ( int j=i+1; j<nnodedevices; ++j ) {
279  if ( nodedevices[i] == nodedevices[j] ) {
280  char msg[1024];
281  sprintf(msg,"Device %d bound twice by same process.", nodedevices[i]);
282  NAMD_die(msg);
283  }
284  }
285  }
286  }
287 
288  sharedGpu = 0;
289  gpuIsMine = 1;
290  int firstPeSharingGpu = CkMyPe();
291  nextPeSharingGpu = CkMyPe();
292 
293  {
294  int dev;
295  if (pmePes != -1) {
296  int myDeviceRank;
297  if (myRankForDevice < pmePes) {
298  myDeviceRank = 0;
299  } else {
300  myDeviceRank = 1 + (myRankForDevice-pmePes) * (ndevices-1) / (numPesForDevice-pmePes);
301  }
302 
303  dev = devices[myDeviceRank];
304  masterPe = CkMyPe();
305  if (myRankForDevice >= pmePes) {
306  pesSharingDevice = new int[numPesForDevice];
307  masterPe = -1;
308  numPesSharingDevice = 0;
309  for ( int i = pmePes; i < numPesForDevice; ++i ) {
310  if ( 1 + (i-pmePes) * (ndevices-1) / (numPesForDevice-pmePes) == myDeviceRank ) {
311  int thisPe = ignoresharing ? (CkNodeFirst(CkMyNode())+i) : pesOnPhysicalNode[i];
312  pesSharingDevice[numPesSharingDevice++] = thisPe;
313  if ( masterPe < 1 ) masterPe = thisPe;
314  if ( WorkDistrib::pe_sortop_diffuse()(thisPe,masterPe) ) masterPe = thisPe;
315  }
316  }
317  for ( int j = 0; j < ndevices; ++j ) {
318  if ( devices[j] == dev && j != myDeviceRank ) sharedGpu = 1;
319  }
320  } else {
321  #ifdef NODEGROUP_FORCE_REGISTER
322  pesSharingDevice = new int[pmePes];
323  #else
324  pesSharingDevice = NULL;
325  #endif
326  masterPe = -1;
327  numPesSharingDevice = 0;
328  for (int i = 0; i < pmePes; ++i) {
329  int thisPe = ignoresharing ? (CkNodeFirst(CkMyNode())+i) : pesOnPhysicalNode[i];
330  pesSharingDevice[numPesSharingDevice++] = thisPe;
331  if ( masterPe < 1 ) masterPe = thisPe;
332  if ( WorkDistrib::pe_sortop_diffuse()(thisPe,masterPe) ) masterPe = thisPe;
333  }
334  }
335  if ( sharedGpu && masterPe == CkMyPe() ) {
336  if ( CmiPhysicalNodeID(masterPe) < 2 )
337  CkPrintf("Pe %d sharing CUDA device %d\n", CkMyPe(), dev);
338  }
339  } else if ( numPesForDevice > 1 ) {
340  int myDeviceRank = myRankForDevice * ndevices / numPesForDevice;
341  dev = devices[myDeviceRank];
342  masterPe = CkMyPe();
343  {
344  pesSharingDevice = new int[numPesForDevice];
345  masterPe = -1;
346  numPesSharingDevice = 0;
347  for ( int i = 0; i < numPesForDevice; ++i ) {
348  if ( i * ndevices / numPesForDevice == myDeviceRank ) {
349  int thisPe = ignoresharing ? (CkNodeFirst(CkMyNode())+i) : pesOnPhysicalNode[i];
350  pesSharingDevice[numPesSharingDevice++] = thisPe;
351  if ( masterPe < 1 ) masterPe = thisPe;
352  if ( WorkDistrib::pe_sortop_diffuse()(thisPe,masterPe) ) masterPe = thisPe;
353  }
354  }
355  for ( int j = 0; j < ndevices; ++j ) {
356  if ( devices[j] == dev && j != myDeviceRank ) sharedGpu = 1;
357  }
358  }
359  if ( sharedGpu && masterPe == CkMyPe() ) {
360  if ( CmiPhysicalNodeID(masterPe) < 2 )
361  CkPrintf("Pe %d sharing CUDA device %d\n", CkMyPe(), dev);
362  }
363  } else { // in case phys node code is lying
364  dev = devices[CkMyPe() % ndevices];
365  masterPe = CkMyPe();
366  pesSharingDevice = new int[1];
367  pesSharingDevice[0] = CkMyPe();
368  numPesSharingDevice = 1;
369  }
370 
371  deviceID = dev;
372 
373  // Setting PME device for single-node scheme
374  // Sanity check in order to see if pmeDevice in contained in the device list
375  bool contained = false;
376  pmeDevice = (cuda_args.pmedevice == NULL) ? devices[0]: cuda_args.pmedevice;
377  for(int i = 0; i < ndevices; i++){
378  if(!contained) {
379  contained = devices[i] == pmeDevice;
380  pmeDeviceIndex = (contained) ? i : -1; // saves index for pmeDevice
381  }
382  if(deviceID == devices[i]) deviceIndex = i;
383  }
384 
385  masterDevice = devices[0]; // head of device list responsible for printing stuff
386  isMasterDevice = deviceID == masterDevice;
387  if(!contained){
388  // Uses a particular device to do PME and reserves it (no other force terms on it)
389  reservePme = true;
390  pmeDeviceIndex = nnodedevices; // PME device index always at the tail of the list
391  isPmeDevice = isMasterDevice; // Master device launches work on the PME device as well
392  }else{
393  reservePme = false;
394  isPmeDevice = pmeDevice == deviceID;
395  }
396 
397  // Device for CudaGlobalMaster
398  globalDevice = (cuda_args.globaldevice < 0) ? devices[0]: cuda_args.globaldevice;
399  // Sanity check in order to see if globalDevice is contained in the device list
400  contained = false;
401  for (int i = 0; i < ndevices; ++i) {
402  if(!contained) {
403  contained = devices[i] == globalDevice;
404  }
405  }
406  if (!contained) {
407  NAMD_die("The selected GPU device for global forces is in the available devices list.\n");
408  }
409  isGlobalDevice = globalDevice == deviceID;
410 
411  // Store device IDs to node-wide list
412  if (CkMyRank() >= MAX_NUM_RANKS)
413  NAMD_die("Maximum number of ranks (2048) per node exceeded");
414  deviceIDList[CkMyRank()] = deviceID;
415 
416  if ( masterPe != CkMyPe() ) {
417  if ( CmiPhysicalNodeID(masterPe) < 2 )
418  CkPrintf("Pe %d physical rank %d will use CUDA device of pe %d\n",
419  CkMyPe(), myRankInPhysicalNode, masterPe);
420  // for PME only
421  cudaCheck(cudaSetDevice(dev));
422  return;
423  }
424 
425  // Store master PEs for every device ID to node-wide list
426  if (deviceID >= MAX_NUM_DEVICES)
427  NAMD_die("Maximum number of CUDA devices (256) per node exceeded");
428  masterPeList[deviceID] = masterPe + 1; // array is pre-initialized to zeros
429 
430  // disable token-passing but don't submit local until remote finished
431  // if shared_gpu is true, otherwise submit all work immediately
432  firstPeSharingGpu = CkMyPe();
433  nextPeSharingGpu = CkMyPe();
434 
435  gpuIsMine = ( firstPeSharingGpu == CkMyPe() );
436 
437  if ( dev >= deviceCount ) {
438  char buf[256];
439  sprintf(buf,"Pe %d unable to bind to CUDA device %d on %s because only %d devices are present",
440  CkMyPe(), dev, host, deviceCount);
441  NAMD_die(buf);
442  }
443 
444  cudaDeviceProp deviceProp;
445  cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
446  if ( CmiPhysicalNodeID(masterPe) < 2 )
447  CkPrintf("Pe %d physical rank %d binding to CUDA device %d on %s: '%s' Mem: %luMB Rev: %d.%d PCI: %x:%x:%x\n",
448  CkMyPe(), myRankInPhysicalNode, dev, host,
449  deviceProp.name,
450  (unsigned long) (deviceProp.totalGlobalMem / (1024*1024)),
451  deviceProp.major, deviceProp.minor,
452  deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
453 
454  cudaCheck(cudaSetDevice(dev));
455 
456  } // just let CUDA pick a device for us
457 
458  {
459  // if only one device then already initialized in cuda_affinity_initialize()
460  cudaError_t cudaSetDeviceFlags_cudaDeviceMapHost = cudaSetDeviceFlags(cudaDeviceMapHost);
461  if ( cudaSetDeviceFlags_cudaDeviceMapHost == cudaErrorSetOnActiveProcess ) {
462  cudaGetLastError();
463  } else {
464  cudaCheck(cudaSetDeviceFlags_cudaDeviceMapHost);
465  }
466 
467  int dev;
468  cudaCheck(cudaGetDevice(&dev));
469  deviceID = dev;
470  cudaDeviceProp deviceProp;
471  int computeMode;
472  cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
473  cudaCheck(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, dev));
474  if ( computeMode == cudaComputeModeProhibited )
475  cudaDie("device in prohibited mode");
476  if ( deviceProp.major < 3 )
477  cudaDie("device not of compute capability 3.0 or higher");
478  if ( ! deviceProp.canMapHostMemory )
479  cudaDie("device cannot map host memory");
480 
481  // initialize the device on this thread
482  int *dummy;
483  cudaCheck(cudaMalloc(&dummy, 4));
484  }
485 
486 #if NODEGROUP_FORCE_REGISTER
487 
488  {
489  // Setting PME device for single-node scheme
490 
491  // Sanity check in order to see if pmeDevice in contained in the device list
492  bool contained = false;
493  pmeDevice = (cuda_args.pmedevice == NULL) ? devices[0]: cuda_args.pmedevice;
494  for(int i = 0; i < ndevices; i++){
495  if(!contained) {
496  contained = devices[i] == pmeDevice;
497  pmeDeviceIndex = (contained) ? i : -1; // saves index for pmeDevice
498  }
499  if(deviceID == devices[i]) deviceIndex = i;
500  }
501 
502  if(!contained && CkMyPe() == 0) cudaDie("device specified for PME is not contained in +devices!");
503  // Everything is OK, sets flags
504 
505  isPmeDevice = pmeDevice == deviceID;
506  masterDevice = devices[0]; // head of device list responsible for printing stuff
507  isMasterDevice = deviceID == masterDevice;
508 
509  if (pmeDeviceIndex != 0 && pmePes != -1) {
510  NAMD_die("PME device must be index 0 if pmePes is set");
511  }
512  }
513 
514 #endif
515 }
516 
517 //
518 // Class destructor
519 //
521  if (deviceProps != NULL) delete [] deviceProps;
522  if (devices != NULL) delete [] devices;
523  delete [] pesSharingDevice;
524 }
525 
526 //
527 // Return device ID for pe. Assumes all nodes are the same
528 //
530  return deviceIDList[CkRankOf(pe) % CkMyNodeSize()];
531 }
532 
533 //
534 // Returns master PE for the device ID, or -1 if device not found
535 //
537  return masterPeList[deviceID % deviceCount] - 1;
538 }
539 
540 //
541 // Returns true if process "pe" shares this device
542 //
544  for ( int i=0; i<numPesSharingDevice; ++i ) {
545  if ( pesSharingDevice[i] == pe ) return true;
546  }
547  return false;
548 }
549 
550 //
551 // Returns true if there is single device per node
552 //
554  if ( numPesSharingDevice != CkMyNodeSize() ) return false;
555  int numPesOnNodeSharingDevice = 0;
556  for ( int i=0; i<numPesSharingDevice; ++i ) {
557  if ( CkNodeOf(pesSharingDevice[i]) == CkMyNode() ) {
558  ++numPesOnNodeSharingDevice;
559  }
560  }
561  return ( numPesOnNodeSharingDevice == CkMyNodeSize() );
562 }
563 
565  int dev;
566  cudaCheck(cudaGetDevice(&dev));
567  return deviceProps[dev].maxThreadsPerBlock;
568 }
569 
571  int dev;
572  cudaCheck(cudaGetDevice(&dev));
573  return deviceProps[dev].maxGridSize[0];
574 }
575 
576 #ifdef NODEGROUP_FORCE_REGISTER
578  int canAccessPeer;
579  for(int i = 0; i < ndevices; i++){
580  if (devices[i] == deviceID) continue;
581  canAccessPeer = 0;
582  // GPUs involved in the simulation need to be fully interconnected, otherwise we bail out
583  cudaCheck(cudaDeviceCanAccessPeer( &canAccessPeer, deviceID, devices[i]));
584  if( !canAccessPeer){
585  char *err = new char[128];
586  sprintf(err, "Failed setting up device peer access - Devices %d and %d are not paired.\n",
587  deviceID, devices[i]);
588  NAMD_die(err);
589  }else{
590 #if 1
591  cudaError_t error = cudaDeviceEnablePeerAccess(devices[i], 0); // pairs devices
592  if(error == cudaErrorPeerAccessAlreadyEnabled ) {
593  cudaCheck(cudaDeviceDisablePeerAccess(devices[i]));
594  cudaCheck(cudaDeviceEnablePeerAccess(devices[i], 0));
595  cudaGetLastError(); // Clear the cudaErrorPeerAccessAlreadyEnabled error
596  }
597  else cudaCheck(error);
598 
599  int p2pAtomics;
600  error = cudaDeviceGetP2PAttribute(&p2pAtomics, cudaDevP2PAttrNativeAtomicSupported, deviceID, devices[i]);
601  cudaCheck(error);
602  if (p2pAtomics != 1) {
603  SimParameters* sim_params = Node::Object()->simParameters;
604  if (sim_params->useDeviceMigration) {
605  // Failing instead of turning off device migration in case CPU migration step has already been
606  // skipped. DMC doesn't think this is necessary, but better to be safe
607  NAMD_die("Device Migration is not supported on systems without P2P atomics.");
608  }
609  }
610 #else
611  cudaCheck(cudaDeviceEnablePeerAccess(devices[i], 0)); // pairs devices
612 #endif
613  }
614  }
615  if(reservePme){
616  // binds this particular device to the pmeDevice
617  canAccessPeer = 0;
618  cudaCheck(cudaDeviceCanAccessPeer( &canAccessPeer, deviceID, pmeDevice));
619  if( !canAccessPeer){
620  char *err = new char[128];
621  sprintf(err, "Failed setting up device peer access - Devices %d and %d are not paired.\n",
622  deviceID, pmeDevice);
623  NAMD_die(err);
624  }else{
625  cudaCheck(cudaDeviceEnablePeerAccess(pmeDevice, 0));
626  }
627  }
628 }
629 
630 #ifdef NAMD_NCCL_ALLREDUCE
631 // Should be called by device on rank 0 on ComputeCUDAMgr;
632 void DeviceCUDA::setupNcclUniqueId(){
633  ncclGetUniqueId( &ncclId);
634 }
635 
636 void DeviceCUDA::setupNcclComm(){
637  // Initialize communicators
638  ncclCommInitRank(&ncclComm, CkNumPes(), ncclId, CkMyPe());
639 }
640 #endif /* NAMD_NCCL_ALLREDUCE */
641 #endif /* NODEGROUP_FORCE_REGISTER */
642 
643 //
644 // Returns true if called by the masterPE that owns the master device
645 //
647  return (CkMyPe() == masterPe) && isMasterDevice;
648 }
649 
650 /*
651 BASE
652 2 types (remote & local)
653 16 pes per node
654 3 phases (1, 2, 3)
655 */
656 
657 void DeviceCUDA::register_user_events() {
658 
659  traceRegisterUserEvent("CUDA PME spreadCharge", CUDA_PME_SPREADCHARGE_EVENT);
660  traceRegisterUserEvent("CUDA PME gatherForce", CUDA_PME_GATHERFORCE_EVENT);
661 
662  traceRegisterUserEvent("CUDA bonded", CUDA_BONDED_KERNEL_EVENT);
663  traceRegisterUserEvent("CUDA debug", CUDA_DEBUG_EVENT);
664  traceRegisterUserEvent("CUDA nonbonded", CUDA_NONBONDED_KERNEL_EVENT);
665  traceRegisterUserEvent("CUDA GBIS Phase 1 kernel", CUDA_GBIS1_KERNEL_EVENT);
666  traceRegisterUserEvent("CUDA GBIS Phase 2 kernel", CUDA_GBIS2_KERNEL_EVENT);
667  traceRegisterUserEvent("CUDA GBIS Phase 3 kernel", CUDA_GBIS3_KERNEL_EVENT);
668 
669  traceRegisterUserEvent("CUDA poll remote", CUDA_EVENT_ID_POLL_REMOTE);
670  traceRegisterUserEvent("CUDA poll local", CUDA_EVENT_ID_POLL_LOCAL);
671 
672 #define REGISTER_DEVICE_EVENTS(DEV) \
673  traceRegisterUserEvent("CUDA device " #DEV " remote", CUDA_EVENT_ID_BASE + 2 * DEV); \
674  traceRegisterUserEvent("CUDA device " #DEV " local", CUDA_EVENT_ID_BASE + 2 * DEV + 1);
675 
692 
693 }
694 
695 #endif // NAMD_CUDA
static Node * Object()
Definition: Node.h:86
#define CUDA_GBIS2_KERNEL_EVENT
Definition: DeviceCUDA.h:33
void initialize()
Definition: DeviceCUDA.C:109
#define REGISTER_DEVICE_EVENTS(DEV)
int getMaxNumThreads()
Definition: DeviceCUDA.C:564
int devicesperreplica
Definition: DeviceCUDA.C:57
#define MAX_NUM_DEVICES
Definition: DeviceCUDA.C:94
#define CUDA_BONDED_KERNEL_EVENT
Definition: DeviceCUDA.h:29
int nomergegrids
Definition: DeviceCUDA.C:60
SimParameters * simParameters
Definition: Node.h:181
#define CUDA_PME_SPREADCHARGE_EVENT
Definition: DeviceCUDA.h:27
void cudaDie(const char *msg, cudaError_t err)
Definition: CudaUtils.C:9
void cuda_finalize()
Definition: DeviceCUDA.C:36
int masterPeList[MAX_NUM_DEVICES]
Definition: DeviceCUDA.C:95
#define CUDA_EVENT_ID_POLL_REMOTE
Definition: DeviceCUDA.h:36
void setupDevicePeerAccess()
Bool useDeviceMigration
static __thread cuda_args_t cuda_args
Definition: DeviceCUDA.C:67
int globaldevice
Definition: DeviceCUDA.C:64
int usedevicelist
Definition: DeviceCUDA.C:56
#define CUDA_DEBUG_EVENT
Definition: DeviceCUDA.h:30
int deviceIDList[MAX_NUM_RANKS]
Definition: DeviceCUDA.C:92
int getMasterPeForDeviceID(int deviceID)
Definition: DeviceCUDA.C:536
int mergegrids
Definition: DeviceCUDA.C:59
#define CUDA_GBIS3_KERNEL_EVENT
Definition: DeviceCUDA.h:34
int nostreaming
Definition: DeviceCUDA.C:61
bool device_shared_with_pe(int pe)
Definition: DeviceCUDA.C:543
int pmedevice
Definition: DeviceCUDA.C:62
int getMaxNumBlocks()
Definition: DeviceCUDA.C:570
void NAMD_die(const char *err_msg)
Definition: common.C:147
#define MAX_NUM_RANKS
Definition: DeviceCUDA.C:91
#define CUDA_NONBONDED_KERNEL_EVENT
Definition: DeviceCUDA.h:31
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:23
#define CUDA_GBIS1_KERNEL_EVENT
Definition: DeviceCUDA.h:32
#define cudaCheck(stmt)
Definition: CudaUtils.h:242
bool getIsMasterDevice()
Definition: DeviceCUDA.C:646
int ignoresharing
Definition: DeviceCUDA.C:58
void cuda_initialize()
Definition: DeviceCUDA.C:27
bool one_device_per_node()
Definition: DeviceCUDA.C:553
int getDeviceIDforPe(int pe)
Definition: DeviceCUDA.C:529
#define CUDA_EVENT_ID_POLL_LOCAL
Definition: DeviceCUDA.h:39
char * devicelist
Definition: DeviceCUDA.C:55
#define CUDA_PME_GATHERFORCE_EVENT
Definition: DeviceCUDA.h:28
for(int i=0;i< n1;++i)
void cuda_getargs(char **argv)
Definition: DeviceCUDA.C:69