NAMD
DeviceCUDA.C
Go to the documentation of this file.
1 #ifdef NAMD_CUDA
2 #include <cuda_runtime.h>
3 #include <cuda.h>
4 #endif
5 #ifdef NAMD_HIP
6 #include <hip/hip_runtime.h>
7 #endif
8 
9 #include "common.h"
10 #include "charm++.h"
11 #include "DeviceCUDA.h"
12 #include "WorkDistrib.h"
13 #include "CudaUtils.h"
14 #include "Node.h"
15 #include "SimParameters.h"
16 
17 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
18 #ifdef WIN32
19 #define __thread __declspec(thread)
20 #endif
21 
22 // Global storage for CUDA devices
24 
25 void cuda_finalize();
26 
28  deviceCUDA = new DeviceCUDA();
30  std::atexit(cuda_finalize);
31 }
32 
33 // kill all service threads
34 void cuda_finalize() {
35 #if defined(NAMD_CUDA) || ((NAMD_HIP) && ((HIP_VERSION_MAJOR < 4) && (HIP_VERSION_MINOR < 5)))
36  int ndevs = 0;
37  cudaGetDeviceCount(&ndevs);
38  for ( int dev=0; dev < ndevs; ++dev ) {
39  cudaSetDevice(dev);
40  cudaDeviceReset();
41  }
42 #else
43  // for hip on rocm versions later > 4.5, there seems to be
44  // additional hipFree's called after hipDeviceReset, so removing it for now
45 #endif
46 }
47 
48 // -------------------------------------------------------------------------------------------------
49 // Called from BackEnd.C by all processes to read command line arguments
50 // These argument settings are used by DeviceCUDA -class
51 // -------------------------------------------------------------------------------------------------
52 struct cuda_args_t {
53  char *devicelist;
60  int pmedevice;
61  int pmePes;
63 };
64 
65 static __thread cuda_args_t cuda_args;
66 
67 void cuda_getargs(char **argv) {
69  cuda_args.usedevicelist = CmiGetArgStringDesc(argv, "+devices", &cuda_args.devicelist,
70  "comma-delimited list of CUDA device numbers such as 0,2,1,2");
72  CmiGetArgInt(argv, "+devicesperreplica", &cuda_args.devicesperreplica);
73  if ( cuda_args.devicesperreplica < 0 ) NAMD_die("Devices per replica must be positive\n");
74  cuda_args.ignoresharing = CmiGetArgFlag(argv, "+ignoresharing");
75  cuda_args.mergegrids = CmiGetArgFlag(argv, "+mergegrids");
76  cuda_args.nomergegrids = CmiGetArgFlag(argv, "+nomergegrids");
77  if ( cuda_args.mergegrids && cuda_args.nomergegrids ) NAMD_die("Do not specify both +mergegrids and +nomergegrids");
78  cuda_args.nostreaming = CmiGetArgFlag(argv, "+nostreaming");
79 #ifdef NODEGROUP_FORCE_REGISTER
80  CmiGetArgInt(argv, "+pmedevice", &cuda_args.pmedevice);
81  CmiGetArgInt(argv, "+pmepes", &cuda_args.pmePes);
83  CmiGetArgInt(argv, "+globaldevice", &cuda_args.globaldevice);
84 #endif
85 }
86 // -------------------------------------------------------------------------------------------------
87 
88 // Node-wide list of device IDs for every rank
89 #define MAX_NUM_RANKS 2048
91 // Node-wide of master PEs for every device ID
92 #define MAX_NUM_DEVICES 256
94 
95 // -------------------------------------------------------------------------------------------------
96 // -------------------------------------------------------------------------------------------------
97 // -------------------------------------------------------------------------------------------------
98 
99 //
100 // Class creator
101 //
102 DeviceCUDA::DeviceCUDA() : deviceProps(NULL), devices(NULL) {}
103 
104 //
105 // Initalize device
106 //
108  // Copy command-line arguments into class
109  this->devicelist = cuda_args.devicelist;
110  this->usedevicelist = cuda_args.usedevicelist;
111  this->devicesperreplica = cuda_args.devicesperreplica;
112  this->ignoresharing = cuda_args.ignoresharing;
113  this->mergegrids = cuda_args.mergegrids;
114  this->nomergegrids = cuda_args.nomergegrids;
115  this->nostreaming = cuda_args.nostreaming;
116 #ifdef NODEGROUP_FORCE_REGISTER
117  reservePme = 0;
118  isPmeDevice = 0;
119  isGlobalDevice = false;
120 #endif
121 
122  if (CkMyPe() == 0) register_user_events();
123 
124 #if defined(CUDA_VERSION)
125  if (CkMyPe() == 0) CkPrintf("Info: Built with CUDA version %d\n", CUDA_VERSION);
126 #endif
127 
128  char host[128];
129  gethostname(host, 128); host[127] = 0;
130 
131  int myPhysicalNodeID = CmiPhysicalNodeID(CkMyPe());
132  int myRankInPhysicalNode;
133  int numPesOnPhysicalNode;
134  int *pesOnPhysicalNode;
135  CmiGetPesOnPhysicalNode(myPhysicalNodeID,
136  &pesOnPhysicalNode,&numPesOnPhysicalNode);
137 
138  {
139  int i;
140  for ( i=0; i < numPesOnPhysicalNode; ++i ) {
141  if ( i && (pesOnPhysicalNode[i] <= pesOnPhysicalNode[i-1]) ) {
142  i = numPesOnPhysicalNode;
143  break;
144  }
145  if ( pesOnPhysicalNode[i] == CkMyPe() ) break;
146  }
147  if ( i == numPesOnPhysicalNode || i != CmiPhysicalRank(CkMyPe()) ) {
148  CkPrintf("Bad result from CmiGetPesOnPhysicalNode!\n");
149  for ( i=0; i < numPesOnPhysicalNode; ++i ) {
150  CkPrintf("pe %d physnode rank %d of %d is %d\n", CkMyPe(),
151  i, numPesOnPhysicalNode, pesOnPhysicalNode[i]);
152  }
153  myRankInPhysicalNode = 0;
154  numPesOnPhysicalNode = 1;
155  pesOnPhysicalNode = new int[1];
156  pesOnPhysicalNode[0] = CkMyPe();
157  } else {
158  myRankInPhysicalNode = i;
159  }
160  }
161  // CkPrintf("Pe %d ranks %d in physical node\n",CkMyPe(),myRankInPhysicalNode);
162 
163  deviceCount = 0;
164  cudaCheck(cudaGetDeviceCount(&deviceCount));
165  if ( deviceCount <= 0 ) {
166  cudaDie("No CUDA devices found.");
167  }
168 
169  // Store all device props
170  deviceProps = new cudaDeviceProp[deviceCount];
171  for ( int i=0; i<deviceCount; ++i ) {
172  cudaCheck(cudaGetDeviceProperties(&deviceProps[i], i));
173  }
174 
175  ndevices = 0;
176  int nexclusive = 0;
177  if ( usedevicelist ) {
178  devices = new int[strlen(devicelist)];
179  int i = 0;
180  while ( devicelist[i] ) {
181  ndevices += sscanf(devicelist+i,"%d",devices+ndevices);
182  while ( devicelist[i] && isdigit(devicelist[i]) ) ++i;
183  while ( devicelist[i] && ! isdigit(devicelist[i]) ) ++i;
184  }
185  } else {
186  if ( ! CkMyPe() ) {
187  CkPrintf("Did not find +devices i,j,k,... argument, using all\n");
188  }
189  devices = new int[deviceCount];
190  for ( int i=0; i<deviceCount; ++i ) {
191  int dev = i % deviceCount;
192 #if CUDA_VERSION >= 2020 || defined(NAMD_HIP)
193  cudaDeviceProp deviceProp;
194  cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
195  if ( deviceProp.computeMode != cudaComputeModeProhibited
196  && (deviceProp.major >= 3)
197  && deviceProp.canMapHostMemory
198  && ( (deviceProp.multiProcessorCount > 2) ||
199  ((ndevices==0)&&(CkNumNodes()==1)) ) // exclude weak cards
200  ) {
201  devices[ndevices++] = dev;
202  }
203  if ( deviceProp.computeMode == cudaComputeModeExclusive ) {
204  ++nexclusive;
205  }
206 #else
207  devices[ndevices++] = dev;
208 #endif
209  }
210  }
211 
212  if ( ! ndevices ) {
213  cudaDie("all devices are in prohibited mode, of compute capability < 3.0, unable to map host memory, too small, or otherwise unusable");
214  }
215 
216  if ( devicesperreplica > 0 ) {
217  if ( devicesperreplica > ndevices ) {
218  NAMD_die("More devices per partition requested than devices are available");
219  }
220  int *olddevices = devices;
221  devices = new int[devicesperreplica];
222  for ( int i=0; i<devicesperreplica; ++i ) {
223  int mypart = CmiMyPartition();
224  devices[i] = olddevices[(i+devicesperreplica*mypart)%ndevices];
225  }
226  ndevices = devicesperreplica;
227  delete [] olddevices;
228  }
229 
230  int myRankForDevice = ignoresharing ? CkMyRank() : myRankInPhysicalNode;
231  int numPesForDevice = ignoresharing ? CkMyNodeSize() : numPesOnPhysicalNode;
232 
233  // This option allows users to specify the number of PEs given to the PME device.
234  // This can be used to improve loadbalancing.
235  #ifdef NODEGROUP_FORCE_REGISTER
236  const int pmePes = (cuda_args.pmePes == NULL) ? -1 : cuda_args.pmePes;
237  #else
238  const int pmePes = -1;
239  #endif
240 
241  // catch multiple processes per device
242  if ( ndevices % ( numPesForDevice / CkMyNodeSize() ) ) {
243  char msg[1024];
244  sprintf(msg,"Number of devices (%d) is not a multiple of number of processes (%d). "
245  "Sharing devices between processes is inefficient. "
246  "Specify +ignoresharing (each process uses all visible devices) if "
247  "not all devices are visible to each process, otherwise "
248  "adjust number of processes to evenly divide number of devices, "
249  "specify subset of devices with +devices argument (e.g., +devices 0,2), "
250  "or multiply list shared devices (e.g., +devices 0,1,2,0).",
251  ndevices, numPesForDevice / CkMyNodeSize() );
252  NAMD_die(msg);
253  }
254 
255  {
256  // build list of devices actually used by this node
257  nodedevices = new int[ndevices];
258  nnodedevices = 0;
259  int pe = CkNodeFirst(CkMyNode());
260  int dr = -1;
261  for ( int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
262  int rank = ignoresharing ? i : CmiPhysicalRank(pe);
263  int peDeviceRank = rank * ndevices / numPesForDevice;
264  if ( peDeviceRank != dr ) {
265  dr = peDeviceRank;
266  nodedevices[nnodedevices++] = devices[dr];
267  }
268  }
269  }
270 
271  {
272  // check for devices used twice by this node
273  for ( int i=0; i<nnodedevices; ++i ) {
274  for ( int j=i+1; j<nnodedevices; ++j ) {
275  if ( nodedevices[i] == nodedevices[j] ) {
276  char msg[1024];
277  sprintf(msg,"Device %d bound twice by same process.", nodedevices[i]);
278  NAMD_die(msg);
279  }
280  }
281  }
282  }
283 
284  sharedGpu = 0;
285  gpuIsMine = 1;
286  int firstPeSharingGpu = CkMyPe();
287  nextPeSharingGpu = CkMyPe();
288 
289  {
290  int dev;
291  if (pmePes != -1) {
292  int myDeviceRank;
293  if (myRankForDevice < pmePes) {
294  myDeviceRank = 0;
295  } else {
296  myDeviceRank = 1 + (myRankForDevice-pmePes) * (ndevices-1) / (numPesForDevice-pmePes);
297  }
298 
299  dev = devices[myDeviceRank];
300  masterPe = CkMyPe();
301  if (myRankForDevice >= pmePes) {
302  pesSharingDevice = new int[numPesForDevice];
303  masterPe = -1;
304  numPesSharingDevice = 0;
305  for ( int i = pmePes; i < numPesForDevice; ++i ) {
306  if ( 1 + (i-pmePes) * (ndevices-1) / (numPesForDevice-pmePes) == myDeviceRank ) {
307  int thisPe = ignoresharing ? (CkNodeFirst(CkMyNode())+i) : pesOnPhysicalNode[i];
308  pesSharingDevice[numPesSharingDevice++] = thisPe;
309  if ( masterPe < 1 ) masterPe = thisPe;
310  if ( WorkDistrib::pe_sortop_diffuse()(thisPe,masterPe) ) masterPe = thisPe;
311  }
312  }
313  for ( int j = 0; j < ndevices; ++j ) {
314  if ( devices[j] == dev && j != myDeviceRank ) sharedGpu = 1;
315  }
316  } else {
317  #ifdef NODEGROUP_FORCE_REGISTER
318  pesSharingDevice = new int[pmePes];
319  #else
320  pesSharingDevice = NULL;
321  #endif
322  masterPe = -1;
323  numPesSharingDevice = 0;
324  for (int i = 0; i < pmePes; ++i) {
325  int thisPe = ignoresharing ? (CkNodeFirst(CkMyNode())+i) : pesOnPhysicalNode[i];
326  pesSharingDevice[numPesSharingDevice++] = thisPe;
327  if ( masterPe < 1 ) masterPe = thisPe;
328  if ( WorkDistrib::pe_sortop_diffuse()(thisPe,masterPe) ) masterPe = thisPe;
329  }
330  }
331  if ( sharedGpu && masterPe == CkMyPe() ) {
332  if ( CmiPhysicalNodeID(masterPe) < 2 )
333  CkPrintf("Pe %d sharing CUDA device %d\n", CkMyPe(), dev);
334  }
335  } else if ( numPesForDevice > 1 ) {
336  int myDeviceRank = myRankForDevice * ndevices / numPesForDevice;
337  dev = devices[myDeviceRank];
338  masterPe = CkMyPe();
339  {
340  pesSharingDevice = new int[numPesForDevice];
341  masterPe = -1;
342  numPesSharingDevice = 0;
343  for ( int i = 0; i < numPesForDevice; ++i ) {
344  if ( i * ndevices / numPesForDevice == myDeviceRank ) {
345  int thisPe = ignoresharing ? (CkNodeFirst(CkMyNode())+i) : pesOnPhysicalNode[i];
346  pesSharingDevice[numPesSharingDevice++] = thisPe;
347  if ( masterPe < 1 ) masterPe = thisPe;
348  if ( WorkDistrib::pe_sortop_diffuse()(thisPe,masterPe) ) masterPe = thisPe;
349  }
350  }
351  for ( int j = 0; j < ndevices; ++j ) {
352  if ( devices[j] == dev && j != myDeviceRank ) sharedGpu = 1;
353  }
354  }
355  if ( sharedGpu && masterPe == CkMyPe() ) {
356  if ( CmiPhysicalNodeID(masterPe) < 2 )
357  CkPrintf("Pe %d sharing CUDA device %d\n", CkMyPe(), dev);
358  }
359  } else { // in case phys node code is lying
360  dev = devices[CkMyPe() % ndevices];
361  masterPe = CkMyPe();
362  pesSharingDevice = new int[1];
363  pesSharingDevice[0] = CkMyPe();
364  numPesSharingDevice = 1;
365  }
366 
367  deviceID = dev;
368 
369  // Setting PME device for single-node scheme
370  // Sanity check in order to see if pmeDevice in contained in the device list
371  bool contained = false;
372  pmeDevice = (cuda_args.pmedevice == NULL) ? devices[0]: cuda_args.pmedevice;
373  for(int i = 0; i < ndevices; i++){
374  if(!contained) {
375  contained = devices[i] == pmeDevice;
376  pmeDeviceIndex = (contained) ? i : -1; // saves index for pmeDevice
377  }
378  if(deviceID == devices[i]) deviceIndex = i;
379  }
380 
381  masterDevice = devices[0]; // head of device list responsible for printing stuff
382  isMasterDevice = deviceID == masterDevice;
383  if(!contained){
384  // Uses a particular device to do PME and reserves it (no other force terms on it)
385  reservePme = true;
386  pmeDeviceIndex = nnodedevices; // PME device index always at the tail of the list
387  isPmeDevice = isMasterDevice; // Master device launches work on the PME device as well
388  }else{
389  reservePme = false;
390  isPmeDevice = pmeDevice == deviceID;
391  }
392 
393  // Device for CudaGlobalMaster
394  globalDevice = (cuda_args.globaldevice < 0) ? devices[0]: cuda_args.globaldevice;
395  // Sanity check in order to see if globalDevice is contained in the device list
396  contained = false;
397  for (int i = 0; i < ndevices; ++i) {
398  if(!contained) {
399  contained = devices[i] == globalDevice;
400  }
401  }
402  if (!contained) {
403  NAMD_die("The selected GPU device for global forces is in the available devices list.\n");
404  }
405  isGlobalDevice = globalDevice == deviceID;
406 
407  // Store device IDs to node-wide list
408  if (CkMyRank() >= MAX_NUM_RANKS)
409  NAMD_die("Maximum number of ranks (2048) per node exceeded");
410  deviceIDList[CkMyRank()] = deviceID;
411 
412  if ( masterPe != CkMyPe() ) {
413  if ( CmiPhysicalNodeID(masterPe) < 2 )
414  CkPrintf("Pe %d physical rank %d will use CUDA device of pe %d\n",
415  CkMyPe(), myRankInPhysicalNode, masterPe);
416  // for PME only
417  cudaCheck(cudaSetDevice(dev));
418  return;
419  }
420 
421  // Store master PEs for every device ID to node-wide list
422  if (deviceID >= MAX_NUM_DEVICES)
423  NAMD_die("Maximum number of CUDA devices (256) per node exceeded");
424  masterPeList[deviceID] = masterPe + 1; // array is pre-initialized to zeros
425 
426  // disable token-passing but don't submit local until remote finished
427  // if shared_gpu is true, otherwise submit all work immediately
428  firstPeSharingGpu = CkMyPe();
429  nextPeSharingGpu = CkMyPe();
430 
431  gpuIsMine = ( firstPeSharingGpu == CkMyPe() );
432 
433  if ( dev >= deviceCount ) {
434  char buf[256];
435  sprintf(buf,"Pe %d unable to bind to CUDA device %d on %s because only %d devices are present",
436  CkMyPe(), dev, host, deviceCount);
437  NAMD_die(buf);
438  }
439 
440  cudaDeviceProp deviceProp;
441  cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
442  if ( CmiPhysicalNodeID(masterPe) < 2 )
443  CkPrintf("Pe %d physical rank %d binding to CUDA device %d on %s: '%s' Mem: %luMB Rev: %d.%d PCI: %x:%x:%x\n",
444  CkMyPe(), myRankInPhysicalNode, dev, host,
445  deviceProp.name,
446  (unsigned long) (deviceProp.totalGlobalMem / (1024*1024)),
447  deviceProp.major, deviceProp.minor,
448  deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
449 
450  cudaCheck(cudaSetDevice(dev));
451 
452  } // just let CUDA pick a device for us
453 
454  {
455  // if only one device then already initialized in cuda_affinity_initialize()
456  cudaError_t cudaSetDeviceFlags_cudaDeviceMapHost = cudaSetDeviceFlags(cudaDeviceMapHost);
457  if ( cudaSetDeviceFlags_cudaDeviceMapHost == cudaErrorSetOnActiveProcess ) {
458  cudaGetLastError();
459  } else {
460  cudaCheck(cudaSetDeviceFlags_cudaDeviceMapHost);
461  }
462 
463  int dev;
464  cudaCheck(cudaGetDevice(&dev));
465  deviceID = dev;
466  cudaDeviceProp deviceProp;
467  cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
468  if ( deviceProp.computeMode == cudaComputeModeProhibited )
469  cudaDie("device in prohibited mode");
470  if ( deviceProp.major < 3 )
471  cudaDie("device not of compute capability 3.0 or higher");
472  if ( ! deviceProp.canMapHostMemory )
473  cudaDie("device cannot map host memory");
474 
475  // initialize the device on this thread
476  int *dummy;
477  cudaCheck(cudaMalloc(&dummy, 4));
478  }
479 
480 #if NODEGROUP_FORCE_REGISTER
481 
482  {
483  // Setting PME device for single-node scheme
484 
485  // Sanity check in order to see if pmeDevice in contained in the device list
486  bool contained = false;
487  pmeDevice = (cuda_args.pmedevice == NULL) ? devices[0]: cuda_args.pmedevice;
488  for(int i = 0; i < ndevices; i++){
489  if(!contained) {
490  contained = devices[i] == pmeDevice;
491  pmeDeviceIndex = (contained) ? i : -1; // saves index for pmeDevice
492  }
493  if(deviceID == devices[i]) deviceIndex = i;
494  }
495 
496  if(!contained && CkMyPe() == 0) cudaDie("device specified for PME is not contained in +devices!");
497  // Everything is OK, sets flags
498 
499  isPmeDevice = pmeDevice == deviceID;
500  masterDevice = devices[0]; // head of device list responsible for printing stuff
501  isMasterDevice = deviceID == masterDevice;
502 
503  if (pmeDeviceIndex != 0 && pmePes != -1) {
504  NAMD_die("PME device must be index 0 if pmePes is set");
505  }
506  }
507 
508 #endif
509 }
510 
511 //
512 // Class destructor
513 //
515  if (deviceProps != NULL) delete [] deviceProps;
516  if (devices != NULL) delete [] devices;
517  delete [] pesSharingDevice;
518 }
519 
520 //
521 // Return device ID for pe. Assumes all nodes are the same
522 //
524  return deviceIDList[CkRankOf(pe) % CkMyNodeSize()];
525 }
526 
527 //
528 // Returns master PE for the device ID, or -1 if device not found
529 //
531  return masterPeList[deviceID % deviceCount] - 1;
532 }
533 
534 //
535 // Returns true if process "pe" shares this device
536 //
538  for ( int i=0; i<numPesSharingDevice; ++i ) {
539  if ( pesSharingDevice[i] == pe ) return true;
540  }
541  return false;
542 }
543 
544 //
545 // Returns true if there is single device per node
546 //
548  if ( numPesSharingDevice != CkMyNodeSize() ) return false;
549  int numPesOnNodeSharingDevice = 0;
550  for ( int i=0; i<numPesSharingDevice; ++i ) {
551  if ( CkNodeOf(pesSharingDevice[i]) == CkMyNode() ) {
552  ++numPesOnNodeSharingDevice;
553  }
554  }
555  return ( numPesOnNodeSharingDevice == CkMyNodeSize() );
556 }
557 
559  int dev;
560  cudaCheck(cudaGetDevice(&dev));
561  return deviceProps[dev].maxThreadsPerBlock;
562 }
563 
565  int dev;
566  cudaCheck(cudaGetDevice(&dev));
567  return deviceProps[dev].maxGridSize[0];
568 }
569 
570 #ifdef NODEGROUP_FORCE_REGISTER
572  int canAccessPeer;
573  for(int i = 0; i < ndevices; i++){
574  if (devices[i] == deviceID) continue;
575  canAccessPeer = 0;
576  // GPUs involved in the simulation need to be fully interconnected, otherwise we bail out
577  cudaCheck(cudaDeviceCanAccessPeer( &canAccessPeer, deviceID, devices[i]));
578  if( !canAccessPeer){
579  char *err = new char[128];
580  sprintf(err, "Failed setting up device peer access - Devices %d and %d are not paired.\n",
581  deviceID, devices[i]);
582  NAMD_die(err);
583  }else{
584 #if 1
585  cudaError_t error = cudaDeviceEnablePeerAccess(devices[i], 0); // pairs devices
586  if(error == cudaErrorPeerAccessAlreadyEnabled ) {
587  cudaCheck(cudaDeviceDisablePeerAccess(devices[i]));
588  cudaCheck(cudaDeviceEnablePeerAccess(devices[i], 0));
589  cudaGetLastError(); // Clear the cudaErrorPeerAccessAlreadyEnabled error
590  }
591  else cudaCheck(error);
592 
593  int p2pAtomics;
594  error = cudaDeviceGetP2PAttribute(&p2pAtomics, cudaDevP2PAttrNativeAtomicSupported, deviceID, devices[i]);
595  cudaCheck(error);
596  if (p2pAtomics != 1) {
597  SimParameters* sim_params = Node::Object()->simParameters;
598  if (sim_params->useDeviceMigration) {
599  // Failing instead of turning off device migration in case CPU migration step has already been
600  // skipped. DMC doesn't think this is necessary, but better to be safe
601  NAMD_die("Device Migration is not supported on systems without P2P atomics.");
602  }
603  }
604 #else
605  cudaCheck(cudaDeviceEnablePeerAccess(devices[i], 0)); // pairs devices
606 #endif
607  }
608  }
609  if(reservePme){
610  // binds this particular device to the pmeDevice
611  canAccessPeer = 0;
612  cudaCheck(cudaDeviceCanAccessPeer( &canAccessPeer, deviceID, pmeDevice));
613  if( !canAccessPeer){
614  char *err = new char[128];
615  sprintf(err, "Failed setting up device peer access - Devices %d and %d are not paired.\n",
616  deviceID, pmeDevice);
617  NAMD_die(err);
618  }else{
619  cudaCheck(cudaDeviceEnablePeerAccess(pmeDevice, 0));
620  }
621  }
622 }
623 
624 #ifdef NAMD_NCCL_ALLREDUCE
625 // Should be called by device on rank 0 on ComputeCUDAMgr;
626 void DeviceCUDA::setupNcclUniqueId(){
627  ncclGetUniqueId( &ncclId);
628 }
629 
630 void DeviceCUDA::setupNcclComm(){
631  // Initialize communicators
632  ncclCommInitRank(&ncclComm, CkNumPes(), ncclId, CkMyPe());
633 }
634 #endif
635 
636 //
637 // Returns true if called by the masterPE that owns the master device
638 //
640  return (CkMyPe() == masterPe) && isMasterDevice;
641 }
642 
643 #endif
644 
645 /*
646 BASE
647 2 types (remote & local)
648 16 pes per node
649 3 phases (1, 2, 3)
650 */
651 
652 void DeviceCUDA::register_user_events() {
653 
654  traceRegisterUserEvent("CUDA PME spreadCharge", CUDA_PME_SPREADCHARGE_EVENT);
655  traceRegisterUserEvent("CUDA PME gatherForce", CUDA_PME_GATHERFORCE_EVENT);
656 
657  traceRegisterUserEvent("CUDA bonded", CUDA_BONDED_KERNEL_EVENT);
658  traceRegisterUserEvent("CUDA debug", CUDA_DEBUG_EVENT);
659  traceRegisterUserEvent("CUDA nonbonded", CUDA_NONBONDED_KERNEL_EVENT);
660  traceRegisterUserEvent("CUDA GBIS Phase 1 kernel", CUDA_GBIS1_KERNEL_EVENT);
661  traceRegisterUserEvent("CUDA GBIS Phase 2 kernel", CUDA_GBIS2_KERNEL_EVENT);
662  traceRegisterUserEvent("CUDA GBIS Phase 3 kernel", CUDA_GBIS3_KERNEL_EVENT);
663 
664  traceRegisterUserEvent("CUDA poll remote", CUDA_EVENT_ID_POLL_REMOTE);
665  traceRegisterUserEvent("CUDA poll local", CUDA_EVENT_ID_POLL_LOCAL);
666 
667 #define REGISTER_DEVICE_EVENTS(DEV) \
668  traceRegisterUserEvent("CUDA device " #DEV " remote", CUDA_EVENT_ID_BASE + 2 * DEV); \
669  traceRegisterUserEvent("CUDA device " #DEV " local", CUDA_EVENT_ID_BASE + 2 * DEV + 1);
670 
687 
688 }
689 
690 #endif // NAMD_CUDA
static Node * Object()
Definition: Node.h:86
#define CUDA_GBIS2_KERNEL_EVENT
Definition: DeviceCUDA.h:33
void initialize()
Definition: DeviceCUDA.C:107
#define REGISTER_DEVICE_EVENTS(DEV)
int getMaxNumThreads()
Definition: DeviceCUDA.C:558
int devicesperreplica
Definition: DeviceCUDA.C:55
#define MAX_NUM_DEVICES
Definition: DeviceCUDA.C:92
#define CUDA_BONDED_KERNEL_EVENT
Definition: DeviceCUDA.h:29
int nomergegrids
Definition: DeviceCUDA.C:58
SimParameters * simParameters
Definition: Node.h:181
#define CUDA_PME_SPREADCHARGE_EVENT
Definition: DeviceCUDA.h:27
void cudaDie(const char *msg, cudaError_t err)
Definition: CudaUtils.C:9
void cuda_finalize()
Definition: DeviceCUDA.C:34
int masterPeList[MAX_NUM_DEVICES]
Definition: DeviceCUDA.C:93
#define CUDA_EVENT_ID_POLL_REMOTE
Definition: DeviceCUDA.h:36
void setupDevicePeerAccess()
Bool useDeviceMigration
static __thread cuda_args_t cuda_args
Definition: DeviceCUDA.C:65
int globaldevice
Definition: DeviceCUDA.C:62
int usedevicelist
Definition: DeviceCUDA.C:54
#define CUDA_DEBUG_EVENT
Definition: DeviceCUDA.h:30
int deviceIDList[MAX_NUM_RANKS]
Definition: DeviceCUDA.C:90
int getMasterPeForDeviceID(int deviceID)
Definition: DeviceCUDA.C:530
int mergegrids
Definition: DeviceCUDA.C:57
#define CUDA_GBIS3_KERNEL_EVENT
Definition: DeviceCUDA.h:34
int nostreaming
Definition: DeviceCUDA.C:59
bool device_shared_with_pe(int pe)
Definition: DeviceCUDA.C:537
int pmedevice
Definition: DeviceCUDA.C:60
int getMaxNumBlocks()
Definition: DeviceCUDA.C:564
void NAMD_die(const char *err_msg)
Definition: common.C:147
#define MAX_NUM_RANKS
Definition: DeviceCUDA.C:89
#define CUDA_NONBONDED_KERNEL_EVENT
Definition: DeviceCUDA.h:31
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:23
#define CUDA_GBIS1_KERNEL_EVENT
Definition: DeviceCUDA.h:32
#define cudaCheck(stmt)
Definition: CudaUtils.h:233
bool getIsMasterDevice()
int ignoresharing
Definition: DeviceCUDA.C:56
void cuda_initialize()
Definition: DeviceCUDA.C:27
bool one_device_per_node()
Definition: DeviceCUDA.C:547
int getDeviceIDforPe(int pe)
Definition: DeviceCUDA.C:523
#define CUDA_EVENT_ID_POLL_LOCAL
Definition: DeviceCUDA.h:39
char * devicelist
Definition: DeviceCUDA.C:53
#define CUDA_PME_GATHERFORCE_EVENT
Definition: DeviceCUDA.h:28
for(int i=0;i< n1;++i)
void cuda_getargs(char **argv)
Definition: DeviceCUDA.C:67