NAMD
DeviceCUDA.C
Go to the documentation of this file.
1 
2 #include "common.h"
3 #include "charm++.h"
4 #include "DeviceCUDA.h"
5 #include "WorkDistrib.h"
6 #include "CudaUtils.h"
7 
8 #if defined(NAMD_CUDA) || defined(NAMD_HIP)
9 #ifdef NAMD_CUDA
10 
11 #include <cuda_runtime.h>
12 #include <cuda.h>
13 #endif
14 #ifdef NAMD_HIP
15 #include <hip/hip_runtime.h>
16 #endif
17 #ifdef WIN32
18 #define __thread __declspec(thread)
19 #endif
20 
21 // Global storage for CUDA devices
23 
25  deviceCUDA = new DeviceCUDA();
27 }
28 
29 // kill all service threads
30 void cuda_finalize() {
31  int ndevs = 0;
32  cudaGetDeviceCount(&ndevs);
33  for ( int dev=0; dev < ndevs; ++dev ) {
34  cudaSetDevice(dev);
35  cudaDeviceReset();
36  }
37 }
38 
39 // -------------------------------------------------------------------------------------------------
40 // Called from BackEnd.C by all processes to read command line arguments
41 // These argument settings are used by DeviceCUDA -class
42 // -------------------------------------------------------------------------------------------------
43 struct cuda_args_t {
44  char *devicelist;
51 };
52 
53 static __thread cuda_args_t cuda_args;
54 
55 void cuda_getargs(char **argv) {
57  cuda_args.usedevicelist = CmiGetArgStringDesc(argv, "+devices", &cuda_args.devicelist,
58  "comma-delimited list of CUDA device numbers such as 0,2,1,2");
60  CmiGetArgInt(argv, "+devicesperreplica", &cuda_args.devicesperreplica);
61  if ( cuda_args.devicesperreplica < 0 ) NAMD_die("Devices per replica must be positive\n");
62  cuda_args.ignoresharing = CmiGetArgFlag(argv, "+ignoresharing");
63  cuda_args.mergegrids = CmiGetArgFlag(argv, "+mergegrids");
64  cuda_args.nomergegrids = CmiGetArgFlag(argv, "+nomergegrids");
65  if ( cuda_args.mergegrids && cuda_args.nomergegrids ) NAMD_die("Do not specify both +mergegrids and +nomergegrids");
66  cuda_args.nostreaming = CmiGetArgFlag(argv, "+nostreaming");
67 }
68 // -------------------------------------------------------------------------------------------------
69 
70 // Node-wide list of device IDs for every rank
71 #define MAX_NUM_RANKS 2048
73 // Node-wide of master PEs for every device ID
74 #define MAX_NUM_DEVICES 256
76 
77 // -------------------------------------------------------------------------------------------------
78 // -------------------------------------------------------------------------------------------------
79 // -------------------------------------------------------------------------------------------------
80 
81 //
82 // Class creator
83 //
84 DeviceCUDA::DeviceCUDA() : deviceProps(NULL), devices(NULL) {}
85 
86 //
87 // Initalize device
88 //
90  // Copy command-line arguments into class
91  this->devicelist = cuda_args.devicelist;
92  this->usedevicelist = cuda_args.usedevicelist;
93  this->devicesperreplica = cuda_args.devicesperreplica;
94  this->ignoresharing = cuda_args.ignoresharing;
95  this->mergegrids = cuda_args.mergegrids;
96  this->nomergegrids = cuda_args.nomergegrids;
97  this->nostreaming = cuda_args.nostreaming;
98 
99  if (CkMyPe() == 0) register_user_events();
100 #if defined(CUDA_VERSION)
101  if (CkMyPe() == 0) CkPrintf("Info: Built with CUDA version %d\n", CUDA_VERSION);
102 #endif
103  char host[128];
104  gethostname(host, 128); host[127] = 0;
105 
106  int myPhysicalNodeID = CmiPhysicalNodeID(CkMyPe());
107  int myRankInPhysicalNode;
108  int numPesOnPhysicalNode;
109  int *pesOnPhysicalNode;
110  CmiGetPesOnPhysicalNode(myPhysicalNodeID,
111  &pesOnPhysicalNode,&numPesOnPhysicalNode);
112 
113  {
114  int i;
115  for ( i=0; i < numPesOnPhysicalNode; ++i ) {
116  if ( i && (pesOnPhysicalNode[i] <= pesOnPhysicalNode[i-1]) ) {
117  i = numPesOnPhysicalNode;
118  break;
119  }
120  if ( pesOnPhysicalNode[i] == CkMyPe() ) break;
121  }
122  if ( i == numPesOnPhysicalNode || i != CmiPhysicalRank(CkMyPe()) ) {
123  CkPrintf("Bad result from CmiGetPesOnPhysicalNode!\n");
124  for ( i=0; i < numPesOnPhysicalNode; ++i ) {
125  CkPrintf("pe %d physnode rank %d of %d is %d\n", CkMyPe(),
126  i, numPesOnPhysicalNode, pesOnPhysicalNode[i]);
127  }
128  myRankInPhysicalNode = 0;
129  numPesOnPhysicalNode = 1;
130  pesOnPhysicalNode = new int[1];
131  pesOnPhysicalNode[0] = CkMyPe();
132  } else {
133  myRankInPhysicalNode = i;
134  }
135  }
136  // CkPrintf("Pe %d ranks %d in physical node\n",CkMyPe(),myRankInPhysicalNode);
137 
138  deviceCount = 0;
139  cudaCheck(cudaGetDeviceCount(&deviceCount));
140  if ( deviceCount <= 0 ) {
141  cudaDie("No CUDA devices found.");
142  }
143 
144  // Store all device props
145  deviceProps = new cudaDeviceProp[deviceCount];
146  for ( int i=0; i<deviceCount; ++i ) {
147  cudaCheck(cudaGetDeviceProperties(&deviceProps[i], i));
148  }
149 
150  ndevices = 0;
151  int nexclusive = 0;
152  if ( usedevicelist ) {
153  devices = new int[strlen(devicelist)];
154  int i = 0;
155  while ( devicelist[i] ) {
156  ndevices += sscanf(devicelist+i,"%d",devices+ndevices);
157  while ( devicelist[i] && isdigit(devicelist[i]) ) ++i;
158  while ( devicelist[i] && ! isdigit(devicelist[i]) ) ++i;
159  }
160  } else {
161  if ( ! CkMyPe() ) {
162  CkPrintf("Did not find +devices i,j,k,... argument, using all\n");
163  }
164  devices = new int[deviceCount];
165  for ( int i=0; i<deviceCount; ++i ) {
166  int dev = i % deviceCount;
167 #if CUDA_VERSION >= 2020 || defined(NAMD_HIP)
168  cudaDeviceProp deviceProp;
169  cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
170  if ( deviceProp.computeMode != cudaComputeModeProhibited
171  && (deviceProp.major >= 3)
172  && deviceProp.canMapHostMemory
173  && ( (deviceProp.multiProcessorCount > 2) ||
174  ((ndevices==0)&&(CkNumNodes()==1)) ) // exclude weak cards
175  ) {
176  devices[ndevices++] = dev;
177  }
178  if ( deviceProp.computeMode == cudaComputeModeExclusive ) {
179  ++nexclusive;
180  }
181 #else
182  devices[ndevices++] = dev;
183 #endif
184  }
185  }
186 
187  if ( ! ndevices ) {
188  cudaDie("all devices are in prohibited mode, of compute capability < 3.0, unable to map host memory, too small, or otherwise unusable");
189  }
190 
191  if ( devicesperreplica > 0 ) {
192  if ( devicesperreplica > ndevices ) {
193  NAMD_die("More devices per partition requested than devices are available");
194  }
195  int *olddevices = devices;
196  devices = new int[devicesperreplica];
197  for ( int i=0; i<devicesperreplica; ++i ) {
198  int mypart = CmiMyPartition();
199  devices[i] = olddevices[(i+devicesperreplica*mypart)%ndevices];
200  }
201  ndevices = devicesperreplica;
202  delete [] olddevices;
203  }
204 
205  int myRankForDevice = ignoresharing ? CkMyRank() : myRankInPhysicalNode;
206  int numPesForDevice = ignoresharing ? CkMyNodeSize() : numPesOnPhysicalNode;
207 
208  // catch multiple processes per device
209  if ( ndevices % ( numPesForDevice / CkMyNodeSize() ) ) {
210  char msg[1024];
211  sprintf(msg,"Number of devices (%d) is not a multiple of number of processes (%d). "
212  "Sharing devices between processes is inefficient. "
213  "Specify +ignoresharing (each process uses all visible devices) if "
214  "not all devices are visible to each process, otherwise "
215  "adjust number of processes to evenly divide number of devices, "
216  "specify subset of devices with +devices argument (e.g., +devices 0,2), "
217  "or multiply list shared devices (e.g., +devices 0,1,2,0).",
218  ndevices, numPesForDevice / CkMyNodeSize() );
219  NAMD_die(msg);
220  }
221 
222  {
223  // build list of devices actually used by this node
224  nodedevices = new int[ndevices];
225  nnodedevices = 0;
226  int pe = CkNodeFirst(CkMyNode());
227  int dr = -1;
228  for ( int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
229  int rank = ignoresharing ? i : CmiPhysicalRank(pe);
230  int peDeviceRank = rank * ndevices / numPesForDevice;
231  if ( peDeviceRank != dr ) {
232  dr = peDeviceRank;
233  nodedevices[nnodedevices++] = devices[dr];
234  }
235  }
236  }
237 
238  {
239  // check for devices used twice by this node
240  for ( int i=0; i<nnodedevices; ++i ) {
241  for ( int j=i+1; j<nnodedevices; ++j ) {
242  if ( nodedevices[i] == nodedevices[j] ) {
243  char msg[1024];
244  sprintf(msg,"Device %d bound twice by same process.", nodedevices[i]);
245  NAMD_die(msg);
246  }
247  }
248  }
249  }
250 
251  sharedGpu = 0;
252  gpuIsMine = 1;
253  int firstPeSharingGpu = CkMyPe();
254  nextPeSharingGpu = CkMyPe();
255 
256  {
257  int dev;
258  if ( numPesForDevice > 1 ) {
259  int myDeviceRank = myRankForDevice * ndevices / numPesForDevice;
260  dev = devices[myDeviceRank];
261  masterPe = CkMyPe();
262  {
263  pesSharingDevice = new int[numPesForDevice];
264  masterPe = -1;
265  numPesSharingDevice = 0;
266  for ( int i = 0; i < numPesForDevice; ++i ) {
267  if ( i * ndevices / numPesForDevice == myDeviceRank ) {
268  int thisPe = ignoresharing ? (CkNodeFirst(CkMyNode())+i) : pesOnPhysicalNode[i];
269  pesSharingDevice[numPesSharingDevice++] = thisPe;
270  if ( masterPe < 1 ) masterPe = thisPe;
271  if ( WorkDistrib::pe_sortop_diffuse()(thisPe,masterPe) ) masterPe = thisPe;
272  }
273  }
274  for ( int j = 0; j < ndevices; ++j ) {
275  if ( devices[j] == dev && j != myDeviceRank ) sharedGpu = 1;
276  }
277  }
278  if ( sharedGpu && masterPe == CkMyPe() ) {
279  if ( CmiPhysicalNodeID(masterPe) < 2 )
280  CkPrintf("Pe %d sharing CUDA device %d\n", CkMyPe(), dev);
281  }
282  } else { // in case phys node code is lying
283  dev = devices[CkMyPe() % ndevices];
284  masterPe = CkMyPe();
285  pesSharingDevice = new int[1];
286  pesSharingDevice[0] = CkMyPe();
287  numPesSharingDevice = 1;
288  }
289 
290  deviceID = dev;
291 
292  // Store device IDs to node-wide list
293  if (CkMyRank() >= MAX_NUM_RANKS)
294  NAMD_die("Maximum number of ranks (2048) per node exceeded");
295  deviceIDList[CkMyRank()] = deviceID;
296 
297  if ( masterPe != CkMyPe() ) {
298  if ( CmiPhysicalNodeID(masterPe) < 2 )
299  CkPrintf("Pe %d physical rank %d will use CUDA device of pe %d\n",
300  CkMyPe(), myRankInPhysicalNode, masterPe);
301  // for PME only
302  cudaCheck(cudaSetDevice(dev));
303  return;
304  }
305 
306  // Store master PEs for every device ID to node-wide list
307  if (deviceID >= MAX_NUM_DEVICES)
308  NAMD_die("Maximum number of CUDA devices (256) per node exceeded");
309  masterPeList[deviceID] = masterPe + 1; // array is pre-initialized to zeros
310 
311  // disable token-passing but don't submit local until remote finished
312  // if shared_gpu is true, otherwise submit all work immediately
313  firstPeSharingGpu = CkMyPe();
314  nextPeSharingGpu = CkMyPe();
315 
316  gpuIsMine = ( firstPeSharingGpu == CkMyPe() );
317 
318  if ( dev >= deviceCount ) {
319  char buf[256];
320  sprintf(buf,"Pe %d unable to bind to CUDA device %d on %s because only %d devices are present",
321  CkMyPe(), dev, host, deviceCount);
322  NAMD_die(buf);
323  }
324 
325  cudaDeviceProp deviceProp;
326  cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
327  if ( CmiPhysicalNodeID(masterPe) < 2 )
328  CkPrintf("Pe %d physical rank %d binding to CUDA device %d on %s: '%s' Mem: %luMB Rev: %d.%d PCI: %x:%x:%x\n",
329  CkMyPe(), myRankInPhysicalNode, dev, host,
330  deviceProp.name,
331  (unsigned long) (deviceProp.totalGlobalMem / (1024*1024)),
332  deviceProp.major, deviceProp.minor,
333  deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
334 
335  cudaCheck(cudaSetDevice(dev));
336 
337  } // just let CUDA pick a device for us
338 
339  {
340  // if only one device then already initialized in cuda_affinity_initialize()
341  cudaError_t cudaSetDeviceFlags_cudaDeviceMapHost = cudaSetDeviceFlags(cudaDeviceMapHost);
342  if ( cudaSetDeviceFlags_cudaDeviceMapHost == cudaErrorSetOnActiveProcess ) {
343  cudaGetLastError();
344  } else {
345  cudaCheck(cudaSetDeviceFlags_cudaDeviceMapHost);
346  }
347 
348  int dev;
349  cudaCheck(cudaGetDevice(&dev));
350  deviceID = dev;
351  cudaDeviceProp deviceProp;
352  cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
353  if ( deviceProp.computeMode == cudaComputeModeProhibited )
354  cudaDie("device in prohibited mode");
355  if ( deviceProp.major < 3 )
356  cudaDie("device not of compute capability 3.0 or higher");
357  if ( ! deviceProp.canMapHostMemory )
358  cudaDie("device cannot map host memory");
359 
360  // initialize the device on this thread
361  int *dummy;
362  cudaCheck(cudaMalloc(&dummy, 4));
363  }
364 }
365 
366 //
367 // Class destructor
368 //
370  if (deviceProps != NULL) delete [] deviceProps;
371  if (devices != NULL) delete [] devices;
372  delete [] pesSharingDevice;
373 }
374 
375 //
376 // Return device ID for pe. Assumes all nodes are the same
377 //
379  return deviceIDList[CkRankOf(pe) % CkMyNodeSize()];
380 }
381 
382 //
383 // Returns master PE for the device ID, or -1 if device not found
384 //
386  return masterPeList[deviceID % deviceCount] - 1;
387 }
388 
389 //
390 // Returns true if process "pe" shares this device
391 //
393  for ( int i=0; i<numPesSharingDevice; ++i ) {
394  if ( pesSharingDevice[i] == pe ) return true;
395  }
396  return false;
397 }
398 
399 //
400 // Returns true if there is single device per node
401 //
403  if ( numPesSharingDevice != CkMyNodeSize() ) return false;
404  int numPesOnNodeSharingDevice = 0;
405  for ( int i=0; i<numPesSharingDevice; ++i ) {
406  if ( CkNodeOf(pesSharingDevice[i]) == CkMyNode() ) {
407  ++numPesOnNodeSharingDevice;
408  }
409  }
410  return ( numPesOnNodeSharingDevice == CkMyNodeSize() );
411 }
412 
414  int dev;
415  cudaCheck(cudaGetDevice(&dev));
416  return deviceProps[dev].maxThreadsPerBlock;
417 }
418 
420  int dev;
421  cudaCheck(cudaGetDevice(&dev));
422  return deviceProps[dev].maxGridSize[0];
423 }
424 
425 /*
426 BASE
427 2 types (remote & local)
428 16 pes per node
429 3 phases (1, 2, 3)
430 */
431 
432 void DeviceCUDA::register_user_events() {
433 
434  traceRegisterUserEvent("CUDA PME spreadCharge", CUDA_PME_SPREADCHARGE_EVENT);
435  traceRegisterUserEvent("CUDA PME gatherForce", CUDA_PME_GATHERFORCE_EVENT);
436 
437  traceRegisterUserEvent("CUDA bonded", CUDA_BONDED_KERNEL_EVENT);
438  traceRegisterUserEvent("CUDA debug", CUDA_DEBUG_EVENT);
439  traceRegisterUserEvent("CUDA nonbonded", CUDA_NONBONDED_KERNEL_EVENT);
440  traceRegisterUserEvent("CUDA GBIS Phase 1 kernel", CUDA_GBIS1_KERNEL_EVENT);
441  traceRegisterUserEvent("CUDA GBIS Phase 2 kernel", CUDA_GBIS2_KERNEL_EVENT);
442  traceRegisterUserEvent("CUDA GBIS Phase 3 kernel", CUDA_GBIS3_KERNEL_EVENT);
443 
444  traceRegisterUserEvent("CUDA poll remote", CUDA_EVENT_ID_POLL_REMOTE);
445  traceRegisterUserEvent("CUDA poll local", CUDA_EVENT_ID_POLL_LOCAL);
446 
447 #define REGISTER_DEVICE_EVENTS(DEV) \
448  traceRegisterUserEvent("CUDA device " #DEV " remote", CUDA_EVENT_ID_BASE + 2 * DEV); \
449  traceRegisterUserEvent("CUDA device " #DEV " local", CUDA_EVENT_ID_BASE + 2 * DEV + 1);
450 
467 
468 }
469 
470 #endif // NAMD_CUDA
471 
#define CUDA_GBIS2_KERNEL_EVENT
Definition: DeviceCUDA.h:18
void initialize()
Definition: DeviceCUDA.C:89
#define REGISTER_DEVICE_EVENTS(DEV)
int getMaxNumThreads()
Definition: DeviceCUDA.C:413
int devicesperreplica
Definition: DeviceCUDA.C:46
#define MAX_NUM_DEVICES
Definition: DeviceCUDA.C:74
#define CUDA_BONDED_KERNEL_EVENT
Definition: DeviceCUDA.h:14
int nomergegrids
Definition: DeviceCUDA.C:49
void cuda_getargs(char **)
Definition: DeviceCUDA.C:55
#define CUDA_PME_SPREADCHARGE_EVENT
Definition: DeviceCUDA.h:12
int masterPeList[MAX_NUM_DEVICES]
Definition: DeviceCUDA.C:75
#define CUDA_EVENT_ID_POLL_REMOTE
Definition: DeviceCUDA.h:21
static __thread cuda_args_t cuda_args
Definition: DeviceCUDA.C:53
int usedevicelist
Definition: DeviceCUDA.C:45
#define CUDA_DEBUG_EVENT
Definition: DeviceCUDA.h:15
int deviceIDList[MAX_NUM_RANKS]
Definition: DeviceCUDA.C:72
int getMasterPeForDeviceID(int deviceID)
Definition: DeviceCUDA.C:385
int mergegrids
Definition: DeviceCUDA.C:48
#define CUDA_GBIS3_KERNEL_EVENT
Definition: DeviceCUDA.h:19
int nostreaming
Definition: DeviceCUDA.C:50
bool device_shared_with_pe(int pe)
Definition: DeviceCUDA.C:392
void cudaDie(const char *msg, cudaError_t err=cudaSuccess)
Definition: CudaUtils.C:9
int getMaxNumBlocks()
Definition: DeviceCUDA.C:419
void NAMD_die(const char *err_msg)
Definition: common.C:85
#define MAX_NUM_RANKS
Definition: DeviceCUDA.C:71
void dummy()
#define CUDA_NONBONDED_KERNEL_EVENT
Definition: DeviceCUDA.h:16
__thread DeviceCUDA * deviceCUDA
Definition: DeviceCUDA.C:22
#define CUDA_GBIS1_KERNEL_EVENT
Definition: DeviceCUDA.h:17
#define cudaCheck(stmt)
Definition: CudaUtils.h:95
int ignoresharing
Definition: DeviceCUDA.C:47
void cuda_initialize()
Definition: DeviceCUDA.C:24
bool one_device_per_node()
Definition: DeviceCUDA.C:402
int getDeviceIDforPe(int pe)
Definition: DeviceCUDA.C:378
#define CUDA_EVENT_ID_POLL_LOCAL
Definition: DeviceCUDA.h:24
char * devicelist
Definition: DeviceCUDA.C:44
void cuda_finalize()
Definition: DeviceCUDA.C:30
#define CUDA_PME_GATHERFORCE_EVENT
Definition: DeviceCUDA.h:13