118 #ifdef NODEGROUP_FORCE_REGISTER 121 isGlobalDevice =
false;
124 if (CkMyPe() == 0) register_user_events();
126 #if defined(CUDA_VERSION) 127 if (CkMyPe() == 0) CkPrintf(
"Info: Built with CUDA version %d\n", CUDA_VERSION);
131 gethostname(host, 128); host[127] = 0;
133 int myPhysicalNodeID = CmiPhysicalNodeID(CkMyPe());
134 int myRankInPhysicalNode;
135 int numPesOnPhysicalNode;
136 int *pesOnPhysicalNode;
137 CmiGetPesOnPhysicalNode(myPhysicalNodeID,
138 &pesOnPhysicalNode,&numPesOnPhysicalNode);
142 for ( i=0; i < numPesOnPhysicalNode; ++i ) {
143 if ( i && (pesOnPhysicalNode[i] <= pesOnPhysicalNode[i-1]) ) {
144 i = numPesOnPhysicalNode;
147 if ( pesOnPhysicalNode[i] == CkMyPe() )
break;
149 if ( i == numPesOnPhysicalNode || i != CmiPhysicalRank(CkMyPe()) ) {
150 CkPrintf(
"Bad result from CmiGetPesOnPhysicalNode!\n");
151 for ( i=0; i < numPesOnPhysicalNode; ++i ) {
152 CkPrintf(
"pe %d physnode rank %d of %d is %d\n", CkMyPe(),
153 i, numPesOnPhysicalNode, pesOnPhysicalNode[i]);
155 myRankInPhysicalNode = 0;
156 numPesOnPhysicalNode = 1;
157 pesOnPhysicalNode =
new int[1];
158 pesOnPhysicalNode[0] = CkMyPe();
160 myRankInPhysicalNode = i;
166 cudaCheck(cudaGetDeviceCount(&deviceCount));
167 if ( deviceCount <= 0 ) {
168 cudaDie(
"No CUDA devices found.");
172 deviceProps =
new cudaDeviceProp[deviceCount];
173 for (
int i=0; i<deviceCount; ++i ) {
174 cudaCheck(cudaGetDeviceProperties(&deviceProps[i], i));
179 if ( usedevicelist ) {
180 devices =
new int[strlen(devicelist)];
182 while ( devicelist[i] ) {
183 ndevices += sscanf(devicelist+i,
"%d",devices+ndevices);
184 while ( devicelist[i] && isdigit(devicelist[i]) ) ++i;
185 while ( devicelist[i] && ! isdigit(devicelist[i]) ) ++i;
189 CkPrintf(
"Did not find +devices i,j,k,... argument, using all\n");
191 devices =
new int[deviceCount];
192 for (
int i=0; i<deviceCount; ++i ) {
193 int dev = i % deviceCount;
194 #if CUDA_VERSION >= 2020 || defined(NAMD_HIP) 195 cudaDeviceProp deviceProp;
197 cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
198 cudaCheck(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, dev));
199 if ( computeMode != cudaComputeModeProhibited
200 && (deviceProp.major >= 3)
201 && deviceProp.canMapHostMemory
202 && ( (deviceProp.multiProcessorCount > 2) ||
203 ((ndevices==0)&&(CkNumNodes()==1)) )
205 devices[ndevices++] = dev;
207 if ( computeMode == cudaComputeModeExclusive ) {
211 devices[ndevices++] = dev;
217 cudaDie(
"all devices are in prohibited mode, of compute capability < 3.0, unable to map host memory, too small, or otherwise unusable");
220 if ( devicesperreplica > 0 ) {
221 if ( devicesperreplica > ndevices ) {
222 NAMD_die(
"More devices per partition requested than devices are available");
224 int *olddevices = devices;
225 devices =
new int[devicesperreplica];
226 for (
int i=0; i<devicesperreplica; ++i ) {
227 int mypart = CmiMyPartition();
228 devices[i] = olddevices[(i+devicesperreplica*mypart)%ndevices];
230 ndevices = devicesperreplica;
231 delete [] olddevices;
234 int myRankForDevice = ignoresharing ? CkMyRank() : myRankInPhysicalNode;
235 int numPesForDevice = ignoresharing ? CkMyNodeSize() : numPesOnPhysicalNode;
239 #ifdef NODEGROUP_FORCE_REGISTER 242 const int pmePes = -1;
246 if ( ndevices % ( numPesForDevice / CkMyNodeSize() ) ) {
248 sprintf(msg,
"Number of devices (%d) is not a multiple of number of processes (%d). " 249 "Sharing devices between processes is inefficient. " 250 "Specify +ignoresharing (each process uses all visible devices) if " 251 "not all devices are visible to each process, otherwise " 252 "adjust number of processes to evenly divide number of devices, " 253 "specify subset of devices with +devices argument (e.g., +devices 0,2), " 254 "or multiply list shared devices (e.g., +devices 0,1,2,0).",
255 ndevices, numPesForDevice / CkMyNodeSize() );
261 nodedevices =
new int[ndevices];
263 int pe = CkNodeFirst(CkMyNode());
265 for (
int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
266 int rank = ignoresharing ? i : CmiPhysicalRank(pe);
267 int peDeviceRank = rank * ndevices / numPesForDevice;
268 if ( peDeviceRank != dr ) {
270 nodedevices[nnodedevices++] = devices[dr];
277 for (
int i=0; i<nnodedevices; ++i ) {
278 for (
int j=i+1; j<nnodedevices; ++j ) {
279 if ( nodedevices[i] == nodedevices[j] ) {
281 sprintf(msg,
"Device %d bound twice by same process.", nodedevices[i]);
290 int firstPeSharingGpu = CkMyPe();
291 nextPeSharingGpu = CkMyPe();
297 if (myRankForDevice < pmePes) {
300 myDeviceRank = 1 + (myRankForDevice-pmePes) * (ndevices-1) / (numPesForDevice-pmePes);
303 dev = devices[myDeviceRank];
305 if (myRankForDevice >= pmePes) {
306 pesSharingDevice =
new int[numPesForDevice];
308 numPesSharingDevice = 0;
309 for (
int i = pmePes; i < numPesForDevice; ++i ) {
310 if ( 1 + (i-pmePes) * (ndevices-1) / (numPesForDevice-pmePes) == myDeviceRank ) {
311 int thisPe = ignoresharing ? (CkNodeFirst(CkMyNode())+i) : pesOnPhysicalNode[i];
312 pesSharingDevice[numPesSharingDevice++] = thisPe;
313 if ( masterPe < 1 ) masterPe = thisPe;
317 for (
int j = 0; j < ndevices; ++j ) {
318 if ( devices[j] == dev && j != myDeviceRank ) sharedGpu = 1;
321 #ifdef NODEGROUP_FORCE_REGISTER 322 pesSharingDevice =
new int[pmePes];
324 pesSharingDevice = NULL;
327 numPesSharingDevice = 0;
328 for (
int i = 0; i < pmePes; ++i) {
329 int thisPe = ignoresharing ? (CkNodeFirst(CkMyNode())+i) : pesOnPhysicalNode[i];
330 pesSharingDevice[numPesSharingDevice++] = thisPe;
331 if ( masterPe < 1 ) masterPe = thisPe;
335 if ( sharedGpu && masterPe == CkMyPe() ) {
336 if ( CmiPhysicalNodeID(masterPe) < 2 )
337 CkPrintf(
"Pe %d sharing CUDA device %d\n", CkMyPe(), dev);
339 }
else if ( numPesForDevice > 1 ) {
340 int myDeviceRank = myRankForDevice * ndevices / numPesForDevice;
341 dev = devices[myDeviceRank];
344 pesSharingDevice =
new int[numPesForDevice];
346 numPesSharingDevice = 0;
347 for (
int i = 0; i < numPesForDevice; ++i ) {
348 if ( i * ndevices / numPesForDevice == myDeviceRank ) {
349 int thisPe = ignoresharing ? (CkNodeFirst(CkMyNode())+i) : pesOnPhysicalNode[i];
350 pesSharingDevice[numPesSharingDevice++] = thisPe;
351 if ( masterPe < 1 ) masterPe = thisPe;
355 for (
int j = 0; j < ndevices; ++j ) {
356 if ( devices[j] == dev && j != myDeviceRank ) sharedGpu = 1;
359 if ( sharedGpu && masterPe == CkMyPe() ) {
360 if ( CmiPhysicalNodeID(masterPe) < 2 )
361 CkPrintf(
"Pe %d sharing CUDA device %d\n", CkMyPe(), dev);
364 dev = devices[CkMyPe() % ndevices];
366 pesSharingDevice =
new int[1];
367 pesSharingDevice[0] = CkMyPe();
368 numPesSharingDevice = 1;
375 bool contained =
false;
377 for(
int i = 0; i < ndevices; i++){
379 contained = devices[i] == pmeDevice;
380 pmeDeviceIndex = (contained) ? i : -1;
382 if(deviceID == devices[i]) deviceIndex = i;
385 masterDevice = devices[0];
386 isMasterDevice = deviceID == masterDevice;
390 pmeDeviceIndex = nnodedevices;
391 isPmeDevice = isMasterDevice;
394 isPmeDevice = pmeDevice == deviceID;
401 for (
int i = 0; i < ndevices; ++i) {
403 contained = devices[i] == globalDevice;
407 NAMD_die(
"The selected GPU device for global forces is in the available devices list.\n");
409 isGlobalDevice = globalDevice == deviceID;
413 NAMD_die(
"Maximum number of ranks (2048) per node exceeded");
416 if ( masterPe != CkMyPe() ) {
417 if ( CmiPhysicalNodeID(masterPe) < 2 )
418 CkPrintf(
"Pe %d physical rank %d will use CUDA device of pe %d\n",
419 CkMyPe(), myRankInPhysicalNode, masterPe);
427 NAMD_die(
"Maximum number of CUDA devices (256) per node exceeded");
432 firstPeSharingGpu = CkMyPe();
433 nextPeSharingGpu = CkMyPe();
435 gpuIsMine = ( firstPeSharingGpu == CkMyPe() );
437 if ( dev >= deviceCount ) {
439 sprintf(buf,
"Pe %d unable to bind to CUDA device %d on %s because only %d devices are present",
440 CkMyPe(), dev, host, deviceCount);
444 cudaDeviceProp deviceProp;
445 cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
446 if ( CmiPhysicalNodeID(masterPe) < 2 )
447 CkPrintf(
"Pe %d physical rank %d binding to CUDA device %d on %s: '%s' Mem: %luMB Rev: %d.%d PCI: %x:%x:%x\n",
448 CkMyPe(), myRankInPhysicalNode, dev, host,
450 (
unsigned long) (deviceProp.totalGlobalMem / (1024*1024)),
451 deviceProp.major, deviceProp.minor,
452 deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
460 cudaError_t cudaSetDeviceFlags_cudaDeviceMapHost = cudaSetDeviceFlags(cudaDeviceMapHost);
461 if ( cudaSetDeviceFlags_cudaDeviceMapHost == cudaErrorSetOnActiveProcess ) {
464 cudaCheck(cudaSetDeviceFlags_cudaDeviceMapHost);
470 cudaDeviceProp deviceProp;
472 cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
473 cudaCheck(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, dev));
474 if ( computeMode == cudaComputeModeProhibited )
475 cudaDie(
"device in prohibited mode");
476 if ( deviceProp.major < 3 )
477 cudaDie(
"device not of compute capability 3.0 or higher");
478 if ( ! deviceProp.canMapHostMemory )
479 cudaDie(
"device cannot map host memory");
486 #if NODEGROUP_FORCE_REGISTER 492 bool contained =
false;
494 for(
int i = 0; i < ndevices; i++){
496 contained = devices[i] == pmeDevice;
497 pmeDeviceIndex = (contained) ? i : -1;
499 if(deviceID == devices[i]) deviceIndex = i;
502 if(!contained && CkMyPe() == 0)
cudaDie(
"device specified for PME is not contained in +devices!");
505 isPmeDevice = pmeDevice == deviceID;
506 masterDevice = devices[0];
507 isMasterDevice = deviceID == masterDevice;
509 if (pmeDeviceIndex != 0 && pmePes != -1) {
510 NAMD_die(
"PME device must be index 0 if pmePes is set");
void cudaDie(const char *msg, cudaError_t err)
int masterPeList[MAX_NUM_DEVICES]
static __thread cuda_args_t cuda_args
int deviceIDList[MAX_NUM_RANKS]
void NAMD_die(const char *err_msg)