116 #ifdef NODEGROUP_FORCE_REGISTER 119 isGlobalDevice =
false;
122 if (CkMyPe() == 0) register_user_events();
124 #if defined(CUDA_VERSION) 125 if (CkMyPe() == 0) CkPrintf(
"Info: Built with CUDA version %d\n", CUDA_VERSION);
129 gethostname(host, 128); host[127] = 0;
131 int myPhysicalNodeID = CmiPhysicalNodeID(CkMyPe());
132 int myRankInPhysicalNode;
133 int numPesOnPhysicalNode;
134 int *pesOnPhysicalNode;
135 CmiGetPesOnPhysicalNode(myPhysicalNodeID,
136 &pesOnPhysicalNode,&numPesOnPhysicalNode);
140 for ( i=0; i < numPesOnPhysicalNode; ++i ) {
141 if ( i && (pesOnPhysicalNode[i] <= pesOnPhysicalNode[i-1]) ) {
142 i = numPesOnPhysicalNode;
145 if ( pesOnPhysicalNode[i] == CkMyPe() )
break;
147 if ( i == numPesOnPhysicalNode || i != CmiPhysicalRank(CkMyPe()) ) {
148 CkPrintf(
"Bad result from CmiGetPesOnPhysicalNode!\n");
149 for ( i=0; i < numPesOnPhysicalNode; ++i ) {
150 CkPrintf(
"pe %d physnode rank %d of %d is %d\n", CkMyPe(),
151 i, numPesOnPhysicalNode, pesOnPhysicalNode[i]);
153 myRankInPhysicalNode = 0;
154 numPesOnPhysicalNode = 1;
155 pesOnPhysicalNode =
new int[1];
156 pesOnPhysicalNode[0] = CkMyPe();
158 myRankInPhysicalNode = i;
164 cudaCheck(cudaGetDeviceCount(&deviceCount));
165 if ( deviceCount <= 0 ) {
166 cudaDie(
"No CUDA devices found.");
170 deviceProps =
new cudaDeviceProp[deviceCount];
171 for (
int i=0; i<deviceCount; ++i ) {
172 cudaCheck(cudaGetDeviceProperties(&deviceProps[i], i));
177 if ( usedevicelist ) {
178 devices =
new int[strlen(devicelist)];
180 while ( devicelist[i] ) {
181 ndevices += sscanf(devicelist+i,
"%d",devices+ndevices);
182 while ( devicelist[i] && isdigit(devicelist[i]) ) ++i;
183 while ( devicelist[i] && ! isdigit(devicelist[i]) ) ++i;
187 CkPrintf(
"Did not find +devices i,j,k,... argument, using all\n");
189 devices =
new int[deviceCount];
190 for (
int i=0; i<deviceCount; ++i ) {
191 int dev = i % deviceCount;
192 #if CUDA_VERSION >= 2020 || defined(NAMD_HIP) 193 cudaDeviceProp deviceProp;
194 cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
195 if ( deviceProp.computeMode != cudaComputeModeProhibited
196 && (deviceProp.major >= 3)
197 && deviceProp.canMapHostMemory
198 && ( (deviceProp.multiProcessorCount > 2) ||
199 ((ndevices==0)&&(CkNumNodes()==1)) )
201 devices[ndevices++] = dev;
203 if ( deviceProp.computeMode == cudaComputeModeExclusive ) {
207 devices[ndevices++] = dev;
213 cudaDie(
"all devices are in prohibited mode, of compute capability < 3.0, unable to map host memory, too small, or otherwise unusable");
216 if ( devicesperreplica > 0 ) {
217 if ( devicesperreplica > ndevices ) {
218 NAMD_die(
"More devices per partition requested than devices are available");
220 int *olddevices = devices;
221 devices =
new int[devicesperreplica];
222 for (
int i=0; i<devicesperreplica; ++i ) {
223 int mypart = CmiMyPartition();
224 devices[i] = olddevices[(i+devicesperreplica*mypart)%ndevices];
226 ndevices = devicesperreplica;
227 delete [] olddevices;
230 int myRankForDevice = ignoresharing ? CkMyRank() : myRankInPhysicalNode;
231 int numPesForDevice = ignoresharing ? CkMyNodeSize() : numPesOnPhysicalNode;
235 #ifdef NODEGROUP_FORCE_REGISTER 238 const int pmePes = -1;
242 if ( ndevices % ( numPesForDevice / CkMyNodeSize() ) ) {
244 sprintf(msg,
"Number of devices (%d) is not a multiple of number of processes (%d). " 245 "Sharing devices between processes is inefficient. " 246 "Specify +ignoresharing (each process uses all visible devices) if " 247 "not all devices are visible to each process, otherwise " 248 "adjust number of processes to evenly divide number of devices, " 249 "specify subset of devices with +devices argument (e.g., +devices 0,2), " 250 "or multiply list shared devices (e.g., +devices 0,1,2,0).",
251 ndevices, numPesForDevice / CkMyNodeSize() );
257 nodedevices =
new int[ndevices];
259 int pe = CkNodeFirst(CkMyNode());
261 for (
int i=0; i<CkMyNodeSize(); ++i, ++pe ) {
262 int rank = ignoresharing ? i : CmiPhysicalRank(pe);
263 int peDeviceRank = rank * ndevices / numPesForDevice;
264 if ( peDeviceRank != dr ) {
266 nodedevices[nnodedevices++] = devices[dr];
273 for (
int i=0; i<nnodedevices; ++i ) {
274 for (
int j=i+1; j<nnodedevices; ++j ) {
275 if ( nodedevices[i] == nodedevices[j] ) {
277 sprintf(msg,
"Device %d bound twice by same process.", nodedevices[i]);
286 int firstPeSharingGpu = CkMyPe();
287 nextPeSharingGpu = CkMyPe();
293 if (myRankForDevice < pmePes) {
296 myDeviceRank = 1 + (myRankForDevice-pmePes) * (ndevices-1) / (numPesForDevice-pmePes);
299 dev = devices[myDeviceRank];
301 if (myRankForDevice >= pmePes) {
302 pesSharingDevice =
new int[numPesForDevice];
304 numPesSharingDevice = 0;
305 for (
int i = pmePes; i < numPesForDevice; ++i ) {
306 if ( 1 + (i-pmePes) * (ndevices-1) / (numPesForDevice-pmePes) == myDeviceRank ) {
307 int thisPe = ignoresharing ? (CkNodeFirst(CkMyNode())+i) : pesOnPhysicalNode[i];
308 pesSharingDevice[numPesSharingDevice++] = thisPe;
309 if ( masterPe < 1 ) masterPe = thisPe;
313 for (
int j = 0; j < ndevices; ++j ) {
314 if ( devices[j] == dev && j != myDeviceRank ) sharedGpu = 1;
317 #ifdef NODEGROUP_FORCE_REGISTER 318 pesSharingDevice =
new int[pmePes];
320 pesSharingDevice = NULL;
323 numPesSharingDevice = 0;
324 for (
int i = 0; i < pmePes; ++i) {
325 int thisPe = ignoresharing ? (CkNodeFirst(CkMyNode())+i) : pesOnPhysicalNode[i];
326 pesSharingDevice[numPesSharingDevice++] = thisPe;
327 if ( masterPe < 1 ) masterPe = thisPe;
331 if ( sharedGpu && masterPe == CkMyPe() ) {
332 if ( CmiPhysicalNodeID(masterPe) < 2 )
333 CkPrintf(
"Pe %d sharing CUDA device %d\n", CkMyPe(), dev);
335 }
else if ( numPesForDevice > 1 ) {
336 int myDeviceRank = myRankForDevice * ndevices / numPesForDevice;
337 dev = devices[myDeviceRank];
340 pesSharingDevice =
new int[numPesForDevice];
342 numPesSharingDevice = 0;
343 for (
int i = 0; i < numPesForDevice; ++i ) {
344 if ( i * ndevices / numPesForDevice == myDeviceRank ) {
345 int thisPe = ignoresharing ? (CkNodeFirst(CkMyNode())+i) : pesOnPhysicalNode[i];
346 pesSharingDevice[numPesSharingDevice++] = thisPe;
347 if ( masterPe < 1 ) masterPe = thisPe;
351 for (
int j = 0; j < ndevices; ++j ) {
352 if ( devices[j] == dev && j != myDeviceRank ) sharedGpu = 1;
355 if ( sharedGpu && masterPe == CkMyPe() ) {
356 if ( CmiPhysicalNodeID(masterPe) < 2 )
357 CkPrintf(
"Pe %d sharing CUDA device %d\n", CkMyPe(), dev);
360 dev = devices[CkMyPe() % ndevices];
362 pesSharingDevice =
new int[1];
363 pesSharingDevice[0] = CkMyPe();
364 numPesSharingDevice = 1;
371 bool contained =
false;
373 for(
int i = 0; i < ndevices; i++){
375 contained = devices[i] == pmeDevice;
376 pmeDeviceIndex = (contained) ? i : -1;
378 if(deviceID == devices[i]) deviceIndex = i;
381 masterDevice = devices[0];
382 isMasterDevice = deviceID == masterDevice;
386 pmeDeviceIndex = nnodedevices;
387 isPmeDevice = isMasterDevice;
390 isPmeDevice = pmeDevice == deviceID;
397 for (
int i = 0; i < ndevices; ++i) {
399 contained = devices[i] == globalDevice;
403 NAMD_die(
"The selected GPU device for global forces is in the available devices list.\n");
405 isGlobalDevice = globalDevice == deviceID;
409 NAMD_die(
"Maximum number of ranks (2048) per node exceeded");
412 if ( masterPe != CkMyPe() ) {
413 if ( CmiPhysicalNodeID(masterPe) < 2 )
414 CkPrintf(
"Pe %d physical rank %d will use CUDA device of pe %d\n",
415 CkMyPe(), myRankInPhysicalNode, masterPe);
423 NAMD_die(
"Maximum number of CUDA devices (256) per node exceeded");
428 firstPeSharingGpu = CkMyPe();
429 nextPeSharingGpu = CkMyPe();
431 gpuIsMine = ( firstPeSharingGpu == CkMyPe() );
433 if ( dev >= deviceCount ) {
435 sprintf(buf,
"Pe %d unable to bind to CUDA device %d on %s because only %d devices are present",
436 CkMyPe(), dev, host, deviceCount);
440 cudaDeviceProp deviceProp;
441 cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
442 if ( CmiPhysicalNodeID(masterPe) < 2 )
443 CkPrintf(
"Pe %d physical rank %d binding to CUDA device %d on %s: '%s' Mem: %luMB Rev: %d.%d PCI: %x:%x:%x\n",
444 CkMyPe(), myRankInPhysicalNode, dev, host,
446 (
unsigned long) (deviceProp.totalGlobalMem / (1024*1024)),
447 deviceProp.major, deviceProp.minor,
448 deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
456 cudaError_t cudaSetDeviceFlags_cudaDeviceMapHost = cudaSetDeviceFlags(cudaDeviceMapHost);
457 if ( cudaSetDeviceFlags_cudaDeviceMapHost == cudaErrorSetOnActiveProcess ) {
460 cudaCheck(cudaSetDeviceFlags_cudaDeviceMapHost);
466 cudaDeviceProp deviceProp;
467 cudaCheck(cudaGetDeviceProperties(&deviceProp, dev));
468 if ( deviceProp.computeMode == cudaComputeModeProhibited )
469 cudaDie(
"device in prohibited mode");
470 if ( deviceProp.major < 3 )
471 cudaDie(
"device not of compute capability 3.0 or higher");
472 if ( ! deviceProp.canMapHostMemory )
473 cudaDie(
"device cannot map host memory");
480 #if NODEGROUP_FORCE_REGISTER 486 bool contained =
false;
488 for(
int i = 0; i < ndevices; i++){
490 contained = devices[i] == pmeDevice;
491 pmeDeviceIndex = (contained) ? i : -1;
493 if(deviceID == devices[i]) deviceIndex = i;
496 if(!contained && CkMyPe() == 0)
cudaDie(
"device specified for PME is not contained in +devices!");
499 isPmeDevice = pmeDevice == deviceID;
500 masterDevice = devices[0];
501 isMasterDevice = deviceID == masterDevice;
503 if (pmeDeviceIndex != 0 && pmePes != -1) {
504 NAMD_die(
"PME device must be index 0 if pmePes is set");
void cudaDie(const char *msg, cudaError_t err)
int masterPeList[MAX_NUM_DEVICES]
static __thread cuda_args_t cuda_args
int deviceIDList[MAX_NUM_RANKS]
void NAMD_die(const char *err_msg)