version 1.3 | version 1.4 |
---|
| |
} | } |
// ------------------------------------------------------------------------------------------------- | // ------------------------------------------------------------------------------------------------- |
| |
| // Node-wide list of device IDs for every rank |
| #define MAX_NUM_RANKS 2048 |
| int deviceIDList[MAX_NUM_RANKS]; |
| // Node-wide of master PEs for every device ID |
| #define MAX_NUM_DEVICES 256 |
| int masterPeList[MAX_NUM_DEVICES]; |
| |
// ------------------------------------------------------------------------------------------------- | // ------------------------------------------------------------------------------------------------- |
// ------------------------------------------------------------------------------------------------- | // ------------------------------------------------------------------------------------------------- |
// ------------------------------------------------------------------------------------------------- | // ------------------------------------------------------------------------------------------------- |
| |
// | // |
// Class creator | // Class creator |
// | // |
DeviceCUDA::DeviceCUDA() {} | DeviceCUDA::DeviceCUDA() : deviceProps(NULL), devices(NULL) {} |
| |
// | // |
// Initalize device | // Initalize device |
| |
} | } |
// CkPrintf("Pe %d ranks %d in physical node\n",CkMyPe(),myRankInPhysicalNode); | // CkPrintf("Pe %d ranks %d in physical node\n",CkMyPe(),myRankInPhysicalNode); |
| |
int deviceCount = 0; | deviceCount = 0; |
cudaCheck(cudaGetDeviceCount(&deviceCount)); | cudaCheck(cudaGetDeviceCount(&deviceCount)); |
if ( deviceCount <= 0 ) { | if ( deviceCount <= 0 ) { |
cudaDie("No CUDA devices found."); | cudaDie("No CUDA devices found."); |
} | } |
| |
int *devices; | // Store all device props |
int ndevices = 0; | deviceProps = new cudaDeviceProp[deviceCount]; |
| for ( int i=0; i<deviceCount; ++i ) { |
| cudaCheck(cudaGetDeviceProperties(&deviceProps[i], i)); |
| } |
| |
| ndevices = 0; |
int nexclusive = 0; | int nexclusive = 0; |
if ( usedevicelist ) { | if ( usedevicelist ) { |
devices = new int[strlen(devicelist)]; | devices = new int[strlen(devicelist)]; |
| |
numPesSharingDevice = 1; | numPesSharingDevice = 1; |
} | } |
| |
| deviceID = dev; |
| |
| // Store device IDs to node-wide list |
| if (CkMyRank() >= MAX_NUM_RANKS) |
| NAMD_die("Maximum number of ranks (2048) per node exceeded"); |
| deviceIDList[CkMyRank()] = deviceID; |
| |
if ( masterPe != CkMyPe() ) { | if ( masterPe != CkMyPe() ) { |
if ( CmiPhysicalNodeID(masterPe) < 2 ) | if ( CmiPhysicalNodeID(masterPe) < 2 ) |
CkPrintf("Pe %d physical rank %d will use CUDA device of pe %d\n", | CkPrintf("Pe %d physical rank %d will use CUDA device of pe %d\n", |
| |
return; | return; |
} | } |
| |
| // Store master PEs for every device ID to node-wide list |
| if (CkMyRank() >= MAX_NUM_DEVICES) |
| NAMD_die("Maximum number of CUDA devices (256) per node exceeded"); |
| masterPeList[deviceID] = masterPe; |
| // Set masterPe values to -1 for devices that do not exist. |
| // Only master Pe with deviceID == devices[0] does the writing |
| if (deviceID == devices[0]) { |
| // For device IDs 0...deviceCount-1, check if it is in the devices[0...deviceCount-1] |
| for (int i=0;i < deviceCount;i++) { |
| bool deviceOK = false; |
| for (int j=0;j < deviceCount;j++) { |
| if (devices[j] == i) deviceOK = true; |
| } |
| if (!deviceOK) masterPeList[i] = -1; |
| } |
| // Device IDs deviceCount ... MAX_NUM_DEVICES are not possible, just set them to -1 |
| for (int i=deviceCount;i < MAX_NUM_DEVICES;i++) { |
| masterPeList[i] = -1; |
| } |
| } |
| |
// disable token-passing but don't submit local until remote finished | // disable token-passing but don't submit local until remote finished |
// if shared_gpu is true, otherwise submit all work immediately | // if shared_gpu is true, otherwise submit all work immediately |
firstPeSharingGpu = CkMyPe(); | firstPeSharingGpu = CkMyPe(); |
| |
| |
} // just let CUDA pick a device for us | } // just let CUDA pick a device for us |
| |
| { |
cudaCheck(cudaSetDeviceFlags(cudaDeviceMapHost)); | cudaCheck(cudaSetDeviceFlags(cudaDeviceMapHost)); |
| |
int dev; | int dev; |
cudaCheck(cudaGetDevice(&dev)); | cudaCheck(cudaGetDevice(&dev)); |
| deviceID = dev; |
cudaDeviceProp deviceProp; | cudaDeviceProp deviceProp; |
cudaCheck(cudaGetDeviceProperties(&deviceProp, dev)); | cudaCheck(cudaGetDeviceProperties(&deviceProp, dev)); |
if ( deviceProp.computeMode == cudaComputeModeProhibited ) | if ( deviceProp.computeMode == cudaComputeModeProhibited ) |
| |
cudaDie("device not of compute capability 1.1 or higher"); | cudaDie("device not of compute capability 1.1 or higher"); |
if ( ! deviceProp.canMapHostMemory ) | if ( ! deviceProp.canMapHostMemory ) |
cudaDie("device cannot map host memory"); | cudaDie("device cannot map host memory"); |
| #ifndef DISABLE_CUDA_TEXTURE_OBJECTS |
| if (deviceProp.major < 3) |
| cudaDie("CUDA texture objects require compute capability 3.0 or higher.\nUse DISABLE_CUDA_TEXTURE_OBJECTS to disable texture objects."); |
| #endif |
| extern int read_CUDA_ARCH(); |
| cuda_arch = read_CUDA_ARCH(); |
| } |
} | } |
| |
// | // |
// Class destructor | // Class destructor |
// | // |
DeviceCUDA::~DeviceCUDA() { | DeviceCUDA::~DeviceCUDA() { |
| if (deviceProps != NULL) delete [] deviceProps; |
| if (devices != NULL) delete [] devices; |
delete [] pesSharingDevice; | delete [] pesSharingDevice; |
} | } |
| |
// | // |
| // Return device ID for pe. Assumes all nodes are the same |
| // |
| int DeviceCUDA::getDeviceIDforPe(int pe) { |
| return deviceIDList[CkRankOf(pe) % CkMyNodeSize()]; |
| } |
| |
| // |
| // Returns master PE for the device ID, or -1 if device not found |
| // |
| int DeviceCUDA::getMasterPeForDeviceID(int deviceID) { |
| return masterPeList[deviceID % deviceCount]; |
| } |
| |
| // |
// Returns true if process "pe" shares this device | // Returns true if process "pe" shares this device |
// | // |
bool DeviceCUDA::device_shared_with_pe(int pe) { | bool DeviceCUDA::device_shared_with_pe(int pe) { |
| |
return ( numPesOnNodeSharingDevice == CkMyNodeSize() ); | return ( numPesOnNodeSharingDevice == CkMyNodeSize() ); |
} | } |
| |
| int DeviceCUDA::getMaxNumThreads() { |
| int dev; |
| cudaCheck(cudaGetDevice(&dev)); |
| return deviceProps[dev].maxThreadsPerBlock; |
| } |
| |
| int DeviceCUDA::getMaxNumBlocks() { |
| int dev; |
| cudaCheck(cudaGetDevice(&dev)); |
| return deviceProps[dev].maxGridSize[0]; |
| } |
| |
/* | /* |
BASE | BASE |
2 types (remote & local) | 2 types (remote & local) |