| version 1.3 | version 1.4 |
|---|
| |
| } | } |
| // ------------------------------------------------------------------------------------------------- | // ------------------------------------------------------------------------------------------------- |
| | |
| | // Node-wide list of device IDs for every rank |
| | #define MAX_NUM_RANKS 2048 |
| | int deviceIDList[MAX_NUM_RANKS]; |
| | // Node-wide of master PEs for every device ID |
| | #define MAX_NUM_DEVICES 256 |
| | int masterPeList[MAX_NUM_DEVICES]; |
| | |
| // ------------------------------------------------------------------------------------------------- | // ------------------------------------------------------------------------------------------------- |
| // ------------------------------------------------------------------------------------------------- | // ------------------------------------------------------------------------------------------------- |
| // ------------------------------------------------------------------------------------------------- | // ------------------------------------------------------------------------------------------------- |
| |
| // | // |
| // Class creator | // Class creator |
| // | // |
| DeviceCUDA::DeviceCUDA() {} | DeviceCUDA::DeviceCUDA() : deviceProps(NULL), devices(NULL) {} |
| | |
| // | // |
| // Initalize device | // Initalize device |
| |
| } | } |
| // CkPrintf("Pe %d ranks %d in physical node\n",CkMyPe(),myRankInPhysicalNode); | // CkPrintf("Pe %d ranks %d in physical node\n",CkMyPe(),myRankInPhysicalNode); |
| | |
| int deviceCount = 0; | deviceCount = 0; |
| cudaCheck(cudaGetDeviceCount(&deviceCount)); | cudaCheck(cudaGetDeviceCount(&deviceCount)); |
| if ( deviceCount <= 0 ) { | if ( deviceCount <= 0 ) { |
| cudaDie("No CUDA devices found."); | cudaDie("No CUDA devices found."); |
| } | } |
| | |
| int *devices; | // Store all device props |
| int ndevices = 0; | deviceProps = new cudaDeviceProp[deviceCount]; |
| | for ( int i=0; i<deviceCount; ++i ) { |
| | cudaCheck(cudaGetDeviceProperties(&deviceProps[i], i)); |
| | } |
| | |
| | ndevices = 0; |
| int nexclusive = 0; | int nexclusive = 0; |
| if ( usedevicelist ) { | if ( usedevicelist ) { |
| devices = new int[strlen(devicelist)]; | devices = new int[strlen(devicelist)]; |
| |
| numPesSharingDevice = 1; | numPesSharingDevice = 1; |
| } | } |
| | |
| | deviceID = dev; |
| | |
| | // Store device IDs to node-wide list |
| | if (CkMyRank() >= MAX_NUM_RANKS) |
| | NAMD_die("Maximum number of ranks (2048) per node exceeded"); |
| | deviceIDList[CkMyRank()] = deviceID; |
| | |
| if ( masterPe != CkMyPe() ) { | if ( masterPe != CkMyPe() ) { |
| if ( CmiPhysicalNodeID(masterPe) < 2 ) | if ( CmiPhysicalNodeID(masterPe) < 2 ) |
| CkPrintf("Pe %d physical rank %d will use CUDA device of pe %d\n", | CkPrintf("Pe %d physical rank %d will use CUDA device of pe %d\n", |
| |
| return; | return; |
| } | } |
| | |
| | // Store master PEs for every device ID to node-wide list |
| | if (CkMyRank() >= MAX_NUM_DEVICES) |
| | NAMD_die("Maximum number of CUDA devices (256) per node exceeded"); |
| | masterPeList[deviceID] = masterPe; |
| | // Set masterPe values to -1 for devices that do not exist. |
| | // Only master Pe with deviceID == devices[0] does the writing |
| | if (deviceID == devices[0]) { |
| | // For device IDs 0...deviceCount-1, check if it is in the devices[0...deviceCount-1] |
| | for (int i=0;i < deviceCount;i++) { |
| | bool deviceOK = false; |
| | for (int j=0;j < deviceCount;j++) { |
| | if (devices[j] == i) deviceOK = true; |
| | } |
| | if (!deviceOK) masterPeList[i] = -1; |
| | } |
| | // Device IDs deviceCount ... MAX_NUM_DEVICES are not possible, just set them to -1 |
| | for (int i=deviceCount;i < MAX_NUM_DEVICES;i++) { |
| | masterPeList[i] = -1; |
| | } |
| | } |
| | |
| // disable token-passing but don't submit local until remote finished | // disable token-passing but don't submit local until remote finished |
| // if shared_gpu is true, otherwise submit all work immediately | // if shared_gpu is true, otherwise submit all work immediately |
| firstPeSharingGpu = CkMyPe(); | firstPeSharingGpu = CkMyPe(); |
| |
| | |
| } // just let CUDA pick a device for us | } // just let CUDA pick a device for us |
| | |
| | { |
| cudaCheck(cudaSetDeviceFlags(cudaDeviceMapHost)); | cudaCheck(cudaSetDeviceFlags(cudaDeviceMapHost)); |
| | |
| int dev; | int dev; |
| cudaCheck(cudaGetDevice(&dev)); | cudaCheck(cudaGetDevice(&dev)); |
| | deviceID = dev; |
| cudaDeviceProp deviceProp; | cudaDeviceProp deviceProp; |
| cudaCheck(cudaGetDeviceProperties(&deviceProp, dev)); | cudaCheck(cudaGetDeviceProperties(&deviceProp, dev)); |
| if ( deviceProp.computeMode == cudaComputeModeProhibited ) | if ( deviceProp.computeMode == cudaComputeModeProhibited ) |
| |
| cudaDie("device not of compute capability 1.1 or higher"); | cudaDie("device not of compute capability 1.1 or higher"); |
| if ( ! deviceProp.canMapHostMemory ) | if ( ! deviceProp.canMapHostMemory ) |
| cudaDie("device cannot map host memory"); | cudaDie("device cannot map host memory"); |
| | #ifndef DISABLE_CUDA_TEXTURE_OBJECTS |
| | if (deviceProp.major < 3) |
| | cudaDie("CUDA texture objects require compute capability 3.0 or higher.\nUse DISABLE_CUDA_TEXTURE_OBJECTS to disable texture objects."); |
| | #endif |
| | extern int read_CUDA_ARCH(); |
| | cuda_arch = read_CUDA_ARCH(); |
| | } |
| } | } |
| | |
| // | // |
| // Class destructor | // Class destructor |
| // | // |
| DeviceCUDA::~DeviceCUDA() { | DeviceCUDA::~DeviceCUDA() { |
| | if (deviceProps != NULL) delete [] deviceProps; |
| | if (devices != NULL) delete [] devices; |
| delete [] pesSharingDevice; | delete [] pesSharingDevice; |
| } | } |
| | |
| // | // |
| | // Return device ID for pe. Assumes all nodes are the same |
| | // |
| | int DeviceCUDA::getDeviceIDforPe(int pe) { |
| | return deviceIDList[CkRankOf(pe) % CkMyNodeSize()]; |
| | } |
| | |
| | // |
| | // Returns master PE for the device ID, or -1 if device not found |
| | // |
| | int DeviceCUDA::getMasterPeForDeviceID(int deviceID) { |
| | return masterPeList[deviceID % deviceCount]; |
| | } |
| | |
| | // |
| // Returns true if process "pe" shares this device | // Returns true if process "pe" shares this device |
| // | // |
| bool DeviceCUDA::device_shared_with_pe(int pe) { | bool DeviceCUDA::device_shared_with_pe(int pe) { |
| |
| return ( numPesOnNodeSharingDevice == CkMyNodeSize() ); | return ( numPesOnNodeSharingDevice == CkMyNodeSize() ); |
| } | } |
| | |
| | int DeviceCUDA::getMaxNumThreads() { |
| | int dev; |
| | cudaCheck(cudaGetDevice(&dev)); |
| | return deviceProps[dev].maxThreadsPerBlock; |
| | } |
| | |
| | int DeviceCUDA::getMaxNumBlocks() { |
| | int dev; |
| | cudaCheck(cudaGetDevice(&dev)); |
| | return deviceProps[dev].maxGridSize[0]; |
| | } |
| | |
| /* | /* |
| BASE | BASE |
| 2 types (remote & local) | 2 types (remote & local) |