Skip to content

Commit

Permalink
Cmds offer-template and est failing when no GPU on non dummy runtime. (
Browse files Browse the repository at this point in the history
…#92)

Co-authored-by: Przemysław Walski <[email protected]>
  • Loading branch information
pwalski and pwalski authored Apr 5, 2024
1 parent 4c51b54 commit d568370
Show file tree
Hide file tree
Showing 5 changed files with 47 additions and 54 deletions.
2 changes: 0 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 0 additions & 2 deletions gpu-detection/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,7 @@ edition = "2021"
readme = "README.md"

[dependencies]
anyhow = "1.0"
nvml-wrapper = "0.10"
serde = "1.0"
log = "0.4.21"
thiserror = "1.0.58"
libloading = "0.8.3"
73 changes: 39 additions & 34 deletions gpu-detection/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use anyhow::{bail, Context};
use model::{Clocks, Cuda, Gpu, Memory};
use nvml_wrapper::error::NvmlError;
use nvml_wrapper::{enum_wrappers::device::Clock, Device, Nvml};
Expand All @@ -8,9 +7,13 @@ pub mod model;

#[derive(Error, Debug)]
pub enum GpuDetectionError {
#[error("a libloading error occurred: {0}")]
#[error("libloading error occurred: {0}")]
LibloadingError(#[from] libloading::Error),
#[error("an unknown error occurred: {0}")]
#[error("Failed to access GPU error: {0}")]
GpuAccessError(String),
#[error("Failed to access GPU info error: {0}")]
GpuInfoAccessError(String),
#[error("NVML error occurred: {0}")]
Unknown(String),
}

Expand All @@ -19,48 +22,53 @@ pub struct GpuDetection {
}

impl GpuDetection {
pub fn init() -> anyhow::Result<Self, GpuDetectionError> {
pub fn init() -> Result<Self, GpuDetectionError> {
let nvml = match Nvml::init() {
Ok(nvlm) => nvlm,
Err(NvmlError::LibloadingError(e)) => {
log::error!("GpuDetection library loading failed: {}", e);
return Err(GpuDetectionError::LibloadingError(e));
}
Err(e) => {
log::error!("GpuDetection init failed: {}", e);
return Err(GpuDetectionError::Unknown(e.to_string()));
return Err(GpuDetectionError::LibloadingError(e))
}
Err(e) => return Err(GpuDetectionError::Unknown(e.to_string())),
};
Ok(Self { nvml })
}

/// `uuid` of GPU device. If not provided first available GPU device will be used.
pub fn detect<S: AsRef<str>>(&self, uuid: Option<S>) -> anyhow::Result<Gpu> {
pub fn detect<S: AsRef<str>>(&self, uuid: Option<S>) -> Result<Gpu, GpuDetectionError> {
if let Some(uuid) = uuid {
let dev = self.nvml.device_by_uuid(uuid.as_ref()).with_context(|| {
format!("Failed to get GPU device with UUID: {}.", uuid.as_ref())
let dev = self.nvml.device_by_uuid(uuid.as_ref()).map_err(|err| {
GpuDetectionError::GpuAccessError(format!(
"Failed to get GPU device with UUID: {}. Err {}",
uuid.as_ref(),
err
))
})?;
return self.device_info(dev);
return self
.device_info(dev)
.map_err(|err| GpuDetectionError::GpuInfoAccessError(err.to_string()));
};

let gpu_count = self
.nvml
.device_count()
.context("Unable to get count of CUDA devices.")?;
let gpu_count = self.nvml.device_count().map_err(|err| {
GpuDetectionError::Unknown(format!("Failed to get device count. Err {}", err))
})?;

if gpu_count == 0 {
bail!("No supported GPU device available.")
return Err(GpuDetectionError::GpuAccessError("No GPU available".into()));
}

let dev = self
.nvml
.device_by_index(0)
.context("Failed to get GPU device.")?;
let index = 0;
let dev = self.nvml.device_by_index(index).map_err(|err| {
GpuDetectionError::GpuAccessError(format!(
"Failed to get GPU device under index: {}. Err {}",
index, err
))
})?;

self.device_info(dev)
.map_err(|err| GpuDetectionError::GpuInfoAccessError(err.to_string()))
}

fn device_info(&self, dev: Device) -> anyhow::Result<Gpu> {
fn device_info(&self, dev: Device) -> Result<Gpu, NvmlError> {
let model = dev.name()?;
let version = self.cuda_version()?;
let cuda = cuda(&dev, version)?;
Expand All @@ -74,18 +82,15 @@ impl GpuDetection {
})
}

fn cuda_version(&self) -> anyhow::Result<String> {
let version = self
.nvml
.sys_cuda_driver_version()
.context("Unable to get driver version")?;
fn cuda_version(&self) -> Result<String, NvmlError> {
let version = self.nvml.sys_cuda_driver_version()?;
let version_major = nvml_wrapper::cuda_driver_version_major(version);
let version_minor = nvml_wrapper::cuda_driver_version_minor(version);
Ok(format!("{}.{}", version_major, version_minor))
}
}

fn cuda(dev: &Device, version: String) -> anyhow::Result<Cuda> {
fn cuda(dev: &Device, version: String) -> Result<Cuda, NvmlError> {
let enabled = true;
let cores = dev.num_cores()?;
let compute_capability = compute_capability(dev)?;
Expand All @@ -97,12 +102,12 @@ fn cuda(dev: &Device, version: String) -> anyhow::Result<Cuda> {
})
}

fn compute_capability(dev: &Device) -> anyhow::Result<String> {
fn compute_capability(dev: &Device) -> Result<String, NvmlError> {
let capability = dev.cuda_compute_capability()?;
Ok(format!("{}.{}", capability.major, capability.minor))
}

fn clocks(dev: &Device) -> anyhow::Result<Clocks> {
fn clocks(dev: &Device) -> Result<Clocks, NvmlError> {
let graphics_mhz = dev.max_clock_info(Clock::Graphics)?;
let memory_mhz = dev.max_clock_info(Clock::Memory)?;
let sm_mhz = dev.max_clock_info(Clock::SM)?;
Expand All @@ -115,7 +120,7 @@ fn clocks(dev: &Device) -> anyhow::Result<Clocks> {
})
}

fn memory(dev: &Device) -> anyhow::Result<Memory> {
fn memory(dev: &Device) -> Result<Memory, NvmlError> {
let total_bytes = dev.memory_info()?.total;
let total_gib = bytes_to_gib(total_bytes);
Ok(Memory {
Expand All @@ -126,7 +131,7 @@ fn memory(dev: &Device) -> anyhow::Result<Memory> {

/// Unused because of lack of `memTransferRatemax` property.
#[allow(dead_code)]
fn bandwidth_gib(dev: &Device) -> anyhow::Result<u32> {
fn bandwidth_gib(dev: &Device) -> Result<u32, NvmlError> {
let memory_bus_width = dev.memory_bus_width()?;
let supported_memory_clocks = dev.supported_memory_clocks()?;
let max_memory_clock = supported_memory_clocks.iter().cloned().fold(0, u32::max);
Expand Down
14 changes: 4 additions & 10 deletions src/offer_template.rs
Original file line number Diff line number Diff line change
@@ -1,18 +1,12 @@
use crate::process::RuntimeConfig;

use gpu_detection::model::Gpu;
use gpu_detection::GpuDetectionError;
use gpu_detection::GpuDetection;
use ya_agreement_utils::OfferTemplate;

pub(crate) fn gpu_detection<CONFIG: RuntimeConfig>(config: &CONFIG) -> anyhow::Result<Option<Gpu>> {
match gpu_detection::GpuDetection::init() {
Ok(gpu_detection) => {
let gpu = gpu_detection.detect(config.gpu_uuid())?;
Ok(Some(gpu))
}
Err(GpuDetectionError::LibloadingError(_)) => Ok(None),
Err(e) => Err(e.into()),
}
pub(crate) fn gpu_detection<CONFIG: RuntimeConfig>(config: &CONFIG) -> anyhow::Result<Gpu> {
let gpu_detection = GpuDetection::init()?;
Ok(gpu_detection.detect(config.gpu_uuid())?)
}

pub(crate) fn template<CONFIG: RuntimeConfig>(_config: &CONFIG) -> anyhow::Result<OfferTemplate> {
Expand Down
10 changes: 4 additions & 6 deletions src/process.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,10 @@ pub(crate) trait Runtime: Sized {

fn offer_template(config: &Self::CONFIG) -> anyhow::Result<OfferTemplate> {
let mut template = offer_template::template(config)?;
if let Some(gpu) = gpu_detection(config)
.context("Generating offer template failed. Unable to detect GPU.")?
{
let gpu = serde_json::value::to_value(gpu)?;
template.set_property("golem.!exp.gap-35.v1.inf.gpu", gpu);
}
let gpu = gpu_detection(config)
.context("Generating offer template failed. Unable to detect GPU.")?;
let gpu = serde_json::value::to_value(gpu)?;
template.set_property("golem.!exp.gap-35.v1.inf.gpu", gpu);
Ok(template)
}
}
Expand Down

0 comments on commit d568370

Please sign in to comment.