Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cmds offer-template and test failing when no GPU on non dummy runtime. #92

Merged
merged 1 commit into from
Apr 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 0 additions & 2 deletions gpu-detection/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,7 @@ edition = "2021"
readme = "README.md"

[dependencies]
anyhow = "1.0"
nvml-wrapper = "0.10"
serde = "1.0"
log = "0.4.21"
thiserror = "1.0.58"
libloading = "0.8.3"
73 changes: 39 additions & 34 deletions gpu-detection/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use anyhow::{bail, Context};
use model::{Clocks, Cuda, Gpu, Memory};
use nvml_wrapper::error::NvmlError;
use nvml_wrapper::{enum_wrappers::device::Clock, Device, Nvml};
Expand All @@ -8,9 +7,13 @@ pub mod model;

#[derive(Error, Debug)]
pub enum GpuDetectionError {
#[error("a libloading error occurred: {0}")]
#[error("libloading error occurred: {0}")]
LibloadingError(#[from] libloading::Error),
#[error("an unknown error occurred: {0}")]
#[error("Failed to access GPU error: {0}")]
GpuAccessError(String),
#[error("Failed to access GPU info error: {0}")]
GpuInfoAccessError(String),
#[error("NVML error occurred: {0}")]
Unknown(String),
}

Expand All @@ -19,48 +22,53 @@ pub struct GpuDetection {
}

impl GpuDetection {
pub fn init() -> anyhow::Result<Self, GpuDetectionError> {
pub fn init() -> Result<Self, GpuDetectionError> {
let nvml = match Nvml::init() {
Ok(nvlm) => nvlm,
Err(NvmlError::LibloadingError(e)) => {
log::error!("GpuDetection library loading failed: {}", e);
return Err(GpuDetectionError::LibloadingError(e));
}
Err(e) => {
log::error!("GpuDetection init failed: {}", e);
return Err(GpuDetectionError::Unknown(e.to_string()));
return Err(GpuDetectionError::LibloadingError(e))
}
Err(e) => return Err(GpuDetectionError::Unknown(e.to_string())),
};
Ok(Self { nvml })
}

/// `uuid` of GPU device. If not provided first available GPU device will be used.
pub fn detect<S: AsRef<str>>(&self, uuid: Option<S>) -> anyhow::Result<Gpu> {
pub fn detect<S: AsRef<str>>(&self, uuid: Option<S>) -> Result<Gpu, GpuDetectionError> {
if let Some(uuid) = uuid {
let dev = self.nvml.device_by_uuid(uuid.as_ref()).with_context(|| {
format!("Failed to get GPU device with UUID: {}.", uuid.as_ref())
let dev = self.nvml.device_by_uuid(uuid.as_ref()).map_err(|err| {
GpuDetectionError::GpuAccessError(format!(
"Failed to get GPU device with UUID: {}. Err {}",
uuid.as_ref(),
err
))
})?;
return self.device_info(dev);
return self
.device_info(dev)
.map_err(|err| GpuDetectionError::GpuInfoAccessError(err.to_string()));
};

let gpu_count = self
.nvml
.device_count()
.context("Unable to get count of CUDA devices.")?;
let gpu_count = self.nvml.device_count().map_err(|err| {
GpuDetectionError::Unknown(format!("Failed to get device count. Err {}", err))
})?;

if gpu_count == 0 {
bail!("No supported GPU device available.")
return Err(GpuDetectionError::GpuAccessError("No GPU available".into()));
}

let dev = self
.nvml
.device_by_index(0)
.context("Failed to get GPU device.")?;
let index = 0;
let dev = self.nvml.device_by_index(index).map_err(|err| {
GpuDetectionError::GpuAccessError(format!(
"Failed to get GPU device under index: {}. Err {}",
index, err
))
})?;

self.device_info(dev)
.map_err(|err| GpuDetectionError::GpuInfoAccessError(err.to_string()))
}

fn device_info(&self, dev: Device) -> anyhow::Result<Gpu> {
fn device_info(&self, dev: Device) -> Result<Gpu, NvmlError> {
let model = dev.name()?;
let version = self.cuda_version()?;
let cuda = cuda(&dev, version)?;
Expand All @@ -74,18 +82,15 @@ impl GpuDetection {
})
}

fn cuda_version(&self) -> anyhow::Result<String> {
let version = self
.nvml
.sys_cuda_driver_version()
.context("Unable to get driver version")?;
fn cuda_version(&self) -> Result<String, NvmlError> {
let version = self.nvml.sys_cuda_driver_version()?;
let version_major = nvml_wrapper::cuda_driver_version_major(version);
let version_minor = nvml_wrapper::cuda_driver_version_minor(version);
Ok(format!("{}.{}", version_major, version_minor))
}
}

fn cuda(dev: &Device, version: String) -> anyhow::Result<Cuda> {
fn cuda(dev: &Device, version: String) -> Result<Cuda, NvmlError> {
let enabled = true;
let cores = dev.num_cores()?;
let compute_capability = compute_capability(dev)?;
Expand All @@ -97,12 +102,12 @@ fn cuda(dev: &Device, version: String) -> anyhow::Result<Cuda> {
})
}

fn compute_capability(dev: &Device) -> anyhow::Result<String> {
fn compute_capability(dev: &Device) -> Result<String, NvmlError> {
let capability = dev.cuda_compute_capability()?;
Ok(format!("{}.{}", capability.major, capability.minor))
}

fn clocks(dev: &Device) -> anyhow::Result<Clocks> {
fn clocks(dev: &Device) -> Result<Clocks, NvmlError> {
let graphics_mhz = dev.max_clock_info(Clock::Graphics)?;
let memory_mhz = dev.max_clock_info(Clock::Memory)?;
let sm_mhz = dev.max_clock_info(Clock::SM)?;
Expand All @@ -115,7 +120,7 @@ fn clocks(dev: &Device) -> anyhow::Result<Clocks> {
})
}

fn memory(dev: &Device) -> anyhow::Result<Memory> {
fn memory(dev: &Device) -> Result<Memory, NvmlError> {
let total_bytes = dev.memory_info()?.total;
let total_gib = bytes_to_gib(total_bytes);
Ok(Memory {
Expand All @@ -126,7 +131,7 @@ fn memory(dev: &Device) -> anyhow::Result<Memory> {

/// Unused because of lack of `memTransferRatemax` property.
#[allow(dead_code)]
fn bandwidth_gib(dev: &Device) -> anyhow::Result<u32> {
fn bandwidth_gib(dev: &Device) -> Result<u32, NvmlError> {
let memory_bus_width = dev.memory_bus_width()?;
let supported_memory_clocks = dev.supported_memory_clocks()?;
let max_memory_clock = supported_memory_clocks.iter().cloned().fold(0, u32::max);
Expand Down
14 changes: 4 additions & 10 deletions src/offer_template.rs
Original file line number Diff line number Diff line change
@@ -1,18 +1,12 @@
use crate::process::RuntimeConfig;

use gpu_detection::model::Gpu;
use gpu_detection::GpuDetectionError;
use gpu_detection::GpuDetection;
use ya_agreement_utils::OfferTemplate;

pub(crate) fn gpu_detection<CONFIG: RuntimeConfig>(config: &CONFIG) -> anyhow::Result<Option<Gpu>> {
match gpu_detection::GpuDetection::init() {
Ok(gpu_detection) => {
let gpu = gpu_detection.detect(config.gpu_uuid())?;
Ok(Some(gpu))
}
Err(GpuDetectionError::LibloadingError(_)) => Ok(None),
Err(e) => Err(e.into()),
}
pub(crate) fn gpu_detection<CONFIG: RuntimeConfig>(config: &CONFIG) -> anyhow::Result<Gpu> {
let gpu_detection = GpuDetection::init()?;
Ok(gpu_detection.detect(config.gpu_uuid())?)
}

pub(crate) fn template<CONFIG: RuntimeConfig>(_config: &CONFIG) -> anyhow::Result<OfferTemplate> {
Expand Down
10 changes: 4 additions & 6 deletions src/process.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,10 @@ pub(crate) trait Runtime: Sized {

fn offer_template(config: &Self::CONFIG) -> anyhow::Result<OfferTemplate> {
let mut template = offer_template::template(config)?;
if let Some(gpu) = gpu_detection(config)
.context("Generating offer template failed. Unable to detect GPU.")?
{
let gpu = serde_json::value::to_value(gpu)?;
template.set_property("golem.!exp.gap-35.v1.inf.gpu", gpu);
}
let gpu = gpu_detection(config)
.context("Generating offer template failed. Unable to detect GPU.")?;
let gpu = serde_json::value::to_value(gpu)?;
template.set_property("golem.!exp.gap-35.v1.inf.gpu", gpu);
Ok(template)
}
}
Expand Down