Cmds offer-template and est failing when no GPU on non dummy runtime. (…

…#92) Co-authored-by: Przemysław Walski <[email protected]>
golemfactory · Apr 5, 2024 · d568370 · d568370
1 parent 4c51b54
commit d568370
Show file tree

Hide file tree

Showing 5 changed files with 47 additions and 54 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/gpu-detection/Cargo.toml b/gpu-detection/Cargo.toml
@@ -5,9 +5,7 @@ edition = "2021"
 readme = "README.md"
 
 [dependencies]
-anyhow = "1.0"
 nvml-wrapper = "0.10"
 serde = "1.0"
-log = "0.4.21"
 thiserror = "1.0.58"
 libloading = "0.8.3"
diff --git a/gpu-detection/src/lib.rs b/gpu-detection/src/lib.rs
@@ -1,4 +1,3 @@
-use anyhow::{bail, Context};
 use model::{Clocks, Cuda, Gpu, Memory};
 use nvml_wrapper::error::NvmlError;
 use nvml_wrapper::{enum_wrappers::device::Clock, Device, Nvml};
@@ -8,9 +7,13 @@ pub mod model;
 
 #[derive(Error, Debug)]
 pub enum GpuDetectionError {
-    #[error("a libloading error occurred: {0}")]
+    #[error("libloading error occurred: {0}")]
     LibloadingError(#[from] libloading::Error),
-    #[error("an unknown error occurred: {0}")]
+    #[error("Failed to access GPU error: {0}")]
+    GpuAccessError(String),
+    #[error("Failed to access GPU info error: {0}")]
+    GpuInfoAccessError(String),
+    #[error("NVML error occurred: {0}")]
     Unknown(String),
 }
 
@@ -19,48 +22,53 @@ pub struct GpuDetection {
 }
 
 impl GpuDetection {
-    pub fn init() -> anyhow::Result<Self, GpuDetectionError> {
+    pub fn init() -> Result<Self, GpuDetectionError> {
         let nvml = match Nvml::init() {
             Ok(nvlm) => nvlm,
             Err(NvmlError::LibloadingError(e)) => {
-                log::error!("GpuDetection library loading failed: {}", e);
-                return Err(GpuDetectionError::LibloadingError(e));
-            }
-            Err(e) => {
-                log::error!("GpuDetection init failed: {}", e);
-                return Err(GpuDetectionError::Unknown(e.to_string()));
+                return Err(GpuDetectionError::LibloadingError(e))
             }
+            Err(e) => return Err(GpuDetectionError::Unknown(e.to_string())),
         };
         Ok(Self { nvml })
     }
 
     /// `uuid` of GPU device. If not provided first available GPU device will be used.
-    pub fn detect<S: AsRef<str>>(&self, uuid: Option<S>) -> anyhow::Result<Gpu> {
+    pub fn detect<S: AsRef<str>>(&self, uuid: Option<S>) -> Result<Gpu, GpuDetectionError> {
         if let Some(uuid) = uuid {
-            let dev = self.nvml.device_by_uuid(uuid.as_ref()).with_context(|| {
-                format!("Failed to get GPU device with UUID: {}.", uuid.as_ref())
+            let dev = self.nvml.device_by_uuid(uuid.as_ref()).map_err(|err| {
+                GpuDetectionError::GpuAccessError(format!(
+                    "Failed to get GPU device with UUID: {}. Err {}",
+                    uuid.as_ref(),
+                    err
+                ))
             })?;
-            return self.device_info(dev);
+            return self
+                .device_info(dev)
+                .map_err(|err| GpuDetectionError::GpuInfoAccessError(err.to_string()));
         };
 
-        let gpu_count = self
-            .nvml
-            .device_count()
-            .context("Unable to get count of CUDA devices.")?;
+        let gpu_count = self.nvml.device_count().map_err(|err| {
+            GpuDetectionError::Unknown(format!("Failed to get device count. Err {}", err))
+        })?;
 
         if gpu_count == 0 {
-            bail!("No supported GPU device available.")
+            return Err(GpuDetectionError::GpuAccessError("No GPU available".into()));
         }
 
-        let dev = self
-            .nvml
-            .device_by_index(0)
-            .context("Failed to get GPU device.")?;
+        let index = 0;
+        let dev = self.nvml.device_by_index(index).map_err(|err| {
+            GpuDetectionError::GpuAccessError(format!(
+                "Failed to get GPU device under index: {}. Err {}",
+                index, err
+            ))
+        })?;
 
         self.device_info(dev)
+            .map_err(|err| GpuDetectionError::GpuInfoAccessError(err.to_string()))
     }
 
-    fn device_info(&self, dev: Device) -> anyhow::Result<Gpu> {
+    fn device_info(&self, dev: Device) -> Result<Gpu, NvmlError> {
         let model = dev.name()?;
         let version = self.cuda_version()?;
         let cuda = cuda(&dev, version)?;
@@ -74,18 +82,15 @@ impl GpuDetection {
         })
     }
 
-    fn cuda_version(&self) -> anyhow::Result<String> {
-        let version = self
-            .nvml
-            .sys_cuda_driver_version()
-            .context("Unable to get driver version")?;
+    fn cuda_version(&self) -> Result<String, NvmlError> {
+        let version = self.nvml.sys_cuda_driver_version()?;
         let version_major = nvml_wrapper::cuda_driver_version_major(version);
         let version_minor = nvml_wrapper::cuda_driver_version_minor(version);
         Ok(format!("{}.{}", version_major, version_minor))
     }
 }
 
-fn cuda(dev: &Device, version: String) -> anyhow::Result<Cuda> {
+fn cuda(dev: &Device, version: String) -> Result<Cuda, NvmlError> {
     let enabled = true;
     let cores = dev.num_cores()?;
     let compute_capability = compute_capability(dev)?;
@@ -97,12 +102,12 @@ fn cuda(dev: &Device, version: String) -> anyhow::Result<Cuda> {
     })
 }
 
-fn compute_capability(dev: &Device) -> anyhow::Result<String> {
+fn compute_capability(dev: &Device) -> Result<String, NvmlError> {
     let capability = dev.cuda_compute_capability()?;
     Ok(format!("{}.{}", capability.major, capability.minor))
 }
 
-fn clocks(dev: &Device) -> anyhow::Result<Clocks> {
+fn clocks(dev: &Device) -> Result<Clocks, NvmlError> {
     let graphics_mhz = dev.max_clock_info(Clock::Graphics)?;
     let memory_mhz = dev.max_clock_info(Clock::Memory)?;
     let sm_mhz = dev.max_clock_info(Clock::SM)?;
@@ -115,7 +120,7 @@ fn clocks(dev: &Device) -> anyhow::Result<Clocks> {
     })
 }
 
-fn memory(dev: &Device) -> anyhow::Result<Memory> {
+fn memory(dev: &Device) -> Result<Memory, NvmlError> {
     let total_bytes = dev.memory_info()?.total;
     let total_gib = bytes_to_gib(total_bytes);
     Ok(Memory {
@@ -126,7 +131,7 @@ fn memory(dev: &Device) -> anyhow::Result<Memory> {
 
 /// Unused because of lack of `memTransferRatemax` property.
 #[allow(dead_code)]
-fn bandwidth_gib(dev: &Device) -> anyhow::Result<u32> {
+fn bandwidth_gib(dev: &Device) -> Result<u32, NvmlError> {
     let memory_bus_width = dev.memory_bus_width()?;
     let supported_memory_clocks = dev.supported_memory_clocks()?;
     let max_memory_clock = supported_memory_clocks.iter().cloned().fold(0, u32::max);

diff --git a/src/offer_template.rs b/src/offer_template.rs
@@ -1,18 +1,12 @@
 use crate::process::RuntimeConfig;
 
 use gpu_detection::model::Gpu;
-use gpu_detection::GpuDetectionError;
+use gpu_detection::GpuDetection;
 use ya_agreement_utils::OfferTemplate;
 
-pub(crate) fn gpu_detection<CONFIG: RuntimeConfig>(config: &CONFIG) -> anyhow::Result<Option<Gpu>> {
-    match gpu_detection::GpuDetection::init() {
-        Ok(gpu_detection) => {
-            let gpu = gpu_detection.detect(config.gpu_uuid())?;
-            Ok(Some(gpu))
-        }
-        Err(GpuDetectionError::LibloadingError(_)) => Ok(None),
-        Err(e) => Err(e.into()),
-    }
+pub(crate) fn gpu_detection<CONFIG: RuntimeConfig>(config: &CONFIG) -> anyhow::Result<Gpu> {
+    let gpu_detection = GpuDetection::init()?;
+    Ok(gpu_detection.detect(config.gpu_uuid())?)
 }
 
 pub(crate) fn template<CONFIG: RuntimeConfig>(_config: &CONFIG) -> anyhow::Result<OfferTemplate> {

diff --git a/src/process.rs b/src/process.rs
@@ -51,12 +51,10 @@ pub(crate) trait Runtime: Sized {
 
     fn offer_template(config: &Self::CONFIG) -> anyhow::Result<OfferTemplate> {
         let mut template = offer_template::template(config)?;
-        if let Some(gpu) = gpu_detection(config)
-            .context("Generating offer template failed. Unable to detect GPU.")?
-        {
-            let gpu = serde_json::value::to_value(gpu)?;
-            template.set_property("golem.!exp.gap-35.v1.inf.gpu", gpu);
-        }
+        let gpu = gpu_detection(config)
+            .context("Generating offer template failed. Unable to detect GPU.")?;
+        let gpu = serde_json::value::to_value(gpu)?;
+        template.set_property("golem.!exp.gap-35.v1.inf.gpu", gpu);
         Ok(template)
     }
 }