From 26b3821d6fb0ef0b194ff7d91ac71b0df101a5d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Pecka?= Date: Tue, 16 Jan 2024 17:23:31 +0100 Subject: [PATCH] hardware: raise alarms when FspYh read failures happen When the PDU or PSU read failure happen, we should also raise an alarm. The read failures are detected inside the FspYh class which does not have access to sysrepo. Therefore during the data polling we should also pick up any alarms that data readers want to be published. Change-Id: I42b1709d59a960b7f9ef65a65b47a0ef9b4a93d8 --- src/ietf-hardware/FspYh.cpp | 22 +++++++++++++++++++++- src/ietf-hardware/FspYh.h | 3 +++ src/ietf-hardware/IETFHardware.cpp | 11 ++++++----- src/ietf-hardware/IETFHardware.h | 11 +++++++++++ src/ietf-hardware/sysrepo/Sysrepo.cpp | 18 +++++++++++++++++- tests/hardware_fspyh.cpp | 17 ++++++++++++++--- tests/hardware_ietf-hardware.cpp | 22 ++++++++++++++++------ tests/sysrepo_ietf-hardware.cpp | 14 +++++++++++++- 8 files changed, 101 insertions(+), 17 deletions(-) diff --git a/src/ietf-hardware/FspYh.cpp b/src/ietf-hardware/FspYh.cpp index 135a82a8..bb3c9afe 100644 --- a/src/ietf-hardware/FspYh.cpp +++ b/src/ietf-hardware/FspYh.cpp @@ -11,6 +11,11 @@ #include "utils/UniqueResource.h" #include "utils/log.h" +namespace { +const auto ALARM_SENSOR_MISSING = "velia-alarms:sensor-missing-alarm"; +const auto ALARM_SENSOR_MISSING_SEVERITY = "critical"; +} + namespace velia::ietf_hardware { TransientI2C::TransientI2C(const uint8_t bus, const uint8_t address, const std::string& driverName) @@ -115,13 +120,16 @@ FspYh::~FspYh() SensorPollData FspYh::readValues() { + auto componentXPath = "/ietf-hardware:hardware/component[name='"s + m_namePrefix + "']"; + std::unique_lock lock(m_mtx); SensorPollData res; res.data = m_staticData; if (m_properties.empty()) { - res.data["/ietf-hardware:hardware/component[name='" + m_namePrefix + "']/state/oper-state"] = "disabled"; + res.data[componentXPath + "/state/oper-state"] = "disabled"; + res.sideLoadedAlarms.insert({ALARM_SENSOR_MISSING, componentXPath, ALARM_SENSOR_MISSING_SEVERITY, missingAlarmDescription()}); return res; } @@ -137,6 +145,7 @@ SensorPollData FspYh::readValues() res.data = m_staticData; res.data["/ietf-hardware:hardware/component[name='" + m_namePrefix + "']/state/oper-state"] = "disabled"; res.thresholds.clear(); + res.sideLoadedAlarms.insert({ALARM_SENSOR_MISSING, componentXPath, ALARM_SENSOR_MISSING_SEVERITY, missingAlarmDescription()}); lock.unlock(); m_cond.notify_all(); @@ -145,6 +154,7 @@ SensorPollData FspYh::readValues() } } + res.sideLoadedAlarms.insert({ALARM_SENSOR_MISSING, componentXPath, "cleared", missingAlarmDescription()}); return res; } @@ -228,6 +238,11 @@ void FspYhPsu::createPower() })); } +std::string FspYhPsu::missingAlarmDescription() const +{ + return "PSU is unplugged."; +} + void FspYhPdu::createPower() { m_hwmon = std::make_shared(m_hwmonDir); @@ -317,4 +332,9 @@ void FspYhPdu::createPower() registerReader(SysfsValue(m_namePrefix + ":current-3V3", m_namePrefix, m_hwmon, 3)); registerReader(SysfsValue(m_namePrefix + ":power-3V3", m_namePrefix, m_hwmon, 3)); } + +std::string FspYhPdu::missingAlarmDescription() const +{ + return "I2C read failure for PDU. Could not get hardware sensor details."; +} } diff --git a/src/ietf-hardware/FspYh.h b/src/ietf-hardware/FspYh.h index bcdc5966..50b38344 100644 --- a/src/ietf-hardware/FspYh.h +++ b/src/ietf-hardware/FspYh.h @@ -55,16 +55,19 @@ struct FspYh { std::vector> m_properties; virtual void createPower() = 0; + virtual std::string missingAlarmDescription() const = 0; }; struct FspYhPsu : public FspYh { using FspYh::FspYh; void createPower() override; + std::string missingAlarmDescription() const override; }; struct FspYhPdu : public FspYh { using FspYh::FspYh; void createPower() override; + std::string missingAlarmDescription() const override; }; } diff --git a/src/ietf-hardware/IETFHardware.cpp b/src/ietf-hardware/IETFHardware.cpp index 9a81084f..103b45d0 100644 --- a/src/ietf-hardware/IETFHardware.cpp +++ b/src/ietf-hardware/IETFHardware.cpp @@ -78,6 +78,7 @@ void SensorPollData::merge(SensorPollData&& other) { data.merge(other.data); thresholds.merge(other.thresholds); + sideLoadedAlarms.merge(other.sideLoadedAlarms); } IETFHardware::IETFHardware() @@ -131,7 +132,7 @@ HardwareInfo IETFHardware::process() pollData.data[ietfHardwareStatePrefix + "/last-change"] = velia::utils::yangTimeFormat(std::chrono::system_clock::now()); - return {pollData.data, alarms, activeSensors}; + return {pollData.data, alarms, activeSensors, pollData.sideLoadedAlarms}; } void IETFHardware::registerDataReader(const IETFHardware::DataReader& callable) @@ -165,7 +166,7 @@ StaticData::StaticData(std::string componentName, std::optional par dataTree); } -SensorPollData StaticData::operator()() const { return {m_staticData, {}}; } +SensorPollData StaticData::operator()() const { return {m_staticData, {}, {}}; } Fans::Fans(std::string componentName, std::optional parent, std::shared_ptr hwmon, unsigned fanChannelsCount, Thresholds thresholds) : DataReader(std::move(componentName), std::move(parent)) @@ -218,7 +219,7 @@ SensorPollData Fans::operator()() const thr.emplace(xpathForComponent(m_componentName + ":fan" + std::to_string(i) + ":rpm") + "sensor-data/value", m_thresholds); } - return {data, thr}; + return {data, thr, {}}; } std::string getSysfsFilename(const SensorType type, int sysfsChannelNr) @@ -297,7 +298,7 @@ SensorPollData SysfsValue::operator()() const int64_t sensorValue = m_hwmon->attribute(m_sysfsFile); addSensorValue(m_log, res, m_componentName, sensorValue); - return {res, ThresholdsBySensorPath{{xpathForComponent(m_componentName) + "sensor-data/value", m_thresholds}}}; + return {res, ThresholdsBySensorPath{{xpathForComponent(m_componentName) + "sensor-data/value", m_thresholds}}, {}}; } template struct SysfsValue; @@ -350,7 +351,7 @@ SensorPollData EMMC::operator()() const auto emmcAttrs = m_emmc->attributes(); addSensorValue(m_log, data, m_componentName + ":lifetime", emmcAttrs.at("life_time")); - return {data, ThresholdsBySensorPath{{xpathForComponent(m_componentName + ":lifetime") + "sensor-data/value", m_thresholds}}}; + return {data, ThresholdsBySensorPath{{xpathForComponent(m_componentName + ":lifetime") + "sensor-data/value", m_thresholds}}, {}}; } } } diff --git a/src/ietf-hardware/IETFHardware.h b/src/ietf-hardware/IETFHardware.h index d256855f..4a7b4ff7 100644 --- a/src/ietf-hardware/IETFHardware.h +++ b/src/ietf-hardware/IETFHardware.h @@ -24,15 +24,26 @@ namespace velia::ietf_hardware { using DataTree = std::map; using ThresholdsBySensorPath = std::map>; +struct SideLoadedAlarm { + std::string alarmTypeId; + std::string resource; + std::string severity; + std::string text; + + auto operator<=>(const SideLoadedAlarm&) const = default; +}; + struct HardwareInfo { DataTree dataTree; std::map updatedTresholdCrossing; std::set activeSensors; + std::set sideLoadedAlarms; }; struct SensorPollData { DataTree data; ThresholdsBySensorPath thresholds; + std::set sideLoadedAlarms; void merge(SensorPollData&& other); }; diff --git a/src/ietf-hardware/sysrepo/Sysrepo.cpp b/src/ietf-hardware/sysrepo/Sysrepo.cpp index 790f7985..efe714c8 100644 --- a/src/ietf-hardware/sysrepo/Sysrepo.cpp +++ b/src/ietf-hardware/sysrepo/Sysrepo.cpp @@ -93,11 +93,12 @@ Sysrepo::Sysrepo(::sysrepo::Session session, std::shared_ptr hwSta DataTree prevValues; std::set seenSensors; std::map thresholdsStates; + std::set> activeSideLoadedAlarms; while (!m_quit) { m_log->trace("IetfHardware poll"); - auto [hwStateValues, thresholds, activeSensors] = m_hwState->process(); + auto [hwStateValues, thresholds, activeSensors, sideLoadedAlarms] = m_hwState->process(); std::set deletedComponents; for (const auto& sensorXPath : activeSensors) { @@ -126,6 +127,21 @@ Sysrepo::Sysrepo(::sysrepo::Session session, std::shared_ptr hwSta utils::valuesPush(hwStateValues, {}, discards, m_session, ::sysrepo::Datastore::Operational); + /* Publish sideloaded alarms */ + for (const auto& [alarm, resource, severity, text] : sideLoadedAlarms) { + // Sideloaded alarms are not registered using the code above, let's register those too + utils::addResourceToAlarmInventoryEntry(m_session, ALARM_SENSOR_MISSING, std::nullopt, resource); + + bool isActive = activeSideLoadedAlarms.contains({alarm, resource}); + if (isActive && severity == "cleared") { + utils::createOrUpdateAlarm(m_session, alarm, std::nullopt, resource, "cleared", text); + activeSideLoadedAlarms.erase({alarm, resource}); + } else if (!isActive && severity != "cleared") { + utils::createOrUpdateAlarm(m_session, alarm, std::nullopt, resource, severity, text); + activeSideLoadedAlarms.insert({alarm, resource}); + } + } + /* Look for nonoperational sensors to set alarms */ for (const auto& [leaf, value] : hwStateValues) { if (boost::ends_with(leaf, "/sensor-data/oper-status")) { diff --git a/tests/hardware_fspyh.cpp b/tests/hardware_fspyh.cpp index 6578a0ca..fb33f5f8 100644 --- a/tests/hardware_fspyh.cpp +++ b/tests/hardware_fspyh.cpp @@ -91,16 +91,21 @@ TEST_CASE("FspYhPsu") {"/ietf-hardware:hardware/component[name='ne:psu']/parent", "ne"}, {"/ietf-hardware:hardware/component[name='ne:psu']/state/oper-state", "disabled"}}; + const velia::ietf_hardware::SideLoadedAlarm alarmUnplugged = {"velia-alarms:sensor-missing-alarm", "/ietf-hardware:hardware/component[name='ne:psu']", "critical", "PSU is unplugged."}; + const velia::ietf_hardware::SideLoadedAlarm alarmPlugged = {"velia-alarms:sensor-missing-alarm", "/ietf-hardware:hardware/component[name='ne:psu']", "cleared", "PSU is unplugged."}; + std::set expectedThresholdsKeys; for (auto i : {0, 1, 2, 3, 4}) { std::this_thread::sleep_for(std::chrono::seconds(4)); velia::ietf_hardware::DataTree expected; + std::set expectedAlarms; switch (i) { case 0: expected = expectedDisabled; expectedThresholdsKeys.clear(); + expectedAlarms = {alarmUnplugged}; break; case 1: expected = { @@ -215,10 +220,12 @@ TEST_CASE("FspYhPsu") "/ietf-hardware:hardware/component[name='ne:psu:voltage-5Vsb']/sensor-data/value", "/ietf-hardware:hardware/component[name='ne:psu:voltage-in']/sensor-data/value", }; + expectedAlarms = {alarmPlugged}; break; case 2: expected = expectedDisabled; expectedThresholdsKeys.clear(); + expectedAlarms = {alarmUnplugged}; break; case 3: // Here I simulate read failure by a file from the hwmon directory. This happens when the user wants data from @@ -226,22 +233,26 @@ TEST_CASE("FspYhPsu") fakeI2c->removeHwmonFile("temp1_input"); expected = expectedDisabled; expectedThresholdsKeys.clear(); + expectedAlarms = {alarmUnplugged}; break; case 4: expected = expectedDisabled; expectedThresholdsKeys.clear(); + expectedAlarms = {alarmUnplugged}; break; } - auto res = psu->readValues(); + auto [data, thresholds, sideLoadedAlarms] = psu->readValues(); CAPTURE((int)counter); - REQUIRE(res.data == expected); + REQUIRE(data == expected); std::set thresholdsKeys; - std::transform(res.thresholds.begin(), res.thresholds.end(), std::inserter(thresholdsKeys, thresholdsKeys.begin()), [](const auto& kv) { return kv.first; }); + std::transform(thresholds.begin(), thresholds.end(), std::inserter(thresholdsKeys, thresholdsKeys.begin()), [](const auto& kv) { return kv.first; }); REQUIRE(thresholdsKeys == expectedThresholdsKeys); + REQUIRE(sideLoadedAlarms == expectedAlarms); + counter++; } diff --git a/tests/hardware_ietf-hardware.cpp b/tests/hardware_ietf-hardware.cpp index 0e73c5b7..b9828d49 100644 --- a/tests/hardware_ietf-hardware.cpp +++ b/tests/hardware_ietf-hardware.cpp @@ -97,6 +97,7 @@ TEST_CASE("HardwareState") velia::ietf_hardware::SensorPollData operator()() { + velia::ietf_hardware::SideLoadedAlarm alarm; velia::ietf_hardware::ThresholdsBySensorPath thr; velia::ietf_hardware::DataTree res = { {COMPONENT("ne:psu") "/class", "iana-hardware:power-supply"}, @@ -121,9 +122,13 @@ TEST_CASE("HardwareState") .warningHigh = OneThreshold{15000, 2000}, .criticalHigh = std::nullopt, }; + + alarm = {"velia-alarms:sensor-missing", COMPONENT("ne:psu"), "cleared", "PSU missing."}; + } else { + alarm = {"velia-alarms:sensor-missing", COMPONENT("ne:psu"), "critical", "PSU missing."}; } - return {res, thr}; + return {res, thr, {alarm}}; } }; bool psuActive = true; @@ -260,7 +265,7 @@ TEST_CASE("HardwareState") }; { - auto [data, alarms, activeSensors] = ietfHardware->process(); + auto [data, alarms, activeSensors, sideLoadedAlarms] = ietfHardware->process(); NUKE_LAST_CHANGE(data); REQUIRE(data == expected); REQUIRE(alarms == std::map{ @@ -289,12 +294,13 @@ TEST_CASE("HardwareState") COMPONENT("ne:fans:fan4:rpm") "/sensor-data/value", COMPONENT("ne:psu:child") "/sensor-data/value", }); + REQUIRE(sideLoadedAlarms == std::set{{"velia-alarms:sensor-missing", COMPONENT("ne:psu"), "cleared", "PSU missing."}}); } fanValues[1] = 500; expected[COMPONENT("ne:fans:fan2:rpm") "/sensor-data/value"] = "500"; { - auto [data, alarms, activeSensors] = ietfHardware->process(); + auto [data, alarms, activeSensors, sideLoadedAlarms] = ietfHardware->process(); NUKE_LAST_CHANGE(data); REQUIRE(data == expected); REQUIRE(alarms == std::map{ @@ -313,6 +319,7 @@ TEST_CASE("HardwareState") COMPONENT("ne:fans:fan4:rpm") "/sensor-data/value", COMPONENT("ne:psu:child") "/sensor-data/value", }); + REQUIRE(sideLoadedAlarms == std::set{{"velia-alarms:sensor-missing", COMPONENT("ne:psu"), "cleared", "PSU missing."}}); } psuActive = false; @@ -332,7 +339,7 @@ TEST_CASE("HardwareState") expected[COMPONENT("ne:fans:fan3:rpm") "/sensor-data/value"] = "5000"; { - auto [data, alarms, activeSensors] = ietfHardware->process(); + auto [data, alarms, activeSensors, sideLoadedAlarms] = ietfHardware->process(); NUKE_LAST_CHANGE(data); REQUIRE(data == expected); @@ -352,6 +359,7 @@ TEST_CASE("HardwareState") COMPONENT("ne:fans:fan3:rpm") "/sensor-data/value", COMPONENT("ne:fans:fan4:rpm") "/sensor-data/value", }); + REQUIRE(sideLoadedAlarms == std::set{{"velia-alarms:sensor-missing", COMPONENT("ne:psu"), "critical", "PSU missing."}}); } psuActive = true; @@ -368,7 +376,7 @@ TEST_CASE("HardwareState") expected[COMPONENT("ne:psu:child") "/sensor-data/value-type"] = "volts-DC"; { - auto [data, alarms, activeSensors] = ietfHardware->process(); + auto [data, alarms, activeSensors, sideLoadedAlarms] = ietfHardware->process(); NUKE_LAST_CHANGE(data); REQUIRE(data == expected); @@ -388,6 +396,7 @@ TEST_CASE("HardwareState") COMPONENT("ne:fans:fan4:rpm") "/sensor-data/value", COMPONENT("ne:psu:child") "/sensor-data/value", }); + REQUIRE(sideLoadedAlarms == std::set{{"velia-alarms:sensor-missing", COMPONENT("ne:psu"), "cleared", "PSU missing."}}); } @@ -399,7 +408,7 @@ TEST_CASE("HardwareState") expected[COMPONENT("ne:fans:fan2:rpm") "/sensor-data/oper-status"] = "nonoperational"; { - auto [data, alarms, activeSensors] = ietfHardware->process(); + auto [data, alarms, activeSensors, sideLoadedAlarms] = ietfHardware->process(); NUKE_LAST_CHANGE(data); REQUIRE(data == expected); @@ -420,5 +429,6 @@ TEST_CASE("HardwareState") COMPONENT("ne:fans:fan4:rpm") "/sensor-data/value", COMPONENT("ne:psu:child") "/sensor-data/value", }); + REQUIRE(sideLoadedAlarms == std::set{{"velia-alarms:sensor-missing", COMPONENT("ne:psu"), "cleared", "PSU missing."}}); } } diff --git a/tests/sysrepo_ietf-hardware.cpp b/tests/sysrepo_ietf-hardware.cpp index 47d53cab..bf35e79a 100644 --- a/tests/sysrepo_ietf-hardware.cpp +++ b/tests/sysrepo_ietf-hardware.cpp @@ -205,6 +205,7 @@ TEST_CASE("IETF Hardware with sysrepo") velia::ietf_hardware::SensorPollData operator()() { + velia::ietf_hardware::SideLoadedAlarm alarm; velia::ietf_hardware::ThresholdsBySensorPath thr; velia::ietf_hardware::DataTree res = { {COMPONENT("ne:psu") "/class", "iana-hardware:power-supply"}, @@ -229,9 +230,13 @@ TEST_CASE("IETF Hardware with sysrepo") .warningHigh = OneThreshold{15000, 2000}, .criticalHigh = std::nullopt, }; + + alarm = {"velia-alarms:sensor-missing-alarm", COMPONENT("ne:psu"), "cleared", "PSU missing."}; + } else { + alarm = {"velia-alarms:sensor-missing-alarm", COMPONENT("ne:psu"), "critical", "PSU missing."}; } - return {res, thr}; + return {res, thr, {alarm}}; } }; ietfHardware->registerDataReader(PsuDataReader{psuActive, psuSensorValue}); @@ -319,6 +324,7 @@ TEST_CASE("IETF Hardware with sysrepo") {COMPONENT("ne:temperature-cpu") "/state/oper-state", "enabled"}, })) .IN_SEQUENCE(seq1); + REQUIRE_ALARM_INVENTORY_ADD_RESOURCE("velia-alarms:sensor-missing-alarm", "ne:psu").TIMES(AT_LEAST(1)); REQUIRE_ALARM_RPC("velia-alarms:sensor-low-value-alarm", "ne:power", "critical", "Sensor value crossed low threshold.").IN_SEQUENCE(seq1); auto ietfHardwareSysrepo = std::make_shared(srSess, ietfHardware, 150ms); @@ -341,6 +347,7 @@ TEST_CASE("IETF Hardware with sysrepo") {COMPONENT("ne:temperature-cpu") "/sensor-data/value", "222"}, })) .IN_SEQUENCE(seq1); + REQUIRE_ALARM_RPC("velia-alarms:sensor-missing-alarm", "ne:psu", "critical", "PSU missing.").IN_SEQUENCE(seq1); REQUIRE_ALARM_RPC("velia-alarms:sensor-low-value-alarm", "ne:power", "cleared", "Sensor value crossed low threshold.").IN_SEQUENCE(seq1); REQUIRE_ALARM_RPC("velia-alarms:sensor-missing-alarm", "ne:psu:child", "warning", "Sensor value not reported. Maybe the sensor was unplugged?").IN_SEQUENCE(seq1); REQUIRE_CALL(*sysfsTempCpu, attribute("temp1_input")).LR_RETURN(cpuTempValue).TIMES(AT_LEAST(1)); @@ -365,6 +372,7 @@ TEST_CASE("IETF Hardware with sysrepo") {COMPONENT("ne:psu:child") "/state/oper-state", "enabled"}, })) .IN_SEQUENCE(seq1); + REQUIRE_ALARM_RPC("velia-alarms:sensor-missing-alarm", "ne:psu", "cleared", "PSU missing.").IN_SEQUENCE(seq1); REQUIRE_ALARM_RPC("velia-alarms:sensor-missing-alarm", "ne:psu:child", "cleared", "Sensor value not reported. Maybe the sensor was unplugged?").IN_SEQUENCE(seq1); REQUIRE_ALARM_RPC("velia-alarms:sensor-high-value-alarm", "ne:psu:child", "warning", "Sensor value crossed high threshold.").IN_SEQUENCE(seq1); psuSensorValue = 50000; @@ -385,6 +393,7 @@ TEST_CASE("IETF Hardware with sysrepo") {COMPONENT("ne:psu") "/state/oper-state", "disabled"}, })) .IN_SEQUENCE(seq1); + REQUIRE_ALARM_RPC("velia-alarms:sensor-missing-alarm", "ne:psu", "critical", "PSU missing.").IN_SEQUENCE(seq1); REQUIRE_ALARM_RPC("velia-alarms:sensor-missing-alarm", "ne:psu:child", "warning", "Sensor value not reported. Maybe the sensor was unplugged?").IN_SEQUENCE(seq1); REQUIRE_ALARM_RPC("velia-alarms:sensor-high-value-alarm", "ne:psu:child", "cleared", "Sensor value crossed high threshold.").IN_SEQUENCE(seq1); psuActive = false; @@ -504,6 +513,8 @@ TEST_CASE("IETF Hardware with sysrepo") {COMPONENT("ne:temperature-cpu") "/state/oper-state", "enabled"}, })) .IN_SEQUENCE(seq1); + REQUIRE_ALARM_INVENTORY_ADD_RESOURCE("velia-alarms:sensor-missing-alarm", "ne:psu").TIMES(AT_LEAST(1)); + REQUIRE_ALARM_RPC("velia-alarms:sensor-missing-alarm", "ne:psu", "critical", "PSU missing.").IN_SEQUENCE(seq1); REQUIRE_ALARM_RPC("velia-alarms:sensor-low-value-alarm", "ne:power", "critical", "Sensor value crossed low threshold.").IN_SEQUENCE(seq1); auto ietfHardwareSysrepo = std::make_shared(srSess, ietfHardware, 150ms); @@ -530,6 +541,7 @@ TEST_CASE("IETF Hardware with sysrepo") {COMPONENT("ne:psu:child") "/state/oper-state", "enabled"}, })) .IN_SEQUENCE(seq1); + REQUIRE_ALARM_RPC("velia-alarms:sensor-missing-alarm", "ne:psu", "cleared", "PSU missing.").IN_SEQUENCE(seq1); psuActive = true; waitForCompletionAndBitMore(seq1);