Skip to content

Commit

Permalink
hardware: raise alarms when FspYh read failures happen
Browse files Browse the repository at this point in the history
When the PDU or PSU read failure happen, we should also raise an alarm.
The read failures are detected inside the FspYh class which does not
have access to sysrepo. Therefore during the data polling we should also
pick up any alarms that data readers want to be published.

Change-Id: I42b1709d59a960b7f9ef65a65b47a0ef9b4a93d8
  • Loading branch information
peckato1 committed Jan 18, 2024
1 parent b0923ec commit 26b3821
Show file tree
Hide file tree
Showing 8 changed files with 101 additions and 17 deletions.
22 changes: 21 additions & 1 deletion src/ietf-hardware/FspYh.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@
#include "utils/UniqueResource.h"
#include "utils/log.h"

namespace {
const auto ALARM_SENSOR_MISSING = "velia-alarms:sensor-missing-alarm";
const auto ALARM_SENSOR_MISSING_SEVERITY = "critical";
}

namespace velia::ietf_hardware {

TransientI2C::TransientI2C(const uint8_t bus, const uint8_t address, const std::string& driverName)
Expand Down Expand Up @@ -115,13 +120,16 @@ FspYh::~FspYh()

SensorPollData FspYh::readValues()
{
auto componentXPath = "/ietf-hardware:hardware/component[name='"s + m_namePrefix + "']";

std::unique_lock lock(m_mtx);

SensorPollData res;
res.data = m_staticData;

if (m_properties.empty()) {
res.data["/ietf-hardware:hardware/component[name='" + m_namePrefix + "']/state/oper-state"] = "disabled";
res.data[componentXPath + "/state/oper-state"] = "disabled";
res.sideLoadedAlarms.insert({ALARM_SENSOR_MISSING, componentXPath, ALARM_SENSOR_MISSING_SEVERITY, missingAlarmDescription()});
return res;
}

Expand All @@ -137,6 +145,7 @@ SensorPollData FspYh::readValues()
res.data = m_staticData;
res.data["/ietf-hardware:hardware/component[name='" + m_namePrefix + "']/state/oper-state"] = "disabled";
res.thresholds.clear();
res.sideLoadedAlarms.insert({ALARM_SENSOR_MISSING, componentXPath, ALARM_SENSOR_MISSING_SEVERITY, missingAlarmDescription()});

lock.unlock();
m_cond.notify_all();
Expand All @@ -145,6 +154,7 @@ SensorPollData FspYh::readValues()
}
}

res.sideLoadedAlarms.insert({ALARM_SENSOR_MISSING, componentXPath, "cleared", missingAlarmDescription()});
return res;
}

Expand Down Expand Up @@ -228,6 +238,11 @@ void FspYhPsu::createPower()
}));
}

std::string FspYhPsu::missingAlarmDescription() const
{
return "PSU is unplugged.";
}

void FspYhPdu::createPower()
{
m_hwmon = std::make_shared<velia::ietf_hardware::sysfs::HWMon>(m_hwmonDir);
Expand Down Expand Up @@ -317,4 +332,9 @@ void FspYhPdu::createPower()
registerReader(SysfsValue<SensorType::Current>(m_namePrefix + ":current-3V3", m_namePrefix, m_hwmon, 3));
registerReader(SysfsValue<SensorType::Power>(m_namePrefix + ":power-3V3", m_namePrefix, m_hwmon, 3));
}

std::string FspYhPdu::missingAlarmDescription() const
{
return "I2C read failure for PDU. Could not get hardware sensor details.";
}
}
3 changes: 3 additions & 0 deletions src/ietf-hardware/FspYh.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,16 +55,19 @@ struct FspYh {
std::vector<std::function<SensorPollData()>> m_properties;

virtual void createPower() = 0;
virtual std::string missingAlarmDescription() const = 0;
};

struct FspYhPsu : public FspYh {
using FspYh::FspYh;
void createPower() override;
std::string missingAlarmDescription() const override;
};

struct FspYhPdu : public FspYh {
using FspYh::FspYh;
void createPower() override;
std::string missingAlarmDescription() const override;
};

}
11 changes: 6 additions & 5 deletions src/ietf-hardware/IETFHardware.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ void SensorPollData::merge(SensorPollData&& other)
{
data.merge(other.data);
thresholds.merge(other.thresholds);
sideLoadedAlarms.merge(other.sideLoadedAlarms);
}

IETFHardware::IETFHardware()
Expand Down Expand Up @@ -131,7 +132,7 @@ HardwareInfo IETFHardware::process()

pollData.data[ietfHardwareStatePrefix + "/last-change"] = velia::utils::yangTimeFormat(std::chrono::system_clock::now());

return {pollData.data, alarms, activeSensors};
return {pollData.data, alarms, activeSensors, pollData.sideLoadedAlarms};
}

void IETFHardware::registerDataReader(const IETFHardware::DataReader& callable)
Expand Down Expand Up @@ -165,7 +166,7 @@ StaticData::StaticData(std::string componentName, std::optional<std::string> par
dataTree);
}

SensorPollData StaticData::operator()() const { return {m_staticData, {}}; }
SensorPollData StaticData::operator()() const { return {m_staticData, {}, {}}; }

Fans::Fans(std::string componentName, std::optional<std::string> parent, std::shared_ptr<sysfs::HWMon> hwmon, unsigned fanChannelsCount, Thresholds<int64_t> thresholds)
: DataReader(std::move(componentName), std::move(parent))
Expand Down Expand Up @@ -218,7 +219,7 @@ SensorPollData Fans::operator()() const
thr.emplace(xpathForComponent(m_componentName + ":fan" + std::to_string(i) + ":rpm") + "sensor-data/value", m_thresholds);
}

return {data, thr};
return {data, thr, {}};
}

std::string getSysfsFilename(const SensorType type, int sysfsChannelNr)
Expand Down Expand Up @@ -297,7 +298,7 @@ SensorPollData SysfsValue<TYPE>::operator()() const
int64_t sensorValue = m_hwmon->attribute(m_sysfsFile);
addSensorValue(m_log, res, m_componentName, sensorValue);

return {res, ThresholdsBySensorPath{{xpathForComponent(m_componentName) + "sensor-data/value", m_thresholds}}};
return {res, ThresholdsBySensorPath{{xpathForComponent(m_componentName) + "sensor-data/value", m_thresholds}}, {}};
}

template struct SysfsValue<SensorType::Current>;
Expand Down Expand Up @@ -350,7 +351,7 @@ SensorPollData EMMC::operator()() const
auto emmcAttrs = m_emmc->attributes();
addSensorValue(m_log, data, m_componentName + ":lifetime", emmcAttrs.at("life_time"));

return {data, ThresholdsBySensorPath{{xpathForComponent(m_componentName + ":lifetime") + "sensor-data/value", m_thresholds}}};
return {data, ThresholdsBySensorPath{{xpathForComponent(m_componentName + ":lifetime") + "sensor-data/value", m_thresholds}}, {}};
}
}
}
11 changes: 11 additions & 0 deletions src/ietf-hardware/IETFHardware.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,26 @@ namespace velia::ietf_hardware {
using DataTree = std::map<std::string, std::string>;
using ThresholdsBySensorPath = std::map<std::string, Thresholds<int64_t>>;

struct SideLoadedAlarm {
std::string alarmTypeId;
std::string resource;
std::string severity;
std::string text;

auto operator<=>(const SideLoadedAlarm&) const = default;
};

struct HardwareInfo {
DataTree dataTree;
std::map<std::string, State> updatedTresholdCrossing;
std::set<std::string> activeSensors;
std::set<SideLoadedAlarm> sideLoadedAlarms;
};

struct SensorPollData {
DataTree data;
ThresholdsBySensorPath thresholds;
std::set<SideLoadedAlarm> sideLoadedAlarms;
void merge(SensorPollData&& other);
};

Expand Down
18 changes: 17 additions & 1 deletion src/ietf-hardware/sysrepo/Sysrepo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,11 +93,12 @@ Sysrepo::Sysrepo(::sysrepo::Session session, std::shared_ptr<IETFHardware> hwSta
DataTree prevValues;
std::set<std::string> seenSensors;
std::map<std::string, State> thresholdsStates;
std::set<std::pair<std::string, std::string>> activeSideLoadedAlarms;

while (!m_quit) {
m_log->trace("IetfHardware poll");

auto [hwStateValues, thresholds, activeSensors] = m_hwState->process();
auto [hwStateValues, thresholds, activeSensors, sideLoadedAlarms] = m_hwState->process();
std::set<std::string> deletedComponents;

for (const auto& sensorXPath : activeSensors) {
Expand Down Expand Up @@ -126,6 +127,21 @@ Sysrepo::Sysrepo(::sysrepo::Session session, std::shared_ptr<IETFHardware> hwSta

utils::valuesPush(hwStateValues, {}, discards, m_session, ::sysrepo::Datastore::Operational);

/* Publish sideloaded alarms */
for (const auto& [alarm, resource, severity, text] : sideLoadedAlarms) {
// Sideloaded alarms are not registered using the code above, let's register those too
utils::addResourceToAlarmInventoryEntry(m_session, ALARM_SENSOR_MISSING, std::nullopt, resource);

bool isActive = activeSideLoadedAlarms.contains({alarm, resource});
if (isActive && severity == "cleared") {
utils::createOrUpdateAlarm(m_session, alarm, std::nullopt, resource, "cleared", text);
activeSideLoadedAlarms.erase({alarm, resource});
} else if (!isActive && severity != "cleared") {
utils::createOrUpdateAlarm(m_session, alarm, std::nullopt, resource, severity, text);
activeSideLoadedAlarms.insert({alarm, resource});
}
}

/* Look for nonoperational sensors to set alarms */
for (const auto& [leaf, value] : hwStateValues) {
if (boost::ends_with(leaf, "/sensor-data/oper-status")) {
Expand Down
17 changes: 14 additions & 3 deletions tests/hardware_fspyh.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,16 +91,21 @@ TEST_CASE("FspYhPsu")
{"/ietf-hardware:hardware/component[name='ne:psu']/parent", "ne"},
{"/ietf-hardware:hardware/component[name='ne:psu']/state/oper-state", "disabled"}};

const velia::ietf_hardware::SideLoadedAlarm alarmUnplugged = {"velia-alarms:sensor-missing-alarm", "/ietf-hardware:hardware/component[name='ne:psu']", "critical", "PSU is unplugged."};
const velia::ietf_hardware::SideLoadedAlarm alarmPlugged = {"velia-alarms:sensor-missing-alarm", "/ietf-hardware:hardware/component[name='ne:psu']", "cleared", "PSU is unplugged."};

std::set<std::string> expectedThresholdsKeys;

for (auto i : {0, 1, 2, 3, 4}) {
std::this_thread::sleep_for(std::chrono::seconds(4));
velia::ietf_hardware::DataTree expected;
std::set<velia::ietf_hardware::SideLoadedAlarm> expectedAlarms;

switch (i) {
case 0:
expected = expectedDisabled;
expectedThresholdsKeys.clear();
expectedAlarms = {alarmUnplugged};
break;
case 1:
expected = {
Expand Down Expand Up @@ -215,33 +220,39 @@ TEST_CASE("FspYhPsu")
"/ietf-hardware:hardware/component[name='ne:psu:voltage-5Vsb']/sensor-data/value",
"/ietf-hardware:hardware/component[name='ne:psu:voltage-in']/sensor-data/value",
};
expectedAlarms = {alarmPlugged};
break;
case 2:
expected = expectedDisabled;
expectedThresholdsKeys.clear();
expectedAlarms = {alarmUnplugged};
break;
case 3:
// Here I simulate read failure by a file from the hwmon directory. This happens when the user wants data from
// a PSU that's no longer there and the watcher thread didn't unbind it yet.
fakeI2c->removeHwmonFile("temp1_input");
expected = expectedDisabled;
expectedThresholdsKeys.clear();
expectedAlarms = {alarmUnplugged};
break;
case 4:
expected = expectedDisabled;
expectedThresholdsKeys.clear();
expectedAlarms = {alarmUnplugged};
break;
}

auto res = psu->readValues();
auto [data, thresholds, sideLoadedAlarms] = psu->readValues();

CAPTURE((int)counter);
REQUIRE(res.data == expected);
REQUIRE(data == expected);

std::set<std::string> thresholdsKeys;
std::transform(res.thresholds.begin(), res.thresholds.end(), std::inserter(thresholdsKeys, thresholdsKeys.begin()), [](const auto& kv) { return kv.first; });
std::transform(thresholds.begin(), thresholds.end(), std::inserter(thresholdsKeys, thresholdsKeys.begin()), [](const auto& kv) { return kv.first; });
REQUIRE(thresholdsKeys == expectedThresholdsKeys);

REQUIRE(sideLoadedAlarms == expectedAlarms);

counter++;
}

Expand Down
22 changes: 16 additions & 6 deletions tests/hardware_ietf-hardware.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ TEST_CASE("HardwareState")

velia::ietf_hardware::SensorPollData operator()()
{
velia::ietf_hardware::SideLoadedAlarm alarm;
velia::ietf_hardware::ThresholdsBySensorPath thr;
velia::ietf_hardware::DataTree res = {
{COMPONENT("ne:psu") "/class", "iana-hardware:power-supply"},
Expand All @@ -121,9 +122,13 @@ TEST_CASE("HardwareState")
.warningHigh = OneThreshold<int64_t>{15000, 2000},
.criticalHigh = std::nullopt,
};

alarm = {"velia-alarms:sensor-missing", COMPONENT("ne:psu"), "cleared", "PSU missing."};
} else {
alarm = {"velia-alarms:sensor-missing", COMPONENT("ne:psu"), "critical", "PSU missing."};
}

return {res, thr};
return {res, thr, {alarm}};
}
};
bool psuActive = true;
Expand Down Expand Up @@ -260,7 +265,7 @@ TEST_CASE("HardwareState")
};

{
auto [data, alarms, activeSensors] = ietfHardware->process();
auto [data, alarms, activeSensors, sideLoadedAlarms] = ietfHardware->process();
NUKE_LAST_CHANGE(data);
REQUIRE(data == expected);
REQUIRE(alarms == std::map<std::string, velia::ietf_hardware::State>{
Expand Down Expand Up @@ -289,12 +294,13 @@ TEST_CASE("HardwareState")
COMPONENT("ne:fans:fan4:rpm") "/sensor-data/value",
COMPONENT("ne:psu:child") "/sensor-data/value",
});
REQUIRE(sideLoadedAlarms == std::set<velia::ietf_hardware::SideLoadedAlarm>{{"velia-alarms:sensor-missing", COMPONENT("ne:psu"), "cleared", "PSU missing."}});
}

fanValues[1] = 500;
expected[COMPONENT("ne:fans:fan2:rpm") "/sensor-data/value"] = "500";
{
auto [data, alarms, activeSensors] = ietfHardware->process();
auto [data, alarms, activeSensors, sideLoadedAlarms] = ietfHardware->process();
NUKE_LAST_CHANGE(data);
REQUIRE(data == expected);
REQUIRE(alarms == std::map<std::string, velia::ietf_hardware::State>{
Expand All @@ -313,6 +319,7 @@ TEST_CASE("HardwareState")
COMPONENT("ne:fans:fan4:rpm") "/sensor-data/value",
COMPONENT("ne:psu:child") "/sensor-data/value",
});
REQUIRE(sideLoadedAlarms == std::set<velia::ietf_hardware::SideLoadedAlarm>{{"velia-alarms:sensor-missing", COMPONENT("ne:psu"), "cleared", "PSU missing."}});
}

psuActive = false;
Expand All @@ -332,7 +339,7 @@ TEST_CASE("HardwareState")
expected[COMPONENT("ne:fans:fan3:rpm") "/sensor-data/value"] = "5000";

{
auto [data, alarms, activeSensors] = ietfHardware->process();
auto [data, alarms, activeSensors, sideLoadedAlarms] = ietfHardware->process();
NUKE_LAST_CHANGE(data);

REQUIRE(data == expected);
Expand All @@ -352,6 +359,7 @@ TEST_CASE("HardwareState")
COMPONENT("ne:fans:fan3:rpm") "/sensor-data/value",
COMPONENT("ne:fans:fan4:rpm") "/sensor-data/value",
});
REQUIRE(sideLoadedAlarms == std::set<velia::ietf_hardware::SideLoadedAlarm>{{"velia-alarms:sensor-missing", COMPONENT("ne:psu"), "critical", "PSU missing."}});
}

psuActive = true;
Expand All @@ -368,7 +376,7 @@ TEST_CASE("HardwareState")
expected[COMPONENT("ne:psu:child") "/sensor-data/value-type"] = "volts-DC";

{
auto [data, alarms, activeSensors] = ietfHardware->process();
auto [data, alarms, activeSensors, sideLoadedAlarms] = ietfHardware->process();
NUKE_LAST_CHANGE(data);

REQUIRE(data == expected);
Expand All @@ -388,6 +396,7 @@ TEST_CASE("HardwareState")
COMPONENT("ne:fans:fan4:rpm") "/sensor-data/value",
COMPONENT("ne:psu:child") "/sensor-data/value",
});
REQUIRE(sideLoadedAlarms == std::set<velia::ietf_hardware::SideLoadedAlarm>{{"velia-alarms:sensor-missing", COMPONENT("ne:psu"), "cleared", "PSU missing."}});
}


Expand All @@ -399,7 +408,7 @@ TEST_CASE("HardwareState")
expected[COMPONENT("ne:fans:fan2:rpm") "/sensor-data/oper-status"] = "nonoperational";

{
auto [data, alarms, activeSensors] = ietfHardware->process();
auto [data, alarms, activeSensors, sideLoadedAlarms] = ietfHardware->process();
NUKE_LAST_CHANGE(data);

REQUIRE(data == expected);
Expand All @@ -420,5 +429,6 @@ TEST_CASE("HardwareState")
COMPONENT("ne:fans:fan4:rpm") "/sensor-data/value",
COMPONENT("ne:psu:child") "/sensor-data/value",
});
REQUIRE(sideLoadedAlarms == std::set<velia::ietf_hardware::SideLoadedAlarm>{{"velia-alarms:sensor-missing", COMPONENT("ne:psu"), "cleared", "PSU missing."}});
}
}
Loading

0 comments on commit 26b3821

Please sign in to comment.