Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fence_heuristics_resource: add new fence-agent for dynamic delay fencing #308

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
198 changes: 198 additions & 0 deletions agents/heuristics_resource/fence_heuristics_resource.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
#!@PYTHON@ -tt

import io
import re
import subprocess
import shlex
import sys, stat
import logging
import atexit
import time
import xml.etree.ElementTree as ET
import distutils.util as dist
sys.path.append("/usr/share/fence")
from fencing import fail_usage, run_command, fence_action, all_opt
from fencing import atexit_handler, check_input, process_input, show_docs
from fencing import run_delay

def heuristics_resource(con, options):
# Search the node where the resource is running and determine
# the ACT node or not. For SBY node, a delay is generated.
# Note that this method always returns FALSE.

if not "--nodename" in options or options["--nodename"] == "":
logging.error("nodename parameter required")
return False

if not "--resource" in options or options["--resource"] == "":
logging.error("resource parameter required")
return False

target = options["--nodename"]
resource_id = options["--resource"]
wait_time = int(options["--standby-wait"])
crm_node_path = options["--crm-node-path"]
crm_mon_path = options["--crm-mon-path"]

(rc, out, err) = run_command(options, "%s --name" % crm_node_path)
if not rc == 0 or out is None:
logging.error("Can not get my nodename. rc=%s, stderr=%s" % (rc, err))
return False

node = out.strip()

if node == target:
logging.info("Skip standby wait due to self-fencing.")
return False

(rc, out, err) = run_command(options, "%s --as-xml" % crm_mon_path)
if not rc == 0 or out is None:
logging.error("crm_mon command failed. rc=%s, stderr=%s" % (rc, err))
return False

tree = ET.fromstring(out)
nodes = tree.findall('./nodes//*[@type="member"]')
nodelist = []
for member in nodes:
nodelist.append(member.get("name"))

resources = tree.findall('./resources//*[@id="%s"]' % resource_id)
if len(resources) == 0:
logging.error("Resource '%s' not found." % resource_id)
elif len(resources) == 1:
resource = resources[0]
type = resource.tag
if type == "resource":
# primitive resource
standby_node = check_standby_node(resource, node, nodelist)
failed = check_failed_attrib(resource)
if standby_node and not failed:
return standby_wait(wait_time)
elif type == "group":
# resource group
standby_node = True
failed = False
for child in resource:
failed |= check_failed_attrib(child)
standby_node &= check_standby_node(child, node, nodelist)
if standby_node and not failed:
return standby_wait(wait_time)
elif type == "clone" and dist.strtobool(resource.get("multi_state")):
# promotable resource
master_nodes = 0
standby_node = True
failed = False
for native in resource:
failed |= check_failed_attrib(native)
if native.get("role") in ["Master"]:
master_nodes += 1
standby_node &= check_standby_node(native, node, nodelist)
if master_nodes == 1 and standby_node and not failed:
return standby_wait(wait_time)
else:
# clone or bundle resource
logging.error("Unsupported resource type: '%s'" % type)
else:
logging.error("Multiple active resources found.")

logging.info("Skip standby wait.")
return False

def standby_wait(wait_time):
logging.info("Standby wait %s sec" % wait_time)
time.sleep(wait_time)
return False

def check_failed_attrib(resource):
failed = dist.strtobool(resource.get("failed"))
ignored = dist.strtobool(resource.get("failure_ignored"))
return failed and not ignored

def check_standby_node(resource, nodename, nodelist):
running_nodes = []
for node in resource:
running_nodes.append(node.get("name"))
return len(set(running_nodes)) == 1 and running_nodes[0] in nodelist and not running_nodes[0] == nodename

def define_new_opts():
all_opt["nodename"] = {
"getopt" : "n:",
"longopt" : "nodename",
"required" : "1",
"help" : "-n, --nodename=[nodename] Name of node to be fenced",
"shortdesc" : "Name of node to be fenced",
"default" : "",
"order" : 1
}
all_opt["resource"] = {
"getopt" : "r:",
"longopt" : "resource",
"required" : "1",
"help" : "-r, --resource=[resource-id] ID of the resource that should be running on the ACT node. It does not make sense to specify a cloned or bundled resource unless it is promotable and has only a single master instance.",
"shortdesc" : "Resource ID. It does not make sense to specify a cloned or bundled resource unless it is promotable and has only a single master instance.",
"default" : "",
"order" : 1
}
all_opt["standby_wait"] = {
"getopt" : "w:",
"longopt" : "standby-wait",
"required" : "0",
"help" : "-w, --standby-wait=[seconds] Wait X seconds on SBY node. The agent will delay but not succeed.",
"shortdesc" : "Wait X seconds on SBY node. The agent will delay but not succeed.",
"default" : "5",
"order" : 1
}
all_opt["crm_mon_path"] = {
"getopt" : ":",
"longopt" : "crm-mon-path",
"required" : "0",
"help" : "--crm-mon-path=[path] Path to crm_mon",
"shortdesc" : "Path to crm_mon command",
"default" : "@CRM_MON_PATH@",
"order" : 1
}
all_opt["crm_node_path"] = {
"getopt" : ":",
"longopt" : "crm-node-path",
"required" : "0",
"help" : "--crm-node-path=[path] Path to crm_node",
"shortdesc" : "Path to crm_node command",
"default" : "@CRM_NODE_PATH@",
"order" : 1
}


def main():
device_opt = ["no_status", "no_password", "nodename", "resource", "standby_wait", "crm_mon_path", "crm_node_path", "method"]
define_new_opts()
atexit.register(atexit_handler)

all_opt["method"]["default"] = "cycle"
all_opt["method"]["help"] = "-m, --method=[method] Method to fence (cycle|onoff) (Default: cycle)"

options = check_input(device_opt, process_input(device_opt))

docs = {}
docs["shortdesc"] = "Fence agent for resource-heuristic based fencing delay"
docs["longdesc"] = "fence_heuristics_resource uses resource-heuristics to delay execution of fence agent running on next level.\
\n.P\n\
This is not a fence agent by itself! \
Its only purpose is to delay execution of another fence agent that lives on next fencing level. \
Note that this agent always returns FALSE. Therefore, subsequent agents on the same fencing level will not run"
docs["vendorurl"] = ""
show_docs(options, docs)

run_delay(options)

result = fence_action(\
None, \
options, \
None, \
None, \
reboot_cycle_fn = heuristics_resource,
sync_set_power_fn = heuristics_resource)

sys.exit(result)

if __name__ == "__main__":
main()
2 changes: 2 additions & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,8 @@ AC_PATH_PROG([SNMPSET_PATH], [snmpset], [/usr/bin/snmpset])
AC_PATH_PROG([SNMPGET_PATH], [snmpget], [/usr/bin/snmpget])
AC_PATH_PROG([NOVA_PATH], [nova], [/usr/bin/nova])
AC_PATH_PROG([POWERMAN_PATH], [powerman], [/usr/bin/powerman])
AC_PATH_PROG([CRM_MON_PATH], [crm_mon], [/usr/sbin/crm_mon])
AC_PATH_PROG([CRM_NODE_PATH], [crm_node], [/usr/sbin/crm_node])

AC_PATH_PROG([PING_CMD], [ping])
AC_PATH_PROG([PING6_CMD], [ping6])
Expand Down
14 changes: 14 additions & 0 deletions fence-agents.spec.in
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ fence-agents-emerson \\
fence-agents-eps \\
fence-agents-hds-cb \\
fence-agents-heuristics-ping \\
fence-agents-heuristics-resource \\
fence-agents-hpblade \\
fence-agents-ibmblade \\
fence-agents-ifmib \\
Expand Down Expand Up @@ -536,6 +537,19 @@ ping-heuristics.
%{_sbindir}/fence_heuristics_ping
%{_mandir}/man8/fence_heuristics_ping.8*

%package heuristics-resource
License: GPLv2+ and LGPLv2+
Summary: Pseudo fence agent to affect other agents based on resource-heuristics
Requires: fence-agents-common = %{version}-%{release}
BuildArch: noarch
Obsoletes: fence-agents
%description heuristics-resource
Fence pseudo agent used to affect other agents based on
resource-heuristics.
%files heuristics-resource
%{_sbindir}/fence_heuristics_resource
%{_mandir}/man8/fence_heuristics_resource.8*

%package hpblade
License: GPLv2+ and LGPLv2+
Summary: Fence agent for HP BladeSystem devices
Expand Down
2 changes: 2 additions & 0 deletions make/fencebuild.mk
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ define gen_agent_from_py
-e 's#@''SNMPGET_PATH@#${SNMPGET_PATH}#g' \
-e 's#@''NOVA_PATH@#${NOVA_PATH}#g' \
-e 's#@''POWERMAN_PATH@#${POWERMAN_PATH}#g' \
-e 's#@''CRM_MON_PATH@#${CRM_MON_PATH}#g' \
-e 's#@''CRM_NODE_PATH@#${CRM_NODE_PATH}#g' \
-e 's#@''PING_CMD@#${PING_CMD}#g' \
-e 's#@''PING6_CMD@#${PING6_CMD}#g' \
-e 's#@''PING4_CMD@#${PING4_CMD}#g' \
Expand Down
114 changes: 114 additions & 0 deletions tests/data/metadata/fence_heuristics_resource.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
<?xml version="1.0" ?>
<resource-agent name="fence_heuristics_resource" shortdesc="Fence agent for resource-heuristic based fencing delay" >
<longdesc>fence_heuristics_resource uses resource-heuristics to delay execution of fence agent running on next level.

This is not a fence agent by itself! Its only purpose is to delay execution of another fence agent that lives on next fencing level. Note that this agent always returns FALSE. Therefore, subsequent agents on the same fencing level will not run</longdesc>
<vendor-url></vendor-url>
<parameters>
<parameter name="action" unique="0" required="1">
<getopt mixed="-o, --action=[action]" />
<content type="string" default="reboot" />
<shortdesc lang="en">Fencing action</shortdesc>
</parameter>
<parameter name="crm_mon_path" unique="0" required="0">
<getopt mixed="--crm-mon-path=[path]" />
<shortdesc lang="en">Path to crm_mon command</shortdesc>
</parameter>
<parameter name="crm_node_path" unique="0" required="0">
<getopt mixed="--crm-node-path=[path]" />
<shortdesc lang="en">Path to crm_node command</shortdesc>
</parameter>
<parameter name="method" unique="0" required="0">
<getopt mixed="-m, --method=[method]" />
<content type="select" default="cycle" >
<option value="onoff" />
<option value="cycle" />
</content>
<shortdesc lang="en">Method to fence</shortdesc>
</parameter>
<parameter name="nodename" unique="0" required="1">
<getopt mixed="-n, --nodename=[nodename]" />
<content type="string" default="" />
<shortdesc lang="en">Name of node to be fenced</shortdesc>
</parameter>
<parameter name="resource" unique="0" required="1">
<getopt mixed="-r, --resource=[resource-id]" />
<content type="string" default="" />
<shortdesc lang="en">Resource ID. It does not make sense to specify a cloned or bundled resource unless it is promotable and has only a single master instance.</shortdesc>
</parameter>
<parameter name="standby_wait" unique="0" required="0">
<getopt mixed="-w, --standby-wait=[seconds]" />
<content type="string" default="5" />
<shortdesc lang="en">Wait X seconds on SBY node. The agent will delay but not succeed.</shortdesc>
</parameter>
<parameter name="quiet" unique="0" required="0">
<getopt mixed="-q, --quiet" />
<content type="boolean" />
<shortdesc lang="en">Disable logging to stderr. Does not affect --verbose or --debug-file or logging to syslog.</shortdesc>
</parameter>
<parameter name="verbose" unique="0" required="0">
<getopt mixed="-v, --verbose" />
<content type="boolean" />
<shortdesc lang="en">Verbose mode</shortdesc>
</parameter>
<parameter name="debug" unique="0" required="0" deprecated="1">
<getopt mixed="-D, --debug-file=[debugfile]" />
<content type="string" />
<shortdesc lang="en">Write debug information to given file</shortdesc>
</parameter>
<parameter name="debug_file" unique="0" required="0" obsoletes="debug">
<getopt mixed="-D, --debug-file=[debugfile]" />
<content type="string" />
<shortdesc lang="en">Write debug information to given file</shortdesc>
</parameter>
<parameter name="version" unique="0" required="0">
<getopt mixed="-V, --version" />
<content type="boolean" />
<shortdesc lang="en">Display version information and exit</shortdesc>
</parameter>
<parameter name="help" unique="0" required="0">
<getopt mixed="-h, --help" />
<content type="boolean" />
<shortdesc lang="en">Display help and exit</shortdesc>
</parameter>
<parameter name="delay" unique="0" required="0">
<getopt mixed="--delay=[seconds]" />
<content type="second" default="0" />
<shortdesc lang="en">Wait X seconds before fencing is started</shortdesc>
</parameter>
<parameter name="login_timeout" unique="0" required="0">
<getopt mixed="--login-timeout=[seconds]" />
<content type="second" default="5" />
<shortdesc lang="en">Wait X seconds for cmd prompt after login</shortdesc>
</parameter>
<parameter name="power_timeout" unique="0" required="0">
<getopt mixed="--power-timeout=[seconds]" />
<content type="second" default="20" />
<shortdesc lang="en">Test X seconds for status change after ON/OFF</shortdesc>
</parameter>
<parameter name="power_wait" unique="0" required="0">
<getopt mixed="--power-wait=[seconds]" />
<content type="second" default="0" />
<shortdesc lang="en">Wait X seconds after issuing ON/OFF</shortdesc>
</parameter>
<parameter name="shell_timeout" unique="0" required="0">
<getopt mixed="--shell-timeout=[seconds]" />
<content type="second" default="3" />
<shortdesc lang="en">Wait X seconds for cmd prompt after issuing command</shortdesc>
</parameter>
<parameter name="retry_on" unique="0" required="0">
<getopt mixed="--retry-on=[attempts]" />
<content type="integer" default="1" />
<shortdesc lang="en">Count of attempts to retry power on</shortdesc>
</parameter>
</parameters>
<actions>
<action name="on" automatic="0"/>
<action name="off" />
<action name="reboot" />
<action name="monitor" />
<action name="metadata" />
<action name="manpage" />
<action name="validate-all" />
</actions>
</resource-agent>