diff --git a/Documentation/cxl/cxl-set-alert-config.txt b/Documentation/cxl/cxl-set-alert-config.txt new file mode 100644 index 00000000..2ccb024d --- /dev/null +++ b/Documentation/cxl/cxl-set-alert-config.txt @@ -0,0 +1,152 @@ +// SPDX-License-Identifier: GPL-2.0 + +cxl-set-alert-config(1) +======================= + +NAME +---- +cxl-set-alert-config - set the warning alert threshold on a CXL memdev + +SYNOPSIS +-------- +[verse] +'cxl set-alert-config [..] []' + +DESCRIPTION +----------- +CXL device raises an alert when its health status is changed. Critical alert +shall automatically be configured by the device after a device reset. +If supported, programmable warning thresholds also be initialized to vendor +recommended defaults, then could be configured by the user. + +Use this command to configure warning alert thresholds of a device. +Having issued this command, the newly requested warning thresholds would +override the previously programmed warning thresholds. + +To enable warning alert, set both 'threshold=value' and 'alert=on'. To disable +warning alert, set only 'alert=off'. Other cases would cause errors. + +Use "cxl list -m -A" to examine the programming warning threshold +capabilities of a device. + +EXAMPLES +-------- +Set warning threshold to 30 and enable alert for life used. +---- +# cxl set-alert-config mem0 --life-used-threshold=30 --life-used-alert=on +{ +  "memdev":"mem0", +  "ram_size":"1024.00 MiB (1073.74 MB)", +  "alert_config":{ +    "life_used_prog_warn_threshold_valid":true, +    "dev_over_temperature_prog_warn_threshold_valid":false, +    "dev_under_temperature_prog_warn_threshold_valid":false, +    "corrected_volatile_mem_err_prog_warn_threshold_valid":false, +    "corrected_pmem_err_prog_warn_threshold_valid":false, +    "life_used_prog_warn_threshold_writable":true, +    "dev_over_temperature_prog_warn_threshold_writable":true, +    "dev_under_temperature_prog_warn_threshold_writable":true, +    "corrected_volatile_mem_err_prog_warn_threshold_writable":true, +    "corrected_pmem_err_prog_warn_threshold_writable":true, +    "life_used_crit_alert_threshold":75, +    "life_used_prog_warn_threshold":30, +    "dev_over_temperature_crit_alert_threshold":0, +    "dev_under_temperature_crit_alert_threshold":0, +    "dev_over_temperature_prog_warn_threshold":0, +    "dev_under_temperature_prog_warn_threshold":0, +    "corrected_volatile_mem_err_prog_warn_threshold":0, +    "corrected_pmem_err_prog_warn_threshold":0 +  }, +  "serial":"0", +  "host":"0000:0d:00.0" +} +cxl memdev: cmd_set_alert_config: set alert configuration 1 mem +---- + +Disable warning alert for life_used. +---- +# cxl set-alert-config mem0 --life-used-alert=off +{ +  "memdev":"mem0", +  "ram_size":"1024.00 MiB (1073.74 MB)", +  "alert_config":{ +    "life_used_prog_warn_threshold_valid":false, +    "dev_over_temperature_prog_warn_threshold_valid":false, +    "dev_under_temperature_prog_warn_threshold_valid":false, +    "corrected_volatile_mem_err_prog_warn_threshold_valid":false, +    "corrected_pmem_err_prog_warn_threshold_valid":false, +    "life_used_prog_warn_threshold_writable":true, +    "dev_over_temperature_prog_warn_threshold_writable":true, +    "dev_under_temperature_prog_warn_threshold_writable":true, +    "corrected_volatile_mem_err_prog_warn_threshold_writable":true, +    "corrected_pmem_err_prog_warn_threshold_writable":true, +    "life_used_crit_alert_threshold":75, +    "life_used_prog_warn_threshold":30, +    "dev_over_temperature_crit_alert_threshold":0, +    "dev_under_temperature_crit_alert_threshold":0, +    "dev_over_temperature_prog_warn_threshold":0, +    "dev_under_temperature_prog_warn_threshold":0, +    "corrected_volatile_mem_err_prog_warn_threshold":0, +    "corrected_pmem_err_prog_warn_threshold":0 +  }, +  "serial":"0", +  "host":"0000:0d:00.0" +} +cxl memdev: cmd_set_alert_config: set alert configuration 1 mem +---- + +OPTIONS +------- +:: +include::memdev-option.txt[] + +-L:: +--life-used-threshold=:: + Set for the life used warning alert threshold. + +--life-used-alert=:: + Enable or disable the life used warning alert. + Options are 'on' or 'off'. + +-O:: +--over-temperature-threshold=:: + Set for the device over temperature warning alert threshold. + +--over-temperature-alert=:: + Enable or disable the device over temperature warning alert. + Options are 'on' or 'off'. + +-U:: +--under-temperature-threshold=:: + Set for the device under temperature warning alert threshold. + +--under-temperature-alert=:: + Enable or disable the device under temperature warning alert. + Options are 'on' or 'off'. + +-V:: +--volatile-mem-err-threshold=:: + Set for the corrected volatile memory error warning alert + threshold. + +--volatile-mem-err-alert=:: + Enable or disable the corrected volatile memory error warning alert. + Options are 'on' or 'off'. + +-P:: +--pmem-err-threshold=:: + Set for the corrected persistent memory error warning alert + threshold. + +--pmem-err-alert=:: + Enable or disable the corrected persistent memory error warning alert. + Options are 'on' or 'off'. + +-v:: +--verbose:: + Turn on verbose debug messages in the library (if libcxl was built with + logging and debug enabled). + +SEE ALSO +-------- +CXL-3.0 8.2.9.8.3.3 diff --git a/Documentation/cxl/lib/libcxl.txt b/Documentation/cxl/lib/libcxl.txt index 31bc8551..bcb89288 100644 --- a/Documentation/cxl/lib/libcxl.txt +++ b/Documentation/cxl/lib/libcxl.txt @@ -122,6 +122,7 @@ struct cxl_cmd *cxl_cmd_new_raw(struct cxl_memdev *memdev, int opcode); struct cxl_cmd *cxl_cmd_new_identify(struct cxl_memdev *memdev); struct cxl_cmd *cxl_cmd_new_get_health_info(struct cxl_memdev *memdev); struct cxl_cmd *cxl_cmd_new_get_alert_config(struct cxl_memdev *memdev); +struct cxl_cmd *cxl_cmd_new_set_alert_config(struct cxl_memdev *memdev); struct cxl_cmd *cxl_cmd_new_read_label(struct cxl_memdev *memdev, unsigned int offset, unsigned int length); struct cxl_cmd *cxl_cmd_new_write_label(struct cxl_memdev *memdev, void *buf, diff --git a/Documentation/cxl/meson.build b/Documentation/cxl/meson.build index c5533572..865aad55 100644 --- a/Documentation/cxl/meson.build +++ b/Documentation/cxl/meson.build @@ -47,6 +47,7 @@ cxl_manpages = [ 'cxl-destroy-region.txt', 'cxl-monitor.txt', 'cxl-update-firmware.txt', + 'cxl-set-alert-config.txt', ] foreach man : cxl_manpages diff --git a/cxl/builtin.h b/cxl/builtin.h index 3ec6c6cb..2c46a82d 100644 --- a/cxl/builtin.h +++ b/cxl/builtin.h @@ -15,6 +15,7 @@ int cmd_enable_memdev(int argc, const char **argv, struct cxl_ctx *ctx); int cmd_reserve_dpa(int argc, const char **argv, struct cxl_ctx *ctx); int cmd_free_dpa(int argc, const char **argv, struct cxl_ctx *ctx); int cmd_update_fw(int argc, const char **argv, struct cxl_ctx *ctx); +int cmd_set_alert_config(int argc, const char **argv, struct cxl_ctx *ctx); int cmd_disable_port(int argc, const char **argv, struct cxl_ctx *ctx); int cmd_enable_port(int argc, const char **argv, struct cxl_ctx *ctx); int cmd_set_partition(int argc, const char **argv, struct cxl_ctx *ctx); diff --git a/cxl/cxl.c b/cxl/cxl.c index e1524b80..bf4822f9 100644 --- a/cxl/cxl.c +++ b/cxl/cxl.c @@ -69,6 +69,7 @@ static struct cmd_struct commands[] = { { "reserve-dpa", .c_fn = cmd_reserve_dpa }, { "free-dpa", .c_fn = cmd_free_dpa }, { "update-firmware", .c_fn = cmd_update_fw }, + { "set-alert-config", .c_fn = cmd_set_alert_config }, { "disable-port", .c_fn = cmd_disable_port }, { "enable-port", .c_fn = cmd_enable_port }, { "set-partition", .c_fn = cmd_set_partition }, diff --git a/cxl/lib/libcxl.c b/cxl/lib/libcxl.c index af4ca44e..c7815669 100644 --- a/cxl/lib/libcxl.c +++ b/cxl/lib/libcxl.c @@ -4465,3 +4465,24 @@ CXL_EXPORT int cxl_memdev_read_label(struct cxl_memdev *memdev, void *buf, { return lsa_op(memdev, LSA_OP_GET, buf, length, offset); } + +#define cxl_alert_config_set_field(field) \ +CXL_EXPORT int cxl_cmd_alert_config_set_##field(struct cxl_cmd *cmd, int val) \ +{ \ + struct cxl_cmd_set_alert_config *setalert = cmd->input_payload; \ + setalert->field = val; \ + return 0; \ +} + +cxl_alert_config_set_field(life_used_prog_warn_threshold) +cxl_alert_config_set_field(dev_over_temperature_prog_warn_threshold) +cxl_alert_config_set_field(dev_under_temperature_prog_warn_threshold) +cxl_alert_config_set_field(corrected_volatile_mem_err_prog_warn_threshold) +cxl_alert_config_set_field(corrected_pmem_err_prog_warn_threshold) +cxl_alert_config_set_field(valid_alert_actions) +cxl_alert_config_set_field(enable_alert_actions) + +CXL_EXPORT struct cxl_cmd *cxl_cmd_new_set_alert_config(struct cxl_memdev *memdev) +{ + return cxl_cmd_new_generic(memdev, CXL_MEM_COMMAND_ID_SET_ALERT_CONFIG); +} diff --git a/cxl/lib/libcxl.sym b/cxl/lib/libcxl.sym index 8fa1cca3..6beca52c 100644 --- a/cxl/lib/libcxl.sym +++ b/cxl/lib/libcxl.sym @@ -264,3 +264,15 @@ global: cxl_memdev_update_fw; cxl_memdev_cancel_fw_update; } LIBCXL_5; + +LIBCXL_7 { +global: + cxl_cmd_alert_config_set_life_used_prog_warn_threshold; + cxl_cmd_alert_config_set_dev_over_temperature_prog_warn_threshold; + cxl_cmd_alert_config_set_dev_under_temperature_prog_warn_threshold; + cxl_cmd_alert_config_set_corrected_volatile_mem_err_prog_warn_threshold; + cxl_cmd_alert_config_set_corrected_pmem_err_prog_warn_threshold; + cxl_cmd_alert_config_set_valid_alert_actions; + cxl_cmd_alert_config_set_enable_alert_actions; + cxl_cmd_new_set_alert_config; +} LIBCXL_6; diff --git a/cxl/lib/private.h b/cxl/lib/private.h index a6417270..b26a8629 100644 --- a/cxl/lib/private.h +++ b/cxl/lib/private.h @@ -309,6 +309,18 @@ struct cxl_cmd_get_alert_config { #define CXL_CMD_ALERT_CONFIG_PROG_ALERTS_CORRECTED_PMEM_ERR_PROG_WARN_THRESHOLD_MASK \ BIT(4) +/* CXL 3.0 8.2.9.8.3.3 Set Alert Configuration */ +struct cxl_cmd_set_alert_config { + u8 valid_alert_actions; + u8 enable_alert_actions; + u8 life_used_prog_warn_threshold; + u8 rsvd; + le16 dev_over_temperature_prog_warn_threshold; + le16 dev_under_temperature_prog_warn_threshold; + le16 corrected_volatile_mem_err_prog_warn_threshold; + le16 corrected_pmem_err_prog_warn_threshold; +} __attribute__((packed)); + struct cxl_cmd_get_partition { le64 active_volatile; le64 active_persistent; diff --git a/cxl/libcxl.h b/cxl/libcxl.h index 0f4f4b26..b0ec3694 100644 --- a/cxl/libcxl.h +++ b/cxl/libcxl.h @@ -461,6 +461,22 @@ enum cxl_setpartition_mode { int cxl_cmd_partition_set_mode(struct cxl_cmd *cmd, enum cxl_setpartition_mode mode); +int cxl_cmd_alert_config_set_life_used_prog_warn_threshold(struct cxl_cmd *cmd, + int threshold); +int cxl_cmd_alert_config_set_dev_over_temperature_prog_warn_threshold( + struct cxl_cmd *cmd, int threshold); +int cxl_cmd_alert_config_set_dev_under_temperature_prog_warn_threshold( + struct cxl_cmd *cmd, int threshold); +int cxl_cmd_alert_config_set_corrected_volatile_mem_err_prog_warn_threshold( + struct cxl_cmd *cmd, int threshold); +int cxl_cmd_alert_config_set_corrected_pmem_err_prog_warn_threshold( + struct cxl_cmd *cmd, int threshold); +int cxl_cmd_alert_config_set_valid_alert_actions(struct cxl_cmd *cmd, + int action); +int cxl_cmd_alert_config_set_enable_alert_actions(struct cxl_cmd *cmd, + int enable); +struct cxl_cmd *cxl_cmd_new_set_alert_config(struct cxl_memdev *memdev); + #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/cxl/memdev.c b/cxl/memdev.c index f6a2d3f1..2dd2e7fc 100644 --- a/cxl/memdev.c +++ b/cxl/memdev.c @@ -38,10 +38,38 @@ static struct parameters { const char *type; const char *size; const char *decoder_filter; + const char *life_used_threshold; + const char *dev_over_temperature_threshold; + const char *dev_under_temperature_threshold; + const char *corrected_volatile_mem_err_threshold; + const char *corrected_pmem_err_threshold; + const char *life_used_alert; + const char *dev_over_temperature_alert; + const char *dev_under_temperature_alert; + const char *corrected_volatile_mem_err_alert; + const char *corrected_pmem_err_alert; } param; static struct log_ctx ml; +struct alert_context { + int valid_alert_actions; + int enable_alert_actions; + int life_used_threshold; + int dev_over_temperature_threshold; + int dev_under_temperature_threshold; + int corrected_volatile_mem_err_threshold; + int corrected_pmem_err_threshold; +}; + +enum cxl_setalert_event { + CXL_SETALERT_LIFE_USED, + CXL_SETALERT_OVER_TEMP, + CXL_SETALERT_UNDER_TEMP, + CXL_SETALERT_VOLATILE_MEM_ERROR, + CXL_SETALERT_PMEM_ERROR, +}; + enum cxl_setpart_type { CXL_SETPART_PMEM, CXL_SETPART_VOLATILE, @@ -99,6 +127,36 @@ OPT_BOOLEAN('c', "cancel", ¶m.cancel, \ OPT_BOOLEAN('w', "wait", ¶m.wait, \ "wait for firmware update to complete before returning") +#define SET_ALERT_OPTIONS() \ +OPT_STRING('L', "life-used-threshold", ¶m.life_used_threshold, \ + "threshold", "threshold value for life used warning alert"), \ +OPT_STRING('\0', "life-used-alert", ¶m.life_used_alert, \ + "'on' or 'off'", "enable or disable life used warning alert"), \ +OPT_STRING('O', "over-temperature-threshold", \ + ¶m.dev_over_temperature_threshold, "threshold", \ + "threshold value for device over temperature warning alert"), \ +OPT_STRING('\0', "over-temperature-alert", \ + ¶m.dev_over_temperature_alert, "'on' or 'off'", \ + "enable or disable device over temperature warning alert"), \ +OPT_STRING('U', "under-temperature-threshold", \ + ¶m.dev_under_temperature_threshold, "threshold", \ + "threshold value for device under temperature warning alert"), \ +OPT_STRING('\0', "under-temperature-alert", \ + ¶m.dev_under_temperature_alert, "'on' or 'off'", \ + "enable or disable device under temperature warning alert"), \ +OPT_STRING('V', "volatile-mem-err-threshold", \ + ¶m.corrected_volatile_mem_err_threshold, "threshold", \ + "threshold value for corrected volatile mem error warning alert"), \ +OPT_STRING('\0', "volatile-mem-err-alert", \ + ¶m.corrected_volatile_mem_err_alert, "'on' or 'off'", \ + "enable or disable corrected volatile mem error warning alert"), \ +OPT_STRING('P', "pmem-err-threshold", \ + ¶m.corrected_pmem_err_threshold, "threshold", \ + "threshold value for corrected pmem error warning alert"), \ +OPT_STRING('\0', "pmem-err-alert", \ + ¶m.corrected_pmem_err_alert, "'on' or 'off'", \ + "enable or disable corrected pmem error warning alert") + static const struct option read_options[] = { BASE_OPTIONS(), LABEL_OPTIONS(), @@ -155,6 +213,12 @@ static const struct option update_fw_options[] = { OPT_END(), }; +static const struct option set_alert_options[] = { + BASE_OPTIONS(), + SET_ALERT_OPTIONS(), + OPT_END(), +}; + enum reserve_dpa_mode { DPA_ALLOC, DPA_FREE, @@ -706,6 +770,148 @@ static int action_update_fw(struct cxl_memdev *memdev, return rc; } +static int validate_alert_threshold(enum cxl_setalert_event event, + int threshold) +{ + if (event == CXL_SETALERT_LIFE_USED) { + if (threshold < 0 || threshold > 100) { + log_err(&ml, "Invalid life used threshold: %d\n", + threshold); + return -EINVAL; + } + } else if (event == CXL_SETALERT_OVER_TEMP || + event == CXL_SETALERT_UNDER_TEMP) { + if (threshold < SHRT_MIN || threshold > SHRT_MAX) { + log_err(&ml, + "Invalid device temperature threshold: %d\n", + threshold); + return -EINVAL; + } + } else { + if (threshold < 0 || threshold > USHRT_MAX) { + log_err(&ml, + "Invalid corrected mem error threshold: %d\n", + threshold); + return -EINVAL; + } + } + return 0; +} + +#define alert_param_set_threshold(arg, alert_event) \ +{ \ + if (!param.arg##_alert) { \ + if (param.arg##_threshold) { \ + log_err(&ml, "Action not specified\n"); \ + return -EINVAL; \ + } \ + } else if (strcmp(param.arg##_alert, "on") == 0) { \ + if (param.arg##_threshold) { \ + char *endptr; \ + alertctx.arg##_threshold = \ + strtol(param.arg##_threshold, &endptr, 10); \ + if (endptr[0] != '\0') { \ + log_err(&ml, "Invalid threshold: %s\n", \ + param.arg##_threshold); \ + return -EINVAL; \ + } \ + rc = validate_alert_threshold( \ + alert_event, alertctx.arg##_threshold); \ + if (rc != 0) \ + return rc; \ + alertctx.valid_alert_actions |= 1 << alert_event; \ + alertctx.enable_alert_actions |= 1 << alert_event; \ + } else { \ + log_err(&ml, "Threshold not specified\n"); \ + return -EINVAL; \ + } \ + } else if (strcmp(param.arg##_alert, "off") == 0) { \ + if (!param.arg##_threshold) { \ + alertctx.valid_alert_actions |= 1 << alert_event; \ + alertctx.enable_alert_actions &= ~(1 << alert_event); \ + } else { \ + log_err(&ml, "Disable not require threshold\n"); \ + return -EINVAL; \ + } \ + } else { \ + log_err(&ml, "Invalid action: %s\n", param.arg##_alert); \ + return -EINVAL; \ + } \ +} + +#define setup_threshold_field(arg) \ +{ \ + if (param.arg##_threshold) \ + cxl_cmd_alert_config_set_##arg##_prog_warn_threshold( \ + cmd, alertctx.arg##_threshold); \ +} + +static int action_set_alert_config(struct cxl_memdev *memdev, + struct action_context *actx) +{ + const char *devname = cxl_memdev_get_devname(memdev); + struct cxl_cmd *cmd; + struct alert_context alertctx = { 0 }; + struct json_object *jmemdev; + unsigned long flags; + int rc = 0; + + alert_param_set_threshold(life_used, CXL_SETALERT_LIFE_USED) + alert_param_set_threshold(dev_over_temperature, CXL_SETALERT_OVER_TEMP) + alert_param_set_threshold(dev_under_temperature, + CXL_SETALERT_UNDER_TEMP) + alert_param_set_threshold(corrected_volatile_mem_err, + CXL_SETALERT_VOLATILE_MEM_ERROR) + alert_param_set_threshold(corrected_pmem_err, CXL_SETALERT_PMEM_ERROR) + if (alertctx.valid_alert_actions == 0) { + log_err(&ml, "No action specified\n"); + return -EINVAL; + } + + cmd = cxl_cmd_new_set_alert_config(memdev); + if (!cmd) { + rc = -ENXIO; + goto out_err; + } + + setup_threshold_field(life_used) + setup_threshold_field(dev_over_temperature) + setup_threshold_field(dev_under_temperature) + setup_threshold_field(corrected_volatile_mem_err) + setup_threshold_field(corrected_pmem_err) + cxl_cmd_alert_config_set_valid_alert_actions( + cmd, alertctx.valid_alert_actions); + cxl_cmd_alert_config_set_enable_alert_actions( + cmd, alertctx.enable_alert_actions); + + rc = cxl_cmd_submit(cmd); + if (rc < 0) { + log_err(&ml, "cmd submission failed: %s\n", strerror(-rc)); + goto out_cmd; + } + + rc = cxl_cmd_get_mbox_status(cmd); + if (rc != 0) { + log_err(&ml, "%s: mbox status: %d\n", __func__, rc); + rc = -ENXIO; + } + +out_cmd: + cxl_cmd_unref(cmd); +out_err: + if (rc) + log_err(&ml, "%s error: %s\n", devname, strerror(-rc)); + + flags = UTIL_JSON_ALERT_CONFIG; + if (actx->f_out == stdout && isatty(1)) + flags |= UTIL_JSON_HUMAN; + jmemdev = util_cxl_memdev_to_json(memdev, flags); + if (actx->jdevs && jmemdev) + json_object_array_add(actx->jdevs, jmemdev); + + return rc; +} + static int memdev_action(int argc, const char **argv, struct cxl_ctx *ctx, int (*action)(struct cxl_memdev *memdev, struct action_context *actx), @@ -749,7 +955,8 @@ static int memdev_action(int argc, const char **argv, struct cxl_ctx *ctx, } if (action == action_setpartition || action == action_reserve_dpa || - action == action_free_dpa || action == action_update_fw) + action == action_free_dpa || action == action_update_fw || + action == action_set_alert_config) actx.jdevs = json_object_new_array(); if (err == argc) { @@ -968,3 +1175,14 @@ int cmd_update_fw(int argc, const char **argv, struct cxl_ctx *ctx) return count >= 0 ? 0 : EXIT_FAILURE; } + +int cmd_set_alert_config(int argc, const char **argv, struct cxl_ctx *ctx) +{ + int count = memdev_action( + argc, argv, ctx, action_set_alert_config, set_alert_options, + "cxl set-alert-config [..] []"); + log_info(&ml, "set alert configuration for %d mem%s\n", + count >= 0 ? count : 0, count > 1 ? "s" : ""); + + return count >= 0 ? 0 : EXIT_FAILURE; +}