From 50287f2c2f8ef9fe5e3756d8691a2b718f80beb2 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Thu, 23 Jun 2022 01:37:50 +0200 Subject: [PATCH 1/3] slo: Add BurnrateName and QueryBurnrate for alerts and PromQL queries --- slo/promql.go | 45 +++++++++++++++++++++++++++++++ slo/rules.go | 69 ++++++++++++++++------------------------------- slo/rules_test.go | 69 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 137 insertions(+), 46 deletions(-) diff --git a/slo/promql.go b/slo/promql.go index 536f2ae95..3793b3c43 100644 --- a/slo/promql.go +++ b/slo/promql.go @@ -257,6 +257,51 @@ func (o Objective) QueryErrorBudget() string { return "" } +// TODO: Add grouping support! +func (o Objective) QueryBurnrate(timerange time.Duration) (string, error) { + var ( + metric string + matchers []*labels.Matcher + ) + + if o.Indicator.Ratio != nil && o.Indicator.Ratio.Total.Name != "" { + metric = o.BurnrateName(timerange) + matchers = o.Indicator.Ratio.Total.LabelMatchers + } + + if o.Indicator.Latency != nil && o.Indicator.Latency.Total.Name != "" { + metric = o.BurnrateName(timerange) + matchers = o.Indicator.Latency.Total.LabelMatchers + } + + if metric == "" { + return "", fmt.Errorf("objective misses indicator") + } + + expr, err := parser.ParseExpr(`metric{}`) + if err != nil { + return "", err + } + + for i, m := range matchers { + if m.Name == labels.MetricName { + matchers[i].Value = metric + } + } + matchers = append(matchers, &labels.Matcher{ + Type: labels.MatchEqual, + Name: "slo", + Value: o.Name(), + }) + + objectiveReplacer{ + metric: metric, + matchers: matchers, + }.replace(expr) + + return expr.String(), nil +} + type objectiveReplacer struct { metric string matchers []*labels.Matcher diff --git a/slo/rules.go b/slo/rules.go index 21d9259cd..eb8675bd1 100644 --- a/slo/rules.go +++ b/slo/rules.go @@ -26,48 +26,19 @@ type MultiBurnRateAlert struct { } func (o Objective) Alerts() ([]MultiBurnRateAlert, error) { - sloName := o.Labels.Get(labels.MetricName) ws := Windows(time.Duration(o.Window)) - var ( - metric string - matchersString string - ) - if o.Indicator.Ratio != nil && o.Indicator.Ratio.Total.Name != "" { - metric = o.Indicator.Ratio.Total.Name - - // TODO: Make this a shared function with below and Burnrates func - var alertMatchers []string - for _, m := range o.Indicator.Ratio.Total.LabelMatchers { - if m.Name == labels.MetricName { - continue - } - alertMatchers = append(alertMatchers, m.String()) + mbras := make([]MultiBurnRateAlert, len(ws)) + for i, w := range ws { + queryShort, err := o.QueryBurnrate(w.Short) + if err != nil { + return nil, err } - alertMatchers = append(alertMatchers, fmt.Sprintf(`slo="%s"`, sloName)) - sort.Strings(alertMatchers) - matchersString = strings.Join(alertMatchers, ",") - } - if o.Indicator.Latency != nil && o.Indicator.Latency.Total.Name != "" { - metric = o.Indicator.Latency.Total.Name - - // TODO: Make this a shared function with below and Burnrates func - var alertMatchers []string - for _, m := range o.Indicator.Latency.Total.LabelMatchers { - if m.Name == labels.MetricName { - continue - } - alertMatchers = append(alertMatchers, m.String()) + queryLong, err := o.QueryBurnrate(w.Long) + if err != nil { + return nil, err } - alertMatchers = append(alertMatchers, fmt.Sprintf(`slo="%s"`, sloName)) - sort.Strings(alertMatchers) - matchersString = strings.Join(alertMatchers, ",") - } - mbras := make([]MultiBurnRateAlert, len(ws)) - for i, w := range ws { - queryShort := fmt.Sprintf("%s{%s}", burnrateName(metric, w.Short), matchersString) - queryLong := fmt.Sprintf("%s{%s}", burnrateName(metric, w.Long), matchersString) mbras[i] = MultiBurnRateAlert{ Severity: string(w.Severity), Short: w.Short, @@ -90,7 +61,6 @@ func (o Objective) Burnrates() (monitoringv1.RuleGroup, error) { rules := make([]monitoringv1.Rule, 0, len(burnrates)) if o.Indicator.Ratio != nil && o.Indicator.Ratio.Total.Name != "" { - metric := o.Indicator.Ratio.Total.Name matchers := o.Indicator.Ratio.Total.LabelMatchers groupingMap := map[string]struct{}{} @@ -111,7 +81,7 @@ func (o Objective) Burnrates() (monitoringv1.RuleGroup, error) { for _, br := range burnrates { rules = append(rules, monitoringv1.Rule{ - Record: burnrateName(metric, br), + Record: o.BurnrateName(br), Expr: intstr.FromString(o.Burnrate(br)), Labels: ruleLabels, }) @@ -145,11 +115,11 @@ func (o Objective) Burnrates() (monitoringv1.RuleGroup, error) { Alert: "ErrorBudgetBurn", // TODO: Use expr replacer Expr: intstr.FromString(fmt.Sprintf("%s{%s} > (%.f * (1-%s)) and %s{%s} > (%.f * (1-%s))", - burnrateName(metric, w.Short), + o.BurnrateName(w.Short), alertMatchersString, w.Factor, strconv.FormatFloat(o.Target, 'f', -1, 64), - burnrateName(metric, w.Long), + o.BurnrateName(w.Long), alertMatchersString, w.Factor, strconv.FormatFloat(o.Target, 'f', -1, 64), @@ -162,7 +132,6 @@ func (o Objective) Burnrates() (monitoringv1.RuleGroup, error) { } if o.Indicator.Latency != nil && o.Indicator.Latency.Total.Name != "" { - metric := o.Indicator.Latency.Total.Name matchers := o.Indicator.Latency.Total.LabelMatchers groupingMap := map[string]struct{}{} @@ -183,7 +152,7 @@ func (o Objective) Burnrates() (monitoringv1.RuleGroup, error) { for _, br := range burnrates { rules = append(rules, monitoringv1.Rule{ - Record: burnrateName(metric, br), + Record: o.BurnrateName(br), Expr: intstr.FromString(o.Burnrate(br)), Labels: ruleLabels, }) @@ -223,11 +192,11 @@ func (o Objective) Burnrates() (monitoringv1.RuleGroup, error) { Alert: "ErrorBudgetBurn", // TODO: Use expr replacer Expr: intstr.FromString(fmt.Sprintf("%s{%s} > (%.f * (1-%s)) and %s{%s} > (%.f * (1-%s))", - burnrateName(metric, w.Short), + o.BurnrateName(w.Short), alertMatchersString, w.Factor, strconv.FormatFloat(o.Target, 'f', -1, 64), - burnrateName(metric, w.Long), + o.BurnrateName(w.Long), alertMatchersString, w.Factor, strconv.FormatFloat(o.Target, 'f', -1, 64), @@ -246,7 +215,15 @@ func (o Objective) Burnrates() (monitoringv1.RuleGroup, error) { }, nil } -func burnrateName(metric string, rate time.Duration) string { +func (o Objective) BurnrateName(rate time.Duration) string { + var metric string + if o.Indicator.Ratio != nil && o.Indicator.Ratio.Total.Name != "" { + metric = o.Indicator.Ratio.Total.Name + } + if o.Indicator.Latency != nil && o.Indicator.Latency.Total.Name != "" { + metric = o.Indicator.Latency.Total.Name + } + metric = strings.TrimSuffix(metric, "_total") metric = strings.TrimSuffix(metric, "_count") return fmt.Sprintf("%s:burnrate%s", metric, model.Duration(rate)) diff --git a/slo/rules_test.go b/slo/rules_test.go index 5664778f2..d0fcc6577 100644 --- a/slo/rules_test.go +++ b/slo/rules_test.go @@ -811,6 +811,75 @@ func TestObjective_Burnrates(t *testing.T) { } } +func TestObjective_BurnrateNames(t *testing.T) { + testcases := []struct { + name string + slo Objective + burnrate string + }{{ + name: "http-ratio", + slo: objectiveHTTPRatio(), + burnrate: "http_requests:burnrate5m", + }, { + name: "http-ratio-grouping", + slo: objectiveHTTPRatioGrouping(), + burnrate: "http_requests:burnrate5m", + }, { + name: "http-ratio-grouping-regex", + slo: objectiveHTTPRatioGroupingRegex(), + burnrate: "http_requests:burnrate5m", + }, { + name: "grpc-errors", + slo: objectiveGRPCRatio(), + burnrate: "grpc_server_handled:burnrate5m", + }, { + name: "grpc-errors-grouping", + slo: objectiveGRPCRatioGrouping(), + burnrate: "grpc_server_handled:burnrate5m", + }, { + name: "http-latency", + slo: objectiveHTTPLatency(), + burnrate: "http_request_duration_seconds:burnrate5m", + }, { + name: "http-latency-grouping", + slo: objectiveHTTPLatencyGrouping(), + burnrate: "http_request_duration_seconds:burnrate5m", + }, { + name: "http-latency-grouping-regex", + slo: objectiveHTTPLatencyGroupingRegex(), + burnrate: "http_request_duration_seconds:burnrate5m", + }, { + name: "grpc-latency", + slo: objectiveGRPCLatency(), + burnrate: "grpc_server_handling_seconds:burnrate5m", + }, { + name: "grpc-latency-grouping", + slo: objectiveGRPCLatencyGrouping(), + burnrate: "grpc_server_handling_seconds:burnrate5m", + }, { + name: "operator-ratio", + slo: objectiveOperator(), + burnrate: "prometheus_operator_reconcile_operations:burnrate5m", + }, { + name: "operator-ratio-grouping", + slo: objectiveOperatorGrouping(), + burnrate: "prometheus_operator_reconcile_operations:burnrate5m", + }, { + name: "apiserver-write-response-errors", + slo: objectiveAPIServerRatio(), + burnrate: "apiserver_request:burnrate5m", + }, { + name: "apiserver-read-resource-latency", + slo: objectiveAPIServerLatency(), + burnrate: "apiserver_request_duration_seconds:burnrate5m", + }} + for _, tc := range testcases { + t.Run(tc.name, func(t *testing.T) { + require.Equal(t, tc.burnrate, tc.slo.BurnrateName(5*time.Minute)) + }) + } +} + func TestObjective_IncreaseRules(t *testing.T) { testcases := []struct { name string From 2ee1128fdb2224042c1817b891086ee120ca5368 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Tue, 28 Jun 2022 01:25:17 +0200 Subject: [PATCH 2/3] Use burnrate recording rules in alerting table and queries --- main.go | 22 +++++++---- main_test.go | 2 +- slo/promql.go | 41 ++++++++++++++------ slo/promql_test.go | 93 ++++++++++++++++++++++++++++++++++++++++++++++ slo/rules.go | 4 +- 5 files changed, 141 insertions(+), 21 deletions(-) diff --git a/main.go b/main.go index d1e18c503..0ef0720c8 100644 --- a/main.go +++ b/main.go @@ -658,9 +658,11 @@ func (o *ObjectivesServer) GetMultiBurnrateAlerts(ctx context.Context, expr, gro // Match alerts that at least have one character for the slo name. queryAlerts := `ALERTS{slo=~".+"}` + var groupingMatchers []*labels.Matcher + if grouping != "" && grouping != "{}" { // If grouping exists we merge those matchers directly into the queryAlerts query. - groupingMatchers, err := parser.ParseMetricSelector(grouping) + groupingMatchers, err = parser.ParseMetricSelector(grouping) if err != nil { return openapiserver.ImplResponse{}, fmt.Errorf("failed parsing grouping matchers: %w", err) } @@ -695,7 +697,7 @@ func (o *ObjectivesServer) GetMultiBurnrateAlerts(ctx context.Context, expr, gro return openapiserver.ImplResponse{Code: http.StatusInternalServerError}, err } - alerts := alertsMatchingObjectives(vector, objectives, inactive) + alerts := alertsMatchingObjectives(vector, objectives, groupingMatchers, inactive) if current { for _, objective := range objectives { @@ -712,8 +714,11 @@ func (o *ObjectivesServer) GetMultiBurnrateAlerts(ctx context.Context, expr, gro go func(w time.Duration) { defer wg.Done() - // TODO: Improve by using the recording rule - query := objective.Burnrate(w) + query, err := objective.QueryBurnrate(w, groupingMatchers) + if err != nil { + level.Warn(o.logger).Log("msg", "failed to query current burn rate", "err", err) + return + } value, _, err := o.promAPI.Query(contextSetPromCache(ctx, instantCache(w)), query, time.Now()) if err != nil { level.Warn(o.logger).Log("msg", "failed to query current burn rate", "err", err) @@ -769,7 +774,7 @@ func (o *ObjectivesServer) GetMultiBurnrateAlerts(ctx context.Context, expr, gro // All labels of an objective need to be equal if they exist on the ALERTS metric. // Therefore, only a subset on labels are taken into account // which gives the ALERTS metric the opportunity to include more custom labels. -func alertsMatchingObjectives(metrics model.Vector, objectives []slo.Objective, inactive bool) []openapiserver.MultiBurnrateAlert { +func alertsMatchingObjectives(metrics model.Vector, objectives []slo.Objective, grouping []*labels.Matcher, inactive bool) []openapiserver.MultiBurnrateAlert { alerts := make([]openapiserver.MultiBurnrateAlert, 0, len(metrics)) if inactive { @@ -782,6 +787,9 @@ func alertsMatchingObjectives(metrics model.Vector, objectives []slo.Objective, lset[l.Name] = l.Value } for _, w := range o.Windows() { + queryShort, _ := o.QueryBurnrate(w.Short, grouping) + queryLong, _ := o.QueryBurnrate(w.Long, grouping) + alerts = append(alerts, openapiserver.MultiBurnrateAlert{ Labels: lset, Severity: string(w.Severity), @@ -790,12 +798,12 @@ func alertsMatchingObjectives(metrics model.Vector, objectives []slo.Objective, Short: openapiserver.Burnrate{ Window: w.Short.Milliseconds(), Current: -1, - Query: o.Burnrate(w.Short), + Query: queryShort, }, Long: openapiserver.Burnrate{ Window: w.Long.Milliseconds(), Current: -1, - Query: o.Burnrate(w.Long), + Query: queryLong, }, State: alertstateInactive, }) diff --git a/main_test.go b/main_test.go index 79525b555..6932659f2 100644 --- a/main_test.go +++ b/main_test.go @@ -419,7 +419,7 @@ func TestAlertsMatchingObjectives(t *testing.T) { }} for _, tc := range testcases { t.Run(tc.name, func(t *testing.T) { - require.Equal(t, tc.alerts, alertsMatchingObjectives(tc.metrics, tc.objectives, tc.inactive)) + require.Equal(t, tc.alerts, alertsMatchingObjectives(tc.metrics, tc.objectives, nil, tc.inactive)) }) } } diff --git a/slo/promql.go b/slo/promql.go index 3793b3c43..4782d741c 100644 --- a/slo/promql.go +++ b/slo/promql.go @@ -257,21 +257,22 @@ func (o Objective) QueryErrorBudget() string { return "" } -// TODO: Add grouping support! -func (o Objective) QueryBurnrate(timerange time.Duration) (string, error) { - var ( - metric string - matchers []*labels.Matcher - ) +func (o Objective) QueryBurnrate(timerange time.Duration, groupingMatchers []*labels.Matcher) (string, error) { + metric := "" + matchers := map[string]*labels.Matcher{} if o.Indicator.Ratio != nil && o.Indicator.Ratio.Total.Name != "" { metric = o.BurnrateName(timerange) - matchers = o.Indicator.Ratio.Total.LabelMatchers + for _, m := range o.Indicator.Ratio.Total.LabelMatchers { + matchers[m.Name] = m + } } if o.Indicator.Latency != nil && o.Indicator.Latency.Total.Name != "" { metric = o.BurnrateName(timerange) - matchers = o.Indicator.Latency.Total.LabelMatchers + for _, m := range o.Indicator.Latency.Total.LabelMatchers { + matchers[m.Name] = m + } } if metric == "" { @@ -288,15 +289,33 @@ func (o Objective) QueryBurnrate(timerange time.Duration) (string, error) { matchers[i].Value = metric } } - matchers = append(matchers, &labels.Matcher{ + + for _, m := range groupingMatchers { + if m.Type != labels.MatchEqual { + return "", fmt.Errorf("grouping matcher has to be MatchEqual not %s", m.Type.String()) + } + + matchers[m.Name] = &labels.Matcher{ + Type: labels.MatchEqual, + Name: m.Name, + Value: m.Value, + } + } + + matchers["slo"] = &labels.Matcher{ Type: labels.MatchEqual, Name: "slo", Value: o.Name(), - }) + } + + matchersSlice := make([]*labels.Matcher, 0, len(matchers)) + for _, m := range matchers { + matchersSlice = append(matchersSlice, m) + } objectiveReplacer{ metric: metric, - matchers: matchers, + matchers: matchersSlice, }.replace(expr) return expr.String(), nil diff --git a/slo/promql_test.go b/slo/promql_test.go index b05abb47a..22adcdb2e 100644 --- a/slo/promql_test.go +++ b/slo/promql_test.go @@ -463,6 +463,99 @@ func TestObjective_QueryErrorBudget(t *testing.T) { } } +func TestObjective_QueryBurnrate(t *testing.T) { + testcases := []struct { + name string + objective Objective + grouping []*labels.Matcher + expected string + }{{ + name: "http-ratio", + objective: objectiveHTTPRatio(), + expected: `http_requests:burnrate5m{job="thanos-receive-default",slo="monitoring-http-errors"}`, + }, { + name: "http-ratio-grouping", + objective: objectiveHTTPRatioGrouping(), + grouping: []*labels.Matcher{ + {Type: labels.MatchEqual, Name: "handler", Value: "/api/v1/query"}, + }, + expected: `http_requests:burnrate5m{handler="/api/v1/query",job="thanos-receive-default",slo="monitoring-http-errors"}`, + }, { + name: "http-ratio-grouping-regex", + objective: objectiveHTTPRatioGroupingRegex(), + grouping: []*labels.Matcher{ + {Type: labels.MatchEqual, Name: "handler", Value: "/api/v1/query"}, + }, + expected: `http_requests:burnrate5m{handler="/api/v1/query",job="thanos-receive-default",slo="monitoring-http-errors"}`, + }, { + name: "grpc-ratio", + objective: objectiveGRPCRatio(), + expected: `grpc_server_handled:burnrate5m{grpc_method="Write",grpc_service="conprof.WritableProfileStore",job="api",slo="monitoring-grpc-errors"}`, + }, { + name: "grpc-ratio-grouping", + objective: objectiveGRPCRatioGrouping(), + grouping: []*labels.Matcher{ + {Type: labels.MatchEqual, Name: "handler", Value: "/api/v1/query"}, + }, + expected: `grpc_server_handled:burnrate5m{grpc_method="Write",grpc_service="conprof.WritableProfileStore",handler="/api/v1/query",job="api",slo="monitoring-grpc-errors"}`, + }, { + name: "http-latency", + objective: objectiveHTTPLatency(), + expected: `http_request_duration_seconds:burnrate5m{code=~"2..",job="metrics-service-thanos-receive-default",slo="monitoring-http-latency"}`, + }, { + name: "http-latency-grouping", + objective: objectiveHTTPLatencyGrouping(), + grouping: []*labels.Matcher{ + {Type: labels.MatchEqual, Name: "handler", Value: "/api/v1/query"}, + }, + expected: `http_request_duration_seconds:burnrate5m{code=~"2..",handler="/api/v1/query",job="metrics-service-thanos-receive-default",slo="monitoring-http-latency"}`, + }, { + name: "http-latency-grouping-regex", + objective: objectiveHTTPLatencyGroupingRegex(), + grouping: []*labels.Matcher{ + {Type: labels.MatchEqual, Name: "handler", Value: "/api/v1/query"}, + }, + expected: `http_request_duration_seconds:burnrate5m{code=~"2..",handler="/api/v1/query",job="metrics-service-thanos-receive-default",slo="monitoring-http-latency"}`, + }, { + name: "grpc-latency", + objective: objectiveGRPCLatency(), + expected: `grpc_server_handling_seconds:burnrate5m{grpc_method="Write",grpc_service="conprof.WritableProfileStore",job="api",slo="monitoring-grpc-latency"}`, + }, { + name: "grpc-latency-regex", + objective: objectiveGRPCLatencyGrouping(), + grouping: []*labels.Matcher{ + {Type: labels.MatchEqual, Name: "handler", Value: "/api/v1/query"}, + }, + expected: `grpc_server_handling_seconds:burnrate5m{grpc_method="Write",grpc_service="conprof.WritableProfileStore",handler="/api/v1/query",job="api",slo="monitoring-grpc-latency"}`, + }, { + name: "operator-ratio", + objective: objectiveOperator(), + expected: `prometheus_operator_reconcile_operations:burnrate5m{slo="monitoring-prometheus-operator-errors"}`, + }, { + name: "operator-ratio-grouping", + objective: objectiveOperatorGrouping(), + grouping: []*labels.Matcher{ + {Type: labels.MatchEqual, Name: "namespace", Value: "monitoring"}, + }, + expected: `prometheus_operator_reconcile_operations:burnrate5m{namespace="monitoring",slo="monitoring-prometheus-operator-errors"}`, + }, { + name: "apiserver-write-response-errors", + objective: objectiveAPIServerRatio(), + expected: `apiserver_request:burnrate5m{job="apiserver",slo="apiserver-write-response-errors",verb=~"POST|PUT|PATCH|DELETE"}`, + }, { + name: "apiserver-read-resource-latency", + objective: objectiveAPIServerRatio(), + expected: `apiserver_request:burnrate5m{job="apiserver",slo="apiserver-write-response-errors",verb=~"POST|PUT|PATCH|DELETE"}`, + }} + for _, tc := range testcases { + t.Run(tc.name, func(t *testing.T) { + query, err := tc.objective.QueryBurnrate(5*time.Minute, tc.grouping) + require.NoError(t, err) + require.Equal(t, tc.expected, query) + }) + } +} + func TestObjective_RequestRange(t *testing.T) { testcases := []struct { name string diff --git a/slo/rules.go b/slo/rules.go index eb8675bd1..63294870f 100644 --- a/slo/rules.go +++ b/slo/rules.go @@ -30,11 +30,11 @@ func (o Objective) Alerts() ([]MultiBurnRateAlert, error) { mbras := make([]MultiBurnRateAlert, len(ws)) for i, w := range ws { - queryShort, err := o.QueryBurnrate(w.Short) + queryShort, err := o.QueryBurnrate(w.Short, nil) if err != nil { return nil, err } - queryLong, err := o.QueryBurnrate(w.Long) + queryLong, err := o.QueryBurnrate(w.Long, nil) if err != nil { return nil, err } From a8f1a84d01248fa946f69c5ab7cb0a0e94f15b0b Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Wed, 29 Jun 2022 00:23:06 +0200 Subject: [PATCH 3/3] ui: List doesn't request current values for alerts --- ui/src/pages/List.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ui/src/pages/List.tsx b/ui/src/pages/List.tsx index 5b4ea134d..a2f322a02 100644 --- a/ui/src/pages/List.tsx +++ b/ui/src/pages/List.tsx @@ -305,7 +305,7 @@ const List = () => { // TODO: This is prone to a concurrency race with updates of status that have additional groupings... // One solution would be to store this in a separate array and reconcile against that array after every status update. if (objectives.length > 0) { - api.getMultiBurnrateAlerts({expr: '', inactive: false}) + api.getMultiBurnrateAlerts({expr: '', inactive: false, current: false}) .then((alerts: MultiBurnrateAlert[]) => { alerts.forEach((alert: MultiBurnrateAlert) => { if (alert.state === MultiBurnrateAlertStateEnum.Firing) {