Skip to content

Commit

Permalink
Merge pull request #350 from pyrra-dev/query-burnrate
Browse files Browse the repository at this point in the history
Use burnrate recording rules to query for alerts table
  • Loading branch information
metalmatze authored Jun 28, 2022
2 parents b715a9b + a8f1a84 commit 44c2db0
Show file tree
Hide file tree
Showing 7 changed files with 266 additions and 55 deletions.
22 changes: 15 additions & 7 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -658,9 +658,11 @@ func (o *ObjectivesServer) GetMultiBurnrateAlerts(ctx context.Context, expr, gro
// Match alerts that at least have one character for the slo name.
queryAlerts := `ALERTS{slo=~".+"}`

var groupingMatchers []*labels.Matcher

if grouping != "" && grouping != "{}" {
// If grouping exists we merge those matchers directly into the queryAlerts query.
groupingMatchers, err := parser.ParseMetricSelector(grouping)
groupingMatchers, err = parser.ParseMetricSelector(grouping)
if err != nil {
return openapiserver.ImplResponse{}, fmt.Errorf("failed parsing grouping matchers: %w", err)
}
Expand Down Expand Up @@ -695,7 +697,7 @@ func (o *ObjectivesServer) GetMultiBurnrateAlerts(ctx context.Context, expr, gro
return openapiserver.ImplResponse{Code: http.StatusInternalServerError}, err
}

alerts := alertsMatchingObjectives(vector, objectives, inactive)
alerts := alertsMatchingObjectives(vector, objectives, groupingMatchers, inactive)

if current {
for _, objective := range objectives {
Expand All @@ -712,8 +714,11 @@ func (o *ObjectivesServer) GetMultiBurnrateAlerts(ctx context.Context, expr, gro
go func(w time.Duration) {
defer wg.Done()

// TODO: Improve by using the recording rule
query := objective.Burnrate(w)
query, err := objective.QueryBurnrate(w, groupingMatchers)
if err != nil {
level.Warn(o.logger).Log("msg", "failed to query current burn rate", "err", err)
return
}
value, _, err := o.promAPI.Query(contextSetPromCache(ctx, instantCache(w)), query, time.Now())
if err != nil {
level.Warn(o.logger).Log("msg", "failed to query current burn rate", "err", err)
Expand Down Expand Up @@ -769,7 +774,7 @@ func (o *ObjectivesServer) GetMultiBurnrateAlerts(ctx context.Context, expr, gro
// All labels of an objective need to be equal if they exist on the ALERTS metric.
// Therefore, only a subset on labels are taken into account
// which gives the ALERTS metric the opportunity to include more custom labels.
func alertsMatchingObjectives(metrics model.Vector, objectives []slo.Objective, inactive bool) []openapiserver.MultiBurnrateAlert {
func alertsMatchingObjectives(metrics model.Vector, objectives []slo.Objective, grouping []*labels.Matcher, inactive bool) []openapiserver.MultiBurnrateAlert {
alerts := make([]openapiserver.MultiBurnrateAlert, 0, len(metrics))

if inactive {
Expand All @@ -782,6 +787,9 @@ func alertsMatchingObjectives(metrics model.Vector, objectives []slo.Objective,
lset[l.Name] = l.Value
}
for _, w := range o.Windows() {
queryShort, _ := o.QueryBurnrate(w.Short, grouping)
queryLong, _ := o.QueryBurnrate(w.Long, grouping)

alerts = append(alerts, openapiserver.MultiBurnrateAlert{
Labels: lset,
Severity: string(w.Severity),
Expand All @@ -790,12 +798,12 @@ func alertsMatchingObjectives(metrics model.Vector, objectives []slo.Objective,
Short: openapiserver.Burnrate{
Window: w.Short.Milliseconds(),
Current: -1,
Query: o.Burnrate(w.Short),
Query: queryShort,
},
Long: openapiserver.Burnrate{
Window: w.Long.Milliseconds(),
Current: -1,
Query: o.Burnrate(w.Long),
Query: queryLong,
},
State: alertstateInactive,
})
Expand Down
2 changes: 1 addition & 1 deletion main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -419,7 +419,7 @@ func TestAlertsMatchingObjectives(t *testing.T) {
}}
for _, tc := range testcases {
t.Run(tc.name, func(t *testing.T) {
require.Equal(t, tc.alerts, alertsMatchingObjectives(tc.metrics, tc.objectives, tc.inactive))
require.Equal(t, tc.alerts, alertsMatchingObjectives(tc.metrics, tc.objectives, nil, tc.inactive))
})
}
}
64 changes: 64 additions & 0 deletions slo/promql.go
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,70 @@ func (o Objective) QueryErrorBudget() string {
return ""
}

func (o Objective) QueryBurnrate(timerange time.Duration, groupingMatchers []*labels.Matcher) (string, error) {
metric := ""
matchers := map[string]*labels.Matcher{}

if o.Indicator.Ratio != nil && o.Indicator.Ratio.Total.Name != "" {
metric = o.BurnrateName(timerange)
for _, m := range o.Indicator.Ratio.Total.LabelMatchers {
matchers[m.Name] = m
}
}

if o.Indicator.Latency != nil && o.Indicator.Latency.Total.Name != "" {
metric = o.BurnrateName(timerange)
for _, m := range o.Indicator.Latency.Total.LabelMatchers {
matchers[m.Name] = m
}
}

if metric == "" {
return "", fmt.Errorf("objective misses indicator")
}

expr, err := parser.ParseExpr(`metric{}`)
if err != nil {
return "", err
}

for i, m := range matchers {
if m.Name == labels.MetricName {
matchers[i].Value = metric
}
}

for _, m := range groupingMatchers {
if m.Type != labels.MatchEqual {
return "", fmt.Errorf("grouping matcher has to be MatchEqual not %s", m.Type.String())
}

matchers[m.Name] = &labels.Matcher{
Type: labels.MatchEqual,
Name: m.Name,
Value: m.Value,
}
}

matchers["slo"] = &labels.Matcher{
Type: labels.MatchEqual,
Name: "slo",
Value: o.Name(),
}

matchersSlice := make([]*labels.Matcher, 0, len(matchers))
for _, m := range matchers {
matchersSlice = append(matchersSlice, m)
}

objectiveReplacer{
metric: metric,
matchers: matchersSlice,
}.replace(expr)

return expr.String(), nil
}

type objectiveReplacer struct {
metric string
matchers []*labels.Matcher
Expand Down
93 changes: 93 additions & 0 deletions slo/promql_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,99 @@ func TestObjective_QueryErrorBudget(t *testing.T) {
}
}

func TestObjective_QueryBurnrate(t *testing.T) {
testcases := []struct {
name string
objective Objective
grouping []*labels.Matcher
expected string
}{{
name: "http-ratio",
objective: objectiveHTTPRatio(),
expected: `http_requests:burnrate5m{job="thanos-receive-default",slo="monitoring-http-errors"}`,
}, {
name: "http-ratio-grouping",
objective: objectiveHTTPRatioGrouping(),
grouping: []*labels.Matcher{
{Type: labels.MatchEqual, Name: "handler", Value: "/api/v1/query"},
},
expected: `http_requests:burnrate5m{handler="/api/v1/query",job="thanos-receive-default",slo="monitoring-http-errors"}`,
}, {
name: "http-ratio-grouping-regex",
objective: objectiveHTTPRatioGroupingRegex(),
grouping: []*labels.Matcher{
{Type: labels.MatchEqual, Name: "handler", Value: "/api/v1/query"},
},
expected: `http_requests:burnrate5m{handler="/api/v1/query",job="thanos-receive-default",slo="monitoring-http-errors"}`,
}, {
name: "grpc-ratio",
objective: objectiveGRPCRatio(),
expected: `grpc_server_handled:burnrate5m{grpc_method="Write",grpc_service="conprof.WritableProfileStore",job="api",slo="monitoring-grpc-errors"}`,
}, {
name: "grpc-ratio-grouping",
objective: objectiveGRPCRatioGrouping(),
grouping: []*labels.Matcher{
{Type: labels.MatchEqual, Name: "handler", Value: "/api/v1/query"},
},
expected: `grpc_server_handled:burnrate5m{grpc_method="Write",grpc_service="conprof.WritableProfileStore",handler="/api/v1/query",job="api",slo="monitoring-grpc-errors"}`,
}, {
name: "http-latency",
objective: objectiveHTTPLatency(),
expected: `http_request_duration_seconds:burnrate5m{code=~"2..",job="metrics-service-thanos-receive-default",slo="monitoring-http-latency"}`,
}, {
name: "http-latency-grouping",
objective: objectiveHTTPLatencyGrouping(),
grouping: []*labels.Matcher{
{Type: labels.MatchEqual, Name: "handler", Value: "/api/v1/query"},
},
expected: `http_request_duration_seconds:burnrate5m{code=~"2..",handler="/api/v1/query",job="metrics-service-thanos-receive-default",slo="monitoring-http-latency"}`,
}, {
name: "http-latency-grouping-regex",
objective: objectiveHTTPLatencyGroupingRegex(),
grouping: []*labels.Matcher{
{Type: labels.MatchEqual, Name: "handler", Value: "/api/v1/query"},
},
expected: `http_request_duration_seconds:burnrate5m{code=~"2..",handler="/api/v1/query",job="metrics-service-thanos-receive-default",slo="monitoring-http-latency"}`,
}, {
name: "grpc-latency",
objective: objectiveGRPCLatency(),
expected: `grpc_server_handling_seconds:burnrate5m{grpc_method="Write",grpc_service="conprof.WritableProfileStore",job="api",slo="monitoring-grpc-latency"}`,
}, {
name: "grpc-latency-regex",
objective: objectiveGRPCLatencyGrouping(),
grouping: []*labels.Matcher{
{Type: labels.MatchEqual, Name: "handler", Value: "/api/v1/query"},
},
expected: `grpc_server_handling_seconds:burnrate5m{grpc_method="Write",grpc_service="conprof.WritableProfileStore",handler="/api/v1/query",job="api",slo="monitoring-grpc-latency"}`,
}, {
name: "operator-ratio",
objective: objectiveOperator(),
expected: `prometheus_operator_reconcile_operations:burnrate5m{slo="monitoring-prometheus-operator-errors"}`,
}, {
name: "operator-ratio-grouping",
objective: objectiveOperatorGrouping(),
grouping: []*labels.Matcher{
{Type: labels.MatchEqual, Name: "namespace", Value: "monitoring"},
},
expected: `prometheus_operator_reconcile_operations:burnrate5m{namespace="monitoring",slo="monitoring-prometheus-operator-errors"}`,
}, {
name: "apiserver-write-response-errors",
objective: objectiveAPIServerRatio(),
expected: `apiserver_request:burnrate5m{job="apiserver",slo="apiserver-write-response-errors",verb=~"POST|PUT|PATCH|DELETE"}`,
}, {
name: "apiserver-read-resource-latency",
objective: objectiveAPIServerRatio(),
expected: `apiserver_request:burnrate5m{job="apiserver",slo="apiserver-write-response-errors",verb=~"POST|PUT|PATCH|DELETE"}`,
}}
for _, tc := range testcases {
t.Run(tc.name, func(t *testing.T) {
query, err := tc.objective.QueryBurnrate(5*time.Minute, tc.grouping)
require.NoError(t, err)
require.Equal(t, tc.expected, query)
})
}
}

func TestObjective_RequestRange(t *testing.T) {
testcases := []struct {
name string
Expand Down
69 changes: 23 additions & 46 deletions slo/rules.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,48 +26,19 @@ type MultiBurnRateAlert struct {
}

func (o Objective) Alerts() ([]MultiBurnRateAlert, error) {
sloName := o.Labels.Get(labels.MetricName)
ws := Windows(time.Duration(o.Window))

var (
metric string
matchersString string
)
if o.Indicator.Ratio != nil && o.Indicator.Ratio.Total.Name != "" {
metric = o.Indicator.Ratio.Total.Name

// TODO: Make this a shared function with below and Burnrates func
var alertMatchers []string
for _, m := range o.Indicator.Ratio.Total.LabelMatchers {
if m.Name == labels.MetricName {
continue
}
alertMatchers = append(alertMatchers, m.String())
mbras := make([]MultiBurnRateAlert, len(ws))
for i, w := range ws {
queryShort, err := o.QueryBurnrate(w.Short, nil)
if err != nil {
return nil, err
}
alertMatchers = append(alertMatchers, fmt.Sprintf(`slo="%s"`, sloName))
sort.Strings(alertMatchers)
matchersString = strings.Join(alertMatchers, ",")
}
if o.Indicator.Latency != nil && o.Indicator.Latency.Total.Name != "" {
metric = o.Indicator.Latency.Total.Name

// TODO: Make this a shared function with below and Burnrates func
var alertMatchers []string
for _, m := range o.Indicator.Latency.Total.LabelMatchers {
if m.Name == labels.MetricName {
continue
}
alertMatchers = append(alertMatchers, m.String())
queryLong, err := o.QueryBurnrate(w.Long, nil)
if err != nil {
return nil, err
}
alertMatchers = append(alertMatchers, fmt.Sprintf(`slo="%s"`, sloName))
sort.Strings(alertMatchers)
matchersString = strings.Join(alertMatchers, ",")
}

mbras := make([]MultiBurnRateAlert, len(ws))
for i, w := range ws {
queryShort := fmt.Sprintf("%s{%s}", burnrateName(metric, w.Short), matchersString)
queryLong := fmt.Sprintf("%s{%s}", burnrateName(metric, w.Long), matchersString)
mbras[i] = MultiBurnRateAlert{
Severity: string(w.Severity),
Short: w.Short,
Expand All @@ -90,7 +61,6 @@ func (o Objective) Burnrates() (monitoringv1.RuleGroup, error) {
rules := make([]monitoringv1.Rule, 0, len(burnrates))

if o.Indicator.Ratio != nil && o.Indicator.Ratio.Total.Name != "" {
metric := o.Indicator.Ratio.Total.Name
matchers := o.Indicator.Ratio.Total.LabelMatchers

groupingMap := map[string]struct{}{}
Expand All @@ -111,7 +81,7 @@ func (o Objective) Burnrates() (monitoringv1.RuleGroup, error) {

for _, br := range burnrates {
rules = append(rules, monitoringv1.Rule{
Record: burnrateName(metric, br),
Record: o.BurnrateName(br),
Expr: intstr.FromString(o.Burnrate(br)),
Labels: ruleLabels,
})
Expand Down Expand Up @@ -145,11 +115,11 @@ func (o Objective) Burnrates() (monitoringv1.RuleGroup, error) {
Alert: "ErrorBudgetBurn",
// TODO: Use expr replacer
Expr: intstr.FromString(fmt.Sprintf("%s{%s} > (%.f * (1-%s)) and %s{%s} > (%.f * (1-%s))",
burnrateName(metric, w.Short),
o.BurnrateName(w.Short),
alertMatchersString,
w.Factor,
strconv.FormatFloat(o.Target, 'f', -1, 64),
burnrateName(metric, w.Long),
o.BurnrateName(w.Long),
alertMatchersString,
w.Factor,
strconv.FormatFloat(o.Target, 'f', -1, 64),
Expand All @@ -162,7 +132,6 @@ func (o Objective) Burnrates() (monitoringv1.RuleGroup, error) {
}

if o.Indicator.Latency != nil && o.Indicator.Latency.Total.Name != "" {
metric := o.Indicator.Latency.Total.Name
matchers := o.Indicator.Latency.Total.LabelMatchers

groupingMap := map[string]struct{}{}
Expand All @@ -183,7 +152,7 @@ func (o Objective) Burnrates() (monitoringv1.RuleGroup, error) {

for _, br := range burnrates {
rules = append(rules, monitoringv1.Rule{
Record: burnrateName(metric, br),
Record: o.BurnrateName(br),
Expr: intstr.FromString(o.Burnrate(br)),
Labels: ruleLabels,
})
Expand Down Expand Up @@ -223,11 +192,11 @@ func (o Objective) Burnrates() (monitoringv1.RuleGroup, error) {
Alert: "ErrorBudgetBurn",
// TODO: Use expr replacer
Expr: intstr.FromString(fmt.Sprintf("%s{%s} > (%.f * (1-%s)) and %s{%s} > (%.f * (1-%s))",
burnrateName(metric, w.Short),
o.BurnrateName(w.Short),
alertMatchersString,
w.Factor,
strconv.FormatFloat(o.Target, 'f', -1, 64),
burnrateName(metric, w.Long),
o.BurnrateName(w.Long),
alertMatchersString,
w.Factor,
strconv.FormatFloat(o.Target, 'f', -1, 64),
Expand All @@ -246,7 +215,15 @@ func (o Objective) Burnrates() (monitoringv1.RuleGroup, error) {
}, nil
}

func burnrateName(metric string, rate time.Duration) string {
func (o Objective) BurnrateName(rate time.Duration) string {
var metric string
if o.Indicator.Ratio != nil && o.Indicator.Ratio.Total.Name != "" {
metric = o.Indicator.Ratio.Total.Name
}
if o.Indicator.Latency != nil && o.Indicator.Latency.Total.Name != "" {
metric = o.Indicator.Latency.Total.Name
}

metric = strings.TrimSuffix(metric, "_total")
metric = strings.TrimSuffix(metric, "_count")
return fmt.Sprintf("%s:burnrate%s", metric, model.Duration(rate))
Expand Down
Loading

0 comments on commit 44c2db0

Please sign in to comment.