Skip to content

Commit

Permalink
ref(seer grouping): Use circuit breaker for Seer call during grouping (
Browse files Browse the repository at this point in the history
…#71635)

This adds a circuit breaker to the Seer call we make before creating a new group, backed up by a new option, `seer.similarity.circuit-breaker-config`. This means that if requests to Seer in that spot error out too many times in a row, over too short a timespan, we'll temporarity block the majority of those calls. (Periodically we'll let one through, to see if whatever was broken has fixed itself, at which point we can resume calling it like normal.)

Right now what "too many errors, too fast" means is more than 30 in an hour, which is the circuit breaker default. If we want to adjust that in the future, we can do so by modifying the new option, which right now only overrides the circuit breaker defaults to turn on that periodic "see if things are okay now" bypass.
  • Loading branch information
lobsterkatie authored Jun 5, 2024
1 parent 86f237e commit bfecb17
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 2 deletions.
17 changes: 15 additions & 2 deletions src/sentry/event_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,9 @@
from sentry.utils.circuit_breaker import (
ERROR_COUNT_CACHE_KEY,
CircuitBreakerPassthrough,
CircuitBreakerTripped,
circuit_breaker_activated,
with_circuit_breaker,
)
from sentry.utils.dates import to_datetime
from sentry.utils.event import has_event_minified_stack_trace, has_stacktrace, is_handled
Expand Down Expand Up @@ -1493,8 +1495,10 @@ def _save_aggregate(
try:
# If the `projects:similarity-embeddings-grouping` feature is disabled,
# we'll still get back result metadata, but `seer_matched_group` will be None
seer_response_data, seer_matched_group = get_seer_similar_issues(
event, primary_hashes
seer_response_data, seer_matched_group = with_circuit_breaker(
"event_manager.get_seer_similar_issues",
lambda: get_seer_similar_issues(event, primary_hashes),
options.get("seer.similarity.circuit-breaker-config"),
)
event.data["seer_similarity"] = seer_response_data

Expand All @@ -1505,6 +1509,15 @@ def _save_aggregate(
"seer_similarity"
] = seer_response_data

except CircuitBreakerTripped:
# TODO: For now, all of the logging/netrics for this happening are handled
# inside of `with_circuit_breaker`. We should figure out if/how we want to
# reflect landing here in the `outcome` tag on the span and timer metric
# below and in `record_calculation_metric_with_result` (also below). Same
# goes for the various tests the event could fail in
# `should_call_seer_for_grouping`.
pass

# Insurance - in theory we shouldn't ever land here
except Exception as e:
sentry_sdk.capture_exception(
Expand Down
4 changes: 4 additions & 0 deletions src/sentry/grouping/ingest/seer.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ def should_call_seer_for_grouping(event: Event, project: Project) -> bool:
if not event_content_is_seer_eligible(event):
return False

# The circuit breaker check which might naturally also go here (along with its killswitch and
# ratelimiting friends) instead happens in the `with_circuit_breaker` helper used where
# `get_seer_similar_issues` is actually called. (It has to be there in order for it to track
# errors arising from that call.)
if _killswitch_enabled(event, project) or _ratelimiting_enabled(event, project):
return False

Expand Down
10 changes: 10 additions & 0 deletions src/sentry/options/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -879,6 +879,16 @@
flags=FLAG_ALLOW_EMPTY | FLAG_AUTOMATOR_MODIFIABLE,
)

register(
"seer.similarity.circuit-breaker-config",
type=Dict,
# TODO: For now we're using the defaults for everything but `allow_passthrough`. We may want to
# revisit that choice in the future.
default={"allow_passthrough": True},
flags=FLAG_ALLOW_EMPTY | FLAG_AUTOMATOR_MODIFIABLE,
)


# seer nearest neighbour endpoint timeout
register(
"embeddings-grouping.seer.nearest-neighbour-timeout",
Expand Down
18 changes: 18 additions & 0 deletions tests/sentry/event_manager/grouping/test_seer_grouping.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from sentry.testutils.helpers.eventprocessing import save_new_event
from sentry.testutils.helpers.features import with_feature
from sentry.testutils.pytest.mocking import capture_results
from sentry.utils.circuit_breaker import with_circuit_breaker
from sentry.utils.types import NonNone


Expand Down Expand Up @@ -151,6 +152,23 @@ def test_obeys_seer_similarity_flags(self):
assert get_seer_similar_issues_return_values[0][1] == existing_event.group
assert new_event.group_id == existing_event.group_id

@patch("sentry.event_manager.should_call_seer_for_grouping", return_value=True)
@patch("sentry.event_manager.with_circuit_breaker", wraps=with_circuit_breaker)
@patch("sentry.event_manager.get_seer_similar_issues", return_value=({}, None))
def test_obeys_circult_breaker(
self, mock_get_seer_similar_issues: MagicMock, mock_with_circuit_breaker: MagicMock, _
):
with patch("sentry.utils.circuit_breaker._should_call_callback", return_value=True):
save_new_event({"message": "Dogs are great!"}, self.project)
assert mock_with_circuit_breaker.call_count == 1
assert mock_get_seer_similar_issues.call_count == 1

with patch("sentry.utils.circuit_breaker._should_call_callback", return_value=False):
save_new_event({"message": "Adopt don't shop"}, self.project)

assert mock_with_circuit_breaker.call_count == 2 # increased
assert mock_get_seer_similar_issues.call_count == 1 # didn't increase

@with_feature("projects:similarity-embeddings-metadata")
@patch("sentry.grouping.ingest.seer.event_content_is_seer_eligible", return_value=True)
@patch("sentry.event_manager.get_seer_similar_issues", return_value=({}, None))
Expand Down

0 comments on commit bfecb17

Please sign in to comment.