From 925eca00fb5fd02a4ee6c32d1564f6d363c6747a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADctor=20Rold=C3=A1n=20Betancort?= Date: Sat, 15 Jul 2023 11:17:18 +0100 Subject: [PATCH] fixes incorrect usage of backoff in GC once `MaxElapsedTime` has elapsed, the backoff switches to the value of `Stop` which is -1. This mean it will effectively remove the backoff and a storm of requests will hit the DB. I also changed it in every instance I found because the default behaviour is dangerous. See https://github.com/authzed/spicedb/issues/1444 --- internal/datastore/common/gc.go | 12 +++-- internal/datastore/common/gc_test.go | 78 ++++++++++++++++++++++++++++ internal/services/health/health.go | 2 + internal/telemetry/reporter.go | 3 +- 4 files changed, 91 insertions(+), 4 deletions(-) create mode 100644 internal/datastore/common/gc_test.go diff --git a/internal/datastore/common/gc.go b/internal/datastore/common/gc.go index 7b5de124f1..80ad3eb098 100644 --- a/internal/datastore/common/gc.go +++ b/internal/datastore/common/gc.go @@ -43,12 +43,13 @@ var ( Help: "The number of stale namespaces deleted by the datastore garbage collection.", }) - gcFailureCounter = prometheus.NewCounter(prometheus.CounterOpts{ + gcFailureCounterConfig = prometheus.CounterOpts{ Namespace: "spicedb", Subsystem: "datastore", Name: "gc_failure_total", Help: "The number of failed runs of the datastore garbage collection.", - }) + } + gcFailureCounter = prometheus.NewCounter(gcFailureCounterConfig) ) // RegisterGCMetrics registers garbage collection metrics to the default @@ -98,13 +99,18 @@ var MaxGCInterval = 60 * time.Minute // StartGarbageCollector loops forever until the context is canceled and // performs garbage collection on the provided interval. func StartGarbageCollector(ctx context.Context, gc GarbageCollector, interval, window, timeout time.Duration) error { + return startGarbageCollectorWithMaxElapsedTime(ctx, gc, interval, window, 0, timeout) +} + +func startGarbageCollectorWithMaxElapsedTime(ctx context.Context, gc GarbageCollector, interval, window, maxElapsedTime, timeout time.Duration) error { log.Ctx(ctx).Info(). Dur("interval", interval). Msg("datastore garbage collection worker started") backoffInterval := backoff.NewExponentialBackOff() backoffInterval.InitialInterval = interval - backoffInterval.MaxElapsedTime = maxDuration(MaxGCInterval, interval) + backoffInterval.MaxInterval = maxDuration(MaxGCInterval, interval) + backoffInterval.MaxElapsedTime = maxElapsedTime nextInterval := interval diff --git a/internal/datastore/common/gc_test.go b/internal/datastore/common/gc_test.go new file mode 100644 index 0000000000..b6bd5c2b11 --- /dev/null +++ b/internal/datastore/common/gc_test.go @@ -0,0 +1,78 @@ +package common + +import ( + "context" + "fmt" + "testing" + "time" + + "github.com/authzed/spicedb/pkg/datastore" + + "github.com/prometheus/client_golang/prometheus" + promclient "github.com/prometheus/client_model/go" + "github.com/stretchr/testify/require" +) + +type testGC struct{} + +func (t testGC) ReadyState(_ context.Context) (datastore.ReadyState, error) { + return datastore.ReadyState{}, fmt.Errorf("hi") +} + +func (t testGC) Now(_ context.Context) (time.Time, error) { + return time.Now(), fmt.Errorf("hi") +} + +func (t testGC) TxIDBefore(_ context.Context, _ time.Time) (datastore.Revision, error) { + return nil, fmt.Errorf("hi") +} + +func (t testGC) DeleteBeforeTx(_ context.Context, _ datastore.Revision) (DeletionCounts, error) { + return DeletionCounts{}, fmt.Errorf("hi") +} + +func TestGCFailureBackoff(t *testing.T) { + defer func() { + gcFailureCounter = prometheus.NewCounter(gcFailureCounterConfig) + }() + reg := prometheus.NewRegistry() + require.NoError(t, reg.Register(gcFailureCounter)) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go func() { + require.Error(t, startGarbageCollectorWithMaxElapsedTime(ctx, testGC{}, 100*time.Millisecond, 1*time.Second, 1*time.Nanosecond, 1*time.Minute)) + }() + time.Sleep(200 * time.Millisecond) + cancel() + + metrics, err := reg.Gather() + require.NoError(t, err) + var mf *promclient.MetricFamily + for _, metric := range metrics { + if metric.GetName() == "spicedb_datastore_gc_failure_total" { + mf = metric + } + } + require.Greater(t, *(mf.GetMetric()[0].Counter.Value), 100.0, "MaxElapsedTime=1ns did not cause backoff to get ignored") + + gcFailureCounter = prometheus.NewCounter(gcFailureCounterConfig) + reg = prometheus.NewRegistry() + require.NoError(t, reg.Register(gcFailureCounter)) + ctx, cancel = context.WithCancel(context.Background()) + defer cancel() + go func() { + require.Error(t, StartGarbageCollector(ctx, testGC{}, 100*time.Millisecond, 1*time.Second, 1*time.Minute)) + }() + time.Sleep(200 * time.Millisecond) + cancel() + + metrics, err = reg.Gather() + require.NoError(t, err) + for _, metric := range metrics { + if metric.GetName() == "spicedb_datastore_gc_failure_total" { + mf = metric + } + } + require.Less(t, *(mf.GetMetric()[0].Counter.Value), 3.0, "MaxElapsedTime=0 should have not caused backoff to get ignored") +} diff --git a/internal/services/health/health.go b/internal/services/health/health.go index fb3819b52c..06b02bdaae 100644 --- a/internal/services/health/health.go +++ b/internal/services/health/health.go @@ -63,6 +63,8 @@ func (hm *healthManager) Checker(ctx context.Context) func() error { return func() error { // Run immediately for the initial check backoffInterval := backoff.NewExponentialBackOff() + backoffInterval.MaxElapsedTime = 0 + ticker := time.After(0) for { diff --git a/internal/telemetry/reporter.go b/internal/telemetry/reporter.go index 1442480d1b..f97f47e965 100644 --- a/internal/telemetry/reporter.go +++ b/internal/telemetry/reporter.go @@ -175,7 +175,8 @@ func RemoteReporter( backoffInterval := backoff.NewExponentialBackOff() backoffInterval.InitialInterval = interval - backoffInterval.MaxElapsedTime = MaxElapsedTimeBetweenReports + backoffInterval.MaxInterval = MaxElapsedTimeBetweenReports + backoffInterval.MaxElapsedTime = 0 // Must reset the backoff object after changing parameters backoffInterval.Reset()