From 925eca00fb5fd02a4ee6c32d1564f6d363c6747a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?V=C3=ADctor=20Rold=C3=A1n=20Betancort?=
 <vroldanbet@authzed.com>
Date: Sat, 15 Jul 2023 11:17:18 +0100
Subject: [PATCH] fixes incorrect usage of backoff in GC

once `MaxElapsedTime` has elapsed, the backoff switches
to the value of `Stop` which is -1. This mean it
will effectively remove the backoff and a storm of requests
will hit the DB.

I also changed it in every instance I found because the default
behaviour is dangerous.

See https://github.com/authzed/spicedb/issues/1444
---
 internal/datastore/common/gc.go      | 12 +++--
 internal/datastore/common/gc_test.go | 78 ++++++++++++++++++++++++++++
 internal/services/health/health.go   |  2 +
 internal/telemetry/reporter.go       |  3 +-
 4 files changed, 91 insertions(+), 4 deletions(-)
 create mode 100644 internal/datastore/common/gc_test.go

diff --git a/internal/datastore/common/gc.go b/internal/datastore/common/gc.go
index 7b5de124f1..80ad3eb098 100644
--- a/internal/datastore/common/gc.go
+++ b/internal/datastore/common/gc.go
@@ -43,12 +43,13 @@ var (
 		Help:      "The number of stale namespaces deleted by the datastore garbage collection.",
 	})
 
-	gcFailureCounter = prometheus.NewCounter(prometheus.CounterOpts{
+	gcFailureCounterConfig = prometheus.CounterOpts{
 		Namespace: "spicedb",
 		Subsystem: "datastore",
 		Name:      "gc_failure_total",
 		Help:      "The number of failed runs of the datastore garbage collection.",
-	})
+	}
+	gcFailureCounter = prometheus.NewCounter(gcFailureCounterConfig)
 )
 
 // RegisterGCMetrics registers garbage collection metrics to the default
@@ -98,13 +99,18 @@ var MaxGCInterval = 60 * time.Minute
 // StartGarbageCollector loops forever until the context is canceled and
 // performs garbage collection on the provided interval.
 func StartGarbageCollector(ctx context.Context, gc GarbageCollector, interval, window, timeout time.Duration) error {
+	return startGarbageCollectorWithMaxElapsedTime(ctx, gc, interval, window, 0, timeout)
+}
+
+func startGarbageCollectorWithMaxElapsedTime(ctx context.Context, gc GarbageCollector, interval, window, maxElapsedTime, timeout time.Duration) error {
 	log.Ctx(ctx).Info().
 		Dur("interval", interval).
 		Msg("datastore garbage collection worker started")
 
 	backoffInterval := backoff.NewExponentialBackOff()
 	backoffInterval.InitialInterval = interval
-	backoffInterval.MaxElapsedTime = maxDuration(MaxGCInterval, interval)
+	backoffInterval.MaxInterval = maxDuration(MaxGCInterval, interval)
+	backoffInterval.MaxElapsedTime = maxElapsedTime
 
 	nextInterval := interval
 
diff --git a/internal/datastore/common/gc_test.go b/internal/datastore/common/gc_test.go
new file mode 100644
index 0000000000..b6bd5c2b11
--- /dev/null
+++ b/internal/datastore/common/gc_test.go
@@ -0,0 +1,78 @@
+package common
+
+import (
+	"context"
+	"fmt"
+	"testing"
+	"time"
+
+	"github.com/authzed/spicedb/pkg/datastore"
+
+	"github.com/prometheus/client_golang/prometheus"
+	promclient "github.com/prometheus/client_model/go"
+	"github.com/stretchr/testify/require"
+)
+
+type testGC struct{}
+
+func (t testGC) ReadyState(_ context.Context) (datastore.ReadyState, error) {
+	return datastore.ReadyState{}, fmt.Errorf("hi")
+}
+
+func (t testGC) Now(_ context.Context) (time.Time, error) {
+	return time.Now(), fmt.Errorf("hi")
+}
+
+func (t testGC) TxIDBefore(_ context.Context, _ time.Time) (datastore.Revision, error) {
+	return nil, fmt.Errorf("hi")
+}
+
+func (t testGC) DeleteBeforeTx(_ context.Context, _ datastore.Revision) (DeletionCounts, error) {
+	return DeletionCounts{}, fmt.Errorf("hi")
+}
+
+func TestGCFailureBackoff(t *testing.T) {
+	defer func() {
+		gcFailureCounter = prometheus.NewCounter(gcFailureCounterConfig)
+	}()
+	reg := prometheus.NewRegistry()
+	require.NoError(t, reg.Register(gcFailureCounter))
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	go func() {
+		require.Error(t, startGarbageCollectorWithMaxElapsedTime(ctx, testGC{}, 100*time.Millisecond, 1*time.Second, 1*time.Nanosecond, 1*time.Minute))
+	}()
+	time.Sleep(200 * time.Millisecond)
+	cancel()
+
+	metrics, err := reg.Gather()
+	require.NoError(t, err)
+	var mf *promclient.MetricFamily
+	for _, metric := range metrics {
+		if metric.GetName() == "spicedb_datastore_gc_failure_total" {
+			mf = metric
+		}
+	}
+	require.Greater(t, *(mf.GetMetric()[0].Counter.Value), 100.0, "MaxElapsedTime=1ns did not cause backoff to get ignored")
+
+	gcFailureCounter = prometheus.NewCounter(gcFailureCounterConfig)
+	reg = prometheus.NewRegistry()
+	require.NoError(t, reg.Register(gcFailureCounter))
+	ctx, cancel = context.WithCancel(context.Background())
+	defer cancel()
+	go func() {
+		require.Error(t, StartGarbageCollector(ctx, testGC{}, 100*time.Millisecond, 1*time.Second, 1*time.Minute))
+	}()
+	time.Sleep(200 * time.Millisecond)
+	cancel()
+
+	metrics, err = reg.Gather()
+	require.NoError(t, err)
+	for _, metric := range metrics {
+		if metric.GetName() == "spicedb_datastore_gc_failure_total" {
+			mf = metric
+		}
+	}
+	require.Less(t, *(mf.GetMetric()[0].Counter.Value), 3.0, "MaxElapsedTime=0 should have not caused backoff to get ignored")
+}
diff --git a/internal/services/health/health.go b/internal/services/health/health.go
index fb3819b52c..06b02bdaae 100644
--- a/internal/services/health/health.go
+++ b/internal/services/health/health.go
@@ -63,6 +63,8 @@ func (hm *healthManager) Checker(ctx context.Context) func() error {
 	return func() error {
 		// Run immediately for the initial check
 		backoffInterval := backoff.NewExponentialBackOff()
+		backoffInterval.MaxElapsedTime = 0
+
 		ticker := time.After(0)
 
 		for {
diff --git a/internal/telemetry/reporter.go b/internal/telemetry/reporter.go
index 1442480d1b..f97f47e965 100644
--- a/internal/telemetry/reporter.go
+++ b/internal/telemetry/reporter.go
@@ -175,7 +175,8 @@ func RemoteReporter(
 
 		backoffInterval := backoff.NewExponentialBackOff()
 		backoffInterval.InitialInterval = interval
-		backoffInterval.MaxElapsedTime = MaxElapsedTimeBetweenReports
+		backoffInterval.MaxInterval = MaxElapsedTimeBetweenReports
+		backoffInterval.MaxElapsedTime = 0
 
 		// Must reset the backoff object after changing parameters
 		backoffInterval.Reset()