authzed · vroldanbet · Jul 17, 2023 · Jul 15, 2023
@@ -43,12 +43,13 @@ var (
 		Help:      "The number of stale namespaces deleted by the datastore garbage collection.",
 	})
 
-	gcFailureCounter = prometheus.NewCounter(prometheus.CounterOpts{
+	gcFailureCounterConfig = prometheus.CounterOpts{
 		Namespace: "spicedb",
 		Subsystem: "datastore",
 		Name:      "gc_failure_total",
 		Help:      "The number of failed runs of the datastore garbage collection.",
-	})
+	}
+	gcFailureCounter = prometheus.NewCounter(gcFailureCounterConfig)
 )
 
 // RegisterGCMetrics registers garbage collection metrics to the default
@@ -98,13 +99,18 @@ var MaxGCInterval = 60 * time.Minute
 // StartGarbageCollector loops forever until the context is canceled and
 // performs garbage collection on the provided interval.
 func StartGarbageCollector(ctx context.Context, gc GarbageCollector, interval, window, timeout time.Duration) error {
+	return startGarbageCollectorWithMaxElapsedTime(ctx, gc, interval, window, 0, timeout)
+}
+
+func startGarbageCollectorWithMaxElapsedTime(ctx context.Context, gc GarbageCollector, interval, window, maxElapsedTime, timeout time.Duration) error {
 	log.Ctx(ctx).Info().
 		Dur("interval", interval).
 		Msg("datastore garbage collection worker started")
 
 	backoffInterval := backoff.NewExponentialBackOff()
 	backoffInterval.InitialInterval = interval
-	backoffInterval.MaxElapsedTime = maxDuration(MaxGCInterval, interval)
+	backoffInterval.MaxInterval = maxDuration(MaxGCInterval, interval)
+	backoffInterval.MaxElapsedTime = maxElapsedTime
 
 	nextInterval := interval
 

@@ -0,0 +1,78 @@
+package common
+
+import (
+	"context"
+	"fmt"
+	"testing"
+	"time"
+
+	"github.com/authzed/spicedb/pkg/datastore"
+
+	"github.com/prometheus/client_golang/prometheus"
+	promclient "github.com/prometheus/client_model/go"
+	"github.com/stretchr/testify/require"
+)
+
+type testGC struct{}
+
+func (t testGC) ReadyState(_ context.Context) (datastore.ReadyState, error) {
+	return datastore.ReadyState{}, fmt.Errorf("hi")
+}
+
+func (t testGC) Now(_ context.Context) (time.Time, error) {
+	return time.Now(), fmt.Errorf("hi")
+}
+
+func (t testGC) TxIDBefore(_ context.Context, _ time.Time) (datastore.Revision, error) {
+	return nil, fmt.Errorf("hi")
+}
+
+func (t testGC) DeleteBeforeTx(_ context.Context, _ datastore.Revision) (DeletionCounts, error) {
+	return DeletionCounts{}, fmt.Errorf("hi")
+}
+
+func TestGCFailureBackoff(t *testing.T) {
+	defer func() {
+		gcFailureCounter = prometheus.NewCounter(gcFailureCounterConfig)
+	}()
+	reg := prometheus.NewRegistry()
+	require.NoError(t, reg.Register(gcFailureCounter))
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	go func() {
+		require.Error(t, startGarbageCollectorWithMaxElapsedTime(ctx, testGC{}, 100*time.Millisecond, 1*time.Second, 1*time.Nanosecond, 1*time.Minute))
+	}()
+	time.Sleep(200 * time.Millisecond)
+	cancel()
+
+	metrics, err := reg.Gather()
+	require.NoError(t, err)
+	var mf *promclient.MetricFamily
+	for _, metric := range metrics {
+		if metric.GetName() == "spicedb_datastore_gc_failure_total" {
+			mf = metric
+		}
+	}
+	require.Greater(t, *(mf.GetMetric()[0].Counter.Value), 100.0, "MaxElapsedTime=1ns did not cause backoff to get ignored")
+
+	gcFailureCounter = prometheus.NewCounter(gcFailureCounterConfig)
+	reg = prometheus.NewRegistry()
+	require.NoError(t, reg.Register(gcFailureCounter))
+	ctx, cancel = context.WithCancel(context.Background())
+	defer cancel()
+	go func() {
+		require.Error(t, StartGarbageCollector(ctx, testGC{}, 100*time.Millisecond, 1*time.Second, 1*time.Minute))
+	}()
+	time.Sleep(200 * time.Millisecond)
+	cancel()
+
+	metrics, err = reg.Gather()
+	require.NoError(t, err)
+	for _, metric := range metrics {
+		if metric.GetName() == "spicedb_datastore_gc_failure_total" {
+			mf = metric
+		}
+	}
+	require.Less(t, *(mf.GetMetric()[0].Counter.Value), 3.0, "MaxElapsedTime=0 should have not caused backoff to get ignored")
+}
@@ -63,6 +63,8 @@ func (hm *healthManager) Checker(ctx context.Context) func() error {
 	return func() error {
 		// Run immediately for the initial check
 		backoffInterval := backoff.NewExponentialBackOff()
+		backoffInterval.MaxElapsedTime = 0
+
 		ticker := time.After(0)
 
 		for {

@@ -175,7 +175,8 @@ func RemoteReporter(
 
 		backoffInterval := backoff.NewExponentialBackOff()
 		backoffInterval.InitialInterval = interval
-		backoffInterval.MaxElapsedTime = MaxElapsedTimeBetweenReports
+		backoffInterval.MaxInterval = MaxElapsedTimeBetweenReports
+		backoffInterval.MaxElapsedTime = 0
 
 		// Must reset the backoff object after changing parameters
 		backoffInterval.Reset()