Skip to content

Commit

Permalink
Merge pull request #1445 from authzed/fix-GC
Browse files Browse the repository at this point in the history
fixes incorrect usage of backoff in GC
  • Loading branch information
vroldanbet authored Jul 17, 2023
2 parents f225c4d + 925eca0 commit 3832ee8
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 4 deletions.
12 changes: 9 additions & 3 deletions internal/datastore/common/gc.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,13 @@ var (
Help: "The number of stale namespaces deleted by the datastore garbage collection.",
})

gcFailureCounter = prometheus.NewCounter(prometheus.CounterOpts{
gcFailureCounterConfig = prometheus.CounterOpts{
Namespace: "spicedb",
Subsystem: "datastore",
Name: "gc_failure_total",
Help: "The number of failed runs of the datastore garbage collection.",
})
}
gcFailureCounter = prometheus.NewCounter(gcFailureCounterConfig)
)

// RegisterGCMetrics registers garbage collection metrics to the default
Expand Down Expand Up @@ -98,13 +99,18 @@ var MaxGCInterval = 60 * time.Minute
// StartGarbageCollector loops forever until the context is canceled and
// performs garbage collection on the provided interval.
func StartGarbageCollector(ctx context.Context, gc GarbageCollector, interval, window, timeout time.Duration) error {
return startGarbageCollectorWithMaxElapsedTime(ctx, gc, interval, window, 0, timeout)
}

func startGarbageCollectorWithMaxElapsedTime(ctx context.Context, gc GarbageCollector, interval, window, maxElapsedTime, timeout time.Duration) error {
log.Ctx(ctx).Info().
Dur("interval", interval).
Msg("datastore garbage collection worker started")

backoffInterval := backoff.NewExponentialBackOff()
backoffInterval.InitialInterval = interval
backoffInterval.MaxElapsedTime = maxDuration(MaxGCInterval, interval)
backoffInterval.MaxInterval = maxDuration(MaxGCInterval, interval)
backoffInterval.MaxElapsedTime = maxElapsedTime

nextInterval := interval

Expand Down
78 changes: 78 additions & 0 deletions internal/datastore/common/gc_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
package common

import (
"context"
"fmt"
"testing"
"time"

"github.com/authzed/spicedb/pkg/datastore"

"github.com/prometheus/client_golang/prometheus"
promclient "github.com/prometheus/client_model/go"
"github.com/stretchr/testify/require"
)

type testGC struct{}

func (t testGC) ReadyState(_ context.Context) (datastore.ReadyState, error) {
return datastore.ReadyState{}, fmt.Errorf("hi")
}

func (t testGC) Now(_ context.Context) (time.Time, error) {
return time.Now(), fmt.Errorf("hi")
}

func (t testGC) TxIDBefore(_ context.Context, _ time.Time) (datastore.Revision, error) {
return nil, fmt.Errorf("hi")
}

func (t testGC) DeleteBeforeTx(_ context.Context, _ datastore.Revision) (DeletionCounts, error) {
return DeletionCounts{}, fmt.Errorf("hi")
}

func TestGCFailureBackoff(t *testing.T) {
defer func() {
gcFailureCounter = prometheus.NewCounter(gcFailureCounterConfig)
}()
reg := prometheus.NewRegistry()
require.NoError(t, reg.Register(gcFailureCounter))

ctx, cancel := context.WithCancel(context.Background())
defer cancel()
go func() {
require.Error(t, startGarbageCollectorWithMaxElapsedTime(ctx, testGC{}, 100*time.Millisecond, 1*time.Second, 1*time.Nanosecond, 1*time.Minute))
}()
time.Sleep(200 * time.Millisecond)
cancel()

metrics, err := reg.Gather()
require.NoError(t, err)
var mf *promclient.MetricFamily
for _, metric := range metrics {
if metric.GetName() == "spicedb_datastore_gc_failure_total" {
mf = metric
}
}
require.Greater(t, *(mf.GetMetric()[0].Counter.Value), 100.0, "MaxElapsedTime=1ns did not cause backoff to get ignored")

gcFailureCounter = prometheus.NewCounter(gcFailureCounterConfig)
reg = prometheus.NewRegistry()
require.NoError(t, reg.Register(gcFailureCounter))
ctx, cancel = context.WithCancel(context.Background())
defer cancel()
go func() {
require.Error(t, StartGarbageCollector(ctx, testGC{}, 100*time.Millisecond, 1*time.Second, 1*time.Minute))
}()
time.Sleep(200 * time.Millisecond)
cancel()

metrics, err = reg.Gather()
require.NoError(t, err)
for _, metric := range metrics {
if metric.GetName() == "spicedb_datastore_gc_failure_total" {
mf = metric
}
}
require.Less(t, *(mf.GetMetric()[0].Counter.Value), 3.0, "MaxElapsedTime=0 should have not caused backoff to get ignored")
}
2 changes: 2 additions & 0 deletions internal/services/health/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ func (hm *healthManager) Checker(ctx context.Context) func() error {
return func() error {
// Run immediately for the initial check
backoffInterval := backoff.NewExponentialBackOff()
backoffInterval.MaxElapsedTime = 0

ticker := time.After(0)

for {
Expand Down
3 changes: 2 additions & 1 deletion internal/telemetry/reporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,8 @@ func RemoteReporter(

backoffInterval := backoff.NewExponentialBackOff()
backoffInterval.InitialInterval = interval
backoffInterval.MaxElapsedTime = MaxElapsedTimeBetweenReports
backoffInterval.MaxInterval = MaxElapsedTimeBetweenReports
backoffInterval.MaxElapsedTime = 0

// Must reset the backoff object after changing parameters
backoffInterval.Reset()
Expand Down

0 comments on commit 3832ee8

Please sign in to comment.