Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fixes incorrect usage of backoff in GC #1445

Merged
merged 1 commit into from
Jul 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions internal/datastore/common/gc.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,13 @@ var (
Help: "The number of stale namespaces deleted by the datastore garbage collection.",
})

gcFailureCounter = prometheus.NewCounter(prometheus.CounterOpts{
gcFailureCounterConfig = prometheus.CounterOpts{
Namespace: "spicedb",
Subsystem: "datastore",
Name: "gc_failure_total",
Help: "The number of failed runs of the datastore garbage collection.",
})
}
gcFailureCounter = prometheus.NewCounter(gcFailureCounterConfig)
)

// RegisterGCMetrics registers garbage collection metrics to the default
Expand Down Expand Up @@ -98,13 +99,18 @@ var MaxGCInterval = 60 * time.Minute
// StartGarbageCollector loops forever until the context is canceled and
// performs garbage collection on the provided interval.
func StartGarbageCollector(ctx context.Context, gc GarbageCollector, interval, window, timeout time.Duration) error {
return startGarbageCollectorWithMaxElapsedTime(ctx, gc, interval, window, 0, timeout)
}

func startGarbageCollectorWithMaxElapsedTime(ctx context.Context, gc GarbageCollector, interval, window, maxElapsedTime, timeout time.Duration) error {
log.Ctx(ctx).Info().
Dur("interval", interval).
Msg("datastore garbage collection worker started")

backoffInterval := backoff.NewExponentialBackOff()
backoffInterval.InitialInterval = interval
backoffInterval.MaxElapsedTime = maxDuration(MaxGCInterval, interval)
backoffInterval.MaxInterval = maxDuration(MaxGCInterval, interval)
backoffInterval.MaxElapsedTime = maxElapsedTime

nextInterval := interval

Expand Down
78 changes: 78 additions & 0 deletions internal/datastore/common/gc_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
package common

import (
"context"
"fmt"
"testing"
"time"

"github.com/authzed/spicedb/pkg/datastore"

"github.com/prometheus/client_golang/prometheus"
promclient "github.com/prometheus/client_model/go"
"github.com/stretchr/testify/require"
)

type testGC struct{}

func (t testGC) ReadyState(_ context.Context) (datastore.ReadyState, error) {
return datastore.ReadyState{}, fmt.Errorf("hi")
}

func (t testGC) Now(_ context.Context) (time.Time, error) {
return time.Now(), fmt.Errorf("hi")
}

func (t testGC) TxIDBefore(_ context.Context, _ time.Time) (datastore.Revision, error) {
return nil, fmt.Errorf("hi")
}

func (t testGC) DeleteBeforeTx(_ context.Context, _ datastore.Revision) (DeletionCounts, error) {
return DeletionCounts{}, fmt.Errorf("hi")
}

func TestGCFailureBackoff(t *testing.T) {
defer func() {
gcFailureCounter = prometheus.NewCounter(gcFailureCounterConfig)
}()
reg := prometheus.NewRegistry()
require.NoError(t, reg.Register(gcFailureCounter))

ctx, cancel := context.WithCancel(context.Background())
defer cancel()
go func() {
require.Error(t, startGarbageCollectorWithMaxElapsedTime(ctx, testGC{}, 100*time.Millisecond, 1*time.Second, 1*time.Nanosecond, 1*time.Minute))
}()
time.Sleep(200 * time.Millisecond)
cancel()

metrics, err := reg.Gather()
require.NoError(t, err)
var mf *promclient.MetricFamily
for _, metric := range metrics {
if metric.GetName() == "spicedb_datastore_gc_failure_total" {
mf = metric
}
}
require.Greater(t, *(mf.GetMetric()[0].Counter.Value), 100.0, "MaxElapsedTime=1ns did not cause backoff to get ignored")

gcFailureCounter = prometheus.NewCounter(gcFailureCounterConfig)
reg = prometheus.NewRegistry()
require.NoError(t, reg.Register(gcFailureCounter))
ctx, cancel = context.WithCancel(context.Background())
defer cancel()
go func() {
require.Error(t, StartGarbageCollector(ctx, testGC{}, 100*time.Millisecond, 1*time.Second, 1*time.Minute))
}()
time.Sleep(200 * time.Millisecond)
cancel()

metrics, err = reg.Gather()
require.NoError(t, err)
for _, metric := range metrics {
if metric.GetName() == "spicedb_datastore_gc_failure_total" {
mf = metric
}
}
require.Less(t, *(mf.GetMetric()[0].Counter.Value), 3.0, "MaxElapsedTime=0 should have not caused backoff to get ignored")
}
2 changes: 2 additions & 0 deletions internal/services/health/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ func (hm *healthManager) Checker(ctx context.Context) func() error {
return func() error {
// Run immediately for the initial check
backoffInterval := backoff.NewExponentialBackOff()
backoffInterval.MaxElapsedTime = 0

ticker := time.After(0)

for {
Expand Down
3 changes: 2 additions & 1 deletion internal/telemetry/reporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,8 @@ func RemoteReporter(

backoffInterval := backoff.NewExponentialBackOff()
backoffInterval.InitialInterval = interval
backoffInterval.MaxElapsedTime = MaxElapsedTimeBetweenReports
backoffInterval.MaxInterval = MaxElapsedTimeBetweenReports
backoffInterval.MaxElapsedTime = 0

// Must reset the backoff object after changing parameters
backoffInterval.Reset()
Expand Down
Loading