From 5b1a96b51a09f33169402069312608a733ea7d4b Mon Sep 17 00:00:00 2001 From: Zeyad Gouda Date: Mon, 7 Oct 2024 11:56:43 +0300 Subject: [PATCH] WIP --- client/clientutil/snapinfo_test.go | 1 + client/packages.go | 2 + client/packages_test.go | 10 + daemon/api_snaps_test.go | 32 ++++ daemon/snap.go | 1 + overlord/snapstate/autorefresh.go | 116 +++++++++++ overlord/snapstate/snapmgr.go | 12 +- overlord/snapstate/snapstate.go | 4 + overlord/snapstate/snapstate_test.go | 104 ++++++++++ overlord/snapstate/snapstate_update_test.go | 181 ++++++++++++++++++ snap/info.go | 10 + .../check_auto_refresh_count.sh | 1 + tests/main/auto-refresh-backoff/task.yaml | 30 +-- 13 files changed, 488 insertions(+), 16 deletions(-) diff --git a/client/clientutil/snapinfo_test.go b/client/clientutil/snapinfo_test.go index 96f43c11a43..c42361d9f2a 100644 --- a/client/clientutil/snapinfo_test.go +++ b/client/clientutil/snapinfo_test.go @@ -121,6 +121,7 @@ func (*cmdSuite) TestClientSnapFromSnapInfo(c *C) { "Hold", "GatingHold", "RefreshInhibit", + "RefreshFailures", "Components", } var checker func(string, reflect.Value) diff --git a/client/packages.go b/client/packages.go index d4a9c4cd0f0..8b5d3c2c93b 100644 --- a/client/packages.go +++ b/client/packages.go @@ -91,6 +91,8 @@ type Snap struct { GatingHold *time.Time `json:"gating-hold,omitempty"` // if RefreshInhibit is nil, then there is no pending refresh. RefreshInhibit *SnapRefreshInhibit `json:"refresh-inhibit,omitempty"` + // RefreshFailures tracks information about snap failed refreshes. + RefreshFailures *snap.RefreshFailuresInfo `json:"refresh-failures,omitempty"` // Components is a list of the snap components Components []Component `json:"components,omitempty"` diff --git a/client/packages_test.go b/client/packages_test.go index bd89e63eabc..d6c248999b2 100644 --- a/client/packages_test.go +++ b/client/packages_test.go @@ -274,6 +274,11 @@ func (cs *clientSuite) testClientSnap(c *check.C, refreshInhibited bool) { "private": true, "devmode": true, "trymode": true, + "refresh-failures": { + "to-revision": 43, + "failure-count": 5, + "last-failure-time": "2024-10-06T21:31:05Z" + }, "screenshots": [ {"url":"http://example.com/shot1.png", "width":640, "height":480}, {"url":"http://example.com/shot2.png"} @@ -356,6 +361,11 @@ func (cs *clientSuite) testClientSnap(c *check.C, refreshInhibited bool) { Website: "http://example.com/funky", StoreURL: "https://snapcraft.io/chatroom", RefreshInhibit: expectedSnapRefreshInhibit, + RefreshFailures: &snap.RefreshFailuresInfo{ + ToRevision: snap.R(43), + FailureCount: 5, + LastFailureTime: time.Date(2024, 10, 6, 21, 31, 5, 0, time.UTC), + }, }) } diff --git a/daemon/api_snaps_test.go b/daemon/api_snaps_test.go index 3ac987ab0e1..2a8ef4c68ed 100644 --- a/daemon/api_snaps_test.go +++ b/daemon/api_snaps_test.go @@ -1578,6 +1578,38 @@ func (s *snapsSuite) TestSnapManyInfosSelectRefreshInhibited(c *check.C) { } } +func (s *snapsSuite) TestSnapInfoReturnsRefreshFailures(c *check.C) { + s.expectSnapsNameReadAccess() + d := s.daemon(c) + s.mkInstalledInState(c, d, "foo", "bar", "v0", snap.R(5), true, "") + + expectedRefreshFailures := &snap.RefreshFailuresInfo{ + ToRevision: snap.R(6), + FailureCount: 4, + LastFailureTime: time.Date(2024, time.October, 10, 21, 22, 11, 0, time.UTC), + } + + st := d.Overlord().State() + st.Lock() + var snapst snapstate.SnapState + // Update snap state with RefreshFailure. + c.Assert(snapstate.Get(st, "foo", &snapst), check.IsNil) + snapst.RefreshFailures = expectedRefreshFailures + snapstate.Set(st, "foo", &snapst) + st.Unlock() + + req, err := http.NewRequest("GET", "/v2/snaps/foo", nil) + c.Assert(err, check.IsNil) + + rsp := s.syncReq(c, req, nil) + + c.Assert(rsp.Result, check.FitsTypeOf, &client.Snap{}) + snapInfo := rsp.Result.(*client.Snap) + c.Assert(snapInfo.RefreshFailures, check.NotNil) + + c.Check(snapInfo.RefreshFailures, check.DeepEquals, expectedRefreshFailures) +} + func (s *snapsSuite) TestMapLocalFields(c *check.C) { media := snap.MediaInfos{ { diff --git a/daemon/snap.go b/daemon/snap.go index 3f2cd589bb7..91404bb9904 100644 --- a/daemon/snap.go +++ b/daemon/snap.go @@ -256,6 +256,7 @@ func mapLocal(about aboutSnap, sd clientutil.StatusDecorator) *client.Snap { result.DevMode = snapst.DevMode result.TryMode = snapst.TryMode result.JailMode = snapst.JailMode + result.RefreshFailures = snapst.RefreshFailures result.MountedFrom = localSnap.MountFile() if result.TryMode { // Readlink instead of EvalSymlinks because it's only expected diff --git a/overlord/snapstate/autorefresh.go b/overlord/snapstate/autorefresh.go index 718d2124961..71704905b75 100644 --- a/overlord/snapstate/autorefresh.go +++ b/overlord/snapstate/autorefresh.go @@ -974,3 +974,119 @@ func MockRefreshCandidate(snapSetup *SnapSetup) interface{} { SnapSetup: *snapSetup, } } + +func processFailedAutoRefresh(chg *state.Change, _ state.Status, new state.Status) { + if chg.Kind() != "auto-refresh" || new != state.ErrorStatus { + return + } + + var failedSnapNames []string + for _, t := range chg.Tasks() { + // We only care about snaps that failed after unlink-current-snap because + // this indicates (with high probability) that something related to the snap + // itself is broken. + if t.Kind() != "unlink-current-snap" || t.Status() != state.UndoneStatus { + continue + } + + snapsup, snapst, err := snapSetupAndState(t) + if err != nil { + logger.Debugf("internal error: failed to get snap associated with task %s: %v", t.ID(), err) + continue + } + + // Update refresh failure information for snap revision + if snapst.RefreshFailures == nil { + // Initialize RefreshFailures + snapst.RefreshFailures = &snap.RefreshFailuresInfo{FailureCount: 1} + } + if snapst.RefreshFailures.ToRevision == snapsup.Revision() { + snapst.RefreshFailures.FailureCount++ + } else { + snapst.RefreshFailures.ToRevision = snapsup.Revision() + } + snapst.RefreshFailures.LastFailureTime = timeNow() + Set(t.State(), snapsup.InstanceName(), snapst) + + delay := computeSnapRefreshDelay(snapst.RefreshFailures) + logger.Noticef("snap %q auto-refresh to revision %s has failed, next auto-refresh attempt will be in %v hours", snapsup.InstanceName(), snapsup.Revision(), delay.Hours()) + failedSnapNames = append(failedSnapNames, snapsup.InstanceName()) + } + + if len(failedSnapNames) == 0 { + return + } + + // Attach failed snaps to change api data. This intended to guide + // agents on devices that manage their own refresh cycle. + var data map[string]interface{} + err := chg.Get("api-data", &data) + if err != nil && !errors.Is(err, state.ErrNoState) { + logger.Debugf("internal error: failed to get api-data for change %s: %v", chg.ID(), err) + return + } + if len(data) == 0 { + data = make(map[string]interface{}) + } + // XXX: Should this be set regardless the refresh failed before/after unlink-current-snap? + data["refresh-failed"] = failedSnapNames + chg.Set("api-data", data) +} + +// snapRefreshDelay maps from failure count to time to snap refresh delay capped at 2 weeks. +// +// Note: Those are heuristic values listed in the SD183 spec. +var snapRefreshDelay = []time.Duration{ + 0, // FailureCount == 0 -> No delay + 8 * time.Hour, // FailureCount == 1 -> 8 hours delay + 12 * time.Hour, // FailureCount == 2 -> 12 hours delay + 24 * time.Hour, // FailureCount == 3 -> 1 day delay + 2 * 24 * time.Hour, // FailureCount == 4 -> 2 days delay + 4 * 24 * time.Hour, // FailureCount == 5 -> 4 days delay + 7 * 24 * time.Hour, // FailureCount == 6 -> 1 week delay + 1.5 * 7 * 24 * time.Hour, // FailureCount == 7 -> 1.5 weeks delay + 2 * 7 * 24 * time.Hour, // FailureCount == 8 -> 2 weeks delay +} + +func computeSnapRefreshDelay(refreshFailures *snap.RefreshFailuresInfo) time.Duration { + if refreshFailures == nil { + return 0 + } + failureCount := refreshFailures.FailureCount + if failureCount > len(snapRefreshDelay)-1 { + // Cap failure count to max delay according to snapRefreshDelay + failureCount = len(snapRefreshDelay) - 1 + } + return snapRefreshDelay[failureCount] +} + +// shouldSkipSnapRefresh checks if a snap refresh to a target revision should be skipped or not. +// +// This helper implements a backoff algorithm that prevents failed upgrade loops +// by introducing backoff delay based on snapst.RefreshFailures and snapRefreshDelay. +func shouldSkipSnapRefresh(st *state.State, snapst *SnapState, targetRevision snap.Revision, opts Options) bool { + if !opts.Flags.IsAutoRefresh || snapst.RefreshFailures == nil { + // Don't skip if not an auto-refresh or if snap has no history of failed refreshes. + return false + } + + if snapst.RefreshFailures.ToRevision != targetRevision { + // Snap has new revision not known to fail, let's reset RefreshFailures + // and continue refresh. + snapst.RefreshFailures = nil + Set(st, snapst.InstanceName(), snapst) + return false + } + + // Here we are certain that the attempted target revision refresh is known to fail. + // Let's compute delay according to RefreshFailures. + delay := computeSnapRefreshDelay(snapst.RefreshFailures) + lastFailureTime := snapst.RefreshFailures.LastFailureTime + // TODO: implement more aggressive backoff for snaps that failed after reboot + if lastFailureTime.Add(delay).After(timeNow()) { + // Backoff delay duration since last failure has passed, let's continue refresh + return true + } + + return false +} diff --git a/overlord/snapstate/snapmgr.go b/overlord/snapstate/snapmgr.go index 7b5719f3e6d..13efae7a2e4 100644 --- a/overlord/snapstate/snapmgr.go +++ b/overlord/snapstate/snapmgr.go @@ -376,6 +376,9 @@ type SnapState struct { // their security profiles set up but are not active. // It is managed by ifacestate. PendingSecurity *PendingSecurityState `json:"pending-security,omitempty"` + + // RefreshFailures tracks information about snap failed refreshes. + RefreshFailures *snap.RefreshFailuresInfo `json:"refresh-failures,omitempty"` } // PendingSecurityState holds information about snaps that have @@ -875,9 +878,12 @@ func (m *SnapManager) StartUp() error { return fmt.Errorf("failed to generate cookies: %q", err) } - // register handler that records a refresh-inhibit notice when - // the set of inhibited snaps is changed. - m.changeCallbackID = m.state.AddChangeStatusChangedHandler(processInhibitedAutoRefresh) + m.changeCallbackID = m.state.AddChangeStatusChangedHandler(func(chg *state.Change, old, new state.Status) { + // This handler records a refresh-inhibit notice when the set of inhibited snaps is changed. + processInhibitedAutoRefresh(chg, old, new) + // This handler implements marks failed snaps auto-refresh attempts for backoff. + processFailedAutoRefresh(chg, old, new) + }) if CheckExpectedRestart(m.state) == ErrUnexpectedRuntimeRestart { logger.Noticef("detected a restart at runtime without a corresponding snapd change") diff --git a/overlord/snapstate/snapstate.go b/overlord/snapstate/snapstate.go index 6a9ada9b7d6..0df9c687e00 100644 --- a/overlord/snapstate/snapstate.go +++ b/overlord/snapstate/snapstate.go @@ -2116,6 +2116,10 @@ func doUpdate(st *state.State, requested []string, updates []update, opts Option continue } + if shouldSkipSnapRefresh(st, &up.SnapState, up.Setup.Revision(), opts) { + continue + } + // if any snaps actually get a revision change, we need to do a // re-refresh check needsRerefreshCheck = true diff --git a/overlord/snapstate/snapstate_test.go b/overlord/snapstate/snapstate_test.go index e8e95ebd2e7..9d0d071edc0 100644 --- a/overlord/snapstate/snapstate_test.go +++ b/overlord/snapstate/snapstate_test.go @@ -173,6 +173,7 @@ func (s *snapmgrBaseTest) SetUpTest(c *C) { fakeBackend: s.fakeBackend, state: s.state, downloadError: make(map[string]error), + refreshRevnos: make(map[string]snap.Revision), } // make tests work consistently also in containers @@ -9343,6 +9344,109 @@ func (s *snapmgrTestSuite) TestSaveRefreshCandidatesOnAutoRefresh(c *C) { c.Check(cands["some-other-snap"], NotNil) } +func (s *snapmgrTestSuite) TestBackoffOnAutoRefresh(c *C) { + s.state.Lock() + defer s.state.Unlock() + + badRevison := snap.R(12) + badSnapst := &snapstate.SnapState{ + Active: true, + Sequence: snapstatetest.NewSequenceFromSnapSideInfos([]*snap.SideInfo{ + {RealName: "some-snap", SnapID: "some-snap-id", Revision: snap.R(1)}, + }), + Current: snap.R(1), + SnapType: "app", + RefreshFailures: &snap.RefreshFailuresInfo{ + ToRevision: badRevison, + FailureCount: 1, + LastFailureTime: time.Now(), + }, + } + snapstate.Set(s.state, "some-snap", badSnapst) + snapstate.Set(s.state, "some-other-snap", &snapstate.SnapState{ + Active: true, + Sequence: snapstatetest.NewSequenceFromSnapSideInfos([]*snap.SideInfo{ + {RealName: "some-other-snap", SnapID: "some-other-snap-id", Revision: snap.R(1)}, + }), + Current: snap.R(1), + SnapType: "app", + }) + + s.fakeStore.refreshRevnos["some-snap-id"] = badRevison + names, tss, err := snapstate.AutoRefresh(context.Background(), s.state) + c.Assert(err, IsNil) + c.Assert(tss, NotNil) + // some-snap auto-refresh skipped + c.Check(names, DeepEquals, []string{"some-other-snap"}) + + // Failure delay is capped at 2 weeks + badSnapst.RefreshFailures.FailureCount = 100 + badSnapst.RefreshFailures.LastFailureTime = time.Now().Add(-(2*7*24 - 1) * time.Hour) + snapstate.Set(s.state, "some-snap", badSnapst) + names, _, err = snapstate.AutoRefresh(context.Background(), s.state) + c.Assert(err, IsNil) + // some-snap auto-refresh skipped + c.Check(names, DeepEquals, []string{"some-other-snap"}) + // But backoff delay is capped at two weeks + badSnapst.RefreshFailures.LastFailureTime = time.Now().Add(-(2 * 7 * 24) * time.Hour) + snapstate.Set(s.state, "some-snap", badSnapst) + names, _, err = snapstate.AutoRefresh(context.Background(), s.state) + c.Assert(err, IsNil) + c.Check(names, DeepEquals, []string{"some-other-snap", "some-snap"}) +} + +func (s *snapmgrTestSuite) TestBackoffOnAutoRefreshWithNewRevision(c *C) { + s.state.Lock() + defer s.state.Unlock() + + badRevison := snap.R(12) + badSnapst := &snapstate.SnapState{ + Active: true, + Sequence: snapstatetest.NewSequenceFromSnapSideInfos([]*snap.SideInfo{ + {RealName: "some-snap", SnapID: "some-snap-id", Revision: snap.R(1)}, + }), + Current: snap.R(1), + SnapType: "app", + RefreshFailures: &snap.RefreshFailuresInfo{ + ToRevision: badRevison, + FailureCount: 3, + LastFailureTime: time.Now(), + }, + } + snapstate.Set(s.state, "some-snap", badSnapst) + snapstate.Set(s.state, "some-other-snap", &snapstate.SnapState{ + Active: true, + Sequence: snapstatetest.NewSequenceFromSnapSideInfos([]*snap.SideInfo{ + {RealName: "some-other-snap", SnapID: "some-other-snap-id", Revision: snap.R(1)}, + }), + Current: snap.R(1), + SnapType: "app", + }) + + s.fakeStore.refreshRevnos["some-snap-id"] = badRevison + names, tss, err := snapstate.AutoRefresh(context.Background(), s.state) + c.Assert(err, IsNil) + c.Assert(tss, NotNil) + // some-snap auto-refresh skipped + c.Check(names, DeepEquals, []string{"some-other-snap"}) + + // Check that new revision resets RefreshFailures + s.fakeStore.refreshRevnos["some-snap-id"] = snap.R(13) + // First make sure RefreshFailures is not nil + var snapst snapstate.SnapState + snapstate.Get(s.state, "some-snap", &snapst) + c.Assert(snapst.RefreshFailures.FailureCount, Equals, 3) + // Trigger new auto-refresh (with new revision from fakestore) + names, tss, err = snapstate.AutoRefresh(context.Background(), s.state) + c.Assert(err, IsNil) + c.Assert(tss, NotNil) + // some-snap auto-refresh not skipped + c.Check(names, DeepEquals, []string{"some-other-snap", "some-snap"}) + // And RefreshFailures is reset + snapstate.Get(s.state, "some-snap", &snapst) + c.Assert(snapst.RefreshFailures, IsNil) +} + type customStore struct { *fakeStore diff --git a/overlord/snapstate/snapstate_update_test.go b/overlord/snapstate/snapstate_update_test.go index b8bb1c2877a..51cc2d655f2 100644 --- a/overlord/snapstate/snapstate_update_test.go +++ b/overlord/snapstate/snapstate_update_test.go @@ -10693,6 +10693,187 @@ func (s *snapmgrTestSuite) TestAutoRefreshCreatePreDownload(c *C) { c.Assert(tasks[0].Summary(), testutil.Contains, "Pre-download snap \"some-snap\" (2) from channel") } +func (s *snapmgrTestSuite) TestAutoRefreshRecordsFailures(c *C) { + // Setup test snaps + s.state.Lock() + defer s.state.Unlock() + snapstate.Set(s.state, "some-snap", &snapstate.SnapState{ + Active: true, + Sequence: snapstatetest.NewSequenceFromSnapSideInfos([]*snap.SideInfo{ + {RealName: "some-snap", SnapID: "some-snap-id", Revision: snap.R(1)}, + }), + Current: snap.R(1), + SnapType: string(snap.TypeApp), + }) + snapstate.Set(s.state, "some-other-snap", &snapstate.SnapState{ + Active: true, + Sequence: snapstatetest.NewSequenceFromSnapSideInfos([]*snap.SideInfo{ + {RealName: "some-other-snap", SnapID: "some-other-snap-id", Revision: snap.R(1)}, + }), + Current: snap.R(1), + SnapType: string(snap.TypeApp), + }) + + snapstate.CanAutoRefresh = func(*state.State) (bool, error) { return true, nil } + + buf, restore := logger.MockLogger() + defer restore() + + badSnapRevision := snap.R(12) + s.fakeBackend.linkSnapFailTrigger = filepath.Join(dirs.SnapMountDir, "/some-snap/"+badSnapRevision.String()) + goodSnapRevision := snap.R(12) + // Specify retrieved revisions for snaps from fakestore + s.fakeStore.refreshRevnos["some-snap-id"] = badSnapRevision + s.fakeStore.refreshRevnos["some-other-snap-id"] = goodSnapRevision + + getRefreshFailures := func(name string) *snap.RefreshFailuresInfo { + var snapst snapstate.SnapState + c.Assert(snapstate.Get(s.state, name, &snapst), IsNil) + return snapst.RefreshFailures + } + getRefreshFailed := func(chg *state.Change) []interface{} { + var apiData map[string]interface{} + c.Assert(chg.Get("api-data", &apiData), IsNil) + if apiData["refresh-failed"] == nil { + return nil + } + refreshFailed := apiData["refresh-failed"].([]interface{}) + sort.Slice(refreshFailed, func(i, j int) bool { + return refreshFailed[i].(string) < refreshFailed[j].(string) + }) + return refreshFailed + } + getRevision := func(name string) snap.Revision { + var snapst snapstate.SnapState + c.Assert(snapstate.Get(s.state, name, &snapst), IsNil) + return snapst.Current + } + + forceAutoRefresh := func() { + s.state.Unlock() + // Fake nextRefresh to now. + s.snapmgr.MockNextRefresh(time.Now()) + // Fake that the retryRefreshDelay is over + restore := snapstate.MockRefreshRetryDelay(1 * time.Millisecond) + defer restore() + time.Sleep(10 * time.Millisecond) + // Trigger autorefresh.Ensure(). + err := s.snapmgr.Ensure() + if errors.Is(err, advisor.ErrNotSupported) { + c.Skip("bolt is not supported") + } + c.Assert(err, IsNil) + s.state.Lock() + s.settle(c) + } + + // Auto-refresh errors are not recorded until after unlink-current-snap as + // this indicates (with high probability) a real issue with the snap revision. + s.fakeBackend.maybeInjectErr = func(fo *fakeOp) error { + if fo.op == "unlink-snap" && strings.Contains(fo.path, "some-snap") { + return errors.New("error exactly at unlink-current-snap") + } + return nil + } + buf.Reset() + forceAutoRefresh() + chgs := s.state.Changes() + c.Assert(chgs, HasLen, 1) + c.Check(chgs[0].Kind(), Equals, "auto-refresh") + c.Check(chgs[0].Err(), ErrorMatches, "cannot perform the following tasks:\n.*- Make current revision for snap \"some-snap\" unavailable \\(error exactly at unlink-current-snap\\)") + c.Check(chgs[0].Status(), Equals, state.ErrorStatus) + c.Check(buf.String(), Not(testutil.Contains), `snap "some-snap" auto-refresh to revision 12 has failed, next auto-refresh attempt will be in 8 hours`) + c.Check(getRefreshFailed(chgs[0]), IsNil) + c.Check(getRefreshFailures("some-snap"), IsNil) // Refresh failures only count after unlink-current-snap + c.Check(getRevision("some-snap"), Equals, snap.R(1)) + c.Check(getRefreshFailures("some-other-snap"), IsNil) + c.Check(getRevision("some-other-snap"), Equals, goodSnapRevision) + + // Auto-refresh errors are recorded after unlink-current-snap + goodSnapRevision.N++ + s.fakeStore.refreshRevnos["some-other-snap-id"] = goodSnapRevision + s.fakeBackend.maybeInjectErr = nil + buf.Reset() + forceAutoRefresh() + chgs = s.state.Changes() + sort.Slice(chgs, func(i, j int) bool { + return chgs[i].SpawnTime().Before(chgs[j].SpawnTime()) + }) + c.Assert(chgs, HasLen, 2) + c.Check(chgs[1].Kind(), Equals, "auto-refresh") + c.Check(chgs[1].Err(), ErrorMatches, "cannot perform the following tasks:\n.*- Make snap \"some-snap\" \\(12\\) available to the system \\(fail\\)") + c.Check(chgs[1].Status(), Equals, state.ErrorStatus) + c.Check(buf.String(), testutil.Contains, `snap "some-snap" auto-refresh to revision 12 has failed, next auto-refresh attempt will be in 8 hours`) + c.Check(getRefreshFailed(chgs[1]), DeepEquals, []interface{}{"some-snap"}) + c.Check(getRefreshFailures("some-snap").ToRevision, Equals, badSnapRevision) + c.Check(getRefreshFailures("some-snap").FailureCount, Equals, 1) + c.Check(getRevision("some-snap"), Equals, snap.R(1)) + c.Check(getRefreshFailures("some-other-snap"), IsNil) + c.Check(getRevision("some-other-snap"), Equals, goodSnapRevision) + + // Bad snap refresh is skipped due to backoff delay, but other refresh continue normally + goodSnapRevision.N++ + s.fakeStore.refreshRevnos["some-other-snap-id"] = goodSnapRevision + buf.Reset() + forceAutoRefresh() + chgs = s.state.Changes() + sort.Slice(chgs, func(i, j int) bool { + return chgs[i].SpawnTime().Before(chgs[j].SpawnTime()) + }) + c.Assert(chgs, HasLen, 3) + c.Check(chgs[2].Kind(), Equals, "auto-refresh") + c.Check(chgs[2].Err(), IsNil) + c.Check(chgs[2].Status(), Equals, state.DoneStatus) + c.Check(buf.String(), Not(testutil.Contains), `snap "some-snap" auto-refresh to revision 12 has failed, next auto-refresh attempt will be in 8 hours`) + c.Check(getRefreshFailed(chgs[2]), IsNil) + c.Check(getRefreshFailures("some-snap").ToRevision, Equals, badSnapRevision) + c.Check(getRefreshFailures("some-snap").FailureCount, Equals, 1) // Stays at 1 + c.Check(getRevision("some-snap"), Equals, snap.R(1)) + c.Check(getRefreshFailures("some-other-snap"), IsNil) + c.Check(getRevision("some-other-snap"), Equals, goodSnapRevision) + + // Make sure backoff delay is capped to a reasonable value (~2 weeks) + var snapst snapstate.SnapState + c.Assert(snapstate.Get(s.state, "some-snap", &snapst), IsNil) + snapst.RefreshFailures = &snap.RefreshFailuresInfo{ + ToRevision: badSnapRevision, + FailureCount: 100, + LastFailureTime: time.Now().Add(-2 * 7 * 24 * time.Hour), + } + snapstate.Set(s.state, "some-snap", &snapst) + buf.Reset() + forceAutoRefresh() + chgs = s.state.Changes() + sort.Slice(chgs, func(i, j int) bool { + return chgs[i].SpawnTime().Before(chgs[j].SpawnTime()) + }) + c.Assert(chgs, HasLen, 4) + c.Check(chgs[3].Kind(), Equals, "auto-refresh") + c.Check(chgs[3].Err(), ErrorMatches, "cannot perform the following tasks:\n.*- Make snap \"some-snap\" \\(12\\) available to the system \\(fail\\)") + c.Check(chgs[3].Status(), Equals, state.ErrorStatus) + c.Check(buf.String(), testutil.Contains, `snap "some-snap" auto-refresh to revision 12 has failed, next auto-refresh attempt will be in 336 hours`) + c.Check(getRefreshFailed(chgs[3]), DeepEquals, []interface{}{"some-snap"}) + c.Check(getRefreshFailures("some-snap").ToRevision, Equals, badSnapRevision) + c.Check(getRefreshFailures("some-snap").FailureCount, Equals, 101) // Backoff delay passed, increment for new failure + c.Check(getRevision("some-snap"), Equals, snap.R(1)) + + // New revision resets RefreshFailures + s.fakeStore.refreshRevnos["some-snap-id"] = snap.R(13) + buf.Reset() + forceAutoRefresh() + chgs = s.state.Changes() + sort.Slice(chgs, func(i, j int) bool { + return chgs[i].SpawnTime().Before(chgs[j].SpawnTime()) + }) + c.Assert(chgs, HasLen, 5) + c.Check(chgs[4].Kind(), Equals, "auto-refresh") + c.Check(chgs[4].Err(), IsNil) + c.Check(chgs[4].Status(), Equals, state.DoneStatus) + c.Check(getRefreshFailed(chgs[4]), IsNil) + c.Check(getRefreshFailures("some-snap"), IsNil) + c.Check(getRevision("some-snap"), Equals, snap.R(13)) +} + func (s *snapmgrTestSuite) testAutoRefreshRefreshInhibitNoticeRecorded(c *C, markerInterfaceConnected bool, warningFallback bool) { refreshAppsCheckCalled := 0 restore := snapstate.MockRefreshAppsCheck(func(si *snap.Info) error { diff --git a/snap/info.go b/snap/info.go index 917fb5014d4..d7255781be2 100644 --- a/snap/info.go +++ b/snap/info.go @@ -2099,3 +2099,13 @@ func RegistryPlugAttrs(plug *PlugInfo) (account, registry, view string, err erro return account, registry, view, nil } + +// RefreshFailures holds information about snap failed refreshes. +type RefreshFailuresInfo struct { + // ToRevision is the target revision that caused the refresh failure. + ToRevision Revision `json:"to-revision,omitempty"` + // FailureCount is number of failed refresh attempts for the revision. + FailureCount int `json:"failure-count,omitempty"` + // LastFailureTime is the time of the last failed refresh attempt for the revision. + LastFailureTime time.Time `json:"last-failure-time"` +} diff --git a/tests/main/auto-refresh-backoff/check_auto_refresh_count.sh b/tests/main/auto-refresh-backoff/check_auto_refresh_count.sh index 4bf51b3ecf9..73ab37f07be 100755 --- a/tests/main/auto-refresh-backoff/check_auto_refresh_count.sh +++ b/tests/main/auto-refresh-backoff/check_auto_refresh_count.sh @@ -3,4 +3,5 @@ LAST_CHANGE_ID=$1 CHANGES_COUNT=$2 +#shellcheck disable=SC2086,SC2046 test $(snap debug api /v2/changes?select=ready | jq "[.result[] | select(.kind == \"auto-refresh\" and (.id|tonumber) > ($LAST_CHANGE_ID|tonumber))] | length") == $CHANGES_COUNT diff --git a/tests/main/auto-refresh-backoff/task.yaml b/tests/main/auto-refresh-backoff/task.yaml index 55b2874cf59..12723466201 100644 --- a/tests/main/auto-refresh-backoff/task.yaml +++ b/tests/main/auto-refresh-backoff/task.yaml @@ -19,6 +19,7 @@ prepare: | fi # Needed by make-snap-installable, Install before switching to fakestore + snap install jq snap install remarshal # Install snaps as baseline since we want to test what happens in refreshes not installs @@ -121,7 +122,7 @@ execute: | # -------- SECOND AUTO REFRESH -------- # Clean old SNAP_TWO snap in fakestore directory and keep bad SNAP_ONE - rm "$BLOB_DIR"/"$SNAP_TWO_ID-rev-11".snap + rm "$BLOB_DIR/$SNAP_TWO_ID-rev-11.snap" echo "Add new revision for $SNAP_TWO while keeping bad revision of $SNAP_ONE" add_snap_to_fakestore "$SNAP_TWO_GOOD_PATH" "$SNAP_TWO_ID" 22 @@ -160,20 +161,23 @@ execute: | snap debug api /v2/changes?select=ready | jq "[.result[] | select(.kind == \"auto-refresh\" and (.id|tonumber) > ($LAST_CHANGE_ID|tonumber))] | sort_by(.id)" > changes.json # 1st auto-refresh - jq '.[0].status' changes.json | MATCH "Error" - jq '.[0].data."snap-names"' changes.json | MATCH "$SNAP_ONE" - jq '.[0].data."snap-names"' changes.json | MATCH "$SNAP_TWO" + jq '.[0].status' < changes.json | MATCH "Error" + jq '.[0].data."snap-names"' < changes.json | MATCH "$SNAP_ONE" + jq '.[0].data."snap-names"' < changes.json | MATCH "$SNAP_TWO" + jq '.[0].data."refresh-failed"' < changes.json | MATCH "$SNAP_ONE" + jq '.[0].data."refresh-failed"' < changes.json | NOMATCH "$SNAP_TWO" # 2nd auto-refresh - jq '.[1].status' changes.json | MATCH "Done" + jq '.[1].status' < changes.json | MATCH "Done" # Broken SNAP_ONE should have been skipped this time - jq '.[1].data."snap-names"' changes.json | NOMATCH "$SNAP_ONE" - jq '.[1].data."snap-names"' changes.json | MATCH "$SNAP_TWO" + jq '.[1].data."snap-names"' < changes.json | NOMATCH "$SNAP_ONE" + jq '.[1].data."snap-names"' < changes.json | MATCH "$SNAP_TWO" + jq '.[1].data."refresh-failed"' < changes.json | NOMATCH "$SNAP_ONE" + jq '.[1].data."refresh-failed"' < changes.json | NOMATCH "$SNAP_TWO" # 3rd auto-refresh - jq '.[2].status' changes.json | MATCH "Done" - jq '.[2].data."snap-names"' changes.json | MATCH "$SNAP_ONE" - jq '.[2].data."snap-names"' changes.json | MATCH "$SNAP_TWO" - - # TODO: check attached metadata to api about failed refreshes on /v2/snaps and /v2/changes - # TODO: check emitted logs about backoff delays + jq '.[2].status' < changes.json | MATCH "Done" + jq '.[2].data."snap-names"' < changes.json | MATCH "$SNAP_ONE" + jq '.[2].data."snap-names"' < changes.json | MATCH "$SNAP_TWO" + jq '.[2].data."refresh-failed"' < changes.json | NOMATCH "$SNAP_ONE" + jq '.[2].data."refresh-failed"' < changes.json | NOMATCH "$SNAP_TWO"