From 5b1a96b51a09f33169402069312608a733ea7d4b Mon Sep 17 00:00:00 2001
From: Zeyad Gouda <zeyad.gouda@canonical.com>
Date: Mon, 7 Oct 2024 11:56:43 +0300
Subject: [PATCH] WIP

---
 client/clientutil/snapinfo_test.go            |   1 +
 client/packages.go                            |   2 +
 client/packages_test.go                       |  10 +
 daemon/api_snaps_test.go                      |  32 ++++
 daemon/snap.go                                |   1 +
 overlord/snapstate/autorefresh.go             | 116 +++++++++++
 overlord/snapstate/snapmgr.go                 |  12 +-
 overlord/snapstate/snapstate.go               |   4 +
 overlord/snapstate/snapstate_test.go          | 104 ++++++++++
 overlord/snapstate/snapstate_update_test.go   | 181 ++++++++++++++++++
 snap/info.go                                  |  10 +
 .../check_auto_refresh_count.sh               |   1 +
 tests/main/auto-refresh-backoff/task.yaml     |  30 +--
 13 files changed, 488 insertions(+), 16 deletions(-)

diff --git a/client/clientutil/snapinfo_test.go b/client/clientutil/snapinfo_test.go
index 96f43c11a43..c42361d9f2a 100644
--- a/client/clientutil/snapinfo_test.go
+++ b/client/clientutil/snapinfo_test.go
@@ -121,6 +121,7 @@ func (*cmdSuite) TestClientSnapFromSnapInfo(c *C) {
 		"Hold",
 		"GatingHold",
 		"RefreshInhibit",
+		"RefreshFailures",
 		"Components",
 	}
 	var checker func(string, reflect.Value)
diff --git a/client/packages.go b/client/packages.go
index d4a9c4cd0f0..8b5d3c2c93b 100644
--- a/client/packages.go
+++ b/client/packages.go
@@ -91,6 +91,8 @@ type Snap struct {
 	GatingHold *time.Time `json:"gating-hold,omitempty"`
 	// if RefreshInhibit is nil, then there is no pending refresh.
 	RefreshInhibit *SnapRefreshInhibit `json:"refresh-inhibit,omitempty"`
+	// RefreshFailures tracks information about snap failed refreshes.
+	RefreshFailures *snap.RefreshFailuresInfo `json:"refresh-failures,omitempty"`
 
 	// Components is a list of the snap components
 	Components []Component `json:"components,omitempty"`
diff --git a/client/packages_test.go b/client/packages_test.go
index bd89e63eabc..d6c248999b2 100644
--- a/client/packages_test.go
+++ b/client/packages_test.go
@@ -274,6 +274,11 @@ func (cs *clientSuite) testClientSnap(c *check.C, refreshInhibited bool) {
 			"private": true,
 			"devmode": true,
 			"trymode": true,
+			"refresh-failures": {
+				"to-revision": 43,
+				"failure-count": 5,
+				"last-failure-time": "2024-10-06T21:31:05Z"
+			},
                         "screenshots": [
                             {"url":"http://example.com/shot1.png", "width":640, "height":480},
                             {"url":"http://example.com/shot2.png"}
@@ -356,6 +361,11 @@ func (cs *clientSuite) testClientSnap(c *check.C, refreshInhibited bool) {
 		Website:        "http://example.com/funky",
 		StoreURL:       "https://snapcraft.io/chatroom",
 		RefreshInhibit: expectedSnapRefreshInhibit,
+		RefreshFailures: &snap.RefreshFailuresInfo{
+			ToRevision:      snap.R(43),
+			FailureCount:    5,
+			LastFailureTime: time.Date(2024, 10, 6, 21, 31, 5, 0, time.UTC),
+		},
 	})
 }
 
diff --git a/daemon/api_snaps_test.go b/daemon/api_snaps_test.go
index 3ac987ab0e1..2a8ef4c68ed 100644
--- a/daemon/api_snaps_test.go
+++ b/daemon/api_snaps_test.go
@@ -1578,6 +1578,38 @@ func (s *snapsSuite) TestSnapManyInfosSelectRefreshInhibited(c *check.C) {
 	}
 }
 
+func (s *snapsSuite) TestSnapInfoReturnsRefreshFailures(c *check.C) {
+	s.expectSnapsNameReadAccess()
+	d := s.daemon(c)
+	s.mkInstalledInState(c, d, "foo", "bar", "v0", snap.R(5), true, "")
+
+	expectedRefreshFailures := &snap.RefreshFailuresInfo{
+		ToRevision:      snap.R(6),
+		FailureCount:    4,
+		LastFailureTime: time.Date(2024, time.October, 10, 21, 22, 11, 0, time.UTC),
+	}
+
+	st := d.Overlord().State()
+	st.Lock()
+	var snapst snapstate.SnapState
+	// Update snap state with RefreshFailure.
+	c.Assert(snapstate.Get(st, "foo", &snapst), check.IsNil)
+	snapst.RefreshFailures = expectedRefreshFailures
+	snapstate.Set(st, "foo", &snapst)
+	st.Unlock()
+
+	req, err := http.NewRequest("GET", "/v2/snaps/foo", nil)
+	c.Assert(err, check.IsNil)
+
+	rsp := s.syncReq(c, req, nil)
+
+	c.Assert(rsp.Result, check.FitsTypeOf, &client.Snap{})
+	snapInfo := rsp.Result.(*client.Snap)
+	c.Assert(snapInfo.RefreshFailures, check.NotNil)
+
+	c.Check(snapInfo.RefreshFailures, check.DeepEquals, expectedRefreshFailures)
+}
+
 func (s *snapsSuite) TestMapLocalFields(c *check.C) {
 	media := snap.MediaInfos{
 		{
diff --git a/daemon/snap.go b/daemon/snap.go
index 3f2cd589bb7..91404bb9904 100644
--- a/daemon/snap.go
+++ b/daemon/snap.go
@@ -256,6 +256,7 @@ func mapLocal(about aboutSnap, sd clientutil.StatusDecorator) *client.Snap {
 	result.DevMode = snapst.DevMode
 	result.TryMode = snapst.TryMode
 	result.JailMode = snapst.JailMode
+	result.RefreshFailures = snapst.RefreshFailures
 	result.MountedFrom = localSnap.MountFile()
 	if result.TryMode {
 		// Readlink instead of EvalSymlinks because it's only expected
diff --git a/overlord/snapstate/autorefresh.go b/overlord/snapstate/autorefresh.go
index 718d2124961..71704905b75 100644
--- a/overlord/snapstate/autorefresh.go
+++ b/overlord/snapstate/autorefresh.go
@@ -974,3 +974,119 @@ func MockRefreshCandidate(snapSetup *SnapSetup) interface{} {
 		SnapSetup: *snapSetup,
 	}
 }
+
+func processFailedAutoRefresh(chg *state.Change, _ state.Status, new state.Status) {
+	if chg.Kind() != "auto-refresh" || new != state.ErrorStatus {
+		return
+	}
+
+	var failedSnapNames []string
+	for _, t := range chg.Tasks() {
+		// We only care about snaps that failed after unlink-current-snap because
+		// this indicates (with high probability) that something related to the snap
+		// itself is broken.
+		if t.Kind() != "unlink-current-snap" || t.Status() != state.UndoneStatus {
+			continue
+		}
+
+		snapsup, snapst, err := snapSetupAndState(t)
+		if err != nil {
+			logger.Debugf("internal error: failed to get snap associated with task %s: %v", t.ID(), err)
+			continue
+		}
+
+		// Update refresh failure information for snap revision
+		if snapst.RefreshFailures == nil {
+			// Initialize RefreshFailures
+			snapst.RefreshFailures = &snap.RefreshFailuresInfo{FailureCount: 1}
+		}
+		if snapst.RefreshFailures.ToRevision == snapsup.Revision() {
+			snapst.RefreshFailures.FailureCount++
+		} else {
+			snapst.RefreshFailures.ToRevision = snapsup.Revision()
+		}
+		snapst.RefreshFailures.LastFailureTime = timeNow()
+		Set(t.State(), snapsup.InstanceName(), snapst)
+
+		delay := computeSnapRefreshDelay(snapst.RefreshFailures)
+		logger.Noticef("snap %q auto-refresh to revision %s has failed, next auto-refresh attempt will be in %v hours", snapsup.InstanceName(), snapsup.Revision(), delay.Hours())
+		failedSnapNames = append(failedSnapNames, snapsup.InstanceName())
+	}
+
+	if len(failedSnapNames) == 0 {
+		return
+	}
+
+	// Attach failed snaps to change api data. This intended to guide
+	// agents on devices that manage their own refresh cycle.
+	var data map[string]interface{}
+	err := chg.Get("api-data", &data)
+	if err != nil && !errors.Is(err, state.ErrNoState) {
+		logger.Debugf("internal error: failed to get api-data for change %s: %v", chg.ID(), err)
+		return
+	}
+	if len(data) == 0 {
+		data = make(map[string]interface{})
+	}
+	// XXX: Should this be set regardless the refresh failed before/after unlink-current-snap?
+	data["refresh-failed"] = failedSnapNames
+	chg.Set("api-data", data)
+}
+
+// snapRefreshDelay maps from failure count to time to snap refresh delay capped at 2 weeks.
+//
+// Note: Those are heuristic values listed in the SD183 spec.
+var snapRefreshDelay = []time.Duration{
+	0,                        // FailureCount == 0 -> No delay
+	8 * time.Hour,            // FailureCount == 1 -> 8 hours delay
+	12 * time.Hour,           // FailureCount == 2 -> 12 hours delay
+	24 * time.Hour,           // FailureCount == 3 -> 1 day delay
+	2 * 24 * time.Hour,       // FailureCount == 4 -> 2 days delay
+	4 * 24 * time.Hour,       // FailureCount == 5 -> 4 days delay
+	7 * 24 * time.Hour,       // FailureCount == 6 -> 1 week delay
+	1.5 * 7 * 24 * time.Hour, // FailureCount == 7 -> 1.5 weeks delay
+	2 * 7 * 24 * time.Hour,   // FailureCount == 8 -> 2 weeks delay
+}
+
+func computeSnapRefreshDelay(refreshFailures *snap.RefreshFailuresInfo) time.Duration {
+	if refreshFailures == nil {
+		return 0
+	}
+	failureCount := refreshFailures.FailureCount
+	if failureCount > len(snapRefreshDelay)-1 {
+		// Cap failure count to max delay according to snapRefreshDelay
+		failureCount = len(snapRefreshDelay) - 1
+	}
+	return snapRefreshDelay[failureCount]
+}
+
+// shouldSkipSnapRefresh checks if a snap refresh to a target revision should be skipped or not.
+//
+// This helper implements a backoff algorithm that prevents failed upgrade loops
+// by introducing backoff delay based on snapst.RefreshFailures and snapRefreshDelay.
+func shouldSkipSnapRefresh(st *state.State, snapst *SnapState, targetRevision snap.Revision, opts Options) bool {
+	if !opts.Flags.IsAutoRefresh || snapst.RefreshFailures == nil {
+		// Don't skip if not an auto-refresh or if snap has no history of failed refreshes.
+		return false
+	}
+
+	if snapst.RefreshFailures.ToRevision != targetRevision {
+		// Snap has new revision not known to fail, let's reset RefreshFailures
+		// and continue refresh.
+		snapst.RefreshFailures = nil
+		Set(st, snapst.InstanceName(), snapst)
+		return false
+	}
+
+	// Here we are certain that the attempted target revision refresh is known to fail.
+	// Let's compute delay according to RefreshFailures.
+	delay := computeSnapRefreshDelay(snapst.RefreshFailures)
+	lastFailureTime := snapst.RefreshFailures.LastFailureTime
+	// TODO: implement more aggressive backoff for snaps that failed after reboot
+	if lastFailureTime.Add(delay).After(timeNow()) {
+		// Backoff delay duration since last failure has passed, let's continue refresh
+		return true
+	}
+
+	return false
+}
diff --git a/overlord/snapstate/snapmgr.go b/overlord/snapstate/snapmgr.go
index 7b5719f3e6d..13efae7a2e4 100644
--- a/overlord/snapstate/snapmgr.go
+++ b/overlord/snapstate/snapmgr.go
@@ -376,6 +376,9 @@ type SnapState struct {
 	// their security profiles set up but are not active.
 	// It is managed by ifacestate.
 	PendingSecurity *PendingSecurityState `json:"pending-security,omitempty"`
+
+	// RefreshFailures tracks information about snap failed refreshes.
+	RefreshFailures *snap.RefreshFailuresInfo `json:"refresh-failures,omitempty"`
 }
 
 // PendingSecurityState holds information about snaps that have
@@ -875,9 +878,12 @@ func (m *SnapManager) StartUp() error {
 		return fmt.Errorf("failed to generate cookies: %q", err)
 	}
 
-	// register handler that records a refresh-inhibit notice when
-	// the set of inhibited snaps is changed.
-	m.changeCallbackID = m.state.AddChangeStatusChangedHandler(processInhibitedAutoRefresh)
+	m.changeCallbackID = m.state.AddChangeStatusChangedHandler(func(chg *state.Change, old, new state.Status) {
+		// This handler records a refresh-inhibit notice when the set of inhibited snaps is changed.
+		processInhibitedAutoRefresh(chg, old, new)
+		// This handler implements marks failed snaps auto-refresh attempts for backoff.
+		processFailedAutoRefresh(chg, old, new)
+	})
 
 	if CheckExpectedRestart(m.state) == ErrUnexpectedRuntimeRestart {
 		logger.Noticef("detected a restart at runtime without a corresponding snapd change")
diff --git a/overlord/snapstate/snapstate.go b/overlord/snapstate/snapstate.go
index 6a9ada9b7d6..0df9c687e00 100644
--- a/overlord/snapstate/snapstate.go
+++ b/overlord/snapstate/snapstate.go
@@ -2116,6 +2116,10 @@ func doUpdate(st *state.State, requested []string, updates []update, opts Option
 			continue
 		}
 
+		if shouldSkipSnapRefresh(st, &up.SnapState, up.Setup.Revision(), opts) {
+			continue
+		}
+
 		// if any snaps actually get a revision change, we need to do a
 		// re-refresh check
 		needsRerefreshCheck = true
diff --git a/overlord/snapstate/snapstate_test.go b/overlord/snapstate/snapstate_test.go
index e8e95ebd2e7..9d0d071edc0 100644
--- a/overlord/snapstate/snapstate_test.go
+++ b/overlord/snapstate/snapstate_test.go
@@ -173,6 +173,7 @@ func (s *snapmgrBaseTest) SetUpTest(c *C) {
 		fakeBackend:         s.fakeBackend,
 		state:               s.state,
 		downloadError:       make(map[string]error),
+		refreshRevnos:       make(map[string]snap.Revision),
 	}
 
 	// make tests work consistently also in containers
@@ -9343,6 +9344,109 @@ func (s *snapmgrTestSuite) TestSaveRefreshCandidatesOnAutoRefresh(c *C) {
 	c.Check(cands["some-other-snap"], NotNil)
 }
 
+func (s *snapmgrTestSuite) TestBackoffOnAutoRefresh(c *C) {
+	s.state.Lock()
+	defer s.state.Unlock()
+
+	badRevison := snap.R(12)
+	badSnapst := &snapstate.SnapState{
+		Active: true,
+		Sequence: snapstatetest.NewSequenceFromSnapSideInfos([]*snap.SideInfo{
+			{RealName: "some-snap", SnapID: "some-snap-id", Revision: snap.R(1)},
+		}),
+		Current:  snap.R(1),
+		SnapType: "app",
+		RefreshFailures: &snap.RefreshFailuresInfo{
+			ToRevision:      badRevison,
+			FailureCount:    1,
+			LastFailureTime: time.Now(),
+		},
+	}
+	snapstate.Set(s.state, "some-snap", badSnapst)
+	snapstate.Set(s.state, "some-other-snap", &snapstate.SnapState{
+		Active: true,
+		Sequence: snapstatetest.NewSequenceFromSnapSideInfos([]*snap.SideInfo{
+			{RealName: "some-other-snap", SnapID: "some-other-snap-id", Revision: snap.R(1)},
+		}),
+		Current:  snap.R(1),
+		SnapType: "app",
+	})
+
+	s.fakeStore.refreshRevnos["some-snap-id"] = badRevison
+	names, tss, err := snapstate.AutoRefresh(context.Background(), s.state)
+	c.Assert(err, IsNil)
+	c.Assert(tss, NotNil)
+	// some-snap auto-refresh skipped
+	c.Check(names, DeepEquals, []string{"some-other-snap"})
+
+	// Failure delay is capped at 2 weeks
+	badSnapst.RefreshFailures.FailureCount = 100
+	badSnapst.RefreshFailures.LastFailureTime = time.Now().Add(-(2*7*24 - 1) * time.Hour)
+	snapstate.Set(s.state, "some-snap", badSnapst)
+	names, _, err = snapstate.AutoRefresh(context.Background(), s.state)
+	c.Assert(err, IsNil)
+	// some-snap auto-refresh skipped
+	c.Check(names, DeepEquals, []string{"some-other-snap"})
+	// But backoff delay is capped at two weeks
+	badSnapst.RefreshFailures.LastFailureTime = time.Now().Add(-(2 * 7 * 24) * time.Hour)
+	snapstate.Set(s.state, "some-snap", badSnapst)
+	names, _, err = snapstate.AutoRefresh(context.Background(), s.state)
+	c.Assert(err, IsNil)
+	c.Check(names, DeepEquals, []string{"some-other-snap", "some-snap"})
+}
+
+func (s *snapmgrTestSuite) TestBackoffOnAutoRefreshWithNewRevision(c *C) {
+	s.state.Lock()
+	defer s.state.Unlock()
+
+	badRevison := snap.R(12)
+	badSnapst := &snapstate.SnapState{
+		Active: true,
+		Sequence: snapstatetest.NewSequenceFromSnapSideInfos([]*snap.SideInfo{
+			{RealName: "some-snap", SnapID: "some-snap-id", Revision: snap.R(1)},
+		}),
+		Current:  snap.R(1),
+		SnapType: "app",
+		RefreshFailures: &snap.RefreshFailuresInfo{
+			ToRevision:      badRevison,
+			FailureCount:    3,
+			LastFailureTime: time.Now(),
+		},
+	}
+	snapstate.Set(s.state, "some-snap", badSnapst)
+	snapstate.Set(s.state, "some-other-snap", &snapstate.SnapState{
+		Active: true,
+		Sequence: snapstatetest.NewSequenceFromSnapSideInfos([]*snap.SideInfo{
+			{RealName: "some-other-snap", SnapID: "some-other-snap-id", Revision: snap.R(1)},
+		}),
+		Current:  snap.R(1),
+		SnapType: "app",
+	})
+
+	s.fakeStore.refreshRevnos["some-snap-id"] = badRevison
+	names, tss, err := snapstate.AutoRefresh(context.Background(), s.state)
+	c.Assert(err, IsNil)
+	c.Assert(tss, NotNil)
+	// some-snap auto-refresh skipped
+	c.Check(names, DeepEquals, []string{"some-other-snap"})
+
+	// Check that new revision resets RefreshFailures
+	s.fakeStore.refreshRevnos["some-snap-id"] = snap.R(13)
+	// First make sure RefreshFailures is not nil
+	var snapst snapstate.SnapState
+	snapstate.Get(s.state, "some-snap", &snapst)
+	c.Assert(snapst.RefreshFailures.FailureCount, Equals, 3)
+	// Trigger new auto-refresh (with new revision from fakestore)
+	names, tss, err = snapstate.AutoRefresh(context.Background(), s.state)
+	c.Assert(err, IsNil)
+	c.Assert(tss, NotNil)
+	// some-snap auto-refresh not skipped
+	c.Check(names, DeepEquals, []string{"some-other-snap", "some-snap"})
+	// And RefreshFailures is reset
+	snapstate.Get(s.state, "some-snap", &snapst)
+	c.Assert(snapst.RefreshFailures, IsNil)
+}
+
 type customStore struct {
 	*fakeStore
 
diff --git a/overlord/snapstate/snapstate_update_test.go b/overlord/snapstate/snapstate_update_test.go
index b8bb1c2877a..51cc2d655f2 100644
--- a/overlord/snapstate/snapstate_update_test.go
+++ b/overlord/snapstate/snapstate_update_test.go
@@ -10693,6 +10693,187 @@ func (s *snapmgrTestSuite) TestAutoRefreshCreatePreDownload(c *C) {
 	c.Assert(tasks[0].Summary(), testutil.Contains, "Pre-download snap \"some-snap\" (2) from channel")
 }
 
+func (s *snapmgrTestSuite) TestAutoRefreshRecordsFailures(c *C) {
+	// Setup test snaps
+	s.state.Lock()
+	defer s.state.Unlock()
+	snapstate.Set(s.state, "some-snap", &snapstate.SnapState{
+		Active: true,
+		Sequence: snapstatetest.NewSequenceFromSnapSideInfos([]*snap.SideInfo{
+			{RealName: "some-snap", SnapID: "some-snap-id", Revision: snap.R(1)},
+		}),
+		Current:  snap.R(1),
+		SnapType: string(snap.TypeApp),
+	})
+	snapstate.Set(s.state, "some-other-snap", &snapstate.SnapState{
+		Active: true,
+		Sequence: snapstatetest.NewSequenceFromSnapSideInfos([]*snap.SideInfo{
+			{RealName: "some-other-snap", SnapID: "some-other-snap-id", Revision: snap.R(1)},
+		}),
+		Current:  snap.R(1),
+		SnapType: string(snap.TypeApp),
+	})
+
+	snapstate.CanAutoRefresh = func(*state.State) (bool, error) { return true, nil }
+
+	buf, restore := logger.MockLogger()
+	defer restore()
+
+	badSnapRevision := snap.R(12)
+	s.fakeBackend.linkSnapFailTrigger = filepath.Join(dirs.SnapMountDir, "/some-snap/"+badSnapRevision.String())
+	goodSnapRevision := snap.R(12)
+	// Specify retrieved revisions for snaps from fakestore
+	s.fakeStore.refreshRevnos["some-snap-id"] = badSnapRevision
+	s.fakeStore.refreshRevnos["some-other-snap-id"] = goodSnapRevision
+
+	getRefreshFailures := func(name string) *snap.RefreshFailuresInfo {
+		var snapst snapstate.SnapState
+		c.Assert(snapstate.Get(s.state, name, &snapst), IsNil)
+		return snapst.RefreshFailures
+	}
+	getRefreshFailed := func(chg *state.Change) []interface{} {
+		var apiData map[string]interface{}
+		c.Assert(chg.Get("api-data", &apiData), IsNil)
+		if apiData["refresh-failed"] == nil {
+			return nil
+		}
+		refreshFailed := apiData["refresh-failed"].([]interface{})
+		sort.Slice(refreshFailed, func(i, j int) bool {
+			return refreshFailed[i].(string) < refreshFailed[j].(string)
+		})
+		return refreshFailed
+	}
+	getRevision := func(name string) snap.Revision {
+		var snapst snapstate.SnapState
+		c.Assert(snapstate.Get(s.state, name, &snapst), IsNil)
+		return snapst.Current
+	}
+
+	forceAutoRefresh := func() {
+		s.state.Unlock()
+		// Fake nextRefresh to now.
+		s.snapmgr.MockNextRefresh(time.Now())
+		// Fake that the retryRefreshDelay is over
+		restore := snapstate.MockRefreshRetryDelay(1 * time.Millisecond)
+		defer restore()
+		time.Sleep(10 * time.Millisecond)
+		// Trigger autorefresh.Ensure().
+		err := s.snapmgr.Ensure()
+		if errors.Is(err, advisor.ErrNotSupported) {
+			c.Skip("bolt is not supported")
+		}
+		c.Assert(err, IsNil)
+		s.state.Lock()
+		s.settle(c)
+	}
+
+	// Auto-refresh errors are not recorded until after unlink-current-snap as
+	// this indicates (with high probability) a real issue with the snap revision.
+	s.fakeBackend.maybeInjectErr = func(fo *fakeOp) error {
+		if fo.op == "unlink-snap" && strings.Contains(fo.path, "some-snap") {
+			return errors.New("error exactly at unlink-current-snap")
+		}
+		return nil
+	}
+	buf.Reset()
+	forceAutoRefresh()
+	chgs := s.state.Changes()
+	c.Assert(chgs, HasLen, 1)
+	c.Check(chgs[0].Kind(), Equals, "auto-refresh")
+	c.Check(chgs[0].Err(), ErrorMatches, "cannot perform the following tasks:\n.*- Make current revision for snap \"some-snap\" unavailable \\(error exactly at unlink-current-snap\\)")
+	c.Check(chgs[0].Status(), Equals, state.ErrorStatus)
+	c.Check(buf.String(), Not(testutil.Contains), `snap "some-snap" auto-refresh to revision 12 has failed, next auto-refresh attempt will be in 8 hours`)
+	c.Check(getRefreshFailed(chgs[0]), IsNil)
+	c.Check(getRefreshFailures("some-snap"), IsNil) // Refresh failures only count after unlink-current-snap
+	c.Check(getRevision("some-snap"), Equals, snap.R(1))
+	c.Check(getRefreshFailures("some-other-snap"), IsNil)
+	c.Check(getRevision("some-other-snap"), Equals, goodSnapRevision)
+
+	// Auto-refresh errors are recorded after unlink-current-snap
+	goodSnapRevision.N++
+	s.fakeStore.refreshRevnos["some-other-snap-id"] = goodSnapRevision
+	s.fakeBackend.maybeInjectErr = nil
+	buf.Reset()
+	forceAutoRefresh()
+	chgs = s.state.Changes()
+	sort.Slice(chgs, func(i, j int) bool {
+		return chgs[i].SpawnTime().Before(chgs[j].SpawnTime())
+	})
+	c.Assert(chgs, HasLen, 2)
+	c.Check(chgs[1].Kind(), Equals, "auto-refresh")
+	c.Check(chgs[1].Err(), ErrorMatches, "cannot perform the following tasks:\n.*- Make snap \"some-snap\" \\(12\\) available to the system \\(fail\\)")
+	c.Check(chgs[1].Status(), Equals, state.ErrorStatus)
+	c.Check(buf.String(), testutil.Contains, `snap "some-snap" auto-refresh to revision 12 has failed, next auto-refresh attempt will be in 8 hours`)
+	c.Check(getRefreshFailed(chgs[1]), DeepEquals, []interface{}{"some-snap"})
+	c.Check(getRefreshFailures("some-snap").ToRevision, Equals, badSnapRevision)
+	c.Check(getRefreshFailures("some-snap").FailureCount, Equals, 1)
+	c.Check(getRevision("some-snap"), Equals, snap.R(1))
+	c.Check(getRefreshFailures("some-other-snap"), IsNil)
+	c.Check(getRevision("some-other-snap"), Equals, goodSnapRevision)
+
+	// Bad snap refresh is skipped due to backoff delay, but other refresh continue normally
+	goodSnapRevision.N++
+	s.fakeStore.refreshRevnos["some-other-snap-id"] = goodSnapRevision
+	buf.Reset()
+	forceAutoRefresh()
+	chgs = s.state.Changes()
+	sort.Slice(chgs, func(i, j int) bool {
+		return chgs[i].SpawnTime().Before(chgs[j].SpawnTime())
+	})
+	c.Assert(chgs, HasLen, 3)
+	c.Check(chgs[2].Kind(), Equals, "auto-refresh")
+	c.Check(chgs[2].Err(), IsNil)
+	c.Check(chgs[2].Status(), Equals, state.DoneStatus)
+	c.Check(buf.String(), Not(testutil.Contains), `snap "some-snap" auto-refresh to revision 12 has failed, next auto-refresh attempt will be in 8 hours`)
+	c.Check(getRefreshFailed(chgs[2]), IsNil)
+	c.Check(getRefreshFailures("some-snap").ToRevision, Equals, badSnapRevision)
+	c.Check(getRefreshFailures("some-snap").FailureCount, Equals, 1) // Stays at 1
+	c.Check(getRevision("some-snap"), Equals, snap.R(1))
+	c.Check(getRefreshFailures("some-other-snap"), IsNil)
+	c.Check(getRevision("some-other-snap"), Equals, goodSnapRevision)
+
+	// Make sure backoff delay is capped to a reasonable value (~2 weeks)
+	var snapst snapstate.SnapState
+	c.Assert(snapstate.Get(s.state, "some-snap", &snapst), IsNil)
+	snapst.RefreshFailures = &snap.RefreshFailuresInfo{
+		ToRevision:      badSnapRevision,
+		FailureCount:    100,
+		LastFailureTime: time.Now().Add(-2 * 7 * 24 * time.Hour),
+	}
+	snapstate.Set(s.state, "some-snap", &snapst)
+	buf.Reset()
+	forceAutoRefresh()
+	chgs = s.state.Changes()
+	sort.Slice(chgs, func(i, j int) bool {
+		return chgs[i].SpawnTime().Before(chgs[j].SpawnTime())
+	})
+	c.Assert(chgs, HasLen, 4)
+	c.Check(chgs[3].Kind(), Equals, "auto-refresh")
+	c.Check(chgs[3].Err(), ErrorMatches, "cannot perform the following tasks:\n.*- Make snap \"some-snap\" \\(12\\) available to the system \\(fail\\)")
+	c.Check(chgs[3].Status(), Equals, state.ErrorStatus)
+	c.Check(buf.String(), testutil.Contains, `snap "some-snap" auto-refresh to revision 12 has failed, next auto-refresh attempt will be in 336 hours`)
+	c.Check(getRefreshFailed(chgs[3]), DeepEquals, []interface{}{"some-snap"})
+	c.Check(getRefreshFailures("some-snap").ToRevision, Equals, badSnapRevision)
+	c.Check(getRefreshFailures("some-snap").FailureCount, Equals, 101) // Backoff delay passed, increment for new failure
+	c.Check(getRevision("some-snap"), Equals, snap.R(1))
+
+	// New revision resets RefreshFailures
+	s.fakeStore.refreshRevnos["some-snap-id"] = snap.R(13)
+	buf.Reset()
+	forceAutoRefresh()
+	chgs = s.state.Changes()
+	sort.Slice(chgs, func(i, j int) bool {
+		return chgs[i].SpawnTime().Before(chgs[j].SpawnTime())
+	})
+	c.Assert(chgs, HasLen, 5)
+	c.Check(chgs[4].Kind(), Equals, "auto-refresh")
+	c.Check(chgs[4].Err(), IsNil)
+	c.Check(chgs[4].Status(), Equals, state.DoneStatus)
+	c.Check(getRefreshFailed(chgs[4]), IsNil)
+	c.Check(getRefreshFailures("some-snap"), IsNil)
+	c.Check(getRevision("some-snap"), Equals, snap.R(13))
+}
+
 func (s *snapmgrTestSuite) testAutoRefreshRefreshInhibitNoticeRecorded(c *C, markerInterfaceConnected bool, warningFallback bool) {
 	refreshAppsCheckCalled := 0
 	restore := snapstate.MockRefreshAppsCheck(func(si *snap.Info) error {
diff --git a/snap/info.go b/snap/info.go
index 917fb5014d4..d7255781be2 100644
--- a/snap/info.go
+++ b/snap/info.go
@@ -2099,3 +2099,13 @@ func RegistryPlugAttrs(plug *PlugInfo) (account, registry, view string, err erro
 
 	return account, registry, view, nil
 }
+
+// RefreshFailures holds information about snap failed refreshes.
+type RefreshFailuresInfo struct {
+	// ToRevision is the target revision that caused the refresh failure.
+	ToRevision Revision `json:"to-revision,omitempty"`
+	// FailureCount is number of failed refresh attempts for the revision.
+	FailureCount int `json:"failure-count,omitempty"`
+	// LastFailureTime is the time of the last failed refresh attempt for the revision.
+	LastFailureTime time.Time `json:"last-failure-time"`
+}
diff --git a/tests/main/auto-refresh-backoff/check_auto_refresh_count.sh b/tests/main/auto-refresh-backoff/check_auto_refresh_count.sh
index 4bf51b3ecf9..73ab37f07be 100755
--- a/tests/main/auto-refresh-backoff/check_auto_refresh_count.sh
+++ b/tests/main/auto-refresh-backoff/check_auto_refresh_count.sh
@@ -3,4 +3,5 @@
 LAST_CHANGE_ID=$1
 CHANGES_COUNT=$2
 
+#shellcheck disable=SC2086,SC2046
 test $(snap debug api /v2/changes?select=ready | jq "[.result[] | select(.kind == \"auto-refresh\" and (.id|tonumber) > ($LAST_CHANGE_ID|tonumber))] | length") == $CHANGES_COUNT
diff --git a/tests/main/auto-refresh-backoff/task.yaml b/tests/main/auto-refresh-backoff/task.yaml
index 55b2874cf59..12723466201 100644
--- a/tests/main/auto-refresh-backoff/task.yaml
+++ b/tests/main/auto-refresh-backoff/task.yaml
@@ -19,6 +19,7 @@ prepare: |
     fi
 
     # Needed by make-snap-installable, Install before switching to fakestore
+    snap install jq
     snap install remarshal
 
     # Install snaps as baseline since we want to test what happens in refreshes not installs
@@ -121,7 +122,7 @@ execute: |
     # -------- SECOND AUTO REFRESH --------
 
     # Clean old SNAP_TWO snap in fakestore directory and keep bad SNAP_ONE
-    rm "$BLOB_DIR"/"$SNAP_TWO_ID-rev-11".snap
+    rm "$BLOB_DIR/$SNAP_TWO_ID-rev-11.snap"
     echo "Add new revision for $SNAP_TWO while keeping bad revision of $SNAP_ONE"
     add_snap_to_fakestore "$SNAP_TWO_GOOD_PATH" "$SNAP_TWO_ID" 22
 
@@ -160,20 +161,23 @@ execute: |
     snap debug api /v2/changes?select=ready | jq "[.result[] | select(.kind == \"auto-refresh\" and (.id|tonumber) > ($LAST_CHANGE_ID|tonumber))] | sort_by(.id)" > changes.json
 
     # 1st auto-refresh
-    jq '.[0].status' changes.json | MATCH "Error"
-    jq '.[0].data."snap-names"' changes.json | MATCH "$SNAP_ONE"
-    jq '.[0].data."snap-names"' changes.json | MATCH "$SNAP_TWO"
+    jq '.[0].status' < changes.json | MATCH "Error"
+    jq '.[0].data."snap-names"' < changes.json | MATCH "$SNAP_ONE"
+    jq '.[0].data."snap-names"' < changes.json | MATCH "$SNAP_TWO"
+    jq '.[0].data."refresh-failed"' < changes.json | MATCH "$SNAP_ONE"
+    jq '.[0].data."refresh-failed"' < changes.json | NOMATCH "$SNAP_TWO"
 
     # 2nd auto-refresh
-    jq '.[1].status' changes.json | MATCH "Done"
+    jq '.[1].status' < changes.json | MATCH "Done"
     # Broken SNAP_ONE should have been skipped this time
-    jq '.[1].data."snap-names"' changes.json | NOMATCH "$SNAP_ONE"
-    jq '.[1].data."snap-names"' changes.json | MATCH "$SNAP_TWO"
+    jq '.[1].data."snap-names"' < changes.json | NOMATCH "$SNAP_ONE"
+    jq '.[1].data."snap-names"' < changes.json | MATCH "$SNAP_TWO"
+    jq '.[1].data."refresh-failed"' < changes.json | NOMATCH "$SNAP_ONE"
+    jq '.[1].data."refresh-failed"' < changes.json | NOMATCH "$SNAP_TWO"
 
     # 3rd auto-refresh
-    jq '.[2].status' changes.json | MATCH "Done"
-    jq '.[2].data."snap-names"' changes.json | MATCH "$SNAP_ONE"
-    jq '.[2].data."snap-names"' changes.json | MATCH "$SNAP_TWO"
-
-    # TODO: check attached metadata to api about failed refreshes on /v2/snaps and /v2/changes
-    # TODO: check emitted logs about backoff delays
+    jq '.[2].status' < changes.json | MATCH "Done"
+    jq '.[2].data."snap-names"' < changes.json | MATCH "$SNAP_ONE"
+    jq '.[2].data."snap-names"' < changes.json | MATCH "$SNAP_TWO"
+    jq '.[2].data."refresh-failed"' < changes.json | NOMATCH "$SNAP_ONE"
+    jq '.[2].data."refresh-failed"' < changes.json | NOMATCH "$SNAP_TWO"