Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
ZeyadYasser committed Oct 7, 2024
1 parent 83bcdbe commit 5b1a96b
Show file tree
Hide file tree
Showing 13 changed files with 488 additions and 16 deletions.
1 change: 1 addition & 0 deletions client/clientutil/snapinfo_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ func (*cmdSuite) TestClientSnapFromSnapInfo(c *C) {
"Hold",
"GatingHold",
"RefreshInhibit",
"RefreshFailures",
"Components",
}
var checker func(string, reflect.Value)
Expand Down
2 changes: 2 additions & 0 deletions client/packages.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@ type Snap struct {
GatingHold *time.Time `json:"gating-hold,omitempty"`
// if RefreshInhibit is nil, then there is no pending refresh.
RefreshInhibit *SnapRefreshInhibit `json:"refresh-inhibit,omitempty"`
// RefreshFailures tracks information about snap failed refreshes.
RefreshFailures *snap.RefreshFailuresInfo `json:"refresh-failures,omitempty"`

// Components is a list of the snap components
Components []Component `json:"components,omitempty"`
Expand Down
10 changes: 10 additions & 0 deletions client/packages_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,11 @@ func (cs *clientSuite) testClientSnap(c *check.C, refreshInhibited bool) {
"private": true,
"devmode": true,
"trymode": true,
"refresh-failures": {
"to-revision": 43,
"failure-count": 5,
"last-failure-time": "2024-10-06T21:31:05Z"
},
"screenshots": [
{"url":"http://example.com/shot1.png", "width":640, "height":480},
{"url":"http://example.com/shot2.png"}
Expand Down Expand Up @@ -356,6 +361,11 @@ func (cs *clientSuite) testClientSnap(c *check.C, refreshInhibited bool) {
Website: "http://example.com/funky",
StoreURL: "https://snapcraft.io/chatroom",
RefreshInhibit: expectedSnapRefreshInhibit,
RefreshFailures: &snap.RefreshFailuresInfo{
ToRevision: snap.R(43),
FailureCount: 5,
LastFailureTime: time.Date(2024, 10, 6, 21, 31, 5, 0, time.UTC),
},
})
}

Expand Down
32 changes: 32 additions & 0 deletions daemon/api_snaps_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1578,6 +1578,38 @@ func (s *snapsSuite) TestSnapManyInfosSelectRefreshInhibited(c *check.C) {
}
}

func (s *snapsSuite) TestSnapInfoReturnsRefreshFailures(c *check.C) {
s.expectSnapsNameReadAccess()
d := s.daemon(c)
s.mkInstalledInState(c, d, "foo", "bar", "v0", snap.R(5), true, "")

expectedRefreshFailures := &snap.RefreshFailuresInfo{
ToRevision: snap.R(6),
FailureCount: 4,
LastFailureTime: time.Date(2024, time.October, 10, 21, 22, 11, 0, time.UTC),
}

st := d.Overlord().State()
st.Lock()
var snapst snapstate.SnapState
// Update snap state with RefreshFailure.
c.Assert(snapstate.Get(st, "foo", &snapst), check.IsNil)
snapst.RefreshFailures = expectedRefreshFailures
snapstate.Set(st, "foo", &snapst)
st.Unlock()

req, err := http.NewRequest("GET", "/v2/snaps/foo", nil)
c.Assert(err, check.IsNil)

rsp := s.syncReq(c, req, nil)

c.Assert(rsp.Result, check.FitsTypeOf, &client.Snap{})
snapInfo := rsp.Result.(*client.Snap)
c.Assert(snapInfo.RefreshFailures, check.NotNil)

c.Check(snapInfo.RefreshFailures, check.DeepEquals, expectedRefreshFailures)
}

func (s *snapsSuite) TestMapLocalFields(c *check.C) {
media := snap.MediaInfos{
{
Expand Down
1 change: 1 addition & 0 deletions daemon/snap.go
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,7 @@ func mapLocal(about aboutSnap, sd clientutil.StatusDecorator) *client.Snap {
result.DevMode = snapst.DevMode
result.TryMode = snapst.TryMode
result.JailMode = snapst.JailMode
result.RefreshFailures = snapst.RefreshFailures
result.MountedFrom = localSnap.MountFile()
if result.TryMode {
// Readlink instead of EvalSymlinks because it's only expected
Expand Down
116 changes: 116 additions & 0 deletions overlord/snapstate/autorefresh.go
Original file line number Diff line number Diff line change
Expand Up @@ -974,3 +974,119 @@ func MockRefreshCandidate(snapSetup *SnapSetup) interface{} {
SnapSetup: *snapSetup,
}
}

func processFailedAutoRefresh(chg *state.Change, _ state.Status, new state.Status) {
if chg.Kind() != "auto-refresh" || new != state.ErrorStatus {
return
}

var failedSnapNames []string
for _, t := range chg.Tasks() {
// We only care about snaps that failed after unlink-current-snap because
// this indicates (with high probability) that something related to the snap
// itself is broken.
if t.Kind() != "unlink-current-snap" || t.Status() != state.UndoneStatus {
continue
}

snapsup, snapst, err := snapSetupAndState(t)
if err != nil {
logger.Debugf("internal error: failed to get snap associated with task %s: %v", t.ID(), err)
continue
}

// Update refresh failure information for snap revision
if snapst.RefreshFailures == nil {
// Initialize RefreshFailures
snapst.RefreshFailures = &snap.RefreshFailuresInfo{FailureCount: 1}
}
if snapst.RefreshFailures.ToRevision == snapsup.Revision() {
snapst.RefreshFailures.FailureCount++
} else {
snapst.RefreshFailures.ToRevision = snapsup.Revision()
}
snapst.RefreshFailures.LastFailureTime = timeNow()
Set(t.State(), snapsup.InstanceName(), snapst)

delay := computeSnapRefreshDelay(snapst.RefreshFailures)
logger.Noticef("snap %q auto-refresh to revision %s has failed, next auto-refresh attempt will be in %v hours", snapsup.InstanceName(), snapsup.Revision(), delay.Hours())
failedSnapNames = append(failedSnapNames, snapsup.InstanceName())
}

if len(failedSnapNames) == 0 {
return
}

// Attach failed snaps to change api data. This intended to guide
// agents on devices that manage their own refresh cycle.
var data map[string]interface{}
err := chg.Get("api-data", &data)
if err != nil && !errors.Is(err, state.ErrNoState) {
logger.Debugf("internal error: failed to get api-data for change %s: %v", chg.ID(), err)
return
}
if len(data) == 0 {
data = make(map[string]interface{})
}
// XXX: Should this be set regardless the refresh failed before/after unlink-current-snap?
data["refresh-failed"] = failedSnapNames
chg.Set("api-data", data)
}

// snapRefreshDelay maps from failure count to time to snap refresh delay capped at 2 weeks.
//
// Note: Those are heuristic values listed in the SD183 spec.
var snapRefreshDelay = []time.Duration{
0, // FailureCount == 0 -> No delay
8 * time.Hour, // FailureCount == 1 -> 8 hours delay
12 * time.Hour, // FailureCount == 2 -> 12 hours delay
24 * time.Hour, // FailureCount == 3 -> 1 day delay
2 * 24 * time.Hour, // FailureCount == 4 -> 2 days delay
4 * 24 * time.Hour, // FailureCount == 5 -> 4 days delay
7 * 24 * time.Hour, // FailureCount == 6 -> 1 week delay
1.5 * 7 * 24 * time.Hour, // FailureCount == 7 -> 1.5 weeks delay
2 * 7 * 24 * time.Hour, // FailureCount == 8 -> 2 weeks delay
}

func computeSnapRefreshDelay(refreshFailures *snap.RefreshFailuresInfo) time.Duration {
if refreshFailures == nil {
return 0
}
failureCount := refreshFailures.FailureCount
if failureCount > len(snapRefreshDelay)-1 {
// Cap failure count to max delay according to snapRefreshDelay
failureCount = len(snapRefreshDelay) - 1
}
return snapRefreshDelay[failureCount]
}

// shouldSkipSnapRefresh checks if a snap refresh to a target revision should be skipped or not.
//
// This helper implements a backoff algorithm that prevents failed upgrade loops
// by introducing backoff delay based on snapst.RefreshFailures and snapRefreshDelay.
func shouldSkipSnapRefresh(st *state.State, snapst *SnapState, targetRevision snap.Revision, opts Options) bool {
if !opts.Flags.IsAutoRefresh || snapst.RefreshFailures == nil {
// Don't skip if not an auto-refresh or if snap has no history of failed refreshes.
return false
}

if snapst.RefreshFailures.ToRevision != targetRevision {
// Snap has new revision not known to fail, let's reset RefreshFailures
// and continue refresh.
snapst.RefreshFailures = nil
Set(st, snapst.InstanceName(), snapst)
return false
}

// Here we are certain that the attempted target revision refresh is known to fail.
// Let's compute delay according to RefreshFailures.
delay := computeSnapRefreshDelay(snapst.RefreshFailures)
lastFailureTime := snapst.RefreshFailures.LastFailureTime
// TODO: implement more aggressive backoff for snaps that failed after reboot
if lastFailureTime.Add(delay).After(timeNow()) {
// Backoff delay duration since last failure has passed, let's continue refresh
return true
}

return false
}
12 changes: 9 additions & 3 deletions overlord/snapstate/snapmgr.go
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,9 @@ type SnapState struct {
// their security profiles set up but are not active.
// It is managed by ifacestate.
PendingSecurity *PendingSecurityState `json:"pending-security,omitempty"`

// RefreshFailures tracks information about snap failed refreshes.
RefreshFailures *snap.RefreshFailuresInfo `json:"refresh-failures,omitempty"`
}

// PendingSecurityState holds information about snaps that have
Expand Down Expand Up @@ -875,9 +878,12 @@ func (m *SnapManager) StartUp() error {
return fmt.Errorf("failed to generate cookies: %q", err)
}

// register handler that records a refresh-inhibit notice when
// the set of inhibited snaps is changed.
m.changeCallbackID = m.state.AddChangeStatusChangedHandler(processInhibitedAutoRefresh)
m.changeCallbackID = m.state.AddChangeStatusChangedHandler(func(chg *state.Change, old, new state.Status) {
// This handler records a refresh-inhibit notice when the set of inhibited snaps is changed.
processInhibitedAutoRefresh(chg, old, new)
// This handler implements marks failed snaps auto-refresh attempts for backoff.
processFailedAutoRefresh(chg, old, new)
})

if CheckExpectedRestart(m.state) == ErrUnexpectedRuntimeRestart {
logger.Noticef("detected a restart at runtime without a corresponding snapd change")
Expand Down
4 changes: 4 additions & 0 deletions overlord/snapstate/snapstate.go
Original file line number Diff line number Diff line change
Expand Up @@ -2116,6 +2116,10 @@ func doUpdate(st *state.State, requested []string, updates []update, opts Option
continue
}

if shouldSkipSnapRefresh(st, &up.SnapState, up.Setup.Revision(), opts) {
continue
}

// if any snaps actually get a revision change, we need to do a
// re-refresh check
needsRerefreshCheck = true
Expand Down
104 changes: 104 additions & 0 deletions overlord/snapstate/snapstate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ func (s *snapmgrBaseTest) SetUpTest(c *C) {
fakeBackend: s.fakeBackend,
state: s.state,
downloadError: make(map[string]error),
refreshRevnos: make(map[string]snap.Revision),
}

// make tests work consistently also in containers
Expand Down Expand Up @@ -9343,6 +9344,109 @@ func (s *snapmgrTestSuite) TestSaveRefreshCandidatesOnAutoRefresh(c *C) {
c.Check(cands["some-other-snap"], NotNil)
}

func (s *snapmgrTestSuite) TestBackoffOnAutoRefresh(c *C) {
s.state.Lock()
defer s.state.Unlock()

badRevison := snap.R(12)
badSnapst := &snapstate.SnapState{
Active: true,
Sequence: snapstatetest.NewSequenceFromSnapSideInfos([]*snap.SideInfo{
{RealName: "some-snap", SnapID: "some-snap-id", Revision: snap.R(1)},
}),
Current: snap.R(1),
SnapType: "app",
RefreshFailures: &snap.RefreshFailuresInfo{
ToRevision: badRevison,
FailureCount: 1,
LastFailureTime: time.Now(),
},
}
snapstate.Set(s.state, "some-snap", badSnapst)
snapstate.Set(s.state, "some-other-snap", &snapstate.SnapState{
Active: true,
Sequence: snapstatetest.NewSequenceFromSnapSideInfos([]*snap.SideInfo{
{RealName: "some-other-snap", SnapID: "some-other-snap-id", Revision: snap.R(1)},
}),
Current: snap.R(1),
SnapType: "app",
})

s.fakeStore.refreshRevnos["some-snap-id"] = badRevison
names, tss, err := snapstate.AutoRefresh(context.Background(), s.state)
c.Assert(err, IsNil)
c.Assert(tss, NotNil)
// some-snap auto-refresh skipped
c.Check(names, DeepEquals, []string{"some-other-snap"})

// Failure delay is capped at 2 weeks
badSnapst.RefreshFailures.FailureCount = 100
badSnapst.RefreshFailures.LastFailureTime = time.Now().Add(-(2*7*24 - 1) * time.Hour)
snapstate.Set(s.state, "some-snap", badSnapst)
names, _, err = snapstate.AutoRefresh(context.Background(), s.state)
c.Assert(err, IsNil)
// some-snap auto-refresh skipped
c.Check(names, DeepEquals, []string{"some-other-snap"})
// But backoff delay is capped at two weeks
badSnapst.RefreshFailures.LastFailureTime = time.Now().Add(-(2 * 7 * 24) * time.Hour)
snapstate.Set(s.state, "some-snap", badSnapst)
names, _, err = snapstate.AutoRefresh(context.Background(), s.state)
c.Assert(err, IsNil)
c.Check(names, DeepEquals, []string{"some-other-snap", "some-snap"})
}

func (s *snapmgrTestSuite) TestBackoffOnAutoRefreshWithNewRevision(c *C) {
s.state.Lock()
defer s.state.Unlock()

badRevison := snap.R(12)
badSnapst := &snapstate.SnapState{
Active: true,
Sequence: snapstatetest.NewSequenceFromSnapSideInfos([]*snap.SideInfo{
{RealName: "some-snap", SnapID: "some-snap-id", Revision: snap.R(1)},
}),
Current: snap.R(1),
SnapType: "app",
RefreshFailures: &snap.RefreshFailuresInfo{
ToRevision: badRevison,
FailureCount: 3,
LastFailureTime: time.Now(),
},
}
snapstate.Set(s.state, "some-snap", badSnapst)
snapstate.Set(s.state, "some-other-snap", &snapstate.SnapState{
Active: true,
Sequence: snapstatetest.NewSequenceFromSnapSideInfos([]*snap.SideInfo{
{RealName: "some-other-snap", SnapID: "some-other-snap-id", Revision: snap.R(1)},
}),
Current: snap.R(1),
SnapType: "app",
})

s.fakeStore.refreshRevnos["some-snap-id"] = badRevison
names, tss, err := snapstate.AutoRefresh(context.Background(), s.state)
c.Assert(err, IsNil)
c.Assert(tss, NotNil)
// some-snap auto-refresh skipped
c.Check(names, DeepEquals, []string{"some-other-snap"})

// Check that new revision resets RefreshFailures
s.fakeStore.refreshRevnos["some-snap-id"] = snap.R(13)
// First make sure RefreshFailures is not nil
var snapst snapstate.SnapState
snapstate.Get(s.state, "some-snap", &snapst)
c.Assert(snapst.RefreshFailures.FailureCount, Equals, 3)
// Trigger new auto-refresh (with new revision from fakestore)
names, tss, err = snapstate.AutoRefresh(context.Background(), s.state)
c.Assert(err, IsNil)
c.Assert(tss, NotNil)
// some-snap auto-refresh not skipped
c.Check(names, DeepEquals, []string{"some-other-snap", "some-snap"})
// And RefreshFailures is reset
snapstate.Get(s.state, "some-snap", &snapst)
c.Assert(snapst.RefreshFailures, IsNil)
}

type customStore struct {
*fakeStore

Expand Down
Loading

0 comments on commit 5b1a96b

Please sign in to comment.