Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

Commit

Permalink
Merge remote-tracking branch 'upstream/release-7.0' into slack-vitess…
Browse files Browse the repository at this point in the history
…-2020.08.19.r0
  • Loading branch information
ameetkotian committed Sep 10, 2020
2 parents a8260ae + 60b65c6 commit 258485e
Show file tree
Hide file tree
Showing 9 changed files with 218 additions and 116 deletions.
2 changes: 0 additions & 2 deletions go/cmd/vtbackup/vtbackup.go
Original file line number Diff line number Diff line change
Expand Up @@ -306,8 +306,6 @@ func takeBackup(ctx context.Context, topoServer *topo.Server, backupStorage back
return fmt.Errorf("no backup found; not starting up empty since -initial_backup flag was not enabled")
}
restorePos = mysql.Position{}
case mysqlctl.ErrExistingDB:
return fmt.Errorf("can't run vtbackup because data directory is not empty")
default:
return fmt.Errorf("can't restore from backup: %v", err)
}
Expand Down
2 changes: 2 additions & 0 deletions go/test/endtoend/backup/vtbackup/backup_only_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,8 @@ func firstBackupTest(t *testing.T, tabletType string) {
// check that the restored replica has the right local_metadata
result, err := replica2.VttabletProcess.QueryTabletWithDB("select * from local_metadata", "_vt")
require.Nil(t, err)
require.NotNil(t, result)
require.NotEmpty(t, result.Rows)
assert.Equal(t, replica2.Alias, result.Rows[0][1].ToString(), "Alias")
assert.Equal(t, "ks.0", result.Rows[1][1].ToString(), "ClusterAlias")
assert.Equal(t, cell, result.Rows[2][1].ToString(), "DataCenter")
Expand Down
42 changes: 15 additions & 27 deletions go/vt/mysqlctl/backup.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,6 @@ var (
// but none of them are complete.
ErrNoCompleteBackup = errors.New("backup(s) found but none are complete")

// ErrExistingDB is returned when there's already an active DB.
ErrExistingDB = errors.New("skipping restore due to existing database")

// backupStorageHook contains the hook name to use to process
// backup files. If not set, we will not process the files. It is
// only used at backup time. Then it is put in the manifest,
Expand Down Expand Up @@ -218,34 +215,25 @@ func removeExistingFiles(cnf *Mycnf) error {
return nil
}

// ShouldRestore checks whether a database with tables already exists
// and returns whether a restore action should be performed
func ShouldRestore(ctx context.Context, params RestoreParams) (bool, error) {
if params.DeleteBeforeRestore || RestoreWasInterrupted(params.Cnf) {
return true, nil
}
params.Logger.Infof("Restore: No %v file found, checking no existing data is present", RestoreState)
// Wait for mysqld to be ready, in case it was launched in parallel with us.
// If this doesn't succeed, we should not attempt a restore
if err := params.Mysqld.Wait(ctx, params.Cnf); err != nil {
return false, err
}
return checkNoDB(ctx, params.Mysqld, params.DbName)
}

// Restore is the main entry point for backup restore. If there is no
// appropriate backup on the BackupStorage, Restore logs an error
// and returns ErrNoBackup. Any other error is returned.
func Restore(ctx context.Context, params RestoreParams) (*BackupManifest, error) {

if !params.DeleteBeforeRestore {
params.Logger.Infof("Restore: Checking if a restore is in progress")
if !RestoreWasInterrupted(params.Cnf) {
params.Logger.Infof("Restore: No %v file found, checking no existing data is present", RestoreState)
// Wait for mysqld to be ready, in case it was launched in parallel with us.
if err := params.Mysqld.Wait(ctx, params.Cnf); err != nil {
return nil, err
}

ok, err := checkNoDB(ctx, params.Mysqld, params.DbName)
if err != nil {
return nil, err
}
if !ok {
params.Logger.Infof("Auto-restore is enabled, but mysqld already contains data. Assuming vttablet was just restarted.")
if err = PopulateMetadataTables(params.Mysqld, params.LocalMetadata, params.DbName); err == nil {
err = ErrExistingDB
}
return nil, err
}
}
}

// find the right backup handle: most recent one, with a MANIFEST
params.Logger.Infof("Restore: looking for a suitable backup to restore")
bs, err := backupstorage.GetBackupStorage()
Expand Down
28 changes: 19 additions & 9 deletions go/vt/vttablet/tabletmanager/restore.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,12 +76,9 @@ func (tm *TabletManager) RestoreData(ctx context.Context, logger logutil.Logger,
}

func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.Logger, waitForBackupInterval time.Duration, deleteBeforeRestore bool) error {

tablet := tm.Tablet()
originalType := tablet.Type
if err := tm.tmState.ChangeTabletType(ctx, topodatapb.TabletType_RESTORE); err != nil {
return err
}

// Try to restore. Depending on the reason for failure, we may be ok.
// If we're not ok, return an error and the tm will log.Fatalf,
// causing the process to be restarted and the restore retried.
Expand Down Expand Up @@ -117,6 +114,24 @@ func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.L
StartTime: logutil.ProtoToTime(keyspaceInfo.SnapshotTime),
}

// Check whether we're going to restore before changing to RESTORE type,
// so we keep our MasterTermStartTime (if any) if we aren't actually restoring.
ok, err := mysqlctl.ShouldRestore(ctx, params)
if err != nil {
return err
}
if !ok {
params.Logger.Infof("Attempting to restore, but mysqld already contains data. Assuming vttablet was just restarted.")
return mysqlctl.PopulateMetadataTables(params.Mysqld, params.LocalMetadata, params.DbName)
}
// We should not become master after restore, because that would incorrectly
// start a new master term, and it's likely our data dir will be out of date.
if originalType == topodatapb.TabletType_MASTER {
originalType = tm.baseTabletType
}
if err := tm.tmState.ChangeTabletType(ctx, topodatapb.TabletType_RESTORE); err != nil {
return err
}
// Loop until a backup exists, unless we were told to give up immediately.
var backupManifest *mysqlctl.BackupManifest
for {
Expand Down Expand Up @@ -161,11 +176,6 @@ func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.L
}
case mysqlctl.ErrNoBackup:
// No-op, starting with empty database.
case mysqlctl.ErrExistingDB:
// No-op, assuming we've just restarted. Note the
// replication reporter may restart replication at the
// next health check if it thinks it should. We do not
// alter replication here.
default:
// If anything failed, we should reset the original tablet type
if err := tm.tmState.ChangeTabletType(ctx, originalType); err != nil {
Expand Down
15 changes: 13 additions & 2 deletions go/vt/vttablet/tabletmanager/tm_init.go
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ func (tm *TabletManager) Start(tablet *topodatapb.Tablet, healthCheckInterval ti
return nil
}

tm.tmState.Open(tm.BatchCtx)
tm.tmState.Open()
return nil
}

Expand Down Expand Up @@ -455,6 +455,7 @@ func (tm *TabletManager) checkMastership(ctx context.Context, si *topo.ShardInfo
case topo.IsErrType(err, topo.NoNode):
// There's no existing tablet record, so we can assume
// no one has left us a message to step down.
log.Infof("Shard master alias matches, but there is no existing tablet record. Switching to master with 'Now' as time")
tm.tmState.UpdateTablet(func(tablet *topodatapb.Tablet) {
tablet.Type = topodatapb.TabletType_MASTER
// Update the master term start time (current value is 0) because we
Expand All @@ -464,12 +465,19 @@ func (tm *TabletManager) checkMastership(ctx context.Context, si *topo.ShardInfo
})
case err == nil:
if oldTablet.Type == topodatapb.TabletType_MASTER {
log.Infof("Shard master alias matches, and existing tablet agrees. Switching to master with tablet's master term start time: %v", oldTablet.MasterTermStartTime)
// We're marked as master in the shard record,
// and our existing tablet record agrees.
tm.tmState.UpdateTablet(func(tablet *topodatapb.Tablet) {
tablet.Type = topodatapb.TabletType_MASTER
tablet.MasterTermStartTime = oldTablet.MasterTermStartTime
})
} else {
log.Warningf("Shard master alias matches, but existing tablet is not master. Switching from %v to master with the shard's master term start time: %v", oldTablet.Type, si.MasterTermStartTime)
tm.tmState.UpdateTablet(func(tablet *topodatapb.Tablet) {
tablet.Type = topodatapb.TabletType_MASTER
tablet.MasterTermStartTime = si.MasterTermStartTime
})
}
default:
return vterrors.Wrap(err, "InitTablet failed to read existing tablet record")
Expand All @@ -486,10 +494,13 @@ func (tm *TabletManager) checkMastership(ctx context.Context, si *topo.ShardInfo
oldMasterTermStartTime := oldTablet.GetMasterTermStartTime()
currentShardTime := si.GetMasterTermStartTime()
if oldMasterTermStartTime.After(currentShardTime) {
log.Infof("Shard master alias does not match, but the tablet's master term start time is newer. Switching to master with tablet's master term start time: %v", oldTablet.MasterTermStartTime)
tm.tmState.UpdateTablet(func(tablet *topodatapb.Tablet) {
tablet.Type = topodatapb.TabletType_MASTER
tablet.MasterTermStartTime = oldTablet.MasterTermStartTime
})
} else {
log.Infof("Existing tablet type is master, but the shard record has a different master with a newer timestamp. Remaining a replica")
}
}
default:
Expand Down Expand Up @@ -588,7 +599,7 @@ func (tm *TabletManager) handleRestore(ctx context.Context) (bool, error) {
if *restoreFromBackup {
go func() {
// Open the state manager after restore is done.
defer tm.tmState.Open(ctx)
defer tm.tmState.Open()

// restoreFromBackup will just be a regular action
// (same as if it was triggered remotely)
Expand Down
94 changes: 91 additions & 3 deletions go/vt/vttablet/tabletmanager/tm_init_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -218,19 +218,23 @@ func TestCheckMastership(t *testing.T) {
// 2. Update shard's master to our alias, then try to init again.
// (This simulates the case where the MasterAlias in the shard record says
// that we are the master but the tablet record says otherwise. In that case,
// we assume we are not the MASTER.)
// we become master by inheriting the shard record's timestamp.)
now := time.Now()
_, err = ts.UpdateShardFields(ctx, "ks", "0", func(si *topo.ShardInfo) error {
si.MasterAlias = alias
si.MasterTermStartTime = logutil.TimeToProto(now)
// Reassign to now for easier comparison.
now = si.GetMasterTermStartTime()
return nil
})
require.NoError(t, err)
err = tm.Start(tablet, 0)
require.NoError(t, err)
ti, err = ts.GetTablet(ctx, alias)
require.NoError(t, err)
assert.Equal(t, topodatapb.TabletType_REPLICA, ti.Type)
assert.Equal(t, topodatapb.TabletType_MASTER, ti.Type)
ter0 := ti.GetMasterTermStartTime()
assert.True(t, ter0.IsZero())
assert.Equal(t, now, ter0)
tm.Stop()

// 3. Delete the tablet record. The shard record still says that we are the
Expand Down Expand Up @@ -291,6 +295,25 @@ func TestCheckMastership(t *testing.T) {
ter4 := ti.GetMasterTermStartTime()
assert.Equal(t, ter1, ter4)
tm.Stop()

// 7. If the shard record shows a different master with a newer
// timestamp, we remain replica.
_, err = ts.UpdateShardFields(ctx, "ks", "0", func(si *topo.ShardInfo) error {
si.MasterAlias = otherAlias
si.MasterTermStartTime = logutil.TimeToProto(ter4.Add(10 * time.Second))
return nil
})
require.NoError(t, err)
tablet.Type = topodatapb.TabletType_REPLICA
tablet.MasterTermStartTime = nil
err = tm.Start(tablet, 0)
require.NoError(t, err)
ti, err = ts.GetTablet(ctx, alias)
require.NoError(t, err)
assert.Equal(t, topodatapb.TabletType_REPLICA, ti.Type)
ter5 := ti.GetMasterTermStartTime()
assert.True(t, ter5.IsZero())
tm.Stop()
}

func TestStartCheckMysql(t *testing.T) {
Expand Down Expand Up @@ -409,6 +432,71 @@ func TestStartDoesNotUpdateReplicationDataForTabletInWrongShard(t *testing.T) {
assert.Equal(t, 0, len(tablets))
}

func TestCheckTabletTypeResets(t *testing.T) {
defer func(saved time.Duration) { rebuildKeyspaceRetryInterval = saved }(rebuildKeyspaceRetryInterval)
rebuildKeyspaceRetryInterval = 10 * time.Millisecond

ctx := context.Background()
cell := "cell1"
ts := memorytopo.NewServer(cell)
alias := &topodatapb.TabletAlias{
Cell: "cell1",
Uid: 1,
}

// 1. Initialize the tablet as REPLICA.
// This will create the respective topology records.
tm := newTestTM(t, ts, 1, "ks", "0")
tablet := tm.Tablet()
ensureSrvKeyspace(t, ts, cell, "ks")
ti, err := ts.GetTablet(ctx, alias)
require.NoError(t, err)
assert.Equal(t, topodatapb.TabletType_REPLICA, ti.Type)
tm.Stop()

// 2. Update tablet record with tabletType RESTORE
_, err = ts.UpdateTabletFields(ctx, alias, func(t *topodatapb.Tablet) error {
t.Type = topodatapb.TabletType_RESTORE
return nil
})
require.NoError(t, err)
err = tm.Start(tablet, 0)
require.NoError(t, err)
assert.Equal(t, tm.tmState.tablet.Type, tm.tmState.displayState.tablet.Type)
ti, err = ts.GetTablet(ctx, alias)
require.NoError(t, err)
// Verify that it changes back to initTabletType
assert.Equal(t, topodatapb.TabletType_REPLICA, ti.Type)

// 3. Update shard's master to our alias, then try to init again.
// (This simulates the case where the MasterAlias in the shard record says
// that we are the master but the tablet record says otherwise. In that case,
// we become master by inheriting the shard record's timestamp.)
now := time.Now()
_, err = ts.UpdateShardFields(ctx, "ks", "0", func(si *topo.ShardInfo) error {
si.MasterAlias = alias
si.MasterTermStartTime = logutil.TimeToProto(now)
// Reassign to now for easier comparison.
now = si.GetMasterTermStartTime()
return nil
})
require.NoError(t, err)
si, err := tm.createKeyspaceShard(ctx)
require.NoError(t, err)
err = tm.checkMastership(ctx, si)
require.NoError(t, err)
assert.Equal(t, tm.tmState.tablet.Type, tm.tmState.displayState.tablet.Type)
err = tm.initTablet(ctx)
require.NoError(t, err)
assert.Equal(t, tm.tmState.tablet.Type, tm.tmState.displayState.tablet.Type)
ti, err = ts.GetTablet(ctx, alias)
require.NoError(t, err)
assert.Equal(t, topodatapb.TabletType_MASTER, ti.Type)
ter0 := ti.GetMasterTermStartTime()
assert.Equal(t, now, ter0)
tm.Stop()
}

func newTestTM(t *testing.T, ts *topo.Server, uid int, keyspace, shard string) *TabletManager {
t.Helper()
ctx := context.Background()
Expand Down
11 changes: 7 additions & 4 deletions go/vt/vttablet/tabletmanager/tm_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,26 +68,28 @@ type tmState struct {
}

func newTMState(tm *TabletManager, tablet *topodatapb.Tablet) *tmState {
ctx, cancel := context.WithCancel(tm.BatchCtx)
return &tmState{
tm: tm,
displayState: displayState{
tablet: proto.Clone(tablet).(*topodatapb.Tablet),
},
tablet: tablet,
ctx: ctx,
cancel: cancel,
}
}

func (ts *tmState) Open(ctx context.Context) {
func (ts *tmState) Open() {
ts.mu.Lock()
defer ts.mu.Unlock()
if ts.isOpen {
return
}

ts.ctx, ts.cancel = context.WithCancel(ctx)
ts.isOpen = true
ts.updateLocked(ts.ctx)
ts.publishStateLocked(ctx)
ts.publishStateLocked(ts.ctx)
}

func (ts *tmState) Close() {
Expand Down Expand Up @@ -192,18 +194,19 @@ func (ts *tmState) UpdateTablet(update func(tablet *topodatapb.Tablet)) {
ts.mu.Lock()
defer ts.mu.Unlock()
update(ts.tablet)
ts.publishForDisplay()
}

func (ts *tmState) updateLocked(ctx context.Context) {
span, ctx := trace.NewSpan(ctx, "tmState.update")
defer span.Finish()
ts.publishForDisplay()

if !ts.isOpen {
return
}

terTime := logutil.ProtoToTime(ts.tablet.MasterTermStartTime)
ts.publishForDisplay()

// Disable TabletServer first so the nonserving state gets advertised
// before other services are shutdown.
Expand Down
2 changes: 1 addition & 1 deletion go/vt/vttablet/tabletmanager/tm_state_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ func TestStateOpenClose(t *testing.T) {
savedCtx := tm.tmState.ctx
tm.tmState.mu.Unlock()

tm.tmState.Open(context.Background())
tm.tmState.Open()

tm.tmState.mu.Lock()
assert.Equal(t, savedCtx, tm.tmState.ctx)
Expand Down
Loading

0 comments on commit 258485e

Please sign in to comment.