Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
106502: batcheval: delete flaky integration test r=aliher1911 a=aliher1911

TestLeaseCommandLearnerReplica is fundamentally flaky as commit can't retry all underlying replication failures and acceptable failure causes are not exposed on the higher levels where test operates.

What test is supposed to verify that failures to propose with invalid lease are not causing pipelined writes to fail but are retried. More details described in original PR cockroachdb#35261 where this test is introduced.
In absence of direct controls of that, test is trying to move leases between replicas, but this action could cause different failures to happen which can't be filtered out. For example:
`failed to repropose 0b9a11f4bef0e3d3 at idx 16 with new lease index: [NotLeaseHolderError] ‹reproposal failed due to closed timestamp›`
and 
`stale proposal: command was proposed under lease #2 but is being applied under lease: ...`
While closed timestamp could be mitigated by changing settings, second one is opaque to the caller and hard to filter out.

Release note: None
Fixes cockroachdb#104716

Co-authored-by: Oleg Afanasyev <[email protected]>
  • Loading branch information
craig[bot] and aliher1911 committed Jul 12, 2023
2 parents 6a6ebcb + 2e21d2e commit b8a6b06
Showing 1 changed file with 0 additions and 92 deletions.
92 changes: 0 additions & 92 deletions pkg/kv/kvserver/batcheval/cmd_lease_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,112 +15,20 @@ import (
"testing"
"time"

"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/kv/kvpb"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/readsummary"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/readsummary/rspb"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/storage"
"github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/stretchr/testify/require"
)

// TestLeaseTransferWithPipelinedWrite verifies that pipelined writes
// do not cause retry errors to be leaked to clients when the error
// can be handled internally. Pipelining dissociates a write from its
// caller, so the retries of internally-generated errors (specifically
// out-of-order lease indexes) must be retried below that level.
//
// This issue was observed in practice to affect the first insert
// after table creation with high probability.
func TestLeaseTransferWithPipelinedWrite(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)

ctx := context.Background()

tc := serverutils.StartNewTestCluster(t, 3, base.TestClusterArgs{})
defer tc.Stopper().Stop(ctx)

db := tc.ServerConn(0)

for iter := 0; iter < 100; iter++ {
log.Infof(ctx, "iter %d", iter)
if _, err := db.ExecContext(ctx, "drop table if exists test"); err != nil {
t.Fatal(err)
}
if _, err := db.ExecContext(ctx, "create table test (a int, b int, primary key (a, b))"); err != nil {
t.Fatal(err)
}

workerErrCh := make(chan error, 1)
go func() {
workerErrCh <- func() error {
for i := 0; i < 1; i++ {
tx, err := db.BeginTx(ctx, nil)
if err != nil {
return err
}
defer func() {
if tx != nil {
if err := tx.Rollback(); err != nil {
log.Warningf(ctx, "error rolling back: %+v", err)
}
}
}()
// Run two inserts in a transaction to ensure that we have
// pipelined writes that cannot be retried at the SQL layer
// due to the first-statement rule.
if _, err := tx.ExecContext(ctx, "INSERT INTO test (a, b) VALUES ($1, $2)", i, 1); err != nil {
return err
}
if _, err := tx.ExecContext(ctx, "INSERT INTO test (a, b) VALUES ($1, $2)", i, 2); err != nil {
return err
}
if err := tx.Commit(); err != nil {
return err
}
tx = nil
}
return nil
}()
}()

// TODO(bdarnell): This test reliably reproduced the issue when
// introduced, because table creation causes splits and repeated
// table creation leads to lease transfers due to rebalancing.
// This is a subtle thing to rely on and the test might become
// more reliable if we ran more iterations in the worker goroutine
// and added a second goroutine to explicitly transfer leases back
// and forth.

select {
case <-time.After(15 * time.Second):
// TODO(bdarnell): The test seems flaky under stress with a 5s
// timeout. Why? I'm giving it a high timeout since hanging
// isn't a failure mode we're particularly concerned about here,
// but it shouldn't be taking this long even with stress.
t.Fatal("timed out")
case err := <-workerErrCh:
if err != nil {
// We allow the transaction to run into an aborted error due to a lease
// transfer when it attempts to create its transaction record. This it
// outside of the focus of this test.
okErr := testutils.IsError(err, kvpb.ABORT_REASON_NEW_LEASE_PREVENTS_TXN.String())
if !okErr {
t.Fatalf("worker failed: %+v", err)
}
}
}
}
}

func TestLeaseCommandLearnerReplica(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)
Expand Down

0 comments on commit b8a6b06

Please sign in to comment.