Skip to content

Commit

Permalink
Merge pull request #18749 from shyamjvs/fix-partial-txn-risk
Browse files Browse the repository at this point in the history
Fix risk of a partial write txn being applied
  • Loading branch information
serathius authored Oct 24, 2024
2 parents aae197c + 8a0fd66 commit 38c27a4
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 7 deletions.
9 changes: 5 additions & 4 deletions server/etcdserver/txn/txn.go
Original file line number Diff line number Diff line change
Expand Up @@ -307,10 +307,11 @@ func txn(ctx context.Context, lg *zap.Logger, txnWrite mvcc.TxnWrite, rt *pb.Txn
_, err := executeTxn(ctx, lg, txnWrite, rt, txnPath, txnResp)
if err != nil {
if isWrite {
// end txn to release locks before panic
txnWrite.End()
// When txn with write operations starts it has to be successful
// We don't have a way to recover state in case of write failure
// CAUTION: When a txn performing write operations starts, we always expect it to be successful.
// If a write failure is seen we SHOULD NOT try to recover the server, but crash with a panic to make the failure explicit.
// Trying to silently recover (e.g by ignoring the failed txn or calling txn.End() early) poses serious risks:
// - violation of transaction atomicity if some write operations have been partially executed
// - data inconsistency across different etcd members if they applied the txn asymmetrically
lg.Panic("unexpected error during txn with writes", zap.Error(err))
} else {
lg.Error("unexpected error during readonly txn", zap.Error(err))
Expand Down
32 changes: 29 additions & 3 deletions server/etcdserver/txn/txn_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ package txn

import (
"context"
"crypto/sha256"
"io"
"os"
"strings"
"testing"
"time"
Expand Down Expand Up @@ -336,9 +339,8 @@ func TestReadonlyTxnError(t *testing.T) {
}
}

func TestWriteTxnPanic(t *testing.T) {
b, _ := betesting.NewDefaultTmpBackend(t)
defer betesting.Close(t, b)
func TestWriteTxnPanicWithoutApply(t *testing.T) {
b, bePath := betesting.NewDefaultTmpBackend(t)
s := mvcc.NewStore(zaptest.NewLogger(t), b, &lease.FakeLessor{}, mvcc.StoreConfig{})
defer s.Close()

Expand Down Expand Up @@ -367,7 +369,17 @@ func TestWriteTxnPanic(t *testing.T) {
},
}

// compute DB file hash before applying the txn
dbHashBefore, err := computeFileHash(bePath)
require.NoErrorf(t, err, "failed to compute DB file hash before txn")

// we verify the following properties below:
// 1. server panics after a write txn aply fails (invariant: server should never try to move on from a failed write)
// 2. no writes from the txn are applied to the backend (invariant: failed write should have no side-effect on DB state besides panic)
assert.Panicsf(t, func() { Txn(ctx, zaptest.NewLogger(t), txn, false, s, &lease.FakeLessor{}) }, "Expected panic in Txn with writes")
dbHashAfter, err := computeFileHash(bePath)
require.NoErrorf(t, err, "failed to compute DB file hash after txn")
require.Equalf(t, dbHashBefore, dbHashAfter, "mismatch in DB hash before and after failed write txn")
}

func TestCheckTxnAuth(t *testing.T) {
Expand Down Expand Up @@ -566,6 +578,20 @@ func setupAuth(t *testing.T, be backend.Backend) auth.AuthStore {
return as
}

func computeFileHash(filePath string) (string, error) {
file, err := os.Open(filePath)
if err != nil {
return "", err
}
defer file.Close()

h := sha256.New()
if _, err := io.Copy(h, file); err != nil {
return "", err
}
return string(h.Sum(nil)), nil
}

// CheckTxnAuth variables setup.
var (
inRangeCompare = &pb.Compare{
Expand Down

0 comments on commit 38c27a4

Please sign in to comment.