Skip to content

Commit 6e6b615

Browse files
authored
Merge pull request #16995 from ahrtr/3.4_clusterId_20231122
[3.4] etcdserver: add cluster id check for hashKVHandler
2 parents fe68345 + c750e01 commit 6e6b615

9 files changed

Lines changed: 283 additions & 14 deletions

File tree

etcdserver/api/rafthttp/http.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ var (
5151
RaftSnapshotPrefix = path.Join(RaftPrefix, "snapshot")
5252

5353
errIncompatibleVersion = errors.New("incompatible version")
54-
errClusterIDMismatch = errors.New("cluster ID mismatch")
54+
ErrClusterIDMismatch = errors.New("cluster ID mismatch")
5555
)
5656

5757
type peerGetter interface {
@@ -558,7 +558,7 @@ func checkClusterCompatibilityFromHeader(lg *zap.Logger, localID types.ID, heade
558558
} else {
559559
plog.Errorf("request cluster ID mismatch (got %s want %s)", gcid, cid)
560560
}
561-
return errClusterIDMismatch
561+
return ErrClusterIDMismatch
562562
}
563563
return nil
564564
}

etcdserver/api/rafthttp/stream.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -677,21 +677,21 @@ func (cr *streamReader) dial(t streamType) (io.ReadCloser, error) {
677677
}
678678
return nil, errIncompatibleVersion
679679

680-
case errClusterIDMismatch.Error():
680+
case ErrClusterIDMismatch.Error():
681681
if cr.lg != nil {
682682
cr.lg.Warn(
683683
"request sent was ignored by remote peer due to cluster ID mismatch",
684684
zap.String("remote-peer-id", cr.peerID.String()),
685685
zap.String("remote-peer-cluster-id", resp.Header.Get("X-Etcd-Cluster-ID")),
686686
zap.String("local-member-id", cr.tr.ID.String()),
687687
zap.String("local-member-cluster-id", cr.tr.ClusterID.String()),
688-
zap.Error(errClusterIDMismatch),
688+
zap.Error(ErrClusterIDMismatch),
689689
)
690690
} else {
691691
plog.Errorf("request sent was ignored (cluster ID mismatch: peer[%s]=%s, local=%s)",
692692
cr.peerID, resp.Header.Get("X-Etcd-Cluster-ID"), cr.tr.ClusterID)
693693
}
694-
return nil, errClusterIDMismatch
694+
return nil, ErrClusterIDMismatch
695695

696696
default:
697697
return nil, fmt.Errorf("unhandled error %q when precondition failed", string(b))

etcdserver/api/rafthttp/util.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,10 +86,10 @@ func checkPostResponse(resp *http.Response, body []byte, req *http.Request, to t
8686
case errIncompatibleVersion.Error():
8787
plog.Errorf("request sent was ignored by peer %s (server version incompatible)", to)
8888
return errIncompatibleVersion
89-
case errClusterIDMismatch.Error():
89+
case ErrClusterIDMismatch.Error():
9090
plog.Errorf("request sent was ignored (cluster ID mismatch: remote[%s]=%s, local=%s)",
9191
to, resp.Header.Get("X-Etcd-Cluster-ID"), req.Header.Get("X-Etcd-Cluster-ID"))
92-
return errClusterIDMismatch
92+
return ErrClusterIDMismatch
9393
default:
9494
return fmt.Errorf("unhandled error %q when precondition failed", string(body))
9595
}

etcdserver/api/v3rpc/rpctypes/error.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ var (
4545
ErrGRPCMemberNotLearner = status.New(codes.FailedPrecondition, "etcdserver: can only promote a learner member").Err()
4646
ErrGRPCLearnerNotReady = status.New(codes.FailedPrecondition, "etcdserver: can only promote a learner member which is in sync with leader").Err()
4747
ErrGRPCTooManyLearners = status.New(codes.FailedPrecondition, "etcdserver: too many learner members in cluster").Err()
48+
ErrGRPCClusterIdMismatch = status.New(codes.FailedPrecondition, "etcdserver: cluster ID mismatch").Err()
4849

4950
ErrGRPCRequestTooLarge = status.New(codes.InvalidArgument, "etcdserver: request is too large").Err()
5051
ErrGRPCRequestTooManyRequests = status.New(codes.ResourceExhausted, "etcdserver: too many requests").Err()
@@ -105,6 +106,7 @@ var (
105106
ErrorDesc(ErrGRPCMemberNotLearner): ErrGRPCMemberNotLearner,
106107
ErrorDesc(ErrGRPCLearnerNotReady): ErrGRPCLearnerNotReady,
107108
ErrorDesc(ErrGRPCTooManyLearners): ErrGRPCTooManyLearners,
109+
ErrorDesc(ErrGRPCClusterIdMismatch): ErrGRPCClusterIdMismatch,
108110

109111
ErrorDesc(ErrGRPCRequestTooLarge): ErrGRPCRequestTooLarge,
110112
ErrorDesc(ErrGRPCRequestTooManyRequests): ErrGRPCRequestTooManyRequests,
@@ -186,6 +188,7 @@ var (
186188
ErrInvalidAuthToken = Error(ErrGRPCInvalidAuthToken)
187189
ErrAuthOldRevision = Error(ErrGRPCAuthOldRevision)
188190
ErrInvalidAuthMgmt = Error(ErrGRPCInvalidAuthMgmt)
191+
ErrClusterIdMismatch = Error(ErrGRPCClusterIdMismatch)
189192

190193
ErrNoLeader = Error(ErrGRPCNoLeader)
191194
ErrNotLeader = Error(ErrGRPCNotLeader)

etcdserver/corrupt.go

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"strings"
2525
"time"
2626

27+
"go.etcd.io/etcd/etcdserver/api/rafthttp"
2728
"go.etcd.io/etcd/etcdserver/api/v3rpc/rpctypes"
2829
pb "go.etcd.io/etcd/etcdserver/etcdserverpb"
2930
"go.etcd.io/etcd/mvcc"
@@ -126,6 +127,19 @@ func (s *EtcdServer) CheckInitialHashKV() error {
126127
} else {
127128
plog.Warningf("%s cannot check the hash of peer(%q) at revision %d: local node is lagging behind(%q)", s.ID(), p.eps, rev, p.err.Error())
128129
}
130+
case rpctypes.ErrClusterIdMismatch:
131+
if lg != nil {
132+
lg.Warn(
133+
"cluster ID mismatch",
134+
zap.String("local-member-id", s.ID().String()),
135+
zap.Int64("local-member-revision", rev),
136+
zap.Int64("local-member-compact-revision", crev),
137+
zap.Uint32("local-member-hash", h),
138+
zap.String("remote-peer-id", p.id.String()),
139+
zap.Strings("remote-peer-endpoints", p.eps),
140+
zap.Error(err),
141+
)
142+
}
129143
}
130144
}
131145
}
@@ -353,7 +367,7 @@ func (s *EtcdServer) getPeerHashKVs(rev int64) []*peerHashKVResp {
353367
ctx, cancel := context.WithTimeout(context.Background(), s.Cfg.ReqTimeout())
354368

355369
var resp *pb.HashKVResponse
356-
resp, lastErr = s.getPeerHashKVHTTP(ctx, ep, rev)
370+
resp, lastErr = s.getPeerHashKVHTTP(ctx, s.cluster.ID(), ep, rev)
357371
cancel()
358372
if lastErr == nil {
359373
resps = append(resps, &peerHashKVResp{peerInfo: p, resp: resp, err: nil})
@@ -440,6 +454,10 @@ func (h *hashKVHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
440454
http.Error(w, "bad path", http.StatusBadRequest)
441455
return
442456
}
457+
if gcid := r.Header.Get("X-Etcd-Cluster-ID"); gcid != "" && gcid != h.server.cluster.ID().String() {
458+
http.Error(w, rafthttp.ErrClusterIDMismatch.Error(), http.StatusPreconditionFailed)
459+
return
460+
}
443461

444462
defer r.Body.Close()
445463
b, err := ioutil.ReadAll(r.Body)
@@ -478,7 +496,7 @@ func (h *hashKVHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
478496
}
479497

480498
// getPeerHashKVHTTP fetch hash of kv store at the given rev via http call to the given url
481-
func (s *EtcdServer) getPeerHashKVHTTP(ctx context.Context, url string, rev int64) (*pb.HashKVResponse, error) {
499+
func (s *EtcdServer) getPeerHashKVHTTP(ctx context.Context, cid types.ID, url string, rev int64) (*pb.HashKVResponse, error) {
482500
cc := &http.Client{Transport: s.peerRt}
483501
hashReq := &pb.HashKVRequest{Revision: rev}
484502
hashReqBytes, err := json.Marshal(hashReq)
@@ -492,6 +510,7 @@ func (s *EtcdServer) getPeerHashKVHTTP(ctx context.Context, url string, rev int6
492510
}
493511
req = req.WithContext(ctx)
494512
req.Header.Set("Content-Type", "application/json")
513+
req.Header.Set("X-Etcd-Cluster-ID", cid.String())
495514
req.Cancel = ctx.Done()
496515

497516
resp, err := cc.Do(req)
@@ -511,6 +530,10 @@ func (s *EtcdServer) getPeerHashKVHTTP(ctx context.Context, url string, rev int6
511530
if strings.Contains(string(b), mvcc.ErrFutureRev.Error()) {
512531
return nil, rpctypes.ErrFutureRev
513532
}
533+
} else if resp.StatusCode == http.StatusPreconditionFailed {
534+
if strings.Contains(string(b), rafthttp.ErrClusterIDMismatch.Error()) {
535+
return nil, rpctypes.ErrClusterIdMismatch
536+
}
514537
}
515538
if resp.StatusCode != http.StatusOK {
516539
return nil, fmt.Errorf("unknown error: %s", string(b))

etcdserver/corrupt_test.go

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
// Copyright 2023 The etcd Authors
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package etcdserver
16+
17+
import (
18+
"bytes"
19+
"encoding/json"
20+
"io"
21+
"net/http"
22+
"net/http/httptest"
23+
"strconv"
24+
"strings"
25+
"testing"
26+
27+
"github.com/stretchr/testify/assert"
28+
"go.uber.org/zap"
29+
30+
pb "go.etcd.io/etcd/etcdserver/etcdserverpb"
31+
"go.etcd.io/etcd/lease"
32+
"go.etcd.io/etcd/mvcc"
33+
betesting "go.etcd.io/etcd/mvcc/backend"
34+
"go.etcd.io/etcd/pkg/types"
35+
)
36+
37+
func TestHashKVHandler(t *testing.T) {
38+
var remoteClusterID = 111195
39+
var localClusterID = 111196
40+
var revision = 1
41+
42+
etcdSrv := &EtcdServer{}
43+
etcdSrv.cluster = newTestCluster(nil)
44+
etcdSrv.cluster.SetID(types.ID(localClusterID), types.ID(localClusterID))
45+
be, _ := betesting.NewDefaultTmpBackend()
46+
defer func() {
47+
assert.NoError(t, be.Close())
48+
}()
49+
etcdSrv.kv = mvcc.New(zap.NewNop(), be, &lease.FakeLessor{}, nil, nil, mvcc.StoreConfig{})
50+
ph := &hashKVHandler{
51+
lg: zap.NewNop(),
52+
server: etcdSrv,
53+
}
54+
srv := httptest.NewServer(ph)
55+
defer srv.Close()
56+
57+
tests := []struct {
58+
name string
59+
remoteClusterID int
60+
wcode int
61+
wKeyWords string
62+
}{
63+
{
64+
name: "HashKV returns 200 if cluster hash matches",
65+
remoteClusterID: localClusterID,
66+
wcode: http.StatusOK,
67+
wKeyWords: "",
68+
},
69+
{
70+
name: "HashKV returns 400 if cluster hash doesn't matche",
71+
remoteClusterID: remoteClusterID,
72+
wcode: http.StatusPreconditionFailed,
73+
wKeyWords: "cluster ID mismatch",
74+
},
75+
}
76+
for i, tt := range tests {
77+
t.Run(tt.name, func(t *testing.T) {
78+
hashReq := &pb.HashKVRequest{Revision: int64(revision)}
79+
hashReqBytes, err := json.Marshal(hashReq)
80+
if err != nil {
81+
t.Fatalf("failed to marshal request: %v", err)
82+
}
83+
req, err := http.NewRequest(http.MethodGet, srv.URL+PeerHashKVPath, bytes.NewReader(hashReqBytes))
84+
if err != nil {
85+
t.Fatalf("failed to create request: %v", err)
86+
}
87+
req.Header.Set("X-Etcd-Cluster-ID", strconv.FormatUint(uint64(tt.remoteClusterID), 16))
88+
89+
resp, err := http.DefaultClient.Do(req)
90+
if err != nil {
91+
t.Fatalf("failed to get http response: %v", err)
92+
}
93+
body, err := io.ReadAll(resp.Body)
94+
resp.Body.Close()
95+
if err != nil {
96+
t.Fatalf("unexpected io.ReadAll error: %v", err)
97+
}
98+
if resp.StatusCode != tt.wcode {
99+
t.Fatalf("#%d: code = %d, want %d", i, resp.StatusCode, tt.wcode)
100+
}
101+
if resp.StatusCode != http.StatusOK {
102+
if !strings.Contains(string(body), tt.wKeyWords) {
103+
t.Errorf("#%d: body: %s, want body to contain keywords: %s", i, string(body), tt.wKeyWords)
104+
}
105+
return
106+
}
107+
108+
hashKVResponse := pb.HashKVResponse{}
109+
err = json.Unmarshal(body, &hashKVResponse)
110+
if err != nil {
111+
t.Fatalf("unmarshal response error: %v", err)
112+
}
113+
hashValue, _, _, err := etcdSrv.KV().HashByRev(int64(revision))
114+
if err != nil {
115+
t.Fatalf("etcd server hash failed: %v", err)
116+
}
117+
if hashKVResponse.Hash != hashValue {
118+
t.Fatalf("hash value inconsistent: %d != %d", hashKVResponse.Hash, hashValue)
119+
}
120+
})
121+
}
122+
}

tests/e2e/cluster_test.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ type etcdProcessClusterConfig struct {
128128
noStrictReconfig bool
129129
enableV2 bool
130130
initialCorruptCheck bool
131+
corruptCheckTime time.Duration
131132
authTokenOpts string
132133

133134
MaxConcurrentStreams uint32 // default is math.MaxUint32
@@ -141,6 +142,17 @@ type etcdProcessClusterConfig struct {
141142
// newEtcdProcessCluster launches a new cluster from etcd processes, returning
142143
// a new etcdProcessCluster once all nodes are ready to accept client requests.
143144
func newEtcdProcessCluster(cfg *etcdProcessClusterConfig) (*etcdProcessCluster, error) {
145+
epc, err := initEtcdProcessCluster(cfg)
146+
if err != nil {
147+
return nil, err
148+
}
149+
150+
return startEtcdProcessCluster(epc, cfg)
151+
}
152+
153+
// `initEtcdProcessCluster` initializes a new cluster based on the given config.
154+
// It doesn't start the cluster.
155+
func initEtcdProcessCluster(cfg *etcdProcessClusterConfig) (*etcdProcessCluster, error) {
144156
etcdCfgs := cfg.etcdServerProcessConfigs()
145157
epc := &etcdProcessCluster{
146158
cfg: cfg,
@@ -158,6 +170,11 @@ func newEtcdProcessCluster(cfg *etcdProcessClusterConfig) (*etcdProcessCluster,
158170
epc.procs[i] = proc
159171
}
160172

173+
return epc, nil
174+
}
175+
176+
// `startEtcdProcessCluster` launches a new cluster from etcd processes.
177+
func startEtcdProcessCluster(epc *etcdProcessCluster, cfg *etcdProcessClusterConfig) (*etcdProcessCluster, error) {
161178
if err := epc.Start(); err != nil {
162179
return nil, err
163180
}
@@ -262,6 +279,9 @@ func (cfg *etcdProcessClusterConfig) etcdServerProcessConfigs() []*etcdServerPro
262279
if cfg.initialCorruptCheck {
263280
args = append(args, "--experimental-initial-corrupt-check")
264281
}
282+
if cfg.corruptCheckTime != 0 {
283+
args = append(args, "--experimental-corrupt-check-time", cfg.corruptCheckTime.String())
284+
}
265285
var murl string
266286
if cfg.metricsURLScheme != "" {
267287
murl = (&url.URL{

0 commit comments

Comments
 (0)