Skip to content

Commit 994ce3f

Browse files
committed
OCPBUGS-42810: actively move bootstrap member lead
This PR will actively try to move the leadership away from the bootstrap member to another member. Signed-off-by: Thomas Jungblut <[email protected]>
1 parent 43a24c5 commit 994ce3f

File tree

4 files changed

+245
-70
lines changed

4 files changed

+245
-70
lines changed

pkg/operator/bootstrapteardown/bootstrap_teardown_controller.go

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,29 @@ func (c *BootstrapTeardownController) removeBootstrap(ctx context.Context, safeT
150150
if isBootstrapComplete, err := bootstrap.IsBootstrapComplete(c.configmapLister); !isBootstrapComplete || err != nil {
151151
return err
152152
}
153+
154+
moved, err := c.ensureBootstrapIsNotLeader(ctx, bootstrapMember)
155+
if err != nil {
156+
return err
157+
}
158+
159+
// if we have just moved it, we will skip this sync iteration to backoff the controller - the next resync will happen after a minute anyway
160+
if moved {
161+
klog.Warningf("Leader just moved, waiting for next resync to remove bootstrap member [%x]", bootstrapID)
162+
c.eventRecorder.Eventf("Bootstrap member was leader and was moved away", "bootstrap member [%x] should no longer be leader", bootstrapID)
163+
_, _, updateErr := v1helpers.UpdateStatus(ctx, c.operatorClient, v1helpers.UpdateConditionFn(operatorv1.OperatorCondition{
164+
Type: "EtcdLeaderMovedAwayFromBootstrap",
165+
Status: operatorv1.ConditionTrue,
166+
Reason: "BootstrapIsLeader",
167+
Message: "bootstrap was leader and has been moved",
168+
}))
169+
if updateErr != nil {
170+
return fmt.Errorf("error while updating EtcdLeaderMovedAwayFromBootstrap: %w", updateErr)
171+
}
172+
173+
return nil
174+
}
175+
153176
klog.Warningf("Removing bootstrap member [%x]", bootstrapID)
154177
c.eventRecorder.Eventf("Removing bootstrap member", "attempting to remove bootstrap member [%x]", bootstrapID)
155178

@@ -230,3 +253,22 @@ func (c *BootstrapTeardownController) canRemoveEtcdBootstrap(ctx context.Context
230253
return false, hasBootstrap, bootstrapMember, nil
231254
}
232255
}
256+
257+
func (c *BootstrapTeardownController) ensureBootstrapIsNotLeader(ctx context.Context, bootstrapMember *etcdserverpb.Member) (bool, error) {
258+
members, err := c.etcdClient.MemberList(ctx)
259+
if err != nil {
260+
return false, fmt.Errorf("could not list while ensuring bootstrap is not the leader: %w", err)
261+
}
262+
263+
leader, err := ceohelpers.FindLeader(ctx, c.etcdClient, members)
264+
if err != nil || leader == nil {
265+
return false, fmt.Errorf("could not find leader: %w", err)
266+
}
267+
268+
if bootstrapMember.ID != leader.ID {
269+
return false, nil
270+
}
271+
272+
klog.Warningf("Bootstrap member [%x] (%s) detected as leader, trying to move elsewhere...", bootstrapMember.ID, bootstrapMember.GetClientURLs()[0])
273+
return ceohelpers.MoveLeaderToAnotherMember(ctx, c.etcdClient, leader, members)
274+
}

pkg/operator/bootstrapteardown/bootstrap_teardown_controller_test.go

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"fmt"
66
"testing"
77

8+
clientv3 "go.etcd.io/etcd/client/v3"
89
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
910
"k8s.io/apimachinery/pkg/runtime"
1011

@@ -56,6 +57,13 @@ var (
5657
Message: "enough members found",
5758
}
5859

60+
conditionBootstrapLeaderMoved = operatorv1.OperatorCondition{
61+
Type: "EtcdLeaderMovedAwayFromBootstrap",
62+
Status: "True",
63+
Reason: "BootstrapIsLeader",
64+
Message: "bootstrap was leader and has been moved",
65+
}
66+
5967
conditionEtcdMemberRemoved = operatorv1.OperatorCondition{
6068
Type: "EtcdBootstrapMemberRemoved",
6169
Status: "True",
@@ -281,6 +289,7 @@ func TestRemoveBootstrap(t *testing.T) {
281289
tests := map[string]struct {
282290
safeToRemove bool
283291
hasBootstrap bool
292+
bootstrapIsLead bool
284293
bootstrapId uint64
285294
expectedConditions []operatorv1.OperatorCondition
286295
indexerObjs []interface{}
@@ -363,6 +372,21 @@ func TestRemoveBootstrap(t *testing.T) {
363372
},
364373
expEvents: 2,
365374
},
375+
376+
"safe, has bootstrap, moving leader, not removing the bootstrap member": {
377+
safeToRemove: true,
378+
hasBootstrap: true,
379+
bootstrapIsLead: true,
380+
bootstrapId: 1,
381+
expectedConditions: []operatorv1.OperatorCondition{
382+
conditionEnoughEtcdMembers,
383+
conditionBootstrapLeaderMoved,
384+
},
385+
indexerObjs: []interface{}{
386+
bootstrapComplete,
387+
},
388+
expEvents: 0,
389+
},
366390
}
367391

368392
for name, test := range tests {
@@ -377,8 +401,22 @@ func TestRemoveBootstrap(t *testing.T) {
377401
fakeConfigmapLister := corev1listers.NewConfigMapLister(indexer)
378402
fakeInfraLister := configv1listers.NewInfrastructureLister(indexer)
379403
fakeStaticPodClient := v1helpers.NewFakeStaticPodOperatorClient(&operatorv1.StaticPodOperatorSpec{}, &operatorv1.StaticPodOperatorStatus{}, nil, nil)
380-
fakeEtcdClient, err := etcdcli.NewFakeEtcdClient([]*etcdserverpb.Member{u.FakeEtcdBootstrapMember(1)})
404+
405+
leaderOpt := etcdcli.WithLeader(2)
406+
if test.bootstrapIsLead {
407+
leaderOpt = etcdcli.WithLeader(1)
408+
}
409+
fakeEtcdClient, err := etcdcli.NewFakeEtcdClient(
410+
[]*etcdserverpb.Member{
411+
u.FakeEtcdBootstrapMember(1),
412+
u.FakeEtcdMemberWithoutServer(2),
413+
},
414+
etcdcli.WithFakeStatus([]*clientv3.StatusResponse{
415+
{Header: &etcdserverpb.ResponseHeader{MemberId: 1}, IsLearner: false},
416+
{Header: &etcdserverpb.ResponseHeader{MemberId: 2}, IsLearner: false},
417+
}), leaderOpt)
381418
require.NoError(t, err)
419+
382420
fakeKubeClient := fake.NewClientset([]runtime.Object{}...)
383421
fakeRecorder := events.NewRecorder(fakeKubeClient.CoreV1().Events(operatorclient.OperatorNamespace),
384422
"test-bootstrap-teardown-controller", &corev1.ObjectReference{}, clock.RealClock{})

pkg/operator/ceohelpers/leader.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66

77
"github.com/openshift/cluster-etcd-operator/pkg/etcdcli"
88
"go.etcd.io/etcd/api/v3/etcdserverpb"
9+
"k8s.io/klog/v2"
910
)
1011

1112
func FindLeader(ctx context.Context, client etcdcli.MemberStatusChecker, memberList []*etcdserverpb.Member) (*etcdserverpb.Member, error) {
@@ -26,3 +27,26 @@ func FindLeader(ctx context.Context, client etcdcli.MemberStatusChecker, memberL
2627
}
2728
return mx, nil
2829
}
30+
31+
func MoveLeaderToAnotherMember(ctx context.Context, client etcdcli.LeaderMover, leader *etcdserverpb.Member,
32+
memberList []*etcdserverpb.Member) (bool, error) {
33+
var otherMember *etcdserverpb.Member
34+
for _, member := range memberList {
35+
if member.ID != leader.ID {
36+
otherMember = member
37+
break
38+
}
39+
}
40+
41+
if otherMember == nil {
42+
return false, fmt.Errorf("no follower member found for the members: %v", memberList)
43+
}
44+
45+
err := client.MoveLeader(ctx, otherMember.ID)
46+
if err != nil {
47+
return false, err
48+
}
49+
50+
klog.Warningf("Moving lead from member [%x] (%s) to [%x] (%s) succesfully!", leader.ID, leader.GetClientURLs()[0], otherMember.ID, otherMember.GetClientURLs()[0])
51+
return true, nil
52+
}

pkg/operator/ceohelpers/leader_test.go

Lines changed: 140 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,18 @@ func (m *mockMemberStatusChecker) MemberStatus(ctx context.Context, member *etcd
2727
return nil, errors.New("member status not found")
2828
}
2929

30+
// mockLeaderMover implements LeaderMover for testing
31+
type mockLeaderMover struct {
32+
moveLeaderErrors map[uint64]error
33+
}
34+
35+
func (m *mockLeaderMover) MoveLeader(ctx context.Context, toMember uint64) error {
36+
if err, exists := m.moveLeaderErrors[toMember]; exists {
37+
return err
38+
}
39+
return nil
40+
}
41+
3042
func TestFindLeader(t *testing.T) {
3143
ctx := context.Background()
3244

@@ -164,81 +176,140 @@ func TestFindLeader(t *testing.T) {
164176
}
165177
}
166178

167-
func TestFindLeader_ContextCancellation(t *testing.T) {
168-
ctx, cancel := context.WithCancel(context.Background())
169-
cancel() // Cancel immediately
170-
171-
mockClient := &mockMemberStatusChecker{
172-
memberStatuses: map[uint64]*clientv3.StatusResponse{
173-
1: {Header: &etcdserverpb.ResponseHeader{MemberId: 1}, Leader: 1},
174-
},
175-
}
176-
177-
memberList := []*etcdserverpb.Member{
178-
{ID: 1, Name: "etcd-1", ClientURLs: []string{"https://10.0.0.1:2379"}},
179-
}
180-
181-
// The function should handle context cancellation gracefully
182-
// Since our mock doesn't actually check context, this test verifies
183-
// that the function signature accepts context and passes it through
184-
leader, err := FindLeader(ctx, mockClient, memberList)
185-
require.NoError(t, err)
186-
require.NotNil(t, leader)
187-
assert.Equal(t, uint64(1), leader.ID)
188-
}
189-
190-
func TestFindLeader_EdgeCases(t *testing.T) {
179+
func TestMoveLeaderToAnotherMember(t *testing.T) {
191180
ctx := context.Background()
192181

193-
t.Run("member with no client URLs", func(t *testing.T) {
194-
mockClient := &mockMemberStatusChecker{
195-
memberStatuses: map[uint64]*clientv3.StatusResponse{
196-
1: {Header: &etcdserverpb.ResponseHeader{MemberId: 1}, Leader: 1},
182+
tests := []struct {
183+
name string
184+
leader *etcdserverpb.Member
185+
memberList []*etcdserverpb.Member
186+
moveLeaderErrors map[uint64]error
187+
expectedSuccess bool
188+
expectedError string
189+
}{
190+
{
191+
name: "successfully move leader to another member",
192+
leader: &etcdserverpb.Member{
193+
ID: 1,
194+
Name: "etcd-1",
195+
ClientURLs: []string{"https://10.0.0.1:2379"},
196+
PeerURLs: []string{"https://10.0.0.1:2380"},
197197
},
198-
}
199-
200-
memberList := []*etcdserverpb.Member{
201-
{ID: 1, Name: "etcd-1", ClientURLs: []string{}},
202-
}
203-
204-
// This should work fine since our mock doesn't actually use ClientURLs
205-
leader, err := FindLeader(ctx, mockClient, memberList)
206-
require.NoError(t, err)
207-
require.NotNil(t, leader)
208-
assert.Equal(t, uint64(1), leader.ID)
209-
})
210-
211-
t.Run("member with nil client URLs", func(t *testing.T) {
212-
mockClient := &mockMemberStatusChecker{
213-
memberStatuses: map[uint64]*clientv3.StatusResponse{
214-
1: {Header: &etcdserverpb.ResponseHeader{MemberId: 1}, Leader: 1},
198+
memberList: []*etcdserverpb.Member{
199+
{ID: 1, Name: "etcd-1", ClientURLs: []string{"https://10.0.0.1:2379"}, PeerURLs: []string{"https://10.0.0.1:2380"}},
200+
{ID: 2, Name: "etcd-2", ClientURLs: []string{"https://10.0.0.2:2379"}, PeerURLs: []string{"https://10.0.0.2:2380"}},
201+
{ID: 3, Name: "etcd-3", ClientURLs: []string{"https://10.0.0.3:2379"}, PeerURLs: []string{"https://10.0.0.3:2380"}},
215202
},
216-
}
217-
218-
memberList := []*etcdserverpb.Member{
219-
{ID: 1, Name: "etcd-1", ClientURLs: nil},
220-
}
203+
expectedSuccess: true,
204+
},
205+
{
206+
name: "successfully move leader when leader is in middle of list",
207+
leader: &etcdserverpb.Member{
208+
ID: 2,
209+
Name: "etcd-2",
210+
ClientURLs: []string{"https://10.0.0.2:2379"},
211+
PeerURLs: []string{"https://10.0.0.2:2380"},
212+
},
213+
memberList: []*etcdserverpb.Member{
214+
{ID: 1, Name: "etcd-1", ClientURLs: []string{"https://10.0.0.1:2379"}, PeerURLs: []string{"https://10.0.0.1:2380"}},
215+
{ID: 2, Name: "etcd-2", ClientURLs: []string{"https://10.0.0.2:2379"}, PeerURLs: []string{"https://10.0.0.2:2380"}},
216+
{ID: 3, Name: "etcd-3", ClientURLs: []string{"https://10.0.0.3:2379"}, PeerURLs: []string{"https://10.0.0.3:2380"}},
217+
},
218+
expectedSuccess: true,
219+
},
220+
{
221+
name: "successfully move leader when leader is last in list",
222+
leader: &etcdserverpb.Member{
223+
ID: 3,
224+
Name: "etcd-3",
225+
ClientURLs: []string{"https://10.0.0.3:2379"},
226+
PeerURLs: []string{"https://10.0.0.3:2380"},
227+
},
228+
memberList: []*etcdserverpb.Member{
229+
{ID: 1, Name: "etcd-1", ClientURLs: []string{"https://10.0.0.1:2379"}, PeerURLs: []string{"https://10.0.0.1:2380"}},
230+
{ID: 2, Name: "etcd-2", ClientURLs: []string{"https://10.0.0.2:2379"}, PeerURLs: []string{"https://10.0.0.2:2380"}},
231+
{ID: 3, Name: "etcd-3", ClientURLs: []string{"https://10.0.0.3:2379"}, PeerURLs: []string{"https://10.0.0.3:2380"}},
232+
},
233+
expectedSuccess: true,
234+
},
235+
{
236+
name: "move leader fails with error",
237+
leader: &etcdserverpb.Member{
238+
ID: 1,
239+
Name: "etcd-1",
240+
ClientURLs: []string{"https://10.0.0.1:2379"},
241+
PeerURLs: []string{"https://10.0.0.1:2380"},
242+
},
243+
memberList: []*etcdserverpb.Member{
244+
{ID: 1, Name: "etcd-1", ClientURLs: []string{"https://10.0.0.1:2379"}, PeerURLs: []string{"https://10.0.0.1:2380"}},
245+
{ID: 2, Name: "etcd-2", ClientURLs: []string{"https://10.0.0.2:2379"}, PeerURLs: []string{"https://10.0.0.2:2380"}},
246+
},
247+
moveLeaderErrors: map[uint64]error{
248+
2: errors.New("move leader failed: connection timeout"),
249+
},
250+
expectedSuccess: false,
251+
expectedError: "move leader failed: connection timeout",
252+
},
253+
{
254+
name: "no follower member found - single member cluster",
255+
leader: &etcdserverpb.Member{
256+
ID: 1,
257+
Name: "etcd-1",
258+
ClientURLs: []string{"https://10.0.0.1:2379"},
259+
PeerURLs: []string{"https://10.0.0.1:2380"},
260+
},
261+
memberList: []*etcdserverpb.Member{
262+
{ID: 1, Name: "etcd-1", ClientURLs: []string{"https://10.0.0.1:2379"}, PeerURLs: []string{"https://10.0.0.1:2380"}},
263+
},
264+
expectedSuccess: false,
265+
expectedError: "no follower member found for the members: [ID:1 name:\"etcd-1\" peerURLs:\"https://10.0.0.1:2380\" clientURLs:\"https://10.0.0.1:2379\" ]",
266+
},
267+
{
268+
name: "no follower member found - all members have same ID",
269+
leader: &etcdserverpb.Member{
270+
ID: 1,
271+
Name: "etcd-1",
272+
ClientURLs: []string{"https://10.0.0.1:2379"},
273+
PeerURLs: []string{"https://10.0.0.1:2380"},
274+
},
275+
memberList: []*etcdserverpb.Member{
276+
{ID: 1, Name: "etcd-1", ClientURLs: []string{"https://10.0.0.1:2379"}, PeerURLs: []string{"https://10.0.0.1:2380"}},
277+
{ID: 1, Name: "etcd-1-copy", ClientURLs: []string{"https://10.0.0.1:2379"}, PeerURLs: []string{"https://10.0.0.1:2380"}},
278+
},
279+
expectedSuccess: false,
280+
expectedError: "no follower member found for the members: [ID:1 name:\"etcd-1\" peerURLs:\"https://10.0.0.1:2380\" clientURLs:\"https://10.0.0.1:2379\" ID:1 name:\"etcd-1-copy\" peerURLs:\"https://10.0.0.1:2380\" clientURLs:\"https://10.0.0.1:2379\" ]",
281+
},
282+
{
283+
name: "empty member list",
284+
leader: &etcdserverpb.Member{
285+
ID: 1,
286+
Name: "etcd-1",
287+
ClientURLs: []string{"https://10.0.0.1:2379"},
288+
PeerURLs: []string{"https://10.0.0.1:2380"},
289+
},
290+
memberList: []*etcdserverpb.Member{},
291+
expectedSuccess: false,
292+
expectedError: "no follower member found for the members: []",
293+
},
294+
}
221295

222-
leader, err := FindLeader(ctx, mockClient, memberList)
223-
require.NoError(t, err)
224-
require.NotNil(t, leader)
225-
assert.Equal(t, uint64(1), leader.ID)
226-
})
296+
for _, tt := range tests {
297+
t.Run(tt.name, func(t *testing.T) {
298+
mockClient := &mockLeaderMover{
299+
moveLeaderErrors: tt.moveLeaderErrors,
300+
}
227301

228-
t.Run("member with zero ID", func(t *testing.T) {
229-
mockClient := &mockMemberStatusChecker{
230-
memberStatuses: map[uint64]*clientv3.StatusResponse{
231-
0: {Header: &etcdserverpb.ResponseHeader{MemberId: 0}, Leader: 0},
232-
},
233-
}
302+
success, err := MoveLeaderToAnotherMember(ctx, mockClient, tt.leader, tt.memberList)
234303

235-
memberList := []*etcdserverpb.Member{
236-
{ID: 0, Name: "etcd-0", ClientURLs: []string{"https://10.0.0.0:2379"}},
237-
}
304+
if tt.expectedError != "" {
305+
require.Error(t, err)
306+
assert.Contains(t, err.Error(), tt.expectedError)
307+
assert.False(t, success)
308+
return
309+
}
238310

239-
leader, err := FindLeader(ctx, mockClient, memberList)
240-
require.NoError(t, err)
241-
require.NotNil(t, leader)
242-
assert.Equal(t, uint64(0), leader.ID)
243-
})
311+
require.NoError(t, err)
312+
assert.Equal(t, tt.expectedSuccess, success)
313+
})
314+
}
244315
}

0 commit comments

Comments
 (0)