Skip to content

Commit 0c031aa

Browse files
committed
OCPBUGS-42810: actively move bootstrap member lead
This PR will actively try to move the leadership away from the bootstrap member to another healthy member. Signed-off-by: Thomas Jungblut <[email protected]>
1 parent 9a2aa20 commit 0c031aa

File tree

5 files changed

+217
-27
lines changed

5 files changed

+217
-27
lines changed

pkg/etcdcli/etcdcli.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,20 @@ func (g *etcdClientGetter) MemberRemove(ctx context.Context, memberID uint64) er
259259
return err
260260
}
261261

262+
func (g *etcdClientGetter) MoveLeader(ctx context.Context, toMember uint64) error {
263+
cli, err := g.clientPool.Get()
264+
if err != nil {
265+
return err
266+
}
267+
268+
defer g.clientPool.Return(cli)
269+
270+
ctx, cancel := context.WithTimeout(ctx, DefaultClientTimeout)
271+
defer cancel()
272+
_, err = cli.MoveLeader(ctx, toMember)
273+
return err
274+
}
275+
262276
func (g *etcdClientGetter) MemberList(ctx context.Context) ([]*etcdserverpb.Member, error) {
263277
cli, err := g.clientPool.Get()
264278
if err != nil {

pkg/etcdcli/helpers.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,14 @@ func (f *fakeEtcdClient) MemberRemove(ctx context.Context, memberID uint64) erro
105105
return nil
106106
}
107107

108+
func (f *fakeEtcdClient) MoveLeader(ctx context.Context, toMember uint64) error {
109+
for _, status := range f.opts.status {
110+
status.Leader = toMember
111+
}
112+
113+
return nil
114+
}
115+
108116
func (f *fakeEtcdClient) MemberHealth(ctx context.Context) (memberHealth, error) {
109117
var healthy, unhealthy int
110118
var memberHealth memberHealth
@@ -251,6 +259,14 @@ func WithFakeStatus(status []*clientv3.StatusResponse) FakeClientOption {
251259
}
252260
}
253261

262+
func WithLeader(leader uint64) FakeClientOption {
263+
return func(fo *FakeClientOptions) {
264+
for _, status := range fo.status {
265+
status.Leader = leader
266+
}
267+
}
268+
}
269+
254270
// WithFakeDefragErrors configures each call to Defrag to consume one error from the given slice
255271
func WithFakeDefragErrors(errors []error) FakeClientOption {
256272
return func(fo *FakeClientOptions) {

pkg/etcdcli/interfaces.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ type EtcdClient interface {
2525
HealthyMemberLister
2626
UnhealthyMemberLister
2727
MemberStatusChecker
28+
LeaderMover
2829
Status
2930

3031
GetMember(ctx context.Context, name string) (*etcdserverpb.Member, error)
@@ -64,6 +65,10 @@ type MemberRemover interface {
6465
MemberRemove(ctx context.Context, memberID uint64) error
6566
}
6667

68+
type LeaderMover interface {
69+
MoveLeader(ctx context.Context, toMember uint64) error
70+
}
71+
6772
type MemberLister interface {
6873
// MemberList lists all members in a cluster
6974
MemberList(ctx context.Context) ([]*etcdserverpb.Member, error)

pkg/operator/bootstrapteardown/bootstrap_teardown_controller.go

Lines changed: 93 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package bootstrapteardown
33
import (
44
"context"
55
"fmt"
6+
"go.etcd.io/etcd/api/v3/etcdserverpb"
67
"time"
78

89
operatorv1 "github.com/openshift/api/operator/v1"
@@ -62,12 +63,29 @@ func (c *BootstrapTeardownController) sync(ctx context.Context, _ factory.SyncCo
6263
return fmt.Errorf("failed to get bootstrap scaling strategy: %w", err)
6364
}
6465
// checks the actual etcd cluster membership API if etcd-bootstrap exists
65-
safeToRemoveBootstrap, hasBootstrap, bootstrapID, err := c.canRemoveEtcdBootstrap(ctx, scalingStrategy)
66+
safeToRemoveBootstrap, hasBootstrap, bootstrapMember, err := c.canRemoveEtcdBootstrap(ctx, scalingStrategy)
6667
if err != nil {
6768
return fmt.Errorf("error while canRemoveEtcdBootstrap: %w", err)
6869
}
6970

70-
err = c.removeBootstrap(timeoutCtx, safeToRemoveBootstrap, hasBootstrap, bootstrapID)
71+
if hasBootstrap {
72+
if err := c.ensureBootstrapIsNotLeader(ctx, bootstrapMember); err != nil {
73+
klog.Errorf("error while ensuring bootstrap is not leader: %v", err)
74+
}
75+
}
76+
77+
// TODO(thomas): it seems on SNO, this is not enough, we might have a non-working apiserver at this point in time
78+
revisionStable, err := ceohelpers.IsRevisionStable(c.operatorClient)
79+
if err != nil {
80+
return fmt.Errorf("BootstrapTeardownController failed to determine stability of revisions: %w", err)
81+
}
82+
83+
if !revisionStable {
84+
klog.Infof("BootstrapTeardownController is waiting for stable etcd revision before removing the bootstrap member")
85+
return nil
86+
}
87+
88+
err = c.removeBootstrap(timeoutCtx, safeToRemoveBootstrap, hasBootstrap, bootstrapMember)
7189
if err != nil {
7290
_, _, updateErr := v1helpers.UpdateStatus(ctx, c.operatorClient, v1helpers.UpdateConditionFn(operatorv1.OperatorCondition{
7391
Type: "BootstrapTeardownDegraded",
@@ -90,13 +108,20 @@ func (c *BootstrapTeardownController) sync(ctx context.Context, _ factory.SyncCo
90108
return updateErr
91109
}
92110

93-
func (c *BootstrapTeardownController) removeBootstrap(ctx context.Context, safeToRemoveBootstrap bool, hasBootstrap bool, bootstrapID uint64) error {
111+
func (c *BootstrapTeardownController) removeBootstrap(ctx context.Context, safeToRemoveBootstrap bool, hasBootstrap bool, bootstrapMember *etcdserverpb.Member) error {
112+
bootstrapID := uint64(0)
113+
bootstrapUrl := "unknown"
114+
if bootstrapMember != nil {
115+
bootstrapID = bootstrapMember.ID
116+
bootstrapUrl = bootstrapMember.GetClientURLs()[0]
117+
}
118+
94119
if !hasBootstrap {
95120
klog.V(4).Infof("no bootstrap anymore setting removal status")
96121
// this is to ensure the status is always set correctly, even if the status update below failed
97-
updateErr := setSuccessfulBoostrapRemovalStatus(ctx, c.operatorClient)
122+
updateErr := setSuccessfulBootstrapRemovalStatus(ctx, c.operatorClient)
98123
if updateErr != nil {
99-
return fmt.Errorf("error while setSuccessfulBoostrapRemovalStatus: %w", updateErr)
124+
return fmt.Errorf("error while setSuccessfulBootstrapRemovalStatus: %w", updateErr)
100125
}
101126

102127
// if the bootstrap isn't present, then clearly we're available enough to terminate. This avoids any risk of flapping.
@@ -141,20 +166,21 @@ func (c *BootstrapTeardownController) removeBootstrap(ctx context.Context, safeT
141166
if isBootstrapComplete, err := bootstrap.IsBootstrapComplete(c.configmapLister); !isBootstrapComplete || err != nil {
142167
return err
143168
}
144-
klog.Warningf("Removing bootstrap member [%x]", bootstrapID)
169+
170+
klog.Warningf("Removing bootstrap member [%x] (%s)", bootstrapID, bootstrapUrl)
145171

146172
// this is ugly until bootkube is updated, but we want to be sure that bootkube has time to be waiting to watch the condition coming back.
147173
if err := c.etcdClient.MemberRemove(ctx, bootstrapID); err != nil {
148-
return fmt.Errorf("error while removing bootstrap member [%x]: %w", bootstrapID, err)
174+
return fmt.Errorf("error while removing bootstrap member [%x] (%s): %w", bootstrapID, bootstrapUrl, err)
149175
}
150176

151-
klog.Infof("Successfully removed bootstrap member [%x]", bootstrapID)
177+
klog.Infof("Successfully removed bootstrap member [%x] (%s)", bootstrapID, bootstrapUrl)
152178
// below might fail, since the member removal can cause some downtime for raft to settle on a quorum
153179
// it's important that everything below is properly retried above during normal controller reconciliation
154-
return setSuccessfulBoostrapRemovalStatus(ctx, c.operatorClient)
180+
return setSuccessfulBootstrapRemovalStatus(ctx, c.operatorClient)
155181
}
156182

157-
func setSuccessfulBoostrapRemovalStatus(ctx context.Context, client v1helpers.StaticPodOperatorClient) error {
183+
func setSuccessfulBootstrapRemovalStatus(ctx context.Context, client v1helpers.StaticPodOperatorClient) error {
158184
_, _, updateErr := v1helpers.UpdateStatus(ctx, client, v1helpers.UpdateConditionFn(operatorv1.OperatorCondition{
159185
Type: "EtcdBootstrapMemberRemoved",
160186
Status: operatorv1.ConditionTrue,
@@ -165,57 +191,101 @@ func setSuccessfulBoostrapRemovalStatus(ctx context.Context, client v1helpers.St
165191
}
166192

167193
// canRemoveEtcdBootstrap returns whether it is safe to remove bootstrap, whether bootstrap is in the list, and an error
168-
func (c *BootstrapTeardownController) canRemoveEtcdBootstrap(ctx context.Context, scalingStrategy ceohelpers.BootstrapScalingStrategy) (bool, bool, uint64, error) {
194+
func (c *BootstrapTeardownController) canRemoveEtcdBootstrap(ctx context.Context, scalingStrategy ceohelpers.BootstrapScalingStrategy) (bool, bool, *etcdserverpb.Member, error) {
169195
members, err := c.etcdClient.MemberList(ctx)
170196
if err != nil {
171-
return false, false, 0, err
197+
return false, false, nil, err
172198
}
173199

174200
var hasBootstrap bool
175-
var bootstrapMemberID uint64
201+
var bootstrapMember *etcdserverpb.Member
176202
for _, member := range members {
177203
if member.Name == "etcd-bootstrap" {
178204
hasBootstrap = true
179-
bootstrapMemberID = member.ID
205+
bootstrapMember = member
180206
break
181207
}
182208
}
183209
if !hasBootstrap {
184-
return false, hasBootstrap, bootstrapMemberID, nil
210+
return false, hasBootstrap, bootstrapMember, nil
185211
}
186212

187213
// First, enforce the main HA invariants in terms of member counts.
188214
switch scalingStrategy {
189215
case ceohelpers.HAScalingStrategy:
190216
if len(members) < 4 {
191-
return false, hasBootstrap, bootstrapMemberID, nil
217+
return false, hasBootstrap, bootstrapMember, nil
192218
}
193219
case ceohelpers.DelayedHAScalingStrategy:
194220
if len(members) < 3 {
195-
return false, hasBootstrap, bootstrapMemberID, nil
221+
return false, hasBootstrap, bootstrapMember, nil
196222
}
197223
case ceohelpers.UnsafeScalingStrategy:
198224
if len(members) < 2 {
199-
return false, hasBootstrap, bootstrapMemberID, nil
225+
return false, hasBootstrap, bootstrapMember, nil
200226
}
201227
}
202228

203229
// Next, given member counts are satisfied, check member health.
204230
unhealthyMembers, err := c.etcdClient.UnhealthyMembers(ctx)
205231
if err != nil {
206-
return false, hasBootstrap, bootstrapMemberID, nil
232+
return false, hasBootstrap, bootstrapMember, nil
207233
}
208234

209235
// the etcd-bootstrap member is allowed to be unhealthy and can still be removed
210236
switch {
211237
case len(unhealthyMembers) == 0:
212-
return true, hasBootstrap, bootstrapMemberID, nil
238+
return true, hasBootstrap, bootstrapMember, nil
213239
case len(unhealthyMembers) > 1:
214-
return false, hasBootstrap, bootstrapMemberID, nil
240+
return false, hasBootstrap, bootstrapMember, nil
215241
default:
216242
if unhealthyMembers[0].Name == "etcd-bootstrap" {
217-
return true, true, unhealthyMembers[0].ID, nil
243+
return true, true, bootstrapMember, nil
244+
}
245+
return false, hasBootstrap, bootstrapMember, nil
246+
}
247+
}
248+
249+
func (c *BootstrapTeardownController) ensureBootstrapIsNotLeader(ctx context.Context, bootstrapMember *etcdserverpb.Member) error {
250+
if bootstrapMember == nil {
251+
return fmt.Errorf("bootstrap member was not provided")
252+
}
253+
status, err := c.etcdClient.Status(ctx, bootstrapMember.ClientURLs[0])
254+
if err != nil {
255+
return fmt.Errorf("could not find bootstrap member status: %w", err)
256+
}
257+
258+
if bootstrapMember.ID != status.Leader {
259+
return nil
260+
}
261+
262+
klog.Warningf("Bootstrap member [%x] (%s) detected as leader, trying to move elsewhere...", bootstrapMember.ID, bootstrapMember.GetClientURLs()[0])
263+
264+
memberHealth, err := c.etcdClient.MemberHealth(ctx)
265+
if err != nil {
266+
return fmt.Errorf("could not find member health: %w", err)
267+
}
268+
269+
var otherMember *etcdserverpb.Member
270+
// we can pick any other healthy voting member as the target to move to
271+
for _, m := range memberHealth.GetHealthyMembers() {
272+
if m.ID != bootstrapMember.ID && !m.IsLearner {
273+
otherMember = m
274+
break
218275
}
219-
return false, hasBootstrap, bootstrapMemberID, nil
220276
}
277+
278+
if otherMember == nil {
279+
return fmt.Errorf("could not find other healthy member to move leader")
280+
}
281+
282+
klog.Warningf("Moving lead from bootstrap member [%x] (%s) detected as leader to [%x] (%s)", bootstrapMember.ID, bootstrapMember.GetClientURLs()[0], otherMember.ID, otherMember.GetClientURLs()[0])
283+
err = c.etcdClient.MoveLeader(ctx, otherMember.ID)
284+
if err != nil {
285+
return err
286+
}
287+
288+
klog.Warningf("Moving lead from bootstrap member [%x] (%s) to [%x] (%s) succesfully!", bootstrapMember.ID, bootstrapMember.GetClientURLs()[0], otherMember.ID, otherMember.GetClientURLs()[0])
289+
290+
return nil
221291
}

pkg/operator/bootstrapteardown/bootstrap_teardown_controller_test.go

Lines changed: 89 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package bootstrapteardown
33
import (
44
"context"
55
"fmt"
6+
clientv3 "go.etcd.io/etcd/client/v3"
67
"testing"
78

89
operatorv1 "github.com/openshift/api/operator/v1"
@@ -172,11 +173,95 @@ func TestCanRemoveEtcdBootstrap(t *testing.T) {
172173
etcdClient: fakeEtcdClient,
173174
}
174175

175-
safeToRemoveBootstrap, hasBootstrap, bootstrapId, err := c.canRemoveEtcdBootstrap(context.TODO(), test.scalingStrategy)
176+
safeToRemoveBootstrap, hasBootstrap, bootstrapMember, err := c.canRemoveEtcdBootstrap(context.TODO(), test.scalingStrategy)
176177
require.NoError(t, err)
177178
require.Equal(t, test.safeToRemove, safeToRemoveBootstrap, "safe to remove")
178179
require.Equal(t, test.hasBootstrap, hasBootstrap, "has bootstrap")
179-
require.Equal(t, test.bootstrapId, bootstrapId, "bootstrap id")
180+
if hasBootstrap {
181+
require.Equal(t, test.bootstrapId, bootstrapMember.ID, "bootstrap member")
182+
} else {
183+
require.Nil(t, bootstrapMember)
184+
}
185+
})
186+
}
187+
}
188+
189+
func TestMoveLeaderFromEtcdBootstrap(t *testing.T) {
190+
tests := map[string]struct {
191+
etcdMembers []*etcdserverpb.Member
192+
clientFakeOpts []etcdcli.FakeClientOption
193+
movedToLeader uint64
194+
expectedErr error
195+
}{
196+
"happy path no bootstrap member": {
197+
etcdMembers: u.DefaultEtcdMembers(),
198+
expectedErr: fmt.Errorf("bootstrap member was not provided"),
199+
},
200+
"happy path bootstrap not leader": {
201+
etcdMembers: append(u.DefaultEtcdMembers(), u.FakeEtcdBootstrapMember(3)),
202+
clientFakeOpts: []etcdcli.FakeClientOption{
203+
etcdcli.WithFakeStatus([]*clientv3.StatusResponse{
204+
{Header: &etcdserverpb.ResponseHeader{MemberId: 3}}}),
205+
etcdcli.WithFakeClusterHealth(&etcdcli.FakeMemberHealth{Unhealthy: 3, Healthy: 1}),
206+
etcdcli.WithLeader(1),
207+
},
208+
expectedErr: nil,
209+
},
210+
"no healthy destination": {
211+
etcdMembers: append(u.DefaultEtcdMembers(), u.FakeEtcdBootstrapMember(3)),
212+
clientFakeOpts: []etcdcli.FakeClientOption{
213+
etcdcli.WithFakeStatus([]*clientv3.StatusResponse{
214+
{Header: &etcdserverpb.ResponseHeader{MemberId: 0}},
215+
{Header: &etcdserverpb.ResponseHeader{MemberId: 1}},
216+
{Header: &etcdserverpb.ResponseHeader{MemberId: 2}},
217+
{Header: &etcdserverpb.ResponseHeader{MemberId: 3}}}),
218+
etcdcli.WithFakeClusterHealth(&etcdcli.FakeMemberHealth{Unhealthy: 4, Healthy: 0}),
219+
etcdcli.WithLeader(3),
220+
},
221+
expectedErr: fmt.Errorf("could not find other healthy member to move leader"),
222+
},
223+
"moving leader": {
224+
etcdMembers: []*etcdserverpb.Member{
225+
u.FakeEtcdBootstrapMember(0),
226+
u.FakeEtcdMemberWithoutServer(1),
227+
},
228+
clientFakeOpts: []etcdcli.FakeClientOption{etcdcli.WithFakeStatus(
229+
[]*clientv3.StatusResponse{
230+
{Header: &etcdserverpb.ResponseHeader{MemberId: 0}},
231+
{Header: &etcdserverpb.ResponseHeader{MemberId: 1}}}),
232+
etcdcli.WithLeader(0)},
233+
movedToLeader: 1,
234+
expectedErr: nil,
235+
},
236+
}
237+
238+
for name, test := range tests {
239+
t.Run(name, func(t *testing.T) {
240+
if test.clientFakeOpts == nil {
241+
test.clientFakeOpts = []etcdcli.FakeClientOption{etcdcli.WithFakeClusterHealth(&etcdcli.FakeMemberHealth{Unhealthy: 0})}
242+
}
243+
fakeEtcdClient, err := etcdcli.NewFakeEtcdClient(test.etcdMembers, test.clientFakeOpts...)
244+
require.NoError(t, err)
245+
246+
c := &BootstrapTeardownController{
247+
etcdClient: fakeEtcdClient,
248+
}
249+
250+
var bootstrapMember *etcdserverpb.Member
251+
for _, m := range test.etcdMembers {
252+
if m.Name == "etcd-bootstrap" {
253+
bootstrapMember = m
254+
break
255+
}
256+
}
257+
258+
err = c.ensureBootstrapIsNotLeader(context.TODO(), bootstrapMember)
259+
require.Equal(t, test.expectedErr, err)
260+
if test.movedToLeader != 0 {
261+
status, err := fakeEtcdClient.Status(context.TODO(), bootstrapMember.GetClientURLs()[0])
262+
require.NoError(t, err)
263+
require.Equal(t, test.movedToLeader, status.Leader)
264+
}
180265
})
181266
}
182267
}
@@ -246,7 +331,7 @@ func TestRemoveBootstrap(t *testing.T) {
246331
indexerObjs: []interface{}{
247332
bootstrapComplete,
248333
},
249-
expectedErr: fmt.Errorf("error while removing bootstrap member [%x]: %w", 0, fmt.Errorf("member with the given ID: 0 doesn't exist")),
334+
expectedErr: fmt.Errorf("error while removing bootstrap member [%x] (https://10.0.0.1:2907): %w", 0, fmt.Errorf("member with the given ID: 0 doesn't exist")),
250335
},
251336
"safe, has bootstrap, complete process, successful removal": {
252337
safeToRemove: true,
@@ -285,7 +370,7 @@ func TestRemoveBootstrap(t *testing.T) {
285370
fakeInfraLister,
286371
}
287372

288-
err = c.removeBootstrap(context.TODO(), test.safeToRemove, test.hasBootstrap, test.bootstrapId)
373+
err = c.removeBootstrap(context.TODO(), test.safeToRemove, test.hasBootstrap, u.FakeEtcdMemberWithoutServer(int(test.bootstrapId)))
289374
require.Equal(t, test.expectedErr, err)
290375

291376
_, status, _, err := fakeStaticPodClient.GetStaticPodOperatorState()

0 commit comments

Comments
 (0)