diff --git a/pkg/etcdcli/etcdcli.go b/pkg/etcdcli/etcdcli.go index ba640547e..80a535b66 100644 --- a/pkg/etcdcli/etcdcli.go +++ b/pkg/etcdcli/etcdcli.go @@ -264,6 +264,20 @@ func (g *etcdClientGetter) MemberRemove(ctx context.Context, memberID uint64) er return err } +func (g *etcdClientGetter) MoveLeader(ctx context.Context, toMember uint64) error { + cli, err := g.clientPool.Get() + if err != nil { + return err + } + + defer g.clientPool.Return(cli) + + ctx, cancel := context.WithTimeout(ctx, DefaultClientTimeout) + defer cancel() + _, err = cli.MoveLeader(ctx, toMember) + return err +} + func (g *etcdClientGetter) MemberList(ctx context.Context) ([]*etcdserverpb.Member, error) { cli, err := g.clientPool.Get() if err != nil { @@ -443,26 +457,20 @@ func (g *etcdClientGetter) IsMemberHealthy(ctx context.Context, member *etcdserv return false, nil } -func (g *etcdClientGetter) MemberStatus(ctx context.Context, member *etcdserverpb.Member) string { +func (g *etcdClientGetter) MemberStatus(ctx context.Context, member *etcdserverpb.Member) (*clientv3.StatusResponse, error) { cli, err := g.clientPool.Get() if err != nil { - klog.Errorf("error getting etcd client: %#v", err) - return EtcdMemberStatusUnknown + return nil, err } defer g.clientPool.Return(cli) - if len(member.ClientURLs) == 0 && member.Name == "" { - return EtcdMemberStatusNotStarted + if len(member.ClientURLs) == 0 { + return nil, fmt.Errorf("member has no etcd clientURLs") } ctx, cancel := context.WithTimeout(ctx, DefaultClientTimeout) defer cancel() - _, err = cli.Status(ctx, member.ClientURLs[0]) - if err != nil { - klog.Errorf("error getting etcd member %s status: %#v", member.Name, err) - return EtcdMemberStatusUnhealthy - } - return EtcdMemberStatusAvailable + return cli.Status(ctx, member.ClientURLs[0]) } // Defragment creates a new uncached clientv3 to the given member url and calls clientv3.Client.Defragment. diff --git a/pkg/etcdcli/helpers.go b/pkg/etcdcli/helpers.go index e16f06877..6f50227da 100644 --- a/pkg/etcdcli/helpers.go +++ b/pkg/etcdcli/helpers.go @@ -105,6 +105,14 @@ func (f *fakeEtcdClient) MemberRemove(ctx context.Context, memberID uint64) erro return nil } +func (f *fakeEtcdClient) MoveLeader(ctx context.Context, toMember uint64) error { + for _, status := range f.opts.status { + status.Leader = toMember + } + + return nil +} + func (f *fakeEtcdClient) MemberHealth(ctx context.Context) (memberHealth, error) { var healthy, unhealthy int var memberHealth memberHealth @@ -162,8 +170,18 @@ func (f *fakeEtcdClient) HealthyVotingMembers(ctx context.Context) ([]*etcdserve return filterVotingMembers(members), nil } -func (f *fakeEtcdClient) MemberStatus(ctx context.Context, member *etcdserverpb.Member) string { - panic("implement me") +func (f *fakeEtcdClient) MemberStatus(ctx context.Context, member *etcdserverpb.Member) (*clientv3.StatusResponse, error) { + // Find the status for this member + for _, status := range f.opts.status { + if status.Header != nil && status.Header.MemberId == member.ID { + return status, nil + } + } + // Return a default status if none found + return &clientv3.StatusResponse{ + Header: &etcdserverpb.ResponseHeader{MemberId: member.ID}, + Leader: member.ID, // Default to self as leader + }, nil } func (f *fakeEtcdClient) GetMember(ctx context.Context, name string) (*etcdserverpb.Member, error) { @@ -251,6 +269,14 @@ func WithFakeStatus(status []*clientv3.StatusResponse) FakeClientOption { } } +func WithLeader(leader uint64) FakeClientOption { + return func(fo *FakeClientOptions) { + for _, status := range fo.status { + status.Leader = leader + } + } +} + // WithFakeDefragErrors configures each call to Defrag to consume one error from the given slice func WithFakeDefragErrors(errors []error) FakeClientOption { return func(fo *FakeClientOptions) { diff --git a/pkg/etcdcli/interfaces.go b/pkg/etcdcli/interfaces.go index 6a31a9c62..a9c660fc2 100644 --- a/pkg/etcdcli/interfaces.go +++ b/pkg/etcdcli/interfaces.go @@ -25,6 +25,7 @@ type EtcdClient interface { HealthyMemberLister UnhealthyMemberLister MemberStatusChecker + LeaderMover Status GetMember(ctx context.Context, name string) (*etcdserverpb.Member, error) @@ -64,6 +65,10 @@ type MemberRemover interface { MemberRemove(ctx context.Context, memberID uint64) error } +type LeaderMover interface { + MoveLeader(ctx context.Context, toMember uint64) error +} + type MemberLister interface { // MemberList lists all members in a cluster MemberList(ctx context.Context) ([]*etcdserverpb.Member, error) @@ -86,5 +91,6 @@ type UnhealthyMemberLister interface { } type MemberStatusChecker interface { - MemberStatus(ctx context.Context, member *etcdserverpb.Member) string + // MemberStatus will return the etcd status response for the given member + MemberStatus(ctx context.Context, member *etcdserverpb.Member) (*clientv3.StatusResponse, error) } diff --git a/pkg/operator/bootstrapteardown/bootstrap_teardown_controller.go b/pkg/operator/bootstrapteardown/bootstrap_teardown_controller.go index ee0e262fe..d17c4ccef 100644 --- a/pkg/operator/bootstrapteardown/bootstrap_teardown_controller.go +++ b/pkg/operator/bootstrapteardown/bootstrap_teardown_controller.go @@ -3,9 +3,11 @@ package bootstrapteardown import ( "context" "fmt" - "k8s.io/client-go/tools/cache" "time" + "go.etcd.io/etcd/api/v3/etcdserverpb" + "k8s.io/client-go/tools/cache" + operatorv1 "github.com/openshift/api/operator/v1" configv1listers "github.com/openshift/client-go/config/listers/config/v1" "github.com/openshift/cluster-etcd-operator/pkg/operator/health" @@ -68,12 +70,12 @@ func (c *BootstrapTeardownController) sync(ctx context.Context, _ factory.SyncCo return fmt.Errorf("failed to get bootstrap scaling strategy: %w", err) } // checks the actual etcd cluster membership API if etcd-bootstrap exists - safeToRemoveBootstrap, hasBootstrap, bootstrapID, err := c.canRemoveEtcdBootstrap(ctx, scalingStrategy) + safeToRemoveBootstrap, hasBootstrap, bootstrapMember, err := c.canRemoveEtcdBootstrap(ctx, scalingStrategy) if err != nil { return fmt.Errorf("error while canRemoveEtcdBootstrap: %w", err) } - err = c.removeBootstrap(timeoutCtx, safeToRemoveBootstrap, hasBootstrap, bootstrapID) + err = c.removeBootstrap(timeoutCtx, safeToRemoveBootstrap, hasBootstrap, bootstrapMember) if err != nil { _, _, updateErr := v1helpers.UpdateStatus(ctx, c.operatorClient, v1helpers.UpdateConditionFn(operatorv1.OperatorCondition{ Type: "BootstrapTeardownDegraded", @@ -96,13 +98,13 @@ func (c *BootstrapTeardownController) sync(ctx context.Context, _ factory.SyncCo return updateErr } -func (c *BootstrapTeardownController) removeBootstrap(ctx context.Context, safeToRemoveBootstrap bool, hasBootstrap bool, bootstrapID uint64) error { +func (c *BootstrapTeardownController) removeBootstrap(ctx context.Context, safeToRemoveBootstrap bool, hasBootstrap bool, bootstrapMember *etcdserverpb.Member) error { if !hasBootstrap { klog.V(4).Infof("no bootstrap anymore setting removal status") // this is to ensure the status is always set correctly, even if the status update below failed - updateErr := setSuccessfulBoostrapRemovalStatus(ctx, c.operatorClient) + updateErr := setSuccessfulBootstrapRemovalStatus(ctx, c.operatorClient) if updateErr != nil { - return fmt.Errorf("error while setSuccessfulBoostrapRemovalStatus: %w", updateErr) + return fmt.Errorf("error while setSuccessfulBootstrapRemovalStatus: %w", updateErr) } // if the bootstrap isn't present, then clearly we're available enough to terminate. This avoids any risk of flapping. @@ -120,6 +122,7 @@ func (c *BootstrapTeardownController) removeBootstrap(ctx context.Context, safeT return nil } + bootstrapID := bootstrapMember.ID if !safeToRemoveBootstrap { _, _, updateErr := v1helpers.UpdateStatus(ctx, c.operatorClient, v1helpers.UpdateConditionFn(operatorv1.OperatorCondition{ Type: "EtcdRunningInCluster", @@ -147,6 +150,29 @@ func (c *BootstrapTeardownController) removeBootstrap(ctx context.Context, safeT if isBootstrapComplete, err := bootstrap.IsBootstrapComplete(c.configmapLister); !isBootstrapComplete || err != nil { return err } + + moved, err := c.ensureBootstrapIsNotLeader(ctx, bootstrapMember) + if err != nil { + return err + } + + // if we have just moved it, we will skip this sync iteration to backoff the controller - the next resync will happen after a minute anyway + if moved { + klog.Warningf("Leader just moved, waiting for next resync to remove bootstrap member [%x]", bootstrapID) + c.eventRecorder.Eventf("Bootstrap member was leader and was moved away", "bootstrap member [%x] should no longer be leader", bootstrapID) + _, _, updateErr := v1helpers.UpdateStatus(ctx, c.operatorClient, v1helpers.UpdateConditionFn(operatorv1.OperatorCondition{ + Type: "EtcdLeaderMovedAwayFromBootstrap", + Status: operatorv1.ConditionTrue, + Reason: "BootstrapIsLeader", + Message: "bootstrap was leader and has been moved", + })) + if updateErr != nil { + return fmt.Errorf("error while updating EtcdLeaderMovedAwayFromBootstrap: %w", updateErr) + } + + return nil + } + klog.Warningf("Removing bootstrap member [%x]", bootstrapID) c.eventRecorder.Eventf("Removing bootstrap member", "attempting to remove bootstrap member [%x]", bootstrapID) @@ -159,10 +185,10 @@ func (c *BootstrapTeardownController) removeBootstrap(ctx context.Context, safeT c.eventRecorder.Eventf("Bootstrap member removed", "successfully removed bootstrap member [%x]", bootstrapID) // below might fail, since the member removal can cause some downtime for raft to settle on a quorum // it's important that everything below is properly retried above during normal controller reconciliation - return setSuccessfulBoostrapRemovalStatus(ctx, c.operatorClient) + return setSuccessfulBootstrapRemovalStatus(ctx, c.operatorClient) } -func setSuccessfulBoostrapRemovalStatus(ctx context.Context, client v1helpers.StaticPodOperatorClient) error { +func setSuccessfulBootstrapRemovalStatus(ctx context.Context, client v1helpers.StaticPodOperatorClient) error { _, _, updateErr := v1helpers.UpdateStatus(ctx, client, v1helpers.UpdateConditionFn(operatorv1.OperatorCondition{ Type: "EtcdBootstrapMemberRemoved", Status: operatorv1.ConditionTrue, @@ -173,57 +199,76 @@ func setSuccessfulBoostrapRemovalStatus(ctx context.Context, client v1helpers.St } // canRemoveEtcdBootstrap returns whether it is safe to remove bootstrap, whether bootstrap is in the list, and an error -func (c *BootstrapTeardownController) canRemoveEtcdBootstrap(ctx context.Context, scalingStrategy ceohelpers.BootstrapScalingStrategy) (bool, bool, uint64, error) { +func (c *BootstrapTeardownController) canRemoveEtcdBootstrap(ctx context.Context, scalingStrategy ceohelpers.BootstrapScalingStrategy) (bool, bool, *etcdserverpb.Member, error) { members, err := c.etcdClient.MemberList(ctx) if err != nil { - return false, false, 0, err + return false, false, nil, err } var hasBootstrap bool - var bootstrapMemberID uint64 + var bootstrapMember *etcdserverpb.Member for _, member := range members { if member.Name == "etcd-bootstrap" { hasBootstrap = true - bootstrapMemberID = member.ID + bootstrapMember = member break } } if !hasBootstrap { - return false, hasBootstrap, bootstrapMemberID, nil + return false, hasBootstrap, bootstrapMember, nil } // First, enforce the main HA invariants in terms of member counts. switch scalingStrategy { case ceohelpers.HAScalingStrategy: if len(members) < 4 { - return false, hasBootstrap, bootstrapMemberID, nil + return false, hasBootstrap, bootstrapMember, nil } case ceohelpers.DelayedHAScalingStrategy, ceohelpers.TwoNodeScalingStrategy: if len(members) < 3 { - return false, hasBootstrap, bootstrapMemberID, nil + return false, hasBootstrap, bootstrapMember, nil } case ceohelpers.UnsafeScalingStrategy, ceohelpers.DelayedTwoNodeScalingStrategy: if len(members) < 2 { - return false, hasBootstrap, bootstrapMemberID, nil + return false, hasBootstrap, bootstrapMember, nil } } // Next, given member counts are satisfied, check member health. unhealthyMembers, err := c.etcdClient.UnhealthyMembers(ctx) if err != nil { - return false, hasBootstrap, bootstrapMemberID, nil + return false, hasBootstrap, bootstrapMember, nil } // the etcd-bootstrap member is allowed to be unhealthy and can still be removed switch { case len(unhealthyMembers) == 0: - return true, hasBootstrap, bootstrapMemberID, nil + return true, hasBootstrap, bootstrapMember, nil case len(unhealthyMembers) > 1: - return false, hasBootstrap, bootstrapMemberID, nil + return false, hasBootstrap, bootstrapMember, nil default: if unhealthyMembers[0].Name == "etcd-bootstrap" { - return true, true, unhealthyMembers[0].ID, nil + return true, true, unhealthyMembers[0], nil } - return false, hasBootstrap, bootstrapMemberID, nil + return false, hasBootstrap, bootstrapMember, nil } } + +func (c *BootstrapTeardownController) ensureBootstrapIsNotLeader(ctx context.Context, bootstrapMember *etcdserverpb.Member) (bool, error) { + members, err := c.etcdClient.MemberList(ctx) + if err != nil { + return false, fmt.Errorf("could not list while ensuring bootstrap is not the leader: %w", err) + } + + leader, err := ceohelpers.FindLeader(ctx, c.etcdClient, members) + if err != nil || leader == nil { + return false, fmt.Errorf("could not find leader: %w", err) + } + + if bootstrapMember.ID != leader.ID { + return false, nil + } + + klog.Warningf("Bootstrap member [%x] (%s) detected as leader, trying to move elsewhere...", bootstrapMember.ID, bootstrapMember.GetClientURLs()[0]) + return ceohelpers.MoveLeaderToAnotherMember(ctx, c.etcdClient, leader, members) +} diff --git a/pkg/operator/bootstrapteardown/bootstrap_teardown_controller_test.go b/pkg/operator/bootstrapteardown/bootstrap_teardown_controller_test.go index a0539d5d8..7a2f18885 100644 --- a/pkg/operator/bootstrapteardown/bootstrap_teardown_controller_test.go +++ b/pkg/operator/bootstrapteardown/bootstrap_teardown_controller_test.go @@ -5,6 +5,7 @@ import ( "fmt" "testing" + clientv3 "go.etcd.io/etcd/client/v3" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -56,6 +57,13 @@ var ( Message: "enough members found", } + conditionBootstrapLeaderMoved = operatorv1.OperatorCondition{ + Type: "EtcdLeaderMovedAwayFromBootstrap", + Status: "True", + Reason: "BootstrapIsLeader", + Message: "bootstrap was leader and has been moved", + } + conditionEtcdMemberRemoved = operatorv1.OperatorCondition{ Type: "EtcdBootstrapMemberRemoved", Status: "True", @@ -262,11 +270,16 @@ func TestCanRemoveEtcdBootstrap(t *testing.T) { etcdClient: fakeEtcdClient, } - safeToRemoveBootstrap, hasBootstrap, bootstrapId, err := c.canRemoveEtcdBootstrap(context.TODO(), test.scalingStrategy) + safeToRemoveBootstrap, hasBootstrap, bootstrapMember, err := c.canRemoveEtcdBootstrap(context.TODO(), test.scalingStrategy) require.NoError(t, err) require.Equal(t, test.safeToRemove, safeToRemoveBootstrap, "safe to remove") require.Equal(t, test.hasBootstrap, hasBootstrap, "has bootstrap") - require.Equal(t, test.bootstrapId, bootstrapId, "bootstrap id") + if test.hasBootstrap { + require.NotNil(t, bootstrapMember, "bootstrap member") + require.Equal(t, test.bootstrapId, bootstrapMember.ID, "bootstrap id") + } else { + require.Nil(t, bootstrapMember, "bootstrap member") + } }) } } @@ -276,6 +289,7 @@ func TestRemoveBootstrap(t *testing.T) { tests := map[string]struct { safeToRemove bool hasBootstrap bool + bootstrapIsLead bool bootstrapId uint64 expectedConditions []operatorv1.OperatorCondition indexerObjs []interface{} @@ -358,6 +372,21 @@ func TestRemoveBootstrap(t *testing.T) { }, expEvents: 2, }, + + "safe, has bootstrap, moving leader, not removing the bootstrap member": { + safeToRemove: true, + hasBootstrap: true, + bootstrapIsLead: true, + bootstrapId: 1, + expectedConditions: []operatorv1.OperatorCondition{ + conditionEnoughEtcdMembers, + conditionBootstrapLeaderMoved, + }, + indexerObjs: []interface{}{ + bootstrapComplete, + }, + expEvents: 0, + }, } for name, test := range tests { @@ -372,8 +401,22 @@ func TestRemoveBootstrap(t *testing.T) { fakeConfigmapLister := corev1listers.NewConfigMapLister(indexer) fakeInfraLister := configv1listers.NewInfrastructureLister(indexer) fakeStaticPodClient := v1helpers.NewFakeStaticPodOperatorClient(&operatorv1.StaticPodOperatorSpec{}, &operatorv1.StaticPodOperatorStatus{}, nil, nil) - fakeEtcdClient, err := etcdcli.NewFakeEtcdClient([]*etcdserverpb.Member{u.FakeEtcdBootstrapMember(1)}) + + leaderOpt := etcdcli.WithLeader(2) + if test.bootstrapIsLead { + leaderOpt = etcdcli.WithLeader(1) + } + fakeEtcdClient, err := etcdcli.NewFakeEtcdClient( + []*etcdserverpb.Member{ + u.FakeEtcdBootstrapMember(1), + u.FakeEtcdMemberWithoutServer(2), + }, + etcdcli.WithFakeStatus([]*clientv3.StatusResponse{ + {Header: &etcdserverpb.ResponseHeader{MemberId: 1}, IsLearner: false}, + {Header: &etcdserverpb.ResponseHeader{MemberId: 2}, IsLearner: false}, + }), leaderOpt) require.NoError(t, err) + fakeKubeClient := fake.NewClientset([]runtime.Object{}...) fakeRecorder := events.NewRecorder(fakeKubeClient.CoreV1().Events(operatorclient.OperatorNamespace), "test-bootstrap-teardown-controller", &corev1.ObjectReference{}, clock.RealClock{}) @@ -387,7 +430,7 @@ func TestRemoveBootstrap(t *testing.T) { fakeRecorder, } - err = c.removeBootstrap(context.TODO(), test.safeToRemove, test.hasBootstrap, test.bootstrapId) + err = c.removeBootstrap(context.TODO(), test.safeToRemove, test.hasBootstrap, u.FakeEtcdMemberWithoutServer(int(test.bootstrapId))) require.Equal(t, test.expectedErr, err) if test.expEvents > 0 { diff --git a/pkg/operator/bootstraptest/bootstrap_lead_test_controller.go b/pkg/operator/bootstraptest/bootstrap_lead_test_controller.go new file mode 100644 index 000000000..9a0345d64 --- /dev/null +++ b/pkg/operator/bootstraptest/bootstrap_lead_test_controller.go @@ -0,0 +1,66 @@ +package bootstraptest + +import ( + "context" + "time" + + "github.com/openshift/cluster-etcd-operator/pkg/etcdcli" + "github.com/openshift/cluster-etcd-operator/pkg/operator/ceohelpers" + "github.com/openshift/cluster-etcd-operator/pkg/operator/health" + "github.com/openshift/library-go/pkg/controller/factory" + "github.com/openshift/library-go/pkg/operator/events" + "go.etcd.io/etcd/api/v3/etcdserverpb" + "k8s.io/klog/v2" +) + +type BootstrapTestController struct { + etcdClient etcdcli.EtcdClient +} + +func NewBootstrapTestController( + etcdClient etcdcli.EtcdClient, + eventRecorder events.Recorder, +) factory.Controller { + + evc := eventRecorder.WithComponentSuffix("bootstrap-test-controller") + c := &BootstrapTestController{ + etcdClient: etcdClient, + } + + syncer := health.NewDefaultCheckingSyncWrapper(c.sync) + return factory.New().ResyncEvery(2*time.Minute).WithSync(syncer.Sync). + ToController("BootstrapTestController", evc) +} + +func (c *BootstrapTestController) sync(ctx context.Context, _ factory.SyncContext) error { + + members, err := c.etcdClient.MemberList(ctx) + if err != nil { + return err + } + + var hasBootstrap bool + var bootstrapMember *etcdserverpb.Member + for _, member := range members { + if member.Name == "etcd-bootstrap" { + hasBootstrap = true + bootstrapMember = member + break + } + } + + if hasBootstrap { + klog.Warningf("TEST_ONLY moving leader to the bootstrap member") + moved, err := ceohelpers.MoveLeaderToAnotherMember(ctx, c.etcdClient, bootstrapMember, members) + if err != nil { + return err + } + if moved { + klog.Warningf("TEST_ONLY successfully moved to the bootstrap member") + } else { + klog.Warningf("TEST_ONLY failed to move to the bootstrap member") + } + } + + return nil +} diff --git a/pkg/operator/ceohelpers/leader.go b/pkg/operator/ceohelpers/leader.go new file mode 100644 index 000000000..f583b4c8b --- /dev/null +++ b/pkg/operator/ceohelpers/leader.go @@ -0,0 +1,52 @@ +package ceohelpers + +import ( + "context" + "fmt" + + "github.com/openshift/cluster-etcd-operator/pkg/etcdcli" + "go.etcd.io/etcd/api/v3/etcdserverpb" + "k8s.io/klog/v2" +) + +func FindLeader(ctx context.Context, client etcdcli.MemberStatusChecker, memberList []*etcdserverpb.Member) (*etcdserverpb.Member, error) { + var leaderId uint64 + var mx *etcdserverpb.Member + for _, member := range memberList { + status, err := client.MemberStatus(ctx, member) + if err != nil { + return nil, fmt.Errorf("failed to get member status while finding leader: %w", err) + } + if leaderId != 0 && leaderId != status.Leader { + return nil, fmt.Errorf("inconsistent leader reported by different members: [%x] vs. [%x]", leaderId, status.Leader) + } + leaderId = status.Leader + if member.ID == leaderId { + mx = member + } + } + return mx, nil +} + +func MoveLeaderToAnotherMember(ctx context.Context, client etcdcli.LeaderMover, leader *etcdserverpb.Member, + memberList []*etcdserverpb.Member) (bool, error) { + var otherMember *etcdserverpb.Member + for _, member := range memberList { + if member.ID != leader.ID { + otherMember = member + break + } + } + + if otherMember == nil { + return false, fmt.Errorf("no follower member found for leadership transfer: %v", memberList) + } + + err := client.MoveLeader(ctx, otherMember.ID) + if err != nil { + return false, err + } + + klog.Warningf("Moved lead from member [%x] (%s) to [%x] (%s) successfully!", leader.ID, leader.GetClientURLs()[0], otherMember.ID, otherMember.GetClientURLs()[0]) + return true, nil +} diff --git a/pkg/operator/ceohelpers/leader_test.go b/pkg/operator/ceohelpers/leader_test.go new file mode 100644 index 000000000..0f799db14 --- /dev/null +++ b/pkg/operator/ceohelpers/leader_test.go @@ -0,0 +1,315 @@ +package ceohelpers + +import ( + "context" + "errors" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.etcd.io/etcd/api/v3/etcdserverpb" + clientv3 "go.etcd.io/etcd/client/v3" +) + +// mockMemberStatusChecker implements MemberStatusChecker for testing +type mockMemberStatusChecker struct { + memberStatuses map[uint64]*clientv3.StatusResponse + errors map[uint64]error +} + +func (m *mockMemberStatusChecker) MemberStatus(ctx context.Context, member *etcdserverpb.Member) (*clientv3.StatusResponse, error) { + if err, exists := m.errors[member.ID]; exists { + return nil, err + } + if status, exists := m.memberStatuses[member.ID]; exists { + return status, nil + } + return nil, errors.New("member status not found") +} + +// mockLeaderMover implements LeaderMover for testing +type mockLeaderMover struct { + moveLeaderErrors map[uint64]error +} + +func (m *mockLeaderMover) MoveLeader(ctx context.Context, toMember uint64) error { + if err, exists := m.moveLeaderErrors[toMember]; exists { + return err + } + return nil +} + +func TestFindLeader(t *testing.T) { + ctx := context.Background() + + tests := []struct { + name string + memberList []*etcdserverpb.Member + memberStatuses map[uint64]*clientv3.StatusResponse + errors map[uint64]error + expectedLeader *etcdserverpb.Member + expectedError string + }{ + { + name: "successfully find leader", + memberList: []*etcdserverpb.Member{ + {ID: 1, Name: "etcd-1", ClientURLs: []string{"https://10.0.0.1:2379"}}, + {ID: 2, Name: "etcd-2", ClientURLs: []string{"https://10.0.0.2:2379"}}, + {ID: 3, Name: "etcd-3", ClientURLs: []string{"https://10.0.0.3:2379"}}, + }, + memberStatuses: map[uint64]*clientv3.StatusResponse{ + 1: {Header: &etcdserverpb.ResponseHeader{MemberId: 1}, Leader: 2}, + 2: {Header: &etcdserverpb.ResponseHeader{MemberId: 2}, Leader: 2}, + 3: {Header: &etcdserverpb.ResponseHeader{MemberId: 3}, Leader: 2}, + }, + expectedLeader: &etcdserverpb.Member{ID: 2, Name: "etcd-2", ClientURLs: []string{"https://10.0.0.2:2379"}}, + }, + { + name: "leader is first member", + memberList: []*etcdserverpb.Member{ + {ID: 1, Name: "etcd-1", ClientURLs: []string{"https://10.0.0.1:2379"}}, + {ID: 2, Name: "etcd-2", ClientURLs: []string{"https://10.0.0.2:2379"}}, + }, + memberStatuses: map[uint64]*clientv3.StatusResponse{ + 1: {Header: &etcdserverpb.ResponseHeader{MemberId: 1}, Leader: 1}, + 2: {Header: &etcdserverpb.ResponseHeader{MemberId: 2}, Leader: 1}, + }, + expectedLeader: &etcdserverpb.Member{ID: 1, Name: "etcd-1", ClientURLs: []string{"https://10.0.0.1:2379"}}, + }, + { + name: "leader is last member", + memberList: []*etcdserverpb.Member{ + {ID: 1, Name: "etcd-1", ClientURLs: []string{"https://10.0.0.1:2379"}}, + {ID: 2, Name: "etcd-2", ClientURLs: []string{"https://10.0.0.2:2379"}}, + {ID: 3, Name: "etcd-3", ClientURLs: []string{"https://10.0.0.3:2379"}}, + }, + memberStatuses: map[uint64]*clientv3.StatusResponse{ + 1: {Header: &etcdserverpb.ResponseHeader{MemberId: 1}, Leader: 3}, + 2: {Header: &etcdserverpb.ResponseHeader{MemberId: 2}, Leader: 3}, + 3: {Header: &etcdserverpb.ResponseHeader{MemberId: 3}, Leader: 3}, + }, + expectedLeader: &etcdserverpb.Member{ID: 3, Name: "etcd-3", ClientURLs: []string{"https://10.0.0.3:2379"}}, + }, + { + name: "single member cluster", + memberList: []*etcdserverpb.Member{ + {ID: 1, Name: "etcd-1", ClientURLs: []string{"https://10.0.0.1:2379"}}, + }, + memberStatuses: map[uint64]*clientv3.StatusResponse{ + 1: {Header: &etcdserverpb.ResponseHeader{MemberId: 1}, Leader: 1}, + }, + expectedLeader: &etcdserverpb.Member{ID: 1, Name: "etcd-1", ClientURLs: []string{"https://10.0.0.1:2379"}}, + }, + { + name: "member status error", + memberList: []*etcdserverpb.Member{ + {ID: 1, Name: "etcd-1", ClientURLs: []string{"https://10.0.0.1:2379"}}, + {ID: 2, Name: "etcd-2", ClientURLs: []string{"https://10.0.0.2:2379"}}, + }, + errors: map[uint64]error{ + 1: errors.New("connection failed"), + }, + expectedError: "failed to get member status while finding leader: connection failed", + }, + { + name: "inconsistent leader reported", + memberList: []*etcdserverpb.Member{ + {ID: 1, Name: "etcd-1", ClientURLs: []string{"https://10.0.0.1:2379"}}, + {ID: 2, Name: "etcd-2", ClientURLs: []string{"https://10.0.0.2:2379"}}, + {ID: 3, Name: "etcd-3", ClientURLs: []string{"https://10.0.0.3:2379"}}, + }, + memberStatuses: map[uint64]*clientv3.StatusResponse{ + 1: {Header: &etcdserverpb.ResponseHeader{MemberId: 1}, Leader: 2}, + 2: {Header: &etcdserverpb.ResponseHeader{MemberId: 2}, Leader: 2}, + 3: {Header: &etcdserverpb.ResponseHeader{MemberId: 3}, Leader: 3}, + }, + expectedError: "inconsistent leader reported by different members: [2] vs. [3]", + }, + { + name: "leader not in member list", + memberList: []*etcdserverpb.Member{ + {ID: 1, Name: "etcd-1", ClientURLs: []string{"https://10.0.0.1:2379"}}, + {ID: 2, Name: "etcd-2", ClientURLs: []string{"https://10.0.0.2:2379"}}, + }, + memberStatuses: map[uint64]*clientv3.StatusResponse{ + 1: {Header: &etcdserverpb.ResponseHeader{MemberId: 1}, Leader: 3}, + 2: {Header: &etcdserverpb.ResponseHeader{MemberId: 2}, Leader: 3}, + }, + expectedLeader: nil, // Leader ID 3 is not in the member list + }, + { + name: "empty member list", + memberList: []*etcdserverpb.Member{}, + expectedLeader: nil, + }, + { + name: "nil member list", + memberList: nil, + expectedLeader: nil, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + mockClient := &mockMemberStatusChecker{ + memberStatuses: tt.memberStatuses, + errors: tt.errors, + } + + leader, err := FindLeader(ctx, mockClient, tt.memberList) + + if tt.expectedError != "" { + require.Error(t, err) + assert.Contains(t, err.Error(), tt.expectedError) + return + } + + require.NoError(t, err) + if tt.expectedLeader == nil { + assert.Nil(t, leader) + } else { + require.NotNil(t, leader) + assert.Equal(t, tt.expectedLeader.ID, leader.ID) + assert.Equal(t, tt.expectedLeader.Name, leader.Name) + } + }) + } +} + +func TestMoveLeaderToAnotherMember(t *testing.T) { + ctx := context.Background() + + tests := []struct { + name string + leader *etcdserverpb.Member + memberList []*etcdserverpb.Member + moveLeaderErrors map[uint64]error + expectedSuccess bool + expectedError string + }{ + { + name: "successfully move leader to another member", + leader: &etcdserverpb.Member{ + ID: 1, + Name: "etcd-1", + ClientURLs: []string{"https://10.0.0.1:2379"}, + PeerURLs: []string{"https://10.0.0.1:2380"}, + }, + memberList: []*etcdserverpb.Member{ + {ID: 1, Name: "etcd-1", ClientURLs: []string{"https://10.0.0.1:2379"}, PeerURLs: []string{"https://10.0.0.1:2380"}}, + {ID: 2, Name: "etcd-2", ClientURLs: []string{"https://10.0.0.2:2379"}, PeerURLs: []string{"https://10.0.0.2:2380"}}, + {ID: 3, Name: "etcd-3", ClientURLs: []string{"https://10.0.0.3:2379"}, PeerURLs: []string{"https://10.0.0.3:2380"}}, + }, + expectedSuccess: true, + }, + { + name: "successfully move leader when leader is in middle of list", + leader: &etcdserverpb.Member{ + ID: 2, + Name: "etcd-2", + ClientURLs: []string{"https://10.0.0.2:2379"}, + PeerURLs: []string{"https://10.0.0.2:2380"}, + }, + memberList: []*etcdserverpb.Member{ + {ID: 1, Name: "etcd-1", ClientURLs: []string{"https://10.0.0.1:2379"}, PeerURLs: []string{"https://10.0.0.1:2380"}}, + {ID: 2, Name: "etcd-2", ClientURLs: []string{"https://10.0.0.2:2379"}, PeerURLs: []string{"https://10.0.0.2:2380"}}, + {ID: 3, Name: "etcd-3", ClientURLs: []string{"https://10.0.0.3:2379"}, PeerURLs: []string{"https://10.0.0.3:2380"}}, + }, + expectedSuccess: true, + }, + { + name: "successfully move leader when leader is last in list", + leader: &etcdserverpb.Member{ + ID: 3, + Name: "etcd-3", + ClientURLs: []string{"https://10.0.0.3:2379"}, + PeerURLs: []string{"https://10.0.0.3:2380"}, + }, + memberList: []*etcdserverpb.Member{ + {ID: 1, Name: "etcd-1", ClientURLs: []string{"https://10.0.0.1:2379"}, PeerURLs: []string{"https://10.0.0.1:2380"}}, + {ID: 2, Name: "etcd-2", ClientURLs: []string{"https://10.0.0.2:2379"}, PeerURLs: []string{"https://10.0.0.2:2380"}}, + {ID: 3, Name: "etcd-3", ClientURLs: []string{"https://10.0.0.3:2379"}, PeerURLs: []string{"https://10.0.0.3:2380"}}, + }, + expectedSuccess: true, + }, + { + name: "move leader fails with error", + leader: &etcdserverpb.Member{ + ID: 1, + Name: "etcd-1", + ClientURLs: []string{"https://10.0.0.1:2379"}, + PeerURLs: []string{"https://10.0.0.1:2380"}, + }, + memberList: []*etcdserverpb.Member{ + {ID: 1, Name: "etcd-1", ClientURLs: []string{"https://10.0.0.1:2379"}, PeerURLs: []string{"https://10.0.0.1:2380"}}, + {ID: 2, Name: "etcd-2", ClientURLs: []string{"https://10.0.0.2:2379"}, PeerURLs: []string{"https://10.0.0.2:2380"}}, + }, + moveLeaderErrors: map[uint64]error{ + 2: errors.New("move leader failed: connection timeout"), + }, + expectedSuccess: false, + expectedError: "move leader failed: connection timeout", + }, + { + name: "no follower member found - single member cluster", + leader: &etcdserverpb.Member{ + ID: 1, + Name: "etcd-1", + ClientURLs: []string{"https://10.0.0.1:2379"}, + PeerURLs: []string{"https://10.0.0.1:2380"}, + }, + memberList: []*etcdserverpb.Member{ + {ID: 1, Name: "etcd-1", ClientURLs: []string{"https://10.0.0.1:2379"}, PeerURLs: []string{"https://10.0.0.1:2380"}}, + }, + expectedSuccess: false, + expectedError: "no follower member found for the members: [ID:1 name:\"etcd-1\" peerURLs:\"https://10.0.0.1:2380\" clientURLs:\"https://10.0.0.1:2379\" ]", + }, + { + name: "no follower member found - all members have same ID", + leader: &etcdserverpb.Member{ + ID: 1, + Name: "etcd-1", + ClientURLs: []string{"https://10.0.0.1:2379"}, + PeerURLs: []string{"https://10.0.0.1:2380"}, + }, + memberList: []*etcdserverpb.Member{ + {ID: 1, Name: "etcd-1", ClientURLs: []string{"https://10.0.0.1:2379"}, PeerURLs: []string{"https://10.0.0.1:2380"}}, + {ID: 1, Name: "etcd-1-copy", ClientURLs: []string{"https://10.0.0.1:2379"}, PeerURLs: []string{"https://10.0.0.1:2380"}}, + }, + expectedSuccess: false, + expectedError: "no follower member found for the members: [ID:1 name:\"etcd-1\" peerURLs:\"https://10.0.0.1:2380\" clientURLs:\"https://10.0.0.1:2379\" ID:1 name:\"etcd-1-copy\" peerURLs:\"https://10.0.0.1:2380\" clientURLs:\"https://10.0.0.1:2379\" ]", + }, + { + name: "empty member list", + leader: &etcdserverpb.Member{ + ID: 1, + Name: "etcd-1", + ClientURLs: []string{"https://10.0.0.1:2379"}, + PeerURLs: []string{"https://10.0.0.1:2380"}, + }, + memberList: []*etcdserverpb.Member{}, + expectedSuccess: false, + expectedError: "no follower member found for the members: []", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + mockClient := &mockLeaderMover{ + moveLeaderErrors: tt.moveLeaderErrors, + } + + success, err := MoveLeaderToAnotherMember(ctx, mockClient, tt.leader, tt.memberList) + + if tt.expectedError != "" { + require.Error(t, err) + assert.Contains(t, err.Error(), tt.expectedError) + assert.False(t, success) + return + } + + require.NoError(t, err) + assert.Equal(t, tt.expectedSuccess, success) + }) + } +} diff --git a/pkg/operator/starter.go b/pkg/operator/starter.go index 650aef31c..44503d700 100644 --- a/pkg/operator/starter.go +++ b/pkg/operator/starter.go @@ -19,6 +19,7 @@ import ( operatorversionedclient "github.com/openshift/client-go/operator/clientset/versioned" operatorversionedclientv1alpha1 "github.com/openshift/client-go/operator/clientset/versioned/typed/operator/v1alpha1" operatorv1informers "github.com/openshift/client-go/operator/informers/externalversions" + "github.com/openshift/cluster-etcd-operator/pkg/operator/bootstraptest" "github.com/openshift/library-go/pkg/controller/controllercmd" "github.com/openshift/library-go/pkg/controller/factory" "github.com/openshift/library-go/pkg/operator/configobserver/featuregates" @@ -467,6 +468,9 @@ func RunOperator(ctx context.Context, controllerContext *controllercmd.Controlle configInformers.Config().V1().Infrastructures().Lister(), ) + // TODO(thomas): TEST ONLY, REMOVE AGAIN + bootstrapTestController := bootstraptest.NewBootstrapTestController(etcdClient, controllerContext.EventRecorder) + scriptController := scriptcontroller.NewScriptControllerController( AlivenessChecker, operatorClient, @@ -640,6 +644,8 @@ func RunOperator(ctx context.Context, controllerContext *controllercmd.Controlle go clusterMemberController.Run(ctx, 1) go etcdMembersController.Run(ctx, 1) go bootstrapTeardownController.Run(ctx, 1) + // TODO(thomas): remove again + go bootstrapTestController.Run(ctx, 1) go unsupportedConfigOverridesController.Run(ctx, 1) go scriptController.Run(ctx, 1) go defragController.Run(ctx, 1)