@@ -3,6 +3,7 @@ package bootstrapteardown
3
3
import (
4
4
"context"
5
5
"fmt"
6
+ "go.etcd.io/etcd/api/v3/etcdserverpb"
6
7
"time"
7
8
8
9
operatorv1 "github.com/openshift/api/operator/v1"
@@ -62,12 +63,29 @@ func (c *BootstrapTeardownController) sync(ctx context.Context, _ factory.SyncCo
62
63
return fmt .Errorf ("failed to get bootstrap scaling strategy: %w" , err )
63
64
}
64
65
// checks the actual etcd cluster membership API if etcd-bootstrap exists
65
- safeToRemoveBootstrap , hasBootstrap , bootstrapID , err := c .canRemoveEtcdBootstrap (ctx , scalingStrategy )
66
+ safeToRemoveBootstrap , hasBootstrap , bootstrapMember , err := c .canRemoveEtcdBootstrap (ctx , scalingStrategy )
66
67
if err != nil {
67
68
return fmt .Errorf ("error while canRemoveEtcdBootstrap: %w" , err )
68
69
}
69
70
70
- err = c .removeBootstrap (timeoutCtx , safeToRemoveBootstrap , hasBootstrap , bootstrapID )
71
+ if hasBootstrap {
72
+ if err := c .ensureBootstrapIsNotLeader (ctx , bootstrapMember ); err != nil {
73
+ klog .Errorf ("error while ensuring bootstrap is not leader: %v" , err )
74
+ }
75
+ }
76
+
77
+ // TODO(thomas): it seems on SNO, this is not enough, we might have a non-working apiserver at this point in time
78
+ revisionStable , err := ceohelpers .IsRevisionStable (c .operatorClient )
79
+ if err != nil {
80
+ return fmt .Errorf ("BootstrapTeardownController failed to determine stability of revisions: %w" , err )
81
+ }
82
+
83
+ if ! revisionStable {
84
+ klog .Infof ("BootstrapTeardownController is waiting for stable etcd revision before removing the bootstrap member" )
85
+ return nil
86
+ }
87
+
88
+ err = c .removeBootstrap (timeoutCtx , safeToRemoveBootstrap , hasBootstrap , bootstrapMember )
71
89
if err != nil {
72
90
_ , _ , updateErr := v1helpers .UpdateStatus (ctx , c .operatorClient , v1helpers .UpdateConditionFn (operatorv1.OperatorCondition {
73
91
Type : "BootstrapTeardownDegraded" ,
@@ -90,13 +108,20 @@ func (c *BootstrapTeardownController) sync(ctx context.Context, _ factory.SyncCo
90
108
return updateErr
91
109
}
92
110
93
- func (c * BootstrapTeardownController ) removeBootstrap (ctx context.Context , safeToRemoveBootstrap bool , hasBootstrap bool , bootstrapID uint64 ) error {
111
+ func (c * BootstrapTeardownController ) removeBootstrap (ctx context.Context , safeToRemoveBootstrap bool , hasBootstrap bool , bootstrapMember * etcdserverpb.Member ) error {
112
+ bootstrapID := uint64 (0 )
113
+ bootstrapUrl := "unknown"
114
+ if bootstrapMember != nil {
115
+ bootstrapID = bootstrapMember .ID
116
+ bootstrapUrl = bootstrapMember .GetClientURLs ()[0 ]
117
+ }
118
+
94
119
if ! hasBootstrap {
95
120
klog .V (4 ).Infof ("no bootstrap anymore setting removal status" )
96
121
// this is to ensure the status is always set correctly, even if the status update below failed
97
- updateErr := setSuccessfulBoostrapRemovalStatus (ctx , c .operatorClient )
122
+ updateErr := setSuccessfulBootstrapRemovalStatus (ctx , c .operatorClient )
98
123
if updateErr != nil {
99
- return fmt .Errorf ("error while setSuccessfulBoostrapRemovalStatus : %w" , updateErr )
124
+ return fmt .Errorf ("error while setSuccessfulBootstrapRemovalStatus : %w" , updateErr )
100
125
}
101
126
102
127
// if the bootstrap isn't present, then clearly we're available enough to terminate. This avoids any risk of flapping.
@@ -141,20 +166,21 @@ func (c *BootstrapTeardownController) removeBootstrap(ctx context.Context, safeT
141
166
if isBootstrapComplete , err := bootstrap .IsBootstrapComplete (c .configmapLister ); ! isBootstrapComplete || err != nil {
142
167
return err
143
168
}
144
- klog .Warningf ("Removing bootstrap member [%x]" , bootstrapID )
169
+
170
+ klog .Warningf ("Removing bootstrap member [%x] (%s)" , bootstrapID , bootstrapUrl )
145
171
146
172
// this is ugly until bootkube is updated, but we want to be sure that bootkube has time to be waiting to watch the condition coming back.
147
173
if err := c .etcdClient .MemberRemove (ctx , bootstrapID ); err != nil {
148
- return fmt .Errorf ("error while removing bootstrap member [%x]: %w" , bootstrapID , err )
174
+ return fmt .Errorf ("error while removing bootstrap member [%x] (%s) : %w" , bootstrapID , bootstrapUrl , err )
149
175
}
150
176
151
- klog .Infof ("Successfully removed bootstrap member [%x]" , bootstrapID )
177
+ klog .Infof ("Successfully removed bootstrap member [%x] (%s) " , bootstrapID , bootstrapUrl )
152
178
// below might fail, since the member removal can cause some downtime for raft to settle on a quorum
153
179
// it's important that everything below is properly retried above during normal controller reconciliation
154
- return setSuccessfulBoostrapRemovalStatus (ctx , c .operatorClient )
180
+ return setSuccessfulBootstrapRemovalStatus (ctx , c .operatorClient )
155
181
}
156
182
157
- func setSuccessfulBoostrapRemovalStatus (ctx context.Context , client v1helpers.StaticPodOperatorClient ) error {
183
+ func setSuccessfulBootstrapRemovalStatus (ctx context.Context , client v1helpers.StaticPodOperatorClient ) error {
158
184
_ , _ , updateErr := v1helpers .UpdateStatus (ctx , client , v1helpers .UpdateConditionFn (operatorv1.OperatorCondition {
159
185
Type : "EtcdBootstrapMemberRemoved" ,
160
186
Status : operatorv1 .ConditionTrue ,
@@ -165,57 +191,101 @@ func setSuccessfulBoostrapRemovalStatus(ctx context.Context, client v1helpers.St
165
191
}
166
192
167
193
// canRemoveEtcdBootstrap returns whether it is safe to remove bootstrap, whether bootstrap is in the list, and an error
168
- func (c * BootstrapTeardownController ) canRemoveEtcdBootstrap (ctx context.Context , scalingStrategy ceohelpers.BootstrapScalingStrategy ) (bool , bool , uint64 , error ) {
194
+ func (c * BootstrapTeardownController ) canRemoveEtcdBootstrap (ctx context.Context , scalingStrategy ceohelpers.BootstrapScalingStrategy ) (bool , bool , * etcdserverpb. Member , error ) {
169
195
members , err := c .etcdClient .MemberList (ctx )
170
196
if err != nil {
171
- return false , false , 0 , err
197
+ return false , false , nil , err
172
198
}
173
199
174
200
var hasBootstrap bool
175
- var bootstrapMemberID uint64
201
+ var bootstrapMember * etcdserverpb. Member
176
202
for _ , member := range members {
177
203
if member .Name == "etcd-bootstrap" {
178
204
hasBootstrap = true
179
- bootstrapMemberID = member . ID
205
+ bootstrapMember = member
180
206
break
181
207
}
182
208
}
183
209
if ! hasBootstrap {
184
- return false , hasBootstrap , bootstrapMemberID , nil
210
+ return false , hasBootstrap , bootstrapMember , nil
185
211
}
186
212
187
213
// First, enforce the main HA invariants in terms of member counts.
188
214
switch scalingStrategy {
189
215
case ceohelpers .HAScalingStrategy :
190
216
if len (members ) < 4 {
191
- return false , hasBootstrap , bootstrapMemberID , nil
217
+ return false , hasBootstrap , bootstrapMember , nil
192
218
}
193
219
case ceohelpers .DelayedHAScalingStrategy :
194
220
if len (members ) < 3 {
195
- return false , hasBootstrap , bootstrapMemberID , nil
221
+ return false , hasBootstrap , bootstrapMember , nil
196
222
}
197
223
case ceohelpers .UnsafeScalingStrategy :
198
224
if len (members ) < 2 {
199
- return false , hasBootstrap , bootstrapMemberID , nil
225
+ return false , hasBootstrap , bootstrapMember , nil
200
226
}
201
227
}
202
228
203
229
// Next, given member counts are satisfied, check member health.
204
230
unhealthyMembers , err := c .etcdClient .UnhealthyMembers (ctx )
205
231
if err != nil {
206
- return false , hasBootstrap , bootstrapMemberID , nil
232
+ return false , hasBootstrap , bootstrapMember , nil
207
233
}
208
234
209
235
// the etcd-bootstrap member is allowed to be unhealthy and can still be removed
210
236
switch {
211
237
case len (unhealthyMembers ) == 0 :
212
- return true , hasBootstrap , bootstrapMemberID , nil
238
+ return true , hasBootstrap , bootstrapMember , nil
213
239
case len (unhealthyMembers ) > 1 :
214
- return false , hasBootstrap , bootstrapMemberID , nil
240
+ return false , hasBootstrap , bootstrapMember , nil
215
241
default :
216
242
if unhealthyMembers [0 ].Name == "etcd-bootstrap" {
217
- return true , true , unhealthyMembers [0 ].ID , nil
243
+ return true , true , bootstrapMember , nil
244
+ }
245
+ return false , hasBootstrap , bootstrapMember , nil
246
+ }
247
+ }
248
+
249
+ func (c * BootstrapTeardownController ) ensureBootstrapIsNotLeader (ctx context.Context , bootstrapMember * etcdserverpb.Member ) error {
250
+ if bootstrapMember == nil {
251
+ return fmt .Errorf ("bootstrap member was not provided" )
252
+ }
253
+ status , err := c .etcdClient .Status (ctx , bootstrapMember .ClientURLs [0 ])
254
+ if err != nil {
255
+ return fmt .Errorf ("could not find bootstrap member status: %w" , err )
256
+ }
257
+
258
+ if bootstrapMember .ID != status .Leader {
259
+ return nil
260
+ }
261
+
262
+ klog .Warningf ("Bootstrap member [%x] (%s) detected as leader, trying to move elsewhere..." , bootstrapMember .ID , bootstrapMember .GetClientURLs ()[0 ])
263
+
264
+ memberHealth , err := c .etcdClient .MemberHealth (ctx )
265
+ if err != nil {
266
+ return fmt .Errorf ("could not find member health: %w" , err )
267
+ }
268
+
269
+ var otherMember * etcdserverpb.Member
270
+ // we can pick any other healthy voting member as the target to move to
271
+ for _ , m := range memberHealth .GetHealthyMembers () {
272
+ if m .ID != bootstrapMember .ID && ! m .IsLearner {
273
+ otherMember = m
274
+ break
218
275
}
219
- return false , hasBootstrap , bootstrapMemberID , nil
220
276
}
277
+
278
+ if otherMember == nil {
279
+ return fmt .Errorf ("could not find other healthy member to move leader" )
280
+ }
281
+
282
+ klog .Warningf ("Moving lead from bootstrap member [%x] (%s) detected as leader to [%x] (%s)" , bootstrapMember .ID , bootstrapMember .GetClientURLs ()[0 ], otherMember .ID , otherMember .GetClientURLs ()[0 ])
283
+ err = c .etcdClient .MoveLeader (ctx , otherMember .ID )
284
+ if err != nil {
285
+ return err
286
+ }
287
+
288
+ klog .Warningf ("Moving lead from bootstrap member [%x] (%s) to [%x] (%s) succesfully!" , bootstrapMember .ID , bootstrapMember .GetClientURLs ()[0 ], otherMember .ID , otherMember .GetClientURLs ()[0 ])
289
+
290
+ return nil
221
291
}
0 commit comments