Skip to content

Commit d0aa54f

Browse files
committed
TNF: Wait for 2 cp nodes
When being installed with assisted installer, the auth jobs is already created when 1 cp node is available only. Wait with the job creation until both nodes exist, ensure jobs are created once only, and double check node count inside jobs. Signed-off-by: Marc Sluiter <[email protected]>
1 parent afc5cf8 commit d0aa54f

File tree

3 files changed

+89
-7
lines changed

3 files changed

+89
-7
lines changed

pkg/tnf/operator/starter.go

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"context"
55
"fmt"
66
"os"
7+
"sync"
78

89
operatorv1 "github.com/openshift/api/operator/v1"
910
configv1informers "github.com/openshift/client-go/config/informers/externalversions"
@@ -16,8 +17,10 @@ import (
1617
"github.com/openshift/library-go/pkg/operator/v1helpers"
1718
batchv1 "k8s.io/api/batch/v1"
1819
corev1 "k8s.io/api/core/v1"
20+
"k8s.io/apimachinery/pkg/labels"
1921
"k8s.io/client-go/dynamic"
2022
"k8s.io/client-go/kubernetes"
23+
corev1listers "k8s.io/client-go/listers/core/v1"
2124
"k8s.io/client-go/tools/cache"
2225
"k8s.io/klog/v2"
2326

@@ -55,25 +58,47 @@ func HandleDualReplicaClusters(
5558
runExternalEtcdSupportController(ctx, controllerContext, operatorClient, envVarGetter, kubeInformersForNamespaces, configInformers, networkInformer, controlPlaneNodeInformer, kubeClient)
5659
runTnfResourceController(ctx, controllerContext, kubeClient, dynamicClient, operatorClient, kubeInformersForNamespaces)
5760

58-
// we need node names for assigning auth jobs to specific nodes
61+
controlPlaneNodeLister := corev1listers.NewNodeLister(controlPlaneNodeInformer.GetIndexer())
62+
63+
// we need node names for assigning auth and after-setup jobs to specific nodes
64+
var once sync.Once
5965
klog.Infof("watching for nodes...")
6066
_, err := controlPlaneNodeInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{
6167
AddFunc: func(obj interface{}) {
6268
node, ok := obj.(*corev1.Node)
6369
if !ok {
64-
klog.Warningf("failed to convert node to Node %+v", obj)
70+
klog.Warningf("failed to convert added object to Node %+v", obj)
71+
return
72+
}
73+
klog.Infof("node added: %s", node.GetName())
74+
75+
// ensure we have both control plane nodes before creating jobs
76+
nodeList, err := controlPlaneNodeLister.List(labels.Everything())
77+
if err != nil {
78+
klog.Errorf("failed to list control plane nodes while waiting to create TNF jobs: %v", err)
79+
return
6580
}
66-
runTnfAuthJobController(ctx, node.GetName(), controllerContext, operatorClient, kubeClient, kubeInformersForNamespaces)
67-
runTnfAfterSetupJobController(ctx, node.GetName(), controllerContext, operatorClient, kubeClient, kubeInformersForNamespaces)
81+
if len(nodeList) != 2 {
82+
klog.Info("not starting TNF jobs yet, waiting for 2 control plane nodes to exist")
83+
return
84+
}
85+
// we can have 2 nodes on the first call of AddFunc already, ensure we create job controllers once only
86+
once.Do(func() {
87+
klog.Infof("found 2 control plane nodes (%q, %q), creating TNF jobs", nodeList[0].GetName(), nodeList[1].GetName())
88+
// the order of job creation does not matter, the jobs wait on each other as needed
89+
for _, node := range nodeList {
90+
runTnfAuthJobController(ctx, node.GetName(), controllerContext, operatorClient, kubeClient, kubeInformersForNamespaces)
91+
runTnfAfterSetupJobController(ctx, node.GetName(), controllerContext, operatorClient, kubeClient, kubeInformersForNamespaces)
92+
}
93+
runTnfSetupJobController(ctx, controllerContext, operatorClient, kubeClient, kubeInformersForNamespaces)
94+
})
6895
},
6996
})
7097
if err != nil {
7198
klog.Errorf("failed to add eventhandler to control plane informer: %v", err)
7299
return false, err
73100
}
74101

75-
runTnfSetupJobController(ctx, controllerContext, operatorClient, kubeClient, kubeInformersForNamespaces)
76-
77102
return true, nil
78103
}
79104

pkg/tnf/pkg/config/cluster.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package config
22

33
import (
44
"context"
5+
"fmt"
56
"sort"
67

78
corev1 "k8s.io/api/core/v1"
@@ -24,10 +25,15 @@ func GetClusterConfig(ctx context.Context, kubeClient kubernetes.Interface) (Clu
2425
clusterCfg := ClusterConfig{}
2526

2627
// Get nodes
27-
nodes, err := kubeClient.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
28+
nodes, err := kubeClient.CoreV1().Nodes().List(ctx, metav1.ListOptions{
29+
LabelSelector: "node-role.kubernetes.io/master",
30+
})
2831
if err != nil {
2932
return clusterCfg, err
3033
}
34+
if len(nodes.Items) != 2 {
35+
return clusterCfg, fmt.Errorf("expected 2 nodes, got %d", len(nodes.Items))
36+
}
3137

3238
sort.Slice(nodes.Items, func(i, j int) bool {
3339
return nodes.Items[i].Name < nodes.Items[j].Name

pkg/tnf/pkg/config/cluster_test.go

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ type args struct {
1717
}
1818

1919
func TestGetClusterConfig(t *testing.T) {
20+
2021
tests := []struct {
2122
name string
2223
args args
@@ -29,6 +30,9 @@ func TestGetClusterConfig(t *testing.T) {
2930
{
3031
ObjectMeta: metav1.ObjectMeta{
3132
Name: "test1",
33+
Labels: map[string]string{
34+
"node-role.kubernetes.io/master": "",
35+
},
3236
},
3337
Status: corev1.NodeStatus{
3438
Addresses: []corev1.NodeAddress{
@@ -41,6 +45,9 @@ func TestGetClusterConfig(t *testing.T) {
4145
{
4246
ObjectMeta: metav1.ObjectMeta{
4347
Name: "test2",
48+
Labels: map[string]string{
49+
"node-role.kubernetes.io/master": "",
50+
},
4451
},
4552
Status: corev1.NodeStatus{
4653
Addresses: []corev1.NodeAddress{
@@ -58,6 +65,44 @@ func TestGetClusterConfig(t *testing.T) {
5865
},
5966
wantErr: false,
6067
},
68+
{
69+
name: "one node only should fail",
70+
args: getArgs(t, []*corev1.Node{
71+
{
72+
ObjectMeta: metav1.ObjectMeta{
73+
Name: "test1",
74+
Labels: map[string]string{
75+
"node-role.kubernetes.io/master": "",
76+
},
77+
},
78+
},
79+
}),
80+
want: ClusterConfig{},
81+
wantErr: true,
82+
},
83+
{
84+
name: "one control plane node only should fail",
85+
args: getArgs(t, []*corev1.Node{
86+
{
87+
ObjectMeta: metav1.ObjectMeta{
88+
Name: "test1",
89+
Labels: map[string]string{
90+
"node-role.kubernetes.io/master": "",
91+
},
92+
},
93+
},
94+
{
95+
ObjectMeta: metav1.ObjectMeta{
96+
Name: "test2",
97+
Labels: map[string]string{
98+
"node-role.kubernetes.io/no-master": "",
99+
},
100+
},
101+
},
102+
}),
103+
want: ClusterConfig{},
104+
wantErr: true,
105+
},
61106
}
62107
for _, tt := range tests {
63108
t.Run(tt.name, func(t *testing.T) {
@@ -69,6 +114,12 @@ func TestGetClusterConfig(t *testing.T) {
69114
if !reflect.DeepEqual(got, tt.want) {
70115
t.Errorf("GetClusterConfig() got = %v, want %v", got, tt.want)
71116
}
117+
// delete nodes
118+
c := tt.args.kubeClient
119+
nodes, _ := c.CoreV1().Nodes().List(tt.args.ctx, metav1.ListOptions{})
120+
for _, node := range nodes.Items {
121+
c.CoreV1().Nodes().Delete(tt.args.ctx, node.Name, metav1.DeleteOptions{})
122+
}
72123
})
73124
}
74125
}

0 commit comments

Comments
 (0)