Skip to content

Commit fc5954c

Browse files
Merge pull request #1 from DennisPeriquet/revert-28847-revert-28735-1717157796269-fbabcock
update upgrade window check and exception messages
2 parents 2c47e0a + b1e572a commit fc5954c

File tree

2 files changed

+42
-44
lines changed

2 files changed

+42
-44
lines changed

pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go

Lines changed: 40 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,20 @@ import (
2626
// exception string if does not think the condition should be fatal.
2727
type exceptionCallback func(operator string, condition *configv1.ClusterOperatorStatusCondition, eventInterval monitorapi.Interval, clientConfig *rest.Config) (string, error)
2828

29+
type upgradeWindowHolder struct {
30+
startInterval *monitorapi.Interval
31+
endInterval *monitorapi.Interval
32+
}
33+
2934
func checkAuthenticationExceptions(condition *configv1.ClusterOperatorStatusCondition) bool {
30-
if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse && (condition.Reason == "APIServices_Error" || condition.Reason == "APIServerDeployment_NoDeployment" || condition.Reason == "APIServerDeployment_NoPod" || condition.Reason == "APIServerDeployment_PreconditionNotFulfilled" || condition.Reason == "APIServices_PreconditionNotReady" || condition.Reason == "OAuthServerDeployment_NoDeployment" || condition.Reason == "OAuthServerRouteEndpointAccessibleController_EndpointUnavailable" || condition.Reason == "OAuthServerServiceEndpointAccessibleController_EndpointUnavailable" || condition.Reason == "WellKnown_NotReady") {
31-
return true
35+
if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse {
36+
switch condition.Reason {
37+
case "APIServices_Error", "APIServerDeployment_NoDeployment", "APIServerDeployment_NoPod",
38+
"APIServerDeployment_PreconditionNotFulfilled", "APIServices_PreconditionNotReady",
39+
"OAuthServerDeployment_NoDeployment", "OAuthServerRouteEndpointAccessibleController_EndpointUnavailable",
40+
"OAuthServerServiceEndpointAccessibleController_EndpointUnavailable", "WellKnown_NotReady":
41+
return true
42+
}
3243
}
3344
return false
3445
}
@@ -45,17 +56,8 @@ func testStableSystemOperatorStateTransitions(events monitorapi.Intervals, clien
4556
}
4657
}
4758

48-
isSingleNode, err := isSingleNodeCheck(clientConfig)
49-
if err != nil {
50-
logrus.Warnf("Error checking for Single Node configuration on stable system (unable to make exception): %v", err)
51-
isSingleNode = false
52-
}
53-
5459
// For the non-upgrade case, if any operator has Available=False, fail the test.
5560
if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse {
56-
if isSingleNode {
57-
return "Operators are allowed to go degraded on single-node for now", nil
58-
}
5961
if operator == "authentication" {
6062
if checkAuthenticationExceptions(condition) {
6163
return "https://issues.redhat.com/browse/OCPBUGS-20056", nil
@@ -86,11 +88,7 @@ func isSingleNodeCheck(clientConfig *rest.Config) (bool, error) {
8688
// UpgradeComplete and UpgradeFailed events end upgrade windows; if there was not an already started upgrade window,
8789
// we ignore the event.
8890
// If we don't find any upgrade ending point, we assume the ending point is at the end of the test.
89-
func isInUpgradeWindow(eventList monitorapi.Intervals, eventInterval monitorapi.Interval) bool {
90-
type upgradeWindowHolder struct {
91-
startInterval *monitorapi.Interval
92-
endInterval *monitorapi.Interval
93-
}
91+
func getUpgradeWindows(eventList monitorapi.Intervals) []*upgradeWindowHolder {
9492

9593
var upgradeWindows []*upgradeWindowHolder
9694
var currentWindow *upgradeWindowHolder
@@ -151,6 +149,10 @@ func isInUpgradeWindow(eventList monitorapi.Intervals, eventInterval monitorapi.
151149
}
152150
}
153151

152+
return upgradeWindows
153+
}
154+
155+
func isInUpgradeWindow(upgradeWindows []*upgradeWindowHolder, eventInterval monitorapi.Interval) bool {
154156
for _, upgradeWindow := range upgradeWindows {
155157
if eventInterval.From.After(upgradeWindow.startInterval.From) {
156158
if upgradeWindow.endInterval == nil || eventInterval.To.Before(upgradeWindow.endInterval.To) {
@@ -163,6 +165,13 @@ func isInUpgradeWindow(eventList monitorapi.Intervals, eventInterval monitorapi.
163165
}
164166

165167
func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConfig *rest.Config) []*junitapi.JUnitTestCase {
168+
upgradeWindows := getUpgradeWindows(events)
169+
isSingleNode, err := isSingleNodeCheck(clientConfig)
170+
if err != nil {
171+
logrus.Warnf("Error checking for Single Node configuration on upgrade (unable to make exception): %v", err)
172+
isSingleNode = false
173+
}
174+
166175
except := func(operator string, condition *configv1.ClusterOperatorStatusCondition, eventInterval monitorapi.Interval, clientConfig *rest.Config) (string, error) {
167176
if condition.Status == configv1.ConditionTrue {
168177
if condition.Type == configv1.OperatorAvailable {
@@ -178,35 +187,21 @@ func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConf
178187
return "We are not worried about Degraded=True blips for update tests yet.", nil
179188
}
180189

181-
var availableEqualsFalseAllowed bool
182-
if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse {
183-
availableEqualsFalseAllowed = isInUpgradeWindow(events, eventInterval) && eventInterval.To.Sub(eventInterval.From) < 10*time.Minute
184-
}
185-
186-
isSingleNode, err := isSingleNodeCheck(clientConfig)
187-
if err != nil {
188-
logrus.Warnf("Error checking for Single Node configuration on upgrade (unable to make exception): %v", err)
189-
isSingleNode = false
190-
}
191-
192-
// We'll add an exception for single node for now.
193-
if !availableEqualsFalseAllowed {
194-
195-
if isSingleNode {
196-
// We'll honor exceptions for single node configuration.
197-
logrus.Infof("Operator %s is in Available=False state, but we give single node clusters an exception", operator)
198-
199-
} else if operator == "authentication" {
200-
// We'll honor exceptions for authentication operator because it is affected by etcd performance issues.
201-
logrus.Info("Operator authentication is in Available=False state, but we give an exception")
202-
203-
} else if operator == "image-registry" {
204-
// For now, we'll honor exceptions for image-registry operator as it's affected by tests that
205-
// cause replicas to go down (e.g., tests that taint two nodes).
206-
logrus.Info("Operator image-registry is in Available=False state, but we give an exception")
207-
} else {
190+
// we know the Status is not true and the Type is not degraded at this point indicating we are available=false
191+
withinUpgradeWindowBuffer := isInUpgradeWindow(upgradeWindows, eventInterval) && eventInterval.To.Sub(eventInterval.From) < 10*time.Minute
192+
if !withinUpgradeWindowBuffer {
193+
switch operator {
194+
// there are some known cases for authentication and image-registry that occur outside of upgrade window, so we will pass through and check for exceptions
195+
case "authentication", "image-registry":
196+
logrus.Infof("Operator %s is in Available=False state outside of upgrade window, but we will check for exceptions", operator)
197+
default:
208198
return "", nil
209199
}
200+
} else {
201+
// SingleNode is expected to go Available=False for most / all operators during upgrade
202+
if isSingleNode {
203+
return fmt.Sprintf("Operator %s is in Available=False state running in single replica control plane, expected availability transition during upgrade", operator), nil
204+
}
210205
}
211206

212207
switch operator {
@@ -250,6 +245,8 @@ func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConf
250245
return "https://issues.redhat.com/browse/OCPBUGS-23744", nil
251246
}
252247
case "image-registry":
248+
// this won't handle the replicaCount==2 serial test where both pods are on nodes that get tainted.
249+
// need to consider how we detect that or modify the job to set replicaCount==3
253250
if replicaCount, _ := checkReplicas("openshift-image-registry", operator, clientConfig); replicaCount == 1 {
254251
return "https://issues.redhat.com/browse/OCPBUGS-22382", nil
255252
}

pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators_test.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,8 @@ func Test_isInUpgradeWindow(t *testing.T) {
186186
}
187187
for _, tt := range tests {
188188
t.Run(tt.name, func(t *testing.T) {
189-
got := isInUpgradeWindow(tt.args.eventList, tt.args.eventInterval)
189+
upgradeWindows := getUpgradeWindows(tt.args.eventList)
190+
got := isInUpgradeWindow(upgradeWindows, tt.args.eventInterval)
190191
assert.Equal(t, tt.want, got)
191192
})
192193
}

0 commit comments

Comments
 (0)