@@ -655,14 +655,42 @@ func (c *liveStateCache) getSyncedCluster(server *appv1.Cluster) (clustercache.C
655
655
656
656
err = clusterCache .EnsureSynced ()
657
657
if err != nil {
658
- if isConversionWebhookError (err ) {
659
- log .WithField ("cluster" , server .Server ).Warnf ("Conversion webhook error during cluster sync, cluster cache may be incomplete: %v" , err )
658
+ if isClusterCacheError (err ) {
659
+ log .WithField ("cluster" , server .Server ).Warnf ("Cluster cache sync error detected, cluster cache may be incomplete: %v" , err )
660
+
660
661
// Extract the GVK from the error and track it as a failed GVK
661
- gvkStr := extractGVKFromConversionWebhookError (err )
662
+ gvkStr := extractGVKFromCacheError (err )
662
663
if gvkStr != "" {
663
664
c .trackFailedResourceGVK (server .Server , gvkStr )
665
+
666
+ // Also mark the cluster as tainted
667
+ errorType := "unknown"
668
+ errStr := err .Error ()
669
+ if strings .Contains (errStr , "conversion webhook" ) {
670
+ errorType = "conversion_webhook"
671
+ } else if strings .Contains (errStr , "Expired: too old resource version" ) {
672
+ errorType = "pagination_token_expired"
673
+ } else if strings .Contains (errStr , "connection" ) {
674
+ errorType = "connection_issue"
675
+ }
676
+
677
+ markClusterTainted (server .Server , errStr , gvkStr , errorType )
678
+ }
679
+
680
+ // Mark the cluster as tainted
681
+ errorType := "unknown"
682
+ errStr := err .Error ()
683
+ if strings .Contains (errStr , "conversion webhook" ) {
684
+ errorType = "conversion_webhook"
685
+ } else if strings .Contains (errStr , "Expired: too old resource version" ) {
686
+ errorType = "pagination_token_expired"
687
+ } else if strings .Contains (errStr , "connection" ) {
688
+ errorType = "connection_issue"
664
689
}
665
- // For conversion webhook errors, we return the cluster cache without an error
690
+
691
+ markClusterTainted (server .Server , errStr , gvkStr , errorType )
692
+
693
+ // For known cache errors, we return the cluster cache without an error
666
694
// but log the issue. This allows applications to continue with
667
695
// whatever state is available rather than failing completely.
668
696
// We're handling partial state in each of the methods that use getSyncedCluster
@@ -1068,18 +1096,40 @@ func (c *liveStateCache) GetClustersInfo() []clustercache.ClusterInfo {
1068
1096
for gvk := range failedGVKs {
1069
1097
failedGVKList = append (failedGVKList , gvk )
1070
1098
}
1099
+ }
1100
+ c .failedGVKLock .RUnlock ()
1101
+
1102
+ // Update cluster info with failed resource GVKs from our tracking
1103
+
1104
+ // Check if this cluster is tainted
1105
+ ClusterTaintLock .RLock ()
1106
+ taints , exists := clusterTaints [server ]
1107
+ ClusterTaintLock .RUnlock ()
1108
+
1109
+ // Update cluster info with taint status and reason
1110
+ if exists && len (taints ) > 0 {
1111
+ info .IsTainted = true
1071
1112
1072
- // If there are failed GVKs, modify SyncError to include this information
1073
- if len (failedGVKList ) > 0 {
1074
- msg := fmt .Sprintf ("Cluster has %d unavailable resource types due to conversion webhook errors" , len (failedGVKList ))
1075
- if info .SyncError == nil {
1076
- info .SyncError = fmt .Errorf (msg )
1077
- } else {
1078
- info .SyncError = fmt .Errorf ("%s; %s" , info .SyncError .Error (), msg )
1113
+ // Add any tainted GVKs that aren't already in the list
1114
+ for gvk := range taints {
1115
+ if ! contains (failedGVKList , gvk ) {
1116
+ info .FailedResourceGVKs = append (info .FailedResourceGVKs , gvk )
1079
1117
}
1080
1118
}
1119
+
1120
+ // Set taint reason
1121
+ info .TaintReason = "Cluster has tainted resource types"
1122
+ }
1123
+
1124
+ // If there are failed GVKs, modify SyncError to include this information
1125
+ if len (failedGVKList ) > 0 {
1126
+ msg := fmt .Sprintf ("Cluster has %d unavailable resource types" , len (failedGVKList ))
1127
+ if info .SyncError == nil {
1128
+ info .SyncError = errors .New (msg )
1129
+ } else {
1130
+ info .SyncError = errors .New (info .SyncError .Error () + "; " + msg )
1131
+ }
1081
1132
}
1082
- c .failedGVKLock .RUnlock ()
1083
1133
1084
1134
res = append (res , info )
1085
1135
}
@@ -1095,7 +1145,42 @@ func (c *liveStateCache) UpdateShard(shard int) bool {
1095
1145
return c .clusterSharding .UpdateShard (shard )
1096
1146
}
1097
1147
1148
+ // isClusterCacheError checks if an error is a known cluster cache error
1149
+ // This handles a broader class of errors that can happen when syncing cluster cache
1150
+ func isClusterCacheError (err error ) bool {
1151
+ if err == nil {
1152
+ return false
1153
+ }
1154
+ errStr := err .Error ()
1155
+
1156
+ // Define all the error patterns we want to catch
1157
+ errorPatterns := []string {
1158
+ // Conversion webhook errors
1159
+ "conversion webhook" ,
1160
+
1161
+ // Pagination token expiration errors
1162
+ "Expired: too old resource version" ,
1163
+
1164
+ // General resource list errors
1165
+ "failed to list resources" ,
1166
+
1167
+ // Connection issues
1168
+ "connection refused" ,
1169
+ "connection reset by peer" ,
1170
+ "i/o timeout" ,
1171
+ }
1172
+
1173
+ for _ , pattern := range errorPatterns {
1174
+ if strings .Contains (errStr , pattern ) {
1175
+ return true
1176
+ }
1177
+ }
1178
+
1179
+ return false
1180
+ }
1181
+
1098
1182
// isConversionWebhookError checks if an error is related to conversion webhooks
1183
+ // Kept for backward compatibility
1099
1184
func isConversionWebhookError (err error ) bool {
1100
1185
if err == nil {
1101
1186
return false
@@ -1104,9 +1189,9 @@ func isConversionWebhookError(err error) bool {
1104
1189
return strings .Contains (errStr , "conversion webhook" )
1105
1190
}
1106
1191
1107
- // extractGVKFromConversionWebhookError attempts to extract the GroupVersionKind from a conversion webhook error
1192
+ // extractGVKFromCacheError attempts to extract the GroupVersionKind from various cache errors
1108
1193
// Returns an empty string if no GVK could be extracted
1109
- func extractGVKFromConversionWebhookError (err error ) string {
1194
+ func extractGVKFromCacheError (err error ) string {
1110
1195
if err == nil {
1111
1196
return ""
1112
1197
}
@@ -1152,6 +1237,53 @@ func (c *liveStateCache) trackFailedResourceGVK(server string, gvkStr string) {
1152
1237
}).Infof ("Tracked failed resource GVK due to conversion webhook error" )
1153
1238
}
1154
1239
1240
+ // Map of server URL to tainted resource GVKs and error types
1241
+ var clusterTaints = make (map [string ]map [string ]string ) // server -> gvk -> error type
1242
+ var ClusterTaintLock = sync.RWMutex {}
1243
+
1244
+ // markClusterTainted marks a cluster as having tainted cache state
1245
+ func markClusterTainted (server string , reason string , gvk string , errorType string ) {
1246
+ ClusterTaintLock .Lock ()
1247
+ defer ClusterTaintLock .Unlock ()
1248
+
1249
+ // Initialize if not exists
1250
+ _ , exists := clusterTaints [server ]
1251
+ if ! exists {
1252
+ clusterTaints [server ] = make (map [string ]string )
1253
+ }
1254
+
1255
+ // Store the GVK and error type
1256
+ if gvk != "" {
1257
+ clusterTaints [server ][gvk ] = errorType
1258
+ }
1259
+ }
1260
+
1261
+ // IsClusterTainted checks if a cluster is in tainted state
1262
+ func IsClusterTainted (server string ) bool {
1263
+ ClusterTaintLock .RLock ()
1264
+ defer ClusterTaintLock .RUnlock ()
1265
+
1266
+ taints , exists := clusterTaints [server ]
1267
+ return exists && len (taints ) > 0
1268
+ }
1269
+
1270
+ // GetTaintedGVKs returns a list of tainted GVKs for a cluster
1271
+ func GetTaintedGVKs (server string ) []string {
1272
+ ClusterTaintLock .RLock ()
1273
+ defer ClusterTaintLock .RUnlock ()
1274
+
1275
+ taints , exists := clusterTaints [server ]
1276
+ if ! exists {
1277
+ return nil
1278
+ }
1279
+
1280
+ gvks := make ([]string , 0 , len (taints ))
1281
+ for gvk := range taints {
1282
+ gvks = append (gvks , gvk )
1283
+ }
1284
+ return gvks
1285
+ }
1286
+
1155
1287
// isResourceGVKFailed checks if a resource GVK is in the failed resources map
1156
1288
func (c * liveStateCache ) isResourceGVKFailed (server string , gvkStr string ) bool {
1157
1289
c .failedGVKLock .RLock ()
@@ -1176,6 +1308,21 @@ func (c *liveStateCache) clearFailedResourceGVKs(server string) {
1176
1308
delete (c .failedResourceGVKs , server )
1177
1309
}
1178
1310
1311
+ // contains checks if a string is in a slice
1312
+ func contains (slice []string , str string ) bool {
1313
+ for _ , s := range slice {
1314
+ if s == str {
1315
+ return true
1316
+ }
1317
+ }
1318
+ return false
1319
+ }
1320
+
1321
+ // GetClusterTaints returns the map of cluster taints
1322
+ func GetClusterTaints () map [string ]map [string ]string {
1323
+ return clusterTaints
1324
+ }
1325
+
1179
1326
// cleanupExpiredFailedGVKs removes expired entries from the failed GVKs cache
1180
1327
// This should be called periodically to avoid the cache growing too large
1181
1328
func (c * liveStateCache ) cleanupExpiredFailedGVKs () {
0 commit comments