Skip to content

Commit 2435423

Browse files
authored
fix: properly classify codes.Unavailable errors as TransientError (#1457)
- Add logging to handleError to debug error classification - Classify codes.Unavailable (EOF, connection reset, keepalive timeout) as TransientError
1 parent 446cc2b commit 2435423

File tree

1 file changed

+8
-2
lines changed

1 file changed

+8
-2
lines changed

internal/checks/checks.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -287,6 +287,9 @@ func handleError(ctx context.Context, logger zerolog.Logger, backoff Backoffer,
287287
fatalError FatalError
288288
)
289289

290+
// Add logging to see what error we're dealing with
291+
logger.Debug().Err(err).Str("error_type", fmt.Sprintf("%T", err)).Msg("handling error in handleError")
292+
290293
switch {
291294
case err == nil:
292295
return true, nil
@@ -316,8 +319,7 @@ func handleError(ctx context.Context, logger zerolog.Logger, backoff Backoffer,
316319
return true, nil
317320

318321
default:
319-
logger.Warn().
320-
Msg("handling check changes")
322+
logger.Warn().Err(err).Str("error_type", fmt.Sprintf("%T", err)).Msg("unhandled error type, treating as transient")
321323

322324
// TODO(mem): this might be a transient error (e.g. bad connection). We probably need to
323325
// fine-tune GRPPC's backoff parameters. We might also need to keep count of the reconnects, and
@@ -354,6 +356,10 @@ func (c *Updater) loop(ctx context.Context) (bool, error) {
354356
// the other end is shutting down
355357
return errTransportClosing
356358

359+
case status.Code() == codes.Unavailable:
360+
// Network errors, connection resets, etc. are transient
361+
return TransientError(fmt.Sprintf("%s: %s", action, status.Message()))
362+
357363
case status.Code() == codes.PermissionDenied:
358364
return errNotAuthorized
359365

0 commit comments

Comments
 (0)