error.rs 3.86 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

//! Error classification for etcd operations.
//!
//! Categorizes etcd errors into reconnectable, expected, or fatal conditions
//! to enable smart retry logic.

use std::fmt;
use tonic::Code;

/// Errors that indicate a connection issue requiring reconnection.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ReconnectableError {
    /// Connection to etcd server was closed
    ConnectionClosed,
    /// Operation timed out
    Timeout,
    /// Service unavailable (etcd server down or unreachable)
    Unavailable,
    /// Lease was not found (may have expired during disconnect)
    LeaseNotFound,
}

impl fmt::Display for ReconnectableError {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            Self::ConnectionClosed => write!(f, "connection closed"),
            Self::Timeout => write!(f, "operation timed out"),
            Self::Unavailable => write!(f, "service unavailable"),
            Self::LeaseNotFound => write!(f, "lease not found"),
        }
    }
}

/// Classification of etcd errors for determining retry strategy.
#[derive(Debug)]
pub(crate) enum EtcdErrorClass {
    /// Error should trigger reconnection and retry
    Reconnectable(ReconnectableError),
    /// Expected condition (key not found) - not an error
    NotFound,
    /// Fatal error that cannot be recovered by reconnecting
    Fatal(anyhow::Error),
}

/// Classify an etcd error to determine appropriate handling.
///
/// # Classification Strategy
///
/// - **Reconnectable**: Connection/transport errors that can be fixed by reconnecting
/// - **NotFound**: Key doesn't exist (expected condition for queries)
/// - **Fatal**: All other errors (permissions, invalid request, etc.)
pub(crate) fn classify_error(err: etcd_client::Error) -> EtcdErrorClass {
    // Use structured error matching instead of fragile string matching
    match err {
        etcd_client::Error::GRpcStatus(status) => {
            // Classify based on gRPC status code
            match status.code() {
                Code::NotFound => {
                    // Check if it's a lease not found or key not found
                    let msg = status.message().to_lowercase();
                    if msg.contains("lease") {
                        EtcdErrorClass::Reconnectable(ReconnectableError::LeaseNotFound)
                    } else {
                        // Key not found is expected, not an error
                        EtcdErrorClass::NotFound
                    }
                }
                Code::Unavailable => EtcdErrorClass::Reconnectable(ReconnectableError::Unavailable),
                Code::DeadlineExceeded => {
                    EtcdErrorClass::Reconnectable(ReconnectableError::Timeout)
                }
                Code::Cancelled | Code::Aborted => {
                    // Connection-related cancellations
                    EtcdErrorClass::Reconnectable(ReconnectableError::ConnectionClosed)
                }
                _ => {
                    // All other gRPC errors are fatal
                    EtcdErrorClass::Fatal(anyhow::anyhow!(
                        "gRPC error: {} (code: {:?})",
                        status.message(),
                        status.code()
                    ))
                }
            }
        }
        etcd_client::Error::TransportError(_) => {
            // Transport errors are reconnectable
            EtcdErrorClass::Reconnectable(ReconnectableError::Unavailable)
        }
        etcd_client::Error::IoError(_) => {
            // I/O errors are reconnectable
            EtcdErrorClass::Reconnectable(ReconnectableError::ConnectionClosed)
        }
        _ => {
            // All other errors (LeaseKeepAliveError, etc.) are fatal
            EtcdErrorClass::Fatal(err.into())
        }
    }
}