feat: Request Rejection Frontend metrics (#7644)

Signed-off-by: Jacky <18255193+kthui@users.noreply.github.com>

feat: Request Rejection Frontend metrics (#7644)
Signed-off-by: Jacky <18255193+kthui@users.noreply.github.com>
3205e7db · Jacky · GitHub · 91700375 · 3205e7db · 3205e7db
Unverified Commit 3205e7db authored Apr 06, 2026 by Jacky Committed by GitHub Apr 07, 2026
12 changed files
--- a/lib/bindings/python/src/dynamo/prometheus_names.py
+++ b/lib/bindings/python/src/dynamo/prometheus_names.py
@@ -110,6 +110,10 @@ class frontend_service:
    MODEL_MIGRATION_LIMIT = "model_migration_limit"
    # Total number of request migrations due to worker unavailability
    MODEL_MIGRATION_TOTAL = "model_migration_total"
+    # Total number of request cancellations
+    MODEL_CANCELLATION_TOTAL = "model_cancellation_total"
+    # Total number of requests rejected due to resource exhaustion
+    MODEL_REJECTION_TOTAL = "model_rejection_total"
    # Active decode blocks (KV cache blocks) per worker
    # Gauge metric tracking current KV cache block utilization for each worker
    WORKER_ACTIVE_DECODE_BLOCKS = "worker_active_decode_blocks"
@@ -239,25 +243,29 @@ class model_info:


 class name_prefix:
-    """Metric name prefixes used across the metrics system"""
+    """Metric name prefixes used across the metrics system."""

-    # Prefix for all Prometheus metric names.
+    # Prefix for component-scoped metrics, auto-labeled with namespace/endpoint.
    COMPONENT = "dynamo_component"
-    # Prefix for frontend service metrics
+    # Prefix for frontend HTTP service metrics (requests, TTFT, ITL, disconnects).
    FRONTEND = "dynamo_frontend"
-    # Prefix for KV router metrics (used with router_id label)
+    # Prefix for KV router instance metrics (carries `router_id` label).
    ROUTER = "dynamo_router"
-    # Prefix for request-plane (transport-agnostic) metrics at AddressedPushRouter
-    REQUEST_PLANE = "dynamo_request_plane"
-    # Prefix for tokio runtime metrics
-    TOKIO = "dynamo_tokio"
    # Prefix for standalone KV indexer metrics
    KVINDEXER = "dynamo_kvindexer"
-    # Prefix for transport-layer metrics (TCP / NATS)
+    # Prefix for request-plane metrics at AddressedPushRouter.
+    # Transport-agnostic: measures request lifecycle latency and concurrency
+    # (queue → send → roundtrip TTFT, inflight gauge).
+    REQUEST_PLANE = "dynamo_request_plane"
+    # Prefix for transport-layer metrics (TCP / NATS).
+    # Protocol-specific: measures wire-level health (bytes sent/received, error counts).
    TRANSPORT = "dynamo_transport"
    # Prefix for work-handler transport breakdown metrics (backend side)
    WORK_HANDLER = "dynamo_work_handler"
-    # Prefix for routing overhead metrics (raw Prometheus, not component-scoped)
+    # Prefix for tokio runtime metrics (poll times, queue depths, stalls).
+    TOKIO = "dynamo_tokio"
+    # Prefix for per-phase routing overhead latency (hashing, scheduling).
+    # Raw Prometheus, not component-scoped.
    ROUTING_OVERHEAD = "dynamo_routing_overhead"


@@ -401,6 +409,8 @@ class work_handler:
    REQUEST_DURATION_SECONDS = "request_duration_seconds"
    # Total number of errors in work handler processing
    ERRORS_TOTAL = "errors_total"
+    # Total number of requests cancelled by work handler (client stop/kill or disconnect)
+    CANCELLATION_TOTAL = "cancellation_total"
    # Network transit: frontend send to backend receive (wall-clock, cross-process)
    NETWORK_TRANSIT_SECONDS = "network_transit_seconds"
    # Backend processing: handle_payload entry to first response sent

--- a/lib/llm/src/grpc/service/openai.rs
+++ b/lib/llm/src/grpc/service/openai.rs
@@ -55,8 +55,9 @@ pub async fn completion_response_stream(
    // [WIP] from request id.
    let request_id = get_or_create_request_id(request.inner.user.as_deref());
    let streaming = request.inner.stream.unwrap_or(false);
+    let model_name = request.inner.model.clone();
    let cancellation_labels = CancellationLabels {
-        model: request.inner.model.clone(),
+        model: model_name.clone(),
        endpoint: "grpc_completions".to_string(),
        request_type: if streaming { "stream" } else { "unary" }.to_string(),
    };
@@ -101,10 +102,16 @@ pub async fn completion_response_stream(
    let annotations = request.annotations();

    // issue the generate call on the engine
-    let stream = engine
-        .generate(request)
-        .await
-        .map_err(|e| Status::internal(format!("Failed to generate completions: {}", e)))?;
+    let stream = engine.generate(request).await.map_err(|e| {
+        if crate::http::service::metrics::request_was_rejected(e.as_ref()) {
+            state.metrics_clone().inc_rejection(
+                &model_name,
+                crate::http::service::metrics::Endpoint::Completions,
+            );
+            return Status::resource_exhausted(e.to_string());
+        }
+        Status::internal(format!("Failed to generate completions: {}", e))
+    })?;

    // capture the context to cancel the stream if the client disconnects
    let ctx = stream.context();

--- a/lib/llm/src/grpc/service/tensor.rs
+++ b/lib/llm/src/grpc/service/tensor.rs
@@ -60,8 +60,9 @@ pub async fn tensor_response_stream(
 ) -> Result<impl Stream<Item = Annotated<NvCreateTensorResponse>>, Status> {
    // create the context for the request
    let request_id = get_or_create_request_id(request.id.as_deref());
+    let model_name = request.model.clone();
    let cancellation_labels = CancellationLabels {
-        model: request.model.clone(),
+        model: model_name.clone(),
        endpoint: Endpoint::Tensor.to_string(),
        request_type: if streaming { "stream" } else { "unary" }.to_string(),
    };
@@ -103,6 +104,12 @@ pub async fn tensor_response_stream(

    // issue the generate call on the engine
    let stream = engine.generate(request).await.map_err(|e| {
+        if crate::http::service::metrics::request_was_rejected(e.as_ref()) {
+            state
+                .metrics_clone()
+                .inc_rejection(&model_name, crate::http::service::metrics::Endpoint::Tensor);
+            return Status::resource_exhausted(e.to_string());
+        }
        Status::internal(format!("Failed to generate tensor response stream: {}", e))
    })?;


--- a/lib/llm/src/http/service/anthropic.rs
+++ b/lib/llm/src/http/service/anthropic.rs
@@ -284,6 +284,11 @@ async fn anthropic_messages(
    tracing::trace!("Issuing generate call for Anthropic messages");

    let engine_stream = engine.generate(request).await.map_err(|e| {
+        if super::metrics::request_was_rejected(e.as_ref()) {
+            state
+                .metrics_clone()
+                .inc_rejection(&model, super::metrics::Endpoint::AnthropicMessages);
+        }
        anthropic_error(
            StatusCode::INTERNAL_SERVER_ERROR,
            "api_error",

--- a/lib/llm/src/http/service/metrics.rs
+++ b/lib/llm/src/http/service/metrics.rs
@@ -27,6 +27,15 @@ use crate::local_model::runtime_config::ModelRuntimeConfig;
 use crate::model_card::ModelDeploymentCard;
 use dynamo_runtime::metrics::prometheus_names::clamp_u64_to_i64;

+use dynamo_runtime::error::ErrorType as DynamoErrorType;
+
+/// Check whether an error chain indicates the request was rejected.
+pub fn request_was_rejected(err: &(dyn std::error::Error + 'static)) -> bool {
+    const REJECTION: &[DynamoErrorType] = &[DynamoErrorType::ResourceExhausted];
+    const NON_REJECTION: &[DynamoErrorType] = &[];
+    dynamo_runtime::error::match_error_chain(err, REJECTION, NON_REJECTION)
+}
+
 pub use prometheus::Registry;

 use super::RouteDoc;
@@ -257,6 +266,7 @@ pub struct Metrics {
    model_migration_limit: IntGaugeVec,
    model_migration_total: IntCounterVec,
    model_cancellation_total: IntCounterVec,
+    model_rejection_total: IntCounterVec,
 }

 // Inflight tracks requests from HTTP handler start until complete response is finished.
@@ -679,6 +689,15 @@ impl Metrics {
        )
        .unwrap();

+        let model_rejection_total = IntCounterVec::new(
+            Opts::new(
+                frontend_metric_name(frontend_service::MODEL_REJECTION_TOTAL),
+                "Total number of requests rejected due to resource exhaustion",
+            ),
+            &["model", "endpoint"],
+        )
+        .unwrap();
+
        Metrics {
            request_counter,
            inflight_gauge,
@@ -700,6 +719,7 @@ impl Metrics {
            model_migration_limit,
            model_migration_total,
            model_cancellation_total,
+            model_rejection_total,
        }
    }

@@ -805,6 +825,7 @@ impl Metrics {
        registry.register(Box::new(self.model_migration_limit.clone()))?;
        registry.register(Box::new(self.model_migration_total.clone()))?;
        registry.register(Box::new(self.model_cancellation_total.clone()))?;
+        registry.register(Box::new(self.model_rejection_total.clone()))?;

        Ok(())
    }
@@ -902,6 +923,20 @@ impl Metrics {
            .get()
    }

+    /// Increment the rejection counter for a request rejected due to resource exhaustion
+    pub fn inc_rejection(&self, model: &str, endpoint: Endpoint) {
+        self.model_rejection_total
+            .with_label_values(&[model, &endpoint.to_string()])
+            .inc();
+    }
+
+    /// Get the current rejection count for a model and endpoint
+    pub fn get_rejection_count(&self, model: &str, endpoint: Endpoint) -> u64 {
+        self.model_rejection_total
+            .with_label_values(&[model, &endpoint.to_string()])
+            .get()
+    }
+
    /// Create a new [`InflightGuard`] for the given model and annotate if its a streaming request,
    /// and the kind of endpoint that was hit
    ///

--- a/lib/llm/src/http/service/openai.rs
+++ b/lib/llm/src/http/service/openai.rs
@@ -194,18 +194,12 @@ impl ErrorMessage {
    /// If successful, it will return the [`HttpError`] as an [`ErrorMessage::internal_server_error`]
    /// with the details of the error.
    pub fn from_anyhow(err: anyhow::Error, alt_msg: &str) -> ErrorResponse {
-        // First check for PipelineError::ServiceOverloaded
-        if let Some(pipeline_err) =
-            err.downcast_ref::<dynamo_runtime::pipeline::error::PipelineError>()
-            && matches!(
-                pipeline_err,
-                dynamo_runtime::pipeline::error::PipelineError::ServiceOverloaded(_)
-            )
-        {
+        // Check for ResourceExhausted anywhere in the error chain → HTTP 503
+        if super::metrics::request_was_rejected(err.as_ref()) {
            return (
                StatusCode::SERVICE_UNAVAILABLE,
                Json(ErrorMessage {
-                    message: pipeline_err.to_string(),
+                    message: err.to_string(),
                    error_type: map_error_code_to_error_type(StatusCode::SERVICE_UNAVAILABLE),
                    code: StatusCode::SERVICE_UNAVAILABLE.as_u16(),
                }),
@@ -470,6 +464,11 @@ async fn completions_single(

    // issue the generate call on the engine
    let stream = engine.generate(request).await.map_err(|e| {
+        if super::metrics::request_was_rejected(e.as_ref()) {
+            state
+                .metrics_clone()
+                .inc_rejection(&model, super::metrics::Endpoint::Completions);
+        }
        let err_response = ErrorMessage::from_anyhow(e, "Failed to generate completions");
        inflight_guard.mark_error(extract_error_type_from_response(&err_response));
        err_response
@@ -621,6 +620,11 @@ async fn completions_batch(

        // Generate stream for this prompt
        let stream = engine.generate(single_request_context).await.map_err(|e| {
+            if super::metrics::request_was_rejected(e.as_ref()) {
+                state
+                    .metrics_clone()
+                    .inc_rejection(&model, super::metrics::Endpoint::Completions);
+            }
            let err_response = ErrorMessage::from_anyhow(e, "Failed to generate completions");
            inflight_guard.mark_error(extract_error_type_from_response(&err_response));
            err_response
@@ -775,9 +779,15 @@ async fn embeddings(
    })?;

    let mut response_collector = state.metrics_clone().create_response_collector(model);
+    let model_name = model.to_string();

    // issue the generate call on the engine
    let stream = engine.generate(request).await.map_err(|e| {
+        if super::metrics::request_was_rejected(e.as_ref()) {
+            state
+                .metrics_clone()
+                .inc_rejection(&model_name, super::metrics::Endpoint::Embeddings);
+        }
        let err_response = ErrorMessage::from_anyhow(e, "Failed to generate embeddings");
        inflight.mark_error(extract_error_type_from_response(&err_response));
        err_response
@@ -1184,6 +1194,11 @@ async fn chat_completions(

    // issue the generate call on the engine
    let stream = engine.generate(request).await.map_err(|e| {
+        if super::metrics::request_was_rejected(e.as_ref()) {
+            state
+                .metrics_clone()
+                .inc_rejection(&model, super::metrics::Endpoint::ChatCompletions);
+        }
        let err_response = ErrorMessage::from_anyhow(e, "Failed to generate completions");
        inflight_guard.mark_error(extract_error_type_from_response(&err_response));
        err_response
@@ -1590,6 +1605,11 @@ async fn responses(

    // issue the generate call on the engine
    let engine_stream = engine.generate(request).await.map_err(|e| {
+        if super::metrics::request_was_rejected(e.as_ref()) {
+            state
+                .metrics_clone()
+                .inc_rejection(&model, super::metrics::Endpoint::Responses);
+        }
        let err_response = ErrorMessage::from_anyhow(e, "Failed to generate completions");
        inflight_guard.mark_error(extract_error_type_from_response(&err_response));
        err_response
@@ -1972,10 +1992,14 @@ async fn images(
    // Note: This uses ServerStreamingEngine for internal routing/distribution,
    // NOT for client-facing SSE streaming. The stream is immediately folded into
    // a single response below.
-    let stream = engine
-        .generate(request)
-        .await
-        .map_err(|e| ErrorMessage::from_anyhow(e, "Failed to generate images"))?;
+    let stream = engine.generate(request).await.map_err(|e| {
+        if super::metrics::request_was_rejected(e.as_ref()) {
+            state
+                .metrics_clone()
+                .inc_rejection(&model, super::metrics::Endpoint::Images);
+        }
+        ErrorMessage::from_anyhow(e, "Failed to generate images")
+    })?;

    // Process stream to collect metrics and drop http_queue_guard on first response
    let mut http_queue_guard = Some(http_queue_guard);
@@ -2055,10 +2079,14 @@ async fn videos(
    let mut response_collector = state.metrics_clone().create_response_collector(&model);

    // issue the generate call on the engine
-    let stream = engine
-        .generate(request)
-        .await
-        .map_err(|e| ErrorMessage::from_anyhow(e, "Failed to generate videos"))?;
+    let stream = engine.generate(request).await.map_err(|e| {
+        if super::metrics::request_was_rejected(e.as_ref()) {
+            state
+                .metrics_clone()
+                .inc_rejection(&model, super::metrics::Endpoint::Videos);
+        }
+        ErrorMessage::from_anyhow(e, "Failed to generate videos")
+    })?;

    // Process stream to collect metrics and drop http_queue_guard on first token
    let mut http_queue_guard = Some(http_queue_guard);
@@ -2116,10 +2144,14 @@ async fn video_stream(

    let mut response_collector = state.metrics_clone().create_response_collector(&model);

-    let stream = engine
-        .generate(request)
-        .await
-        .map_err(|e| ErrorMessage::from_anyhow(e, "Failed to start video stream"))?;
+    let stream = engine.generate(request).await.map_err(|e| {
+        if super::metrics::request_was_rejected(e.as_ref()) {
+            state
+                .metrics_clone()
+                .inc_rejection(&model, super::metrics::Endpoint::Videos);
+        }
+        ErrorMessage::from_anyhow(e, "Failed to start video stream")
+    })?;

    // Capture the context to cancel the stream if the client disconnects.
    let ctx = stream.context();
@@ -2435,18 +2467,24 @@ mod tests {
    }

    #[test]
-    fn test_service_overloaded_error_response_from_anyhow() {
+    fn test_resource_exhausted_error_response_from_anyhow() {
+        use dynamo_runtime::error::{DynamoError, ErrorType};
        use dynamo_runtime::pipeline::error::PipelineError;

-        let err: anyhow::Error = PipelineError::ServiceOverloaded(
+        let cause = PipelineError::ServiceOverloaded(
            "All workers are busy, please retry later".to_string(),
-        )
-        .into();
+        );
+        let err: anyhow::Error = DynamoError::builder()
+            .error_type(ErrorType::ResourceExhausted)
+            .message("All workers are busy, please retry later")
+            .cause(cause)
+            .build()
+            .into();
        let response = ErrorMessage::from_anyhow(err, BACKUP_ERROR_MESSAGE);
        assert_eq!(response.0, StatusCode::SERVICE_UNAVAILABLE);
        assert_eq!(
            response.1.message,
-            "Service temporarily unavailable: All workers are busy, please retry later"
+            "ResourceExhausted: All workers are busy, please retry later"
        );
    }


--- a/lib/llm/src/migration.rs
+++ b/lib/llm/src/migration.rs
@@ -27,7 +27,7 @@ fn is_migratable(err: &(dyn StdError + 'static)) -> bool {
        ErrorType::ConnectionTimeout,
        ErrorType::Backend(BackendError::EngineShutdown),
    ];
-    const NON_MIGRATABLE: &[ErrorType] = &[ErrorType::Cancelled];
+    const NON_MIGRATABLE: &[ErrorType] = &[ErrorType::Cancelled, ErrorType::ResourceExhausted];
    error::match_error_chain(err, MIGRATABLE, NON_MIGRATABLE)
 }


--- a/lib/runtime/src/error.rs
+++ b/lib/runtime/src/error.rs
@@ -53,6 +53,8 @@ pub enum ErrorType {
    ConnectionTimeout,
    /// The request was cancelled (e.g., client disconnected).
    Cancelled,
+    /// The system does not have enough resources to handle the request.
+    ResourceExhausted,
    /// Error originating from a backend engine.
    Backend(BackendError),
 }
@@ -66,6 +68,7 @@ impl fmt::Display for ErrorType {
            ErrorType::Disconnected => write!(f, "Disconnected"),
            ErrorType::ConnectionTimeout => write!(f, "ConnectionTimeout"),
            ErrorType::Cancelled => write!(f, "Cancelled"),
+            ErrorType::ResourceExhausted => write!(f, "ResourceExhausted"),
            ErrorType::Backend(sub) => write!(f, "Backend{sub}"),
        }
    }

--- a/lib/runtime/src/metrics/prometheus_names.rs
+++ b/lib/runtime/src/metrics/prometheus_names.rs
@@ -235,6 +235,9 @@ pub mod frontend_service {
    /// Total number of request cancellations
    pub const MODEL_CANCELLATION_TOTAL: &str = "model_cancellation_total";

+    /// Total number of requests rejected due to resource exhaustion
+    pub const MODEL_REJECTION_TOTAL: &str = "model_rejection_total";
+
    /// Active decode blocks (KV cache blocks) per worker
    /// Gauge metric tracking current KV cache block utilization for each worker
    pub const WORKER_ACTIVE_DECODE_BLOCKS: &str = "worker_active_decode_blocks";

--- a/lib/runtime/src/pipeline/network/egress/push_router.rs
+++ b/lib/runtime/src/pipeline/network/egress/push_router.rs
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0

 use super::{AsyncEngineContextProvider, ResponseStream};
-use crate::error::{BackendError, ErrorType, match_error_chain};
+use crate::error::{BackendError, DynamoError, ErrorType, match_error_chain};

 /// Check if an error chain indicates the worker should be reported as down.
 fn is_inhibited(err: &(dyn std::error::Error + 'static)) -> bool {
@@ -518,10 +518,15 @@ where
                        total_workers = all_instances.len(),
                        "Rejecting request: all workers are busy"
                    );
-                    return Err(PipelineError::ServiceOverloaded(
+                    let cause = PipelineError::ServiceOverloaded(
                        "All workers are busy, please retry later".to_string(),
-                    )
-                    .into());
+                    );
+                    return Err(DynamoError::builder()
+                        .error_type(ErrorType::ResourceExhausted)
+                        .message("All workers are busy, please retry later")
+                        .cause(cause)
+                        .build()
+                        .into());
                }
            }
        }

--- a/tests/router/common.py
+++ b/tests/router/common.py
@@ -11,8 +11,10 @@ from typing import TYPE_CHECKING, Any, Optional

 import aiohttp
 import nats
+import requests

 from dynamo.llm import AicPerfConfig, KvRouter, KvRouterConfig
+from dynamo.prometheus_names import frontend_service, name_prefix
 from tests.router.helper import (
    _nats_server,
    assert_event_dumps_equal,
@@ -604,6 +606,66 @@ def _test_router_query_instance_id(
        logger.info(f"Token count: {result['token_count']}")


+def _parse_frontend_rejection_metric(
+    metrics_text: str, model_name: str, endpoint: str
+) -> int:
+    """Parse frontend model_rejection_total from Prometheus metrics text.
+
+    Args:
+        metrics_text: Raw Prometheus metrics text
+        model_name: The model name label value
+        endpoint: The endpoint label value (e.g. "chat_completions")
+
+    Returns:
+        The metric count, or 0 if not found
+    """
+    metric_name = f"{name_prefix.FRONTEND}_{frontend_service.MODEL_REJECTION_TOTAL}"
+    for line in metrics_text.splitlines():
+        if not line.startswith(f"{metric_name}{{"):
+            continue
+        if f'model="{model_name}"' in line and f'endpoint="{endpoint}"' in line:
+            parts = line.rsplit(None, 1)
+            if len(parts) == 2:
+                try:
+                    return int(float(parts[1]))
+                except ValueError:
+                    pass
+    return 0
+
+
+def _verify_frontend_rejection_metrics(
+    frontend_port: int,
+    model_name: str,
+    endpoint: str,
+    expected_count: int,
+) -> None:
+    """Verify frontend rejection metrics by scraping the /metrics endpoint.
+
+    Args:
+        frontend_port: Port where the frontend /metrics is served
+        model_name: The model name label value
+        endpoint: The endpoint label value (e.g. "chat_completions")
+        expected_count: Expected rejection count to match exactly
+    """
+    metrics_url = f"http://localhost:{frontend_port}/metrics"
+    try:
+        metrics_response = requests.get(metrics_url, timeout=5)
+        metrics_response.raise_for_status()
+    except requests.RequestException as e:
+        raise AssertionError(
+            f"Failed to fetch frontend metrics from {metrics_url}: {e}"
+        ) from e
+
+    metric_count = _parse_frontend_rejection_metric(
+        metrics_response.text, model_name, endpoint
+    )
+    logger.info(f"Frontend rejection metric: model_rejection_total={metric_count}")
+    assert metric_count == expected_count, (
+        f"Frontend model_rejection_total ({metric_count}) does not match "
+        f"expected count ({expected_count})"
+    )
+
+
 def _test_router_overload_503(
    engine_workers,
    block_size: int,
@@ -612,11 +674,16 @@ def _test_router_overload_503(
    test_payload: dict,
    blocks_threshold: float = 0.2,
 ):
-    """Test that KV router returns 503 when all workers are busy.
+    """Test that 503 is returned when all workers are busy, and verify rejection metrics.

    Assumes engine_workers are already initialized. This function manages router lifecycle.
    Uses limited resources to intentionally trigger the overload condition.

+    Sends staggered requests (0.1s apart) to exhaust worker resources, then verifies:
+    1. At least one request succeeds (routed before busy state propagates)
+    2. At least one request is rejected with 503 (worker busy)
+    3. The frontend model_rejection_total metric matches the observed 503 count
+
    Args:
        engine_workers: Backend workers (mocker/vllm) already initialized with __enter__()
        block_size: Block size for KV cache (should be small to exhaust quickly, e.g. 4)
@@ -626,9 +693,8 @@ def _test_router_overload_503(
        blocks_threshold: Active decode blocks threshold for the router (default 0.2)

    Raises:
-        AssertionError: If 503 response is not received when expected
+        AssertionError: If success/rejection counts or metrics don't meet expectations
    """
-
    logger.info(
        f"Starting KV router frontend on port {frontend_port} with limited resources"
    )
@@ -662,8 +728,6 @@ def _test_router_overload_503(

        async def exhaust_resources_and_verify_503():
            stop_event = asyncio.Event()
-            overload_response = {}
-            unexpected_statuses = []

            async with aiohttp.ClientSession() as session:
                tasks = []
@@ -681,23 +745,24 @@ def _test_router_overload_503(
                                logger.info(
                                    f"Request {req_id} got expected 503: {body}"
                                )
-                                overload_response["status"] = response.status
-                                overload_response["body"] = body
                                stop_event.set()
+                                error_msg = body.get("message", "")
+                                assert (
+                                    "Service temporarily unavailable" in error_msg
+                                    or "All workers are busy" in error_msg
+                                ), f"Expected service overload error message, got: {body}"
                                return response.status

                            body = await response.text()
                            logger.info(
                                f"Request {req_id} got unexpected status {response.status}: {body}"
                            )
-                            unexpected_statuses.append((response.status, body))
                            return response.status
                    except asyncio.CancelledError:
                        raise
                    except Exception as e:
                        logger.info(f"Request {req_id} failed: {e}")
-                        unexpected_statuses.append(("exception", str(e)))
-                        return None
+                        raise

                try:
                    for i in range(50):
@@ -732,27 +797,42 @@ def _test_router_overload_503(
                    for task in pending:
                        task.cancel()
                    await asyncio.gather(*pending, return_exceptions=True)
-                    for task in done:
-                        task.result()

-                if overload_response.get("status") != 503:
-                    logger.error(
-                        f"Observed statuses before timeout: {unexpected_statuses}"
-                    )
-                    return False
+                return [t.result() for t in done]

-                error_msg = overload_response["body"].get("message", "")
-                assert (
-                    "Service temporarily unavailable" in error_msg
-                    or "All workers are busy" in error_msg
-                ), f"Expected service overload error message, got: {overload_response['body']}"
-                return True
+        results = asyncio.run(exhaust_resources_and_verify_503())
+
+        # Count outcomes
+        num_succeeded = sum(1 for s in results if s == 200)
+        num_rejected = sum(1 for s in results if s == 503)
+        num_other = sum(1 for s in results if s not in (200, 503))

-        # Run the test
-        success = asyncio.run(exhaust_resources_and_verify_503())
-        assert success, "Failed to verify 503 response when resources are exhausted"
+        logger.info(
+            f"Results: {num_succeeded} succeeded, {num_rejected} rejected (503), "
+            f"{num_other} other"
+        )

-        logger.info("Successfully verified 503 response when all workers are busy")
+        # Assert minimum thresholds
+        assert (
+            num_other == 0
+        ), f"Expected only 200 or 503 responses, but got {num_other} other"
+        assert (
+            num_rejected > 0
+        ), f"Expected at least 1 rejection, but got {num_rejected}"
+        assert (
+            num_succeeded > 0
+        ), f"Expected at least 1 success, but got {num_succeeded}"
+
+        # Verify rejection metrics from frontend /metrics endpoint
+        model_name = test_payload.get("model", "")
+        _verify_frontend_rejection_metrics(
+            frontend_port, model_name, "chat_completions", num_rejected
+        )
+
+        logger.info(
+            f"Successfully verified overload 503: {num_rejected} rejected, "
+            f"{num_succeeded} succeeded, metrics match"
+        )


 async def _zmq_replay_cycle(

--- a/tests/router/test_router_e2e_with_mockers.py
+++ b/tests/router/test_router_e2e_with_mockers.py
@@ -809,7 +809,7 @@ def test_mocker_two_kv_router(
 @pytest.mark.parametrize(
    "durable_kv_events", [False], ids=["nondurable"], indirect=True
 )  # Use NATS Core (local indexer)
-@pytest.mark.timeout(60)  # ~3x average (~19.86s), rounded up (when enabled)
+@pytest.mark.timeout(45)  # ~3x average (~13.10s), rounded up (when enabled)
 def test_mocker_kv_router_overload_503(
    request, runtime_services_dynamic_ports, predownload_tokenizers, durable_kv_events
 ):