Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
3205e7db
Unverified
Commit
3205e7db
authored
Apr 06, 2026
by
Jacky
Committed by
GitHub
Apr 07, 2026
Browse files
feat: Request Rejection Frontend metrics (#7644)
Signed-off-by:
Jacky
<
18255193+kthui@users.noreply.github.com
>
parent
91700375
Changes
12
Show whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
268 additions
and
75 deletions
+268
-75
lib/bindings/python/src/dynamo/prometheus_names.py
lib/bindings/python/src/dynamo/prometheus_names.py
+20
-10
lib/llm/src/grpc/service/openai.rs
lib/llm/src/grpc/service/openai.rs
+12
-5
lib/llm/src/grpc/service/tensor.rs
lib/llm/src/grpc/service/tensor.rs
+8
-1
lib/llm/src/http/service/anthropic.rs
lib/llm/src/http/service/anthropic.rs
+5
-0
lib/llm/src/http/service/metrics.rs
lib/llm/src/http/service/metrics.rs
+35
-0
lib/llm/src/http/service/openai.rs
lib/llm/src/http/service/openai.rs
+64
-26
lib/llm/src/migration.rs
lib/llm/src/migration.rs
+1
-1
lib/runtime/src/error.rs
lib/runtime/src/error.rs
+3
-0
lib/runtime/src/metrics/prometheus_names.rs
lib/runtime/src/metrics/prometheus_names.rs
+3
-0
lib/runtime/src/pipeline/network/egress/push_router.rs
lib/runtime/src/pipeline/network/egress/push_router.rs
+9
-4
tests/router/common.py
tests/router/common.py
+107
-27
tests/router/test_router_e2e_with_mockers.py
tests/router/test_router_e2e_with_mockers.py
+1
-1
No files found.
lib/bindings/python/src/dynamo/prometheus_names.py
View file @
3205e7db
...
...
@@ -110,6 +110,10 @@ class frontend_service:
MODEL_MIGRATION_LIMIT
=
"model_migration_limit"
# Total number of request migrations due to worker unavailability
MODEL_MIGRATION_TOTAL
=
"model_migration_total"
# Total number of request cancellations
MODEL_CANCELLATION_TOTAL
=
"model_cancellation_total"
# Total number of requests rejected due to resource exhaustion
MODEL_REJECTION_TOTAL
=
"model_rejection_total"
# Active decode blocks (KV cache blocks) per worker
# Gauge metric tracking current KV cache block utilization for each worker
WORKER_ACTIVE_DECODE_BLOCKS
=
"worker_active_decode_blocks"
...
...
@@ -239,25 +243,29 @@ class model_info:
class
name_prefix
:
"""Metric name prefixes used across the metrics system"""
"""Metric name prefixes used across the metrics system
.
"""
# Prefix for
all Prometheus metric names
.
# Prefix for
component-scoped metrics, auto-labeled with namespace/endpoint
.
COMPONENT
=
"dynamo_component"
# Prefix for frontend service metrics
# Prefix for frontend
HTTP
service metrics
(requests, TTFT, ITL, disconnects).
FRONTEND
=
"dynamo_frontend"
# Prefix for KV router
metrics (used with
router_id label)
# Prefix for KV router
instance metrics (carries `
router_id
`
label)
.
ROUTER
=
"dynamo_router"
# Prefix for request-plane (transport-agnostic) metrics at AddressedPushRouter
REQUEST_PLANE
=
"dynamo_request_plane"
# Prefix for tokio runtime metrics
TOKIO
=
"dynamo_tokio"
# Prefix for standalone KV indexer metrics
KVINDEXER
=
"dynamo_kvindexer"
# Prefix for transport-layer metrics (TCP / NATS)
# Prefix for request-plane metrics at AddressedPushRouter.
# Transport-agnostic: measures request lifecycle latency and concurrency
# (queue → send → roundtrip TTFT, inflight gauge).
REQUEST_PLANE
=
"dynamo_request_plane"
# Prefix for transport-layer metrics (TCP / NATS).
# Protocol-specific: measures wire-level health (bytes sent/received, error counts).
TRANSPORT
=
"dynamo_transport"
# Prefix for work-handler transport breakdown metrics (backend side)
WORK_HANDLER
=
"dynamo_work_handler"
# Prefix for routing overhead metrics (raw Prometheus, not component-scoped)
# Prefix for tokio runtime metrics (poll times, queue depths, stalls).
TOKIO
=
"dynamo_tokio"
# Prefix for per-phase routing overhead latency (hashing, scheduling).
# Raw Prometheus, not component-scoped.
ROUTING_OVERHEAD
=
"dynamo_routing_overhead"
...
...
@@ -401,6 +409,8 @@ class work_handler:
REQUEST_DURATION_SECONDS
=
"request_duration_seconds"
# Total number of errors in work handler processing
ERRORS_TOTAL
=
"errors_total"
# Total number of requests cancelled by work handler (client stop/kill or disconnect)
CANCELLATION_TOTAL
=
"cancellation_total"
# Network transit: frontend send to backend receive (wall-clock, cross-process)
NETWORK_TRANSIT_SECONDS
=
"network_transit_seconds"
# Backend processing: handle_payload entry to first response sent
...
...
lib/llm/src/grpc/service/openai.rs
View file @
3205e7db
...
...
@@ -55,8 +55,9 @@ pub async fn completion_response_stream(
// [WIP] from request id.
let
request_id
=
get_or_create_request_id
(
request
.inner.user
.as_deref
());
let
streaming
=
request
.inner.stream
.unwrap_or
(
false
);
let
model_name
=
request
.inner.model
.clone
();
let
cancellation_labels
=
CancellationLabels
{
model
:
request
.inner.
model
.clone
(),
model
:
model
_name
.clone
(),
endpoint
:
"grpc_completions"
.to_string
(),
request_type
:
if
streaming
{
"stream"
}
else
{
"unary"
}
.to_string
(),
};
...
...
@@ -101,10 +102,16 @@ pub async fn completion_response_stream(
let
annotations
=
request
.annotations
();
// issue the generate call on the engine
let
stream
=
engine
.generate
(
request
)
.await
.map_err
(|
e
|
Status
::
internal
(
format!
(
"Failed to generate completions: {}"
,
e
)))
?
;
let
stream
=
engine
.generate
(
request
)
.await
.map_err
(|
e
|
{
if
crate
::
http
::
service
::
metrics
::
request_was_rejected
(
e
.as_ref
())
{
state
.metrics_clone
()
.inc_rejection
(
&
model_name
,
crate
::
http
::
service
::
metrics
::
Endpoint
::
Completions
,
);
return
Status
::
resource_exhausted
(
e
.to_string
());
}
Status
::
internal
(
format!
(
"Failed to generate completions: {}"
,
e
))
})
?
;
// capture the context to cancel the stream if the client disconnects
let
ctx
=
stream
.context
();
...
...
lib/llm/src/grpc/service/tensor.rs
View file @
3205e7db
...
...
@@ -60,8 +60,9 @@ pub async fn tensor_response_stream(
)
->
Result
<
impl
Stream
<
Item
=
Annotated
<
NvCreateTensorResponse
>>
,
Status
>
{
// create the context for the request
let
request_id
=
get_or_create_request_id
(
request
.id
.as_deref
());
let
model_name
=
request
.model
.clone
();
let
cancellation_labels
=
CancellationLabels
{
model
:
request
.
model
.clone
(),
model
:
model
_name
.clone
(),
endpoint
:
Endpoint
::
Tensor
.to_string
(),
request_type
:
if
streaming
{
"stream"
}
else
{
"unary"
}
.to_string
(),
};
...
...
@@ -103,6 +104,12 @@ pub async fn tensor_response_stream(
// issue the generate call on the engine
let
stream
=
engine
.generate
(
request
)
.await
.map_err
(|
e
|
{
if
crate
::
http
::
service
::
metrics
::
request_was_rejected
(
e
.as_ref
())
{
state
.metrics_clone
()
.inc_rejection
(
&
model_name
,
crate
::
http
::
service
::
metrics
::
Endpoint
::
Tensor
);
return
Status
::
resource_exhausted
(
e
.to_string
());
}
Status
::
internal
(
format!
(
"Failed to generate tensor response stream: {}"
,
e
))
})
?
;
...
...
lib/llm/src/http/service/anthropic.rs
View file @
3205e7db
...
...
@@ -284,6 +284,11 @@ async fn anthropic_messages(
tracing
::
trace!
(
"Issuing generate call for Anthropic messages"
);
let
engine_stream
=
engine
.generate
(
request
)
.await
.map_err
(|
e
|
{
if
super
::
metrics
::
request_was_rejected
(
e
.as_ref
())
{
state
.metrics_clone
()
.inc_rejection
(
&
model
,
super
::
metrics
::
Endpoint
::
AnthropicMessages
);
}
anthropic_error
(
StatusCode
::
INTERNAL_SERVER_ERROR
,
"api_error"
,
...
...
lib/llm/src/http/service/metrics.rs
View file @
3205e7db
...
...
@@ -27,6 +27,15 @@ use crate::local_model::runtime_config::ModelRuntimeConfig;
use
crate
::
model_card
::
ModelDeploymentCard
;
use
dynamo_runtime
::
metrics
::
prometheus_names
::
clamp_u64_to_i64
;
use
dynamo_runtime
::
error
::
ErrorType
as
DynamoErrorType
;
/// Check whether an error chain indicates the request was rejected.
pub
fn
request_was_rejected
(
err
:
&
(
dyn
std
::
error
::
Error
+
'static
))
->
bool
{
const
REJECTION
:
&
[
DynamoErrorType
]
=
&
[
DynamoErrorType
::
ResourceExhausted
];
const
NON_REJECTION
:
&
[
DynamoErrorType
]
=
&
[];
dynamo_runtime
::
error
::
match_error_chain
(
err
,
REJECTION
,
NON_REJECTION
)
}
pub
use
prometheus
::
Registry
;
use
super
::
RouteDoc
;
...
...
@@ -257,6 +266,7 @@ pub struct Metrics {
model_migration_limit
:
IntGaugeVec
,
model_migration_total
:
IntCounterVec
,
model_cancellation_total
:
IntCounterVec
,
model_rejection_total
:
IntCounterVec
,
}
// Inflight tracks requests from HTTP handler start until complete response is finished.
...
...
@@ -679,6 +689,15 @@ impl Metrics {
)
.unwrap
();
let
model_rejection_total
=
IntCounterVec
::
new
(
Opts
::
new
(
frontend_metric_name
(
frontend_service
::
MODEL_REJECTION_TOTAL
),
"Total number of requests rejected due to resource exhaustion"
,
),
&
[
"model"
,
"endpoint"
],
)
.unwrap
();
Metrics
{
request_counter
,
inflight_gauge
,
...
...
@@ -700,6 +719,7 @@ impl Metrics {
model_migration_limit
,
model_migration_total
,
model_cancellation_total
,
model_rejection_total
,
}
}
...
...
@@ -805,6 +825,7 @@ impl Metrics {
registry
.register
(
Box
::
new
(
self
.model_migration_limit
.clone
()))
?
;
registry
.register
(
Box
::
new
(
self
.model_migration_total
.clone
()))
?
;
registry
.register
(
Box
::
new
(
self
.model_cancellation_total
.clone
()))
?
;
registry
.register
(
Box
::
new
(
self
.model_rejection_total
.clone
()))
?
;
Ok
(())
}
...
...
@@ -902,6 +923,20 @@ impl Metrics {
.get
()
}
/// Increment the rejection counter for a request rejected due to resource exhaustion
pub
fn
inc_rejection
(
&
self
,
model
:
&
str
,
endpoint
:
Endpoint
)
{
self
.model_rejection_total
.with_label_values
(
&
[
model
,
&
endpoint
.to_string
()])
.inc
();
}
/// Get the current rejection count for a model and endpoint
pub
fn
get_rejection_count
(
&
self
,
model
:
&
str
,
endpoint
:
Endpoint
)
->
u64
{
self
.model_rejection_total
.with_label_values
(
&
[
model
,
&
endpoint
.to_string
()])
.get
()
}
/// Create a new [`InflightGuard`] for the given model and annotate if its a streaming request,
/// and the kind of endpoint that was hit
///
...
...
lib/llm/src/http/service/openai.rs
View file @
3205e7db
...
...
@@ -194,18 +194,12 @@ impl ErrorMessage {
/// If successful, it will return the [`HttpError`] as an [`ErrorMessage::internal_server_error`]
/// with the details of the error.
pub
fn
from_anyhow
(
err
:
anyhow
::
Error
,
alt_msg
:
&
str
)
->
ErrorResponse
{
// First check for PipelineError::ServiceOverloaded
if
let
Some
(
pipeline_err
)
=
err
.downcast_ref
::
<
dynamo_runtime
::
pipeline
::
error
::
PipelineError
>
()
&&
matches!
(
pipeline_err
,
dynamo_runtime
::
pipeline
::
error
::
PipelineError
::
ServiceOverloaded
(
_
)
)
{
// Check for ResourceExhausted anywhere in the error chain → HTTP 503
if
super
::
metrics
::
request_was_rejected
(
err
.as_ref
())
{
return
(
StatusCode
::
SERVICE_UNAVAILABLE
,
Json
(
ErrorMessage
{
message
:
pipeline_
err
.to_string
(),
message
:
err
.to_string
(),
error_type
:
map_error_code_to_error_type
(
StatusCode
::
SERVICE_UNAVAILABLE
),
code
:
StatusCode
::
SERVICE_UNAVAILABLE
.as_u16
(),
}),
...
...
@@ -470,6 +464,11 @@ async fn completions_single(
// issue the generate call on the engine
let
stream
=
engine
.generate
(
request
)
.await
.map_err
(|
e
|
{
if
super
::
metrics
::
request_was_rejected
(
e
.as_ref
())
{
state
.metrics_clone
()
.inc_rejection
(
&
model
,
super
::
metrics
::
Endpoint
::
Completions
);
}
let
err_response
=
ErrorMessage
::
from_anyhow
(
e
,
"Failed to generate completions"
);
inflight_guard
.mark_error
(
extract_error_type_from_response
(
&
err_response
));
err_response
...
...
@@ -621,6 +620,11 @@ async fn completions_batch(
// Generate stream for this prompt
let
stream
=
engine
.generate
(
single_request_context
)
.await
.map_err
(|
e
|
{
if
super
::
metrics
::
request_was_rejected
(
e
.as_ref
())
{
state
.metrics_clone
()
.inc_rejection
(
&
model
,
super
::
metrics
::
Endpoint
::
Completions
);
}
let
err_response
=
ErrorMessage
::
from_anyhow
(
e
,
"Failed to generate completions"
);
inflight_guard
.mark_error
(
extract_error_type_from_response
(
&
err_response
));
err_response
...
...
@@ -775,9 +779,15 @@ async fn embeddings(
})
?
;
let
mut
response_collector
=
state
.metrics_clone
()
.create_response_collector
(
model
);
let
model_name
=
model
.to_string
();
// issue the generate call on the engine
let
stream
=
engine
.generate
(
request
)
.await
.map_err
(|
e
|
{
if
super
::
metrics
::
request_was_rejected
(
e
.as_ref
())
{
state
.metrics_clone
()
.inc_rejection
(
&
model_name
,
super
::
metrics
::
Endpoint
::
Embeddings
);
}
let
err_response
=
ErrorMessage
::
from_anyhow
(
e
,
"Failed to generate embeddings"
);
inflight
.mark_error
(
extract_error_type_from_response
(
&
err_response
));
err_response
...
...
@@ -1184,6 +1194,11 @@ async fn chat_completions(
// issue the generate call on the engine
let
stream
=
engine
.generate
(
request
)
.await
.map_err
(|
e
|
{
if
super
::
metrics
::
request_was_rejected
(
e
.as_ref
())
{
state
.metrics_clone
()
.inc_rejection
(
&
model
,
super
::
metrics
::
Endpoint
::
ChatCompletions
);
}
let
err_response
=
ErrorMessage
::
from_anyhow
(
e
,
"Failed to generate completions"
);
inflight_guard
.mark_error
(
extract_error_type_from_response
(
&
err_response
));
err_response
...
...
@@ -1590,6 +1605,11 @@ async fn responses(
// issue the generate call on the engine
let
engine_stream
=
engine
.generate
(
request
)
.await
.map_err
(|
e
|
{
if
super
::
metrics
::
request_was_rejected
(
e
.as_ref
())
{
state
.metrics_clone
()
.inc_rejection
(
&
model
,
super
::
metrics
::
Endpoint
::
Responses
);
}
let
err_response
=
ErrorMessage
::
from_anyhow
(
e
,
"Failed to generate completions"
);
inflight_guard
.mark_error
(
extract_error_type_from_response
(
&
err_response
));
err_response
...
...
@@ -1972,10 +1992,14 @@ async fn images(
// Note: This uses ServerStreamingEngine for internal routing/distribution,
// NOT for client-facing SSE streaming. The stream is immediately folded into
// a single response below.
let
stream
=
engine
.generate
(
request
)
.await
.map_err
(|
e
|
ErrorMessage
::
from_anyhow
(
e
,
"Failed to generate images"
))
?
;
let
stream
=
engine
.generate
(
request
)
.await
.map_err
(|
e
|
{
if
super
::
metrics
::
request_was_rejected
(
e
.as_ref
())
{
state
.metrics_clone
()
.inc_rejection
(
&
model
,
super
::
metrics
::
Endpoint
::
Images
);
}
ErrorMessage
::
from_anyhow
(
e
,
"Failed to generate images"
)
})
?
;
// Process stream to collect metrics and drop http_queue_guard on first response
let
mut
http_queue_guard
=
Some
(
http_queue_guard
);
...
...
@@ -2055,10 +2079,14 @@ async fn videos(
let
mut
response_collector
=
state
.metrics_clone
()
.create_response_collector
(
&
model
);
// issue the generate call on the engine
let
stream
=
engine
.generate
(
request
)
.await
.map_err
(|
e
|
ErrorMessage
::
from_anyhow
(
e
,
"Failed to generate videos"
))
?
;
let
stream
=
engine
.generate
(
request
)
.await
.map_err
(|
e
|
{
if
super
::
metrics
::
request_was_rejected
(
e
.as_ref
())
{
state
.metrics_clone
()
.inc_rejection
(
&
model
,
super
::
metrics
::
Endpoint
::
Videos
);
}
ErrorMessage
::
from_anyhow
(
e
,
"Failed to generate videos"
)
})
?
;
// Process stream to collect metrics and drop http_queue_guard on first token
let
mut
http_queue_guard
=
Some
(
http_queue_guard
);
...
...
@@ -2116,10 +2144,14 @@ async fn video_stream(
let
mut
response_collector
=
state
.metrics_clone
()
.create_response_collector
(
&
model
);
let
stream
=
engine
.generate
(
request
)
.await
.map_err
(|
e
|
ErrorMessage
::
from_anyhow
(
e
,
"Failed to start video stream"
))
?
;
let
stream
=
engine
.generate
(
request
)
.await
.map_err
(|
e
|
{
if
super
::
metrics
::
request_was_rejected
(
e
.as_ref
())
{
state
.metrics_clone
()
.inc_rejection
(
&
model
,
super
::
metrics
::
Endpoint
::
Videos
);
}
ErrorMessage
::
from_anyhow
(
e
,
"Failed to start video stream"
)
})
?
;
// Capture the context to cancel the stream if the client disconnects.
let
ctx
=
stream
.context
();
...
...
@@ -2435,18 +2467,24 @@ mod tests {
}
#[test]
fn
test_service_overloaded_error_response_from_anyhow
()
{
fn
test_resource_exhausted_error_response_from_anyhow
()
{
use
dynamo_runtime
::
error
::{
DynamoError
,
ErrorType
};
use
dynamo_runtime
::
pipeline
::
error
::
PipelineError
;
let
err
:
anyhow
::
Error
=
PipelineError
::
ServiceOverloaded
(
let
cause
=
PipelineError
::
ServiceOverloaded
(
"All workers are busy, please retry later"
.to_string
(),
)
);
let
err
:
anyhow
::
Error
=
DynamoError
::
builder
()
.error_type
(
ErrorType
::
ResourceExhausted
)
.message
(
"All workers are busy, please retry later"
)
.cause
(
cause
)
.build
()
.into
();
let
response
=
ErrorMessage
::
from_anyhow
(
err
,
BACKUP_ERROR_MESSAGE
);
assert_eq!
(
response
.0
,
StatusCode
::
SERVICE_UNAVAILABLE
);
assert_eq!
(
response
.1
.message
,
"
Service temporarily unavailable
: All workers are busy, please retry later"
"
ResourceExhausted
: All workers are busy, please retry later"
);
}
...
...
lib/llm/src/migration.rs
View file @
3205e7db
...
...
@@ -27,7 +27,7 @@ fn is_migratable(err: &(dyn StdError + 'static)) -> bool {
ErrorType
::
ConnectionTimeout
,
ErrorType
::
Backend
(
BackendError
::
EngineShutdown
),
];
const
NON_MIGRATABLE
:
&
[
ErrorType
]
=
&
[
ErrorType
::
Cancelled
];
const
NON_MIGRATABLE
:
&
[
ErrorType
]
=
&
[
ErrorType
::
Cancelled
,
ErrorType
::
ResourceExhausted
];
error
::
match_error_chain
(
err
,
MIGRATABLE
,
NON_MIGRATABLE
)
}
...
...
lib/runtime/src/error.rs
View file @
3205e7db
...
...
@@ -53,6 +53,8 @@ pub enum ErrorType {
ConnectionTimeout
,
/// The request was cancelled (e.g., client disconnected).
Cancelled
,
/// The system does not have enough resources to handle the request.
ResourceExhausted
,
/// Error originating from a backend engine.
Backend
(
BackendError
),
}
...
...
@@ -66,6 +68,7 @@ impl fmt::Display for ErrorType {
ErrorType
::
Disconnected
=>
write!
(
f
,
"Disconnected"
),
ErrorType
::
ConnectionTimeout
=>
write!
(
f
,
"ConnectionTimeout"
),
ErrorType
::
Cancelled
=>
write!
(
f
,
"Cancelled"
),
ErrorType
::
ResourceExhausted
=>
write!
(
f
,
"ResourceExhausted"
),
ErrorType
::
Backend
(
sub
)
=>
write!
(
f
,
"Backend{sub}"
),
}
}
...
...
lib/runtime/src/metrics/prometheus_names.rs
View file @
3205e7db
...
...
@@ -235,6 +235,9 @@ pub mod frontend_service {
/// Total number of request cancellations
pub
const
MODEL_CANCELLATION_TOTAL
:
&
str
=
"model_cancellation_total"
;
/// Total number of requests rejected due to resource exhaustion
pub
const
MODEL_REJECTION_TOTAL
:
&
str
=
"model_rejection_total"
;
/// Active decode blocks (KV cache blocks) per worker
/// Gauge metric tracking current KV cache block utilization for each worker
pub
const
WORKER_ACTIVE_DECODE_BLOCKS
:
&
str
=
"worker_active_decode_blocks"
;
...
...
lib/runtime/src/pipeline/network/egress/push_router.rs
View file @
3205e7db
...
...
@@ -2,7 +2,7 @@
// SPDX-License-Identifier: Apache-2.0
use
super
::{
AsyncEngineContextProvider
,
ResponseStream
};
use
crate
::
error
::{
BackendError
,
ErrorType
,
match_error_chain
};
use
crate
::
error
::{
BackendError
,
DynamoError
,
ErrorType
,
match_error_chain
};
/// Check if an error chain indicates the worker should be reported as down.
fn
is_inhibited
(
err
:
&
(
dyn
std
::
error
::
Error
+
'static
))
->
bool
{
...
...
@@ -518,9 +518,14 @@ where
total_workers
=
all_instances
.len
(),
"Rejecting request: all workers are busy"
);
r
et
urn
Err
(
PipelineError
::
ServiceOverloaded
(
l
et
cause
=
PipelineError
::
ServiceOverloaded
(
"All workers are busy, please retry later"
.to_string
(),
)
);
return
Err
(
DynamoError
::
builder
()
.error_type
(
ErrorType
::
ResourceExhausted
)
.message
(
"All workers are busy, please retry later"
)
.cause
(
cause
)
.build
()
.into
());
}
}
...
...
tests/router/common.py
View file @
3205e7db
...
...
@@ -11,8 +11,10 @@ from typing import TYPE_CHECKING, Any, Optional
import
aiohttp
import
nats
import
requests
from
dynamo.llm
import
AicPerfConfig
,
KvRouter
,
KvRouterConfig
from
dynamo.prometheus_names
import
frontend_service
,
name_prefix
from
tests.router.helper
import
(
_nats_server
,
assert_event_dumps_equal
,
...
...
@@ -604,6 +606,66 @@ def _test_router_query_instance_id(
logger
.
info
(
f
"Token count:
{
result
[
'token_count'
]
}
"
)
def
_parse_frontend_rejection_metric
(
metrics_text
:
str
,
model_name
:
str
,
endpoint
:
str
)
->
int
:
"""Parse frontend model_rejection_total from Prometheus metrics text.
Args:
metrics_text: Raw Prometheus metrics text
model_name: The model name label value
endpoint: The endpoint label value (e.g. "chat_completions")
Returns:
The metric count, or 0 if not found
"""
metric_name
=
f
"
{
name_prefix
.
FRONTEND
}
_
{
frontend_service
.
MODEL_REJECTION_TOTAL
}
"
for
line
in
metrics_text
.
splitlines
():
if
not
line
.
startswith
(
f
"
{
metric_name
}
{{"
):
continue
if
f
'model="
{
model_name
}
"'
in
line
and
f
'endpoint="
{
endpoint
}
"'
in
line
:
parts
=
line
.
rsplit
(
None
,
1
)
if
len
(
parts
)
==
2
:
try
:
return
int
(
float
(
parts
[
1
]))
except
ValueError
:
pass
return
0
def
_verify_frontend_rejection_metrics
(
frontend_port
:
int
,
model_name
:
str
,
endpoint
:
str
,
expected_count
:
int
,
)
->
None
:
"""Verify frontend rejection metrics by scraping the /metrics endpoint.
Args:
frontend_port: Port where the frontend /metrics is served
model_name: The model name label value
endpoint: The endpoint label value (e.g. "chat_completions")
expected_count: Expected rejection count to match exactly
"""
metrics_url
=
f
"http://localhost:
{
frontend_port
}
/metrics"
try
:
metrics_response
=
requests
.
get
(
metrics_url
,
timeout
=
5
)
metrics_response
.
raise_for_status
()
except
requests
.
RequestException
as
e
:
raise
AssertionError
(
f
"Failed to fetch frontend metrics from
{
metrics_url
}
:
{
e
}
"
)
from
e
metric_count
=
_parse_frontend_rejection_metric
(
metrics_response
.
text
,
model_name
,
endpoint
)
logger
.
info
(
f
"Frontend rejection metric: model_rejection_total=
{
metric_count
}
"
)
assert
metric_count
==
expected_count
,
(
f
"Frontend model_rejection_total (
{
metric_count
}
) does not match "
f
"expected count (
{
expected_count
}
)"
)
def
_test_router_overload_503
(
engine_workers
,
block_size
:
int
,
...
...
@@ -612,11 +674,16 @@ def _test_router_overload_503(
test_payload
:
dict
,
blocks_threshold
:
float
=
0.2
,
):
"""Test that
KV router
return
s 503
when all workers are busy.
"""Test that
503 is
return
ed
when all workers are busy
, and verify rejection metrics
.
Assumes engine_workers are already initialized. This function manages router lifecycle.
Uses limited resources to intentionally trigger the overload condition.
Sends staggered requests (0.1s apart) to exhaust worker resources, then verifies:
1. At least one request succeeds (routed before busy state propagates)
2. At least one request is rejected with 503 (worker busy)
3. The frontend model_rejection_total metric matches the observed 503 count
Args:
engine_workers: Backend workers (mocker/vllm) already initialized with __enter__()
block_size: Block size for KV cache (should be small to exhaust quickly, e.g. 4)
...
...
@@ -626,9 +693,8 @@ def _test_router_overload_503(
blocks_threshold: Active decode blocks threshold for the router (default 0.2)
Raises:
AssertionError: If
503 response is not received when
expect
ed
AssertionError: If
success/rejection counts or metrics don't meet
expect
ations
"""
logger
.
info
(
f
"Starting KV router frontend on port
{
frontend_port
}
with limited resources"
)
...
...
@@ -662,8 +728,6 @@ def _test_router_overload_503(
async
def
exhaust_resources_and_verify_503
():
stop_event
=
asyncio
.
Event
()
overload_response
=
{}
unexpected_statuses
=
[]
async
with
aiohttp
.
ClientSession
()
as
session
:
tasks
=
[]
...
...
@@ -681,23 +745,24 @@ def _test_router_overload_503(
logger
.
info
(
f
"Request
{
req_id
}
got expected 503:
{
body
}
"
)
overload_response
[
"status"
]
=
response
.
status
overload_response
[
"body"
]
=
body
stop_event
.
set
()
error_msg
=
body
.
get
(
"message"
,
""
)
assert
(
"Service temporarily unavailable"
in
error_msg
or
"All workers are busy"
in
error_msg
),
f
"Expected service overload error message, got:
{
body
}
"
return
response
.
status
body
=
await
response
.
text
()
logger
.
info
(
f
"Request
{
req_id
}
got unexpected status
{
response
.
status
}
:
{
body
}
"
)
unexpected_statuses
.
append
((
response
.
status
,
body
))
return
response
.
status
except
asyncio
.
CancelledError
:
raise
except
Exception
as
e
:
logger
.
info
(
f
"Request
{
req_id
}
failed:
{
e
}
"
)
unexpected_statuses
.
append
((
"exception"
,
str
(
e
)))
return
None
raise
try
:
for
i
in
range
(
50
):
...
...
@@ -732,27 +797,42 @@ def _test_router_overload_503(
for
task
in
pending
:
task
.
cancel
()
await
asyncio
.
gather
(
*
pending
,
return_exceptions
=
True
)
for
task
in
done
:
task
.
result
()
if
overload_response
.
get
(
"status"
)
!=
503
:
logger
.
error
(
f
"Observed statuses before timeout:
{
unexpected_statuses
}
"
return
[
t
.
result
()
for
t
in
done
]
results
=
asyncio
.
run
(
exhaust_resources_and_verify_503
())
# Count outcomes
num_succeeded
=
sum
(
1
for
s
in
results
if
s
==
200
)
num_rejected
=
sum
(
1
for
s
in
results
if
s
==
503
)
num_other
=
sum
(
1
for
s
in
results
if
s
not
in
(
200
,
503
))
logger
.
info
(
f
"Results:
{
num_succeeded
}
succeeded,
{
num_rejected
}
rejected (503), "
f
"
{
num_other
}
other"
)
return
False
error_msg
=
overload_response
[
"body"
].
get
(
"message"
,
""
)
# Assert minimum thresholds
assert
(
"Service temporarily unavailable"
in
error_msg
or
"All workers are busy"
in
error_msg
),
f
"Expected service overload error message, got:
{
overload_response
[
'body'
]
}
"
return
True
num_other
==
0
),
f
"Expected only 200 or 503 responses, but got
{
num_other
}
other"
assert
(
num_rejected
>
0
),
f
"Expected at least 1 rejection, but got
{
num_rejected
}
"
assert
(
num_succeeded
>
0
),
f
"Expected at least 1 success, but got
{
num_succeeded
}
"
# Run the test
success
=
asyncio
.
run
(
exhaust_resources_and_verify_503
())
assert
success
,
"Failed to verify 503 response when resources are exhausted"
# Verify rejection metrics from frontend /metrics endpoint
model_name
=
test_payload
.
get
(
"model"
,
""
)
_verify_frontend_rejection_metrics
(
frontend_port
,
model_name
,
"chat_completions"
,
num_rejected
)
logger
.
info
(
"Successfully verified 503 response when all workers are busy"
)
logger
.
info
(
f
"Successfully verified overload 503:
{
num_rejected
}
rejected, "
f
"
{
num_succeeded
}
succeeded, metrics match"
)
async
def
_zmq_replay_cycle
(
...
...
tests/router/test_router_e2e_with_mockers.py
View file @
3205e7db
...
...
@@ -809,7 +809,7 @@ def test_mocker_two_kv_router(
@
pytest
.
mark
.
parametrize
(
"durable_kv_events"
,
[
False
],
ids
=
[
"nondurable"
],
indirect
=
True
)
# Use NATS Core (local indexer)
@
pytest
.
mark
.
timeout
(
60
)
# ~3x average (~1
9.86
s), rounded up (when enabled)
@
pytest
.
mark
.
timeout
(
45
)
# ~3x average (~1
3.10
s), rounded up (when enabled)
def
test_mocker_kv_router_overload_503
(
request
,
runtime_services_dynamic_ports
,
predownload_tokenizers
,
durable_kv_events
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment