Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
d84790db
"vscode:/vscode.git/clone" did not exist on "6bdcd7867217d5656d1aaf5f39749463d6e94d8a"
Unverified
Commit
d84790db
authored
Nov 04, 2025
by
fzyzcjy
Committed by
GitHub
Nov 04, 2025
Browse files
Support aggregating engine metrics in sgl-router (#11456)
parent
0678beaa
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
456 additions
and
1 deletion
+456
-1
sgl-router/Cargo.toml
sgl-router/Cargo.toml
+1
-0
sgl-router/src/core/metrics_aggregator.rs
sgl-router/src/core/metrics_aggregator.rs
+91
-0
sgl-router/src/core/mod.rs
sgl-router/src/core/mod.rs
+1
-0
sgl-router/src/core/worker_manager.rs
sgl-router/src/core/worker_manager.rs
+87
-1
sgl-router/src/server.rs
sgl-router/src/server.rs
+5
-0
sgl-router/tests/metrics_aggregator_test.rs
sgl-router/tests/metrics_aggregator_test.rs
+271
-0
No files found.
sgl-router/Cargo.toml
View file @
d84790db
...
...
@@ -84,6 +84,7 @@ subtle = "2.6"
rustpython-parser
=
"0.4.0"
num-traits
=
"0.2"
openai-harmony
=
{
git
=
"https://github.com/openai/harmony"
,
tag
=
"v0.0.4"
}
openmetrics-parser
=
"0.4.4"
# gRPC and Protobuf dependencies
tonic
=
{
version
=
"0.14.2"
,
features
=
[
"gzip"
,
"transport"
]
}
...
...
sgl-router/src/core/metrics_aggregator.rs
0 → 100644
View file @
d84790db
use
anyhow
::
ensure
;
use
openmetrics_parser
::{
MetricFamily
,
MetricsExposition
,
PrometheusType
,
PrometheusValue
};
use
tracing
::
warn
;
#[derive(Debug)]
pub
struct
MetricPack
{
pub
labels
:
Vec
<
(
String
,
String
)
>
,
pub
metrics_text
:
String
,
}
type
PrometheusExposition
=
MetricsExposition
<
PrometheusType
,
PrometheusValue
>
;
type
PrometheusFamily
=
MetricFamily
<
PrometheusType
,
PrometheusValue
>
;
/// Aggregate Prometheus metrics scraped from multiple sources into a unified one
pub
fn
aggregate_metrics
(
metric_packs
:
Vec
<
MetricPack
>
)
->
anyhow
::
Result
<
String
>
{
let
mut
expositions
=
vec!
[];
for
metric_pack
in
metric_packs
{
let
metrics_text
=
&
metric_pack
.metrics_text
;
// Hacky workaround since the parser do not understand `:`, should improve later
let
metrics_text
=
metrics_text
.replace
(
":"
,
"_"
);
let
exposition
=
match
openmetrics_parser
::
prometheus
::
parse_prometheus
(
&
metrics_text
)
{
Ok
(
x
)
=>
x
,
Err
(
err
)
=>
{
warn!
(
"aggregate_metrics error when parsing text: pack={:?} err={:?}"
,
metric_pack
,
err
);
continue
;
}
};
let
exposition
=
transform_metrics
(
exposition
,
&
metric_pack
.labels
);
expositions
.push
(
exposition
);
}
let
text
=
try_reduce
(
expositions
.into_iter
(),
merge_exposition
)
?
.map
(|
x
|
format!
(
"{x}"
))
.unwrap_or_default
();
Ok
(
text
)
}
fn
transform_metrics
(
mut
exposition
:
PrometheusExposition
,
extra_labels
:
&
[(
String
,
String
)],
)
->
PrometheusExposition
{
for
family
in
exposition
.families
.values_mut
()
{
*
family
=
family
.with_labels
(
extra_labels
.iter
()
.map
(|(
k
,
v
)|
(
k
.as_str
(),
v
.as_str
())));
}
exposition
}
fn
merge_exposition
(
a
:
PrometheusExposition
,
b
:
PrometheusExposition
,
)
->
anyhow
::
Result
<
PrometheusExposition
>
{
let
mut
ans
=
a
;
for
(
name
,
family_b
)
in
b
.families
.into_iter
()
{
let
family_merged
=
if
let
Some
(
family_a
)
=
ans
.families
.remove
(
&
name
)
{
merge_family
(
family_a
,
family_b
)
?
}
else
{
family_b
};
ans
.families
.insert
(
name
,
family_merged
);
}
Ok
(
ans
)
}
fn
merge_family
(
a
:
PrometheusFamily
,
b
:
PrometheusFamily
)
->
anyhow
::
Result
<
PrometheusFamily
>
{
ensure!
(
a
.get_label_names
()
==
b
.get_label_names
(),
"Label names should agree a={:?} b={:?}"
,
a
.get_label_names
(),
b
.get_label_names
()
);
a
.with_samples
(
b
.into_iter_samples
())
.map_err
(|
e
|
anyhow
::
anyhow!
(
"failed to merge samples: {e:?}"
))
}
pub
fn
try_reduce
<
I
,
T
,
E
,
F
>
(
iterable
:
I
,
f
:
F
)
->
Result
<
Option
<
T
>
,
E
>
where
I
:
IntoIterator
<
Item
=
T
>
,
F
:
FnMut
(
T
,
T
)
->
Result
<
T
,
E
>
,
{
let
mut
it
=
iterable
.into_iter
();
let
first
=
match
it
.next
()
{
None
=>
return
Ok
(
None
),
Some
(
x
)
=>
x
,
};
Ok
(
Some
(
it
.try_fold
(
first
,
f
)
?
))
}
sgl-router/src/core/mod.rs
View file @
d84790db
...
...
@@ -11,6 +11,7 @@
pub
mod
circuit_breaker
;
pub
mod
error
;
pub
mod
job_queue
;
pub
mod
metrics_aggregator
;
pub
mod
retry
;
pub
mod
token_bucket
;
pub
mod
worker
;
...
...
sgl-router/src/core/worker_manager.rs
View file @
d84790db
...
...
@@ -5,7 +5,9 @@
use
std
::{
collections
::
HashMap
,
sync
::
Arc
,
time
::
Duration
};
use
axum
::
response
::{
IntoResponse
,
Response
};
use
futures
::
future
;
use
http
::{
Method
,
StatusCode
};
use
serde_json
::
Value
;
use
tokio
::{
sync
::{
watch
,
Mutex
},
...
...
@@ -14,7 +16,7 @@ use tokio::{
use
tracing
::{
debug
,
error
,
info
,
warn
};
use
crate
::{
core
::{
ConnectionMode
,
WorkerRegistry
,
WorkerType
},
core
::{
metrics_aggregator
::
MetricPack
,
ConnectionMode
,
WorkerRegistry
,
WorkerType
},
policies
::
PolicyRegistry
,
protocols
::
worker_spec
::{
FlushCacheResult
,
WorkerLoadInfo
,
WorkerLoadsResult
},
};
...
...
@@ -234,6 +236,90 @@ impl WorkerManager {
failed
,
}
}
pub
async
fn
get_engine_metrics
(
worker_registry
:
&
WorkerRegistry
,
client
:
&
reqwest
::
Client
,
)
->
Response
{
let
engine_responses
=
match
Self
::
fan_out_simple_request
(
worker_registry
,
client
,
"metrics"
,
Method
::
GET
)
.await
{
Ok
(
x
)
=>
x
,
Err
(
e
)
=>
return
e
,
};
let
engine_responses
=
engine_responses
.into_iter
()
.map
(|(
worker_base_url
,
metrics_text
)|
MetricPack
{
labels
:
vec!
[(
"worker_addr"
.into
(),
worker_base_url
)],
metrics_text
,
})
.collect
();
let
text
=
match
crate
::
core
::
metrics_aggregator
::
aggregate_metrics
(
engine_responses
)
{
Ok
(
x
)
=>
x
,
Err
(
e
)
=>
{
let
error_msg
=
format!
(
"Failed to aggregate metrics: {}"
,
e
);
return
(
StatusCode
::
INTERNAL_SERVER_ERROR
,
error_msg
)
.into_response
();
}
};
(
StatusCode
::
OK
,
text
)
.into_response
()
}
async
fn
fan_out_simple_request
(
worker_registry
:
&
WorkerRegistry
,
client
:
&
reqwest
::
Client
,
endpoint
:
&
str
,
method
:
Method
,
)
->
Result
<
Vec
<
(
String
,
String
)
>
,
Response
>
{
let
workers
=
worker_registry
.get_all
();
if
workers
.is_empty
()
{
return
Err
((
StatusCode
::
SERVICE_UNAVAILABLE
,
"No available workers"
)
.into_response
());
}
let
mut
responses
=
vec!
[];
// May do parallel requests later
for
worker
in
workers
{
let
worker_url
=
worker
.url
()
.to_string
();
let
url
=
format!
(
"{}/{}"
,
worker_url
,
endpoint
);
let
mut
request_builder
=
match
method
{
Method
::
GET
=>
client
.get
(
url
),
Method
::
POST
=>
client
.post
(
url
),
_
=>
{
return
Err
((
StatusCode
::
METHOD_NOT_ALLOWED
,
"Unsupported method for simple routing"
,
)
.into_response
())
}
};
if
let
Some
(
api_key
)
=
worker
.api_key
()
{
request_builder
=
request_builder
.header
(
"Authorization"
,
format!
(
"Bearer {}"
,
api_key
));
}
match
request_builder
.send
()
.await
{
Ok
(
res
)
=>
{
let
status
=
StatusCode
::
from_u16
(
res
.status
()
.as_u16
())
.unwrap_or
(
StatusCode
::
INTERNAL_SERVER_ERROR
);
match
res
.text
()
.await
{
Ok
(
body_text
)
=>
{
if
status
.is_success
()
{
responses
.push
((
worker_url
,
body_text
));
}
}
Err
(
e
)
=>
{
warn!
(
"fan_out_simple_request failed when reading text: {}"
,
e
)
}
}
}
Err
(
e
)
=>
warn!
(
"fan_out_simple_request failed when sending: {}"
,
e
),
}
}
Ok
(
responses
)
}
}
/// Load monitoring service that periodically fetches worker loads
...
...
sgl-router/src/server.rs
View file @
d84790db
...
...
@@ -115,6 +115,10 @@ async fn health_generate(State(state): State<Arc<AppState>>, req: Request) -> Re
state
.router
.health_generate
(
req
)
.await
}
async
fn
engine_metrics
(
State
(
state
):
State
<
Arc
<
AppState
>>
)
->
Response
{
WorkerManager
::
get_engine_metrics
(
&
state
.context.worker_registry
,
&
state
.context.client
)
.await
}
async
fn
get_server_info
(
State
(
state
):
State
<
Arc
<
AppState
>>
,
req
:
Request
)
->
Response
{
state
.router
.get_server_info
(
req
)
.await
}
...
...
@@ -641,6 +645,7 @@ pub fn build_app(
.route
(
"/readiness"
,
get
(
readiness
))
.route
(
"/health"
,
get
(
health
))
.route
(
"/health_generate"
,
get
(
health_generate
))
.route
(
"/engine_metrics"
,
get
(
engine_metrics
))
.route
(
"/v1/models"
,
get
(
v1_models
))
.route
(
"/get_model_info"
,
get
(
get_model_info
))
.route
(
"/get_server_info"
,
get
(
get_server_info
));
...
...
sgl-router/tests/metrics_aggregator_test.rs
0 → 100644
View file @
d84790db
use
sglang_router_rs
::
core
::
metrics_aggregator
::{
aggregate_metrics
,
MetricPack
};
#[test]
fn
test_aggregate_simple
()
{
let
pack1
=
MetricPack
{
labels
:
vec!
[(
"source"
.to_string
(),
"worker1"
.to_string
())],
metrics_text
:
r#"
# HELP http_requests_total The total number of HTTP requests.
# TYPE http_requests_total counter
http_requests_total{method="post",code="200"} 1027
http_requests_total{method="post",code="400"} 3
"#
.to_string
(),
};
let
pack2
=
MetricPack
{
labels
:
vec!
[(
"source"
.to_string
(),
"worker2"
.to_string
())],
metrics_text
:
r#"
# HELP http_requests_total The total number of HTTP requests.
# TYPE http_requests_total counter
http_requests_total{method="post",code="200"} 500
"#
.to_string
(),
};
let
result
=
aggregate_metrics
(
vec!
[
pack1
,
pack2
])
.unwrap
();
let
expected
=
r#"# HELP http_requests_total The total number of HTTP requests.
# TYPE http_requests_total counter
http_requests_total{code="200",method="post",source="worker1"} 1027
http_requests_total{code="400",method="post",source="worker1"} 3
http_requests_total{code="200",method="post",source="worker2"} 500
"#
;
assert_eq!
(
result
.trim
(),
expected
.trim
());
}
#[test]
fn
test_aggregate_multiple_metrics
()
{
let
pack1
=
MetricPack
{
labels
:
vec!
[(
"source"
.to_string
(),
"w1"
.to_string
())],
metrics_text
:
r#"
# TYPE metric_a gauge
metric_a{dim="x"} 1.0
# TYPE metric_b_total counter
metric_b_total 10
"#
.to_string
(),
};
let
pack2
=
MetricPack
{
labels
:
vec!
[(
"source"
.to_string
(),
"w2"
.to_string
())],
metrics_text
:
r#"
# TYPE metric_a gauge
metric_a{dim="y"} 2.0
"#
.to_string
(),
};
let
result
=
aggregate_metrics
(
vec!
[
pack1
,
pack2
])
.unwrap
();
let
expected
=
r#"# TYPE metric_a gauge
metric_a{dim="x",source="w1"} 1
metric_a{dim="y",source="w2"} 2
# TYPE metric_b_total counter
metric_b_total{source="w1"} 10
"#
;
assert_eq_sorted
(
&
result
,
expected
);
}
#[test]
fn
test_empty_input
()
{
let
result
=
aggregate_metrics
(
vec!
[])
.unwrap
();
assert_eq!
(
result
,
""
);
}
#[test]
fn
test_invalid_metrics_are_skipped
()
{
let
pack1
=
MetricPack
{
labels
:
vec!
[(
"source"
.to_string
(),
"worker1"
.to_string
())],
metrics_text
:
"invalid metrics text"
.to_string
(),
};
let
pack2
=
MetricPack
{
labels
:
vec!
[(
"source"
.to_string
(),
"worker2"
.to_string
())],
metrics_text
:
"# TYPE valid_metric gauge
\n
valid_metric 123
\n
"
.to_string
(),
};
let
result
=
aggregate_metrics
(
vec!
[
pack1
,
pack2
])
.unwrap
();
let
expected
=
r#"# TYPE valid_metric gauge
valid_metric{source="worker2"} 123
"#
;
assert_eq!
(
result
.trim
(),
expected
.trim
());
}
#[test]
fn
test_real
()
{
let
pack1
=
MetricPack
{
labels
:
vec!
[(
"source"
.to_string
(),
"worker1"
.to_string
())],
// https://docs.sglang.ai/references/production_metrics.html
metrics_text
:
r###"# HELP sglang:prompt_tokens_total Number of prefill tokens processed.
# TYPE sglang:prompt_tokens_total counter
sglang:prompt_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct"} 8.128902e+06
# HELP sglang:generation_tokens_total Number of generation tokens processed.
# TYPE sglang:generation_tokens_total counter
sglang:generation_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.557572e+06
# HELP sglang:token_usage The token usage
# TYPE sglang:token_usage gauge
sglang:token_usage{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.28
# HELP sglang:cache_hit_rate The cache hit rate
# TYPE sglang:cache_hit_rate gauge
sglang:cache_hit_rate{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.007507552643049313
# HELP sglang:time_to_first_token_seconds Histogram of time to first token in seconds.
# TYPE sglang:time_to_first_token_seconds histogram
sglang:time_to_first_token_seconds_sum{model_name="meta-llama/Llama-3.1-8B-Instruct"} 2.3518979474117756e+06
sglang:time_to_first_token_seconds_bucket{le="0.001",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
sglang:time_to_first_token_seconds_bucket{le="0.005",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
sglang:time_to_first_token_seconds_bucket{le="0.01",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
sglang:time_to_first_token_seconds_bucket{le="+Inf",model_name="meta-llama/Llama-3.1-8B-Instruct"} 11008.0
sglang:time_to_first_token_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct"} 11008.0
# HELP sglang:e2e_request_latency_seconds Histogram of End-to-end request latency in seconds
# TYPE sglang:e2e_request_latency_seconds histogram
sglang:e2e_request_latency_seconds_sum{model_name="meta-llama/Llama-3.1-8B-Instruct"} 3.116093850019932e+06
sglang:e2e_request_latency_seconds_bucket{le="0.3",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
sglang:e2e_request_latency_seconds_bucket{le="0.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
sglang:e2e_request_latency_seconds_bucket{le="0.8",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
sglang:e2e_request_latency_seconds_bucket{le="+Inf",model_name="meta-llama/Llama-3.1-8B-Instruct"} 11228.0
sglang:e2e_request_latency_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct"} 11228.0
# HELP sglang:time_per_output_token_seconds Histogram of time per output token in seconds.
# TYPE sglang:time_per_output_token_seconds histogram
sglang:time_per_output_token_seconds_sum{model_name="meta-llama/Llama-3.1-8B-Instruct"} 866964.5791549598
sglang:time_per_output_token_seconds_bucket{le="0.005",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
sglang:time_per_output_token_seconds_bucket{le="0.01",model_name="meta-llama/Llama-3.1-8B-Instruct"} 73.0
sglang:time_per_output_token_seconds_bucket{le="0.015",model_name="meta-llama/Llama-3.1-8B-Instruct"} 382.0
sglang:time_per_output_token_seconds_bucket{le="+Inf",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.400757e+06
sglang:time_per_output_token_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.400757e+06
# HELP sglang:func_latency_seconds Function latency in seconds
# TYPE sglang:func_latency_seconds histogram
sglang:func_latency_seconds_sum{name="generate_request"} 4.514771912145079
sglang:func_latency_seconds_bucket{le="0.05",name="generate_request"} 14006.0
sglang:func_latency_seconds_bucket{le="0.07500000000000001",name="generate_request"} 14006.0
sglang:func_latency_seconds_bucket{le="0.1125",name="generate_request"} 14006.0
sglang:func_latency_seconds_bucket{le="0.16875",name="generate_request"} 14006.0
sglang:func_latency_seconds_bucket{le="+Inf",name="generate_request"} 14007.0
sglang:func_latency_seconds_count{name="generate_request"} 14007.0
# HELP sglang:num_running_reqs The number of running requests
# TYPE sglang:num_running_reqs gauge
sglang:num_running_reqs{model_name="meta-llama/Llama-3.1-8B-Instruct"} 162.0
# HELP sglang:num_used_tokens The number of used tokens
# TYPE sglang:num_used_tokens gauge
sglang:num_used_tokens{model_name="meta-llama/Llama-3.1-8B-Instruct"} 123859.0
# HELP sglang:gen_throughput The generate throughput (token/s)
# TYPE sglang:gen_throughput gauge
sglang:gen_throughput{model_name="meta-llama/Llama-3.1-8B-Instruct"} 86.50814177726902
# HELP sglang:num_queue_reqs The number of requests in the waiting queue
# TYPE sglang:num_queue_reqs gauge
sglang:num_queue_reqs{model_name="meta-llama/Llama-3.1-8B-Instruct"} 2826.0
"###
.to_string
(),
};
let
pack2
=
MetricPack
{
labels
:
vec!
[(
"source"
.to_string
(),
"worker2"
.to_string
())],
metrics_text
:
pack1
.metrics_text
.clone
(),
};
let
result
=
aggregate_metrics
(
vec!
[
pack1
,
pack2
])
.unwrap
();
let
expected
=
r###"# HELP sglang_token_usage The token usage
# TYPE sglang_token_usage gauge
sglang_token_usage{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1"} 0.28
sglang_token_usage{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2"} 0.28
# HELP sglang_time_to_first_token_seconds Histogram of time to first token in seconds.
# TYPE sglang_time_to_first_token_seconds histogram
sglang_time_to_first_token_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1",le="0.001"} 0
sglang_time_to_first_token_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1",le="0.005"} 0
sglang_time_to_first_token_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1",le="0.01"} 0
sglang_time_to_first_token_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1",le="+Inf"} 11008
sglang_time_to_first_token_seconds_sum{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1"} 2351897.9474117756
sglang_time_to_first_token_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1"} 11008
sglang_time_to_first_token_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2",le="0.001"} 0
sglang_time_to_first_token_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2",le="0.005"} 0
sglang_time_to_first_token_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2",le="0.01"} 0
sglang_time_to_first_token_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2",le="+Inf"} 11008
sglang_time_to_first_token_seconds_sum{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2"} 2351897.9474117756
sglang_time_to_first_token_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2"} 11008
# HELP sglang_time_per_output_token_seconds Histogram of time per output token in seconds.
# TYPE sglang_time_per_output_token_seconds histogram
sglang_time_per_output_token_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1",le="0.005"} 1
sglang_time_per_output_token_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1",le="0.01"} 73
sglang_time_per_output_token_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1",le="0.015"} 382
sglang_time_per_output_token_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1",le="+Inf"} 7400757
sglang_time_per_output_token_seconds_sum{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1"} 866964.5791549598
sglang_time_per_output_token_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1"} 7400757
sglang_time_per_output_token_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2",le="0.005"} 1
sglang_time_per_output_token_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2",le="0.01"} 73
sglang_time_per_output_token_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2",le="0.015"} 382
sglang_time_per_output_token_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2",le="+Inf"} 7400757
sglang_time_per_output_token_seconds_sum{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2"} 866964.5791549598
sglang_time_per_output_token_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2"} 7400757
# HELP sglang_func_latency_seconds Function latency in seconds
# TYPE sglang_func_latency_seconds histogram
sglang_func_latency_seconds_bucket{name="generate_request",source="worker1",le="0.05"} 14006
sglang_func_latency_seconds_bucket{name="generate_request",source="worker1",le="0.07500000000000001"} 14006
sglang_func_latency_seconds_bucket{name="generate_request",source="worker1",le="0.1125"} 14006
sglang_func_latency_seconds_bucket{name="generate_request",source="worker1",le="0.16875"} 14006
sglang_func_latency_seconds_bucket{name="generate_request",source="worker1",le="+Inf"} 14007
sglang_func_latency_seconds_sum{name="generate_request",source="worker1"} 4.514771912145079
sglang_func_latency_seconds_count{name="generate_request",source="worker1"} 14007
sglang_func_latency_seconds_bucket{name="generate_request",source="worker2",le="0.05"} 14006
sglang_func_latency_seconds_bucket{name="generate_request",source="worker2",le="0.07500000000000001"} 14006
sglang_func_latency_seconds_bucket{name="generate_request",source="worker2",le="0.1125"} 14006
sglang_func_latency_seconds_bucket{name="generate_request",source="worker2",le="0.16875"} 14006
sglang_func_latency_seconds_bucket{name="generate_request",source="worker2",le="+Inf"} 14007
sglang_func_latency_seconds_sum{name="generate_request",source="worker2"} 4.514771912145079
sglang_func_latency_seconds_count{name="generate_request",source="worker2"} 14007
# HELP sglang_num_used_tokens The number of used tokens
# TYPE sglang_num_used_tokens gauge
sglang_num_used_tokens{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1"} 123859
sglang_num_used_tokens{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2"} 123859
# HELP sglang_cache_hit_rate The cache hit rate
# TYPE sglang_cache_hit_rate gauge
sglang_cache_hit_rate{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1"} 0.007507552643049313
sglang_cache_hit_rate{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2"} 0.007507552643049313
# HELP sglang_num_queue_reqs The number of requests in the waiting queue
# TYPE sglang_num_queue_reqs gauge
sglang_num_queue_reqs{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1"} 2826
sglang_num_queue_reqs{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2"} 2826
# HELP sglang_generation_tokens_total Number of generation tokens processed.
# TYPE sglang_generation_tokens_total counter
sglang_generation_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1"} 7557572
sglang_generation_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2"} 7557572
# HELP sglang_num_running_reqs The number of running requests
# TYPE sglang_num_running_reqs gauge
sglang_num_running_reqs{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1"} 162
sglang_num_running_reqs{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2"} 162
# HELP sglang_e2e_request_latency_seconds Histogram of End-to-end request latency in seconds
# TYPE sglang_e2e_request_latency_seconds histogram
sglang_e2e_request_latency_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1",le="0.3"} 0
sglang_e2e_request_latency_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1",le="0.5"} 6
sglang_e2e_request_latency_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1",le="0.8"} 6
sglang_e2e_request_latency_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1",le="+Inf"} 11228
sglang_e2e_request_latency_seconds_sum{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1"} 3116093.850019932
sglang_e2e_request_latency_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1"} 11228
sglang_e2e_request_latency_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2",le="0.3"} 0
sglang_e2e_request_latency_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2",le="0.5"} 6
sglang_e2e_request_latency_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2",le="0.8"} 6
sglang_e2e_request_latency_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2",le="+Inf"} 11228
sglang_e2e_request_latency_seconds_sum{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2"} 3116093.850019932
sglang_e2e_request_latency_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2"} 11228
# HELP sglang_gen_throughput The generate throughput (token/s)
# TYPE sglang_gen_throughput gauge
sglang_gen_throughput{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1"} 86.50814177726902
sglang_gen_throughput{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2"} 86.50814177726902
# HELP sglang_prompt_tokens_total Number of prefill tokens processed.
# TYPE sglang_prompt_tokens_total counter
sglang_prompt_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1"} 8128902
sglang_prompt_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2"} 8128902"###
;
println!
(
"result=
\n
{result}"
);
assert_eq_sorted
(
result
.trim
(),
expected
.trim
());
}
fn
assert_eq_sorted
(
result
:
&
str
,
expected
:
&
str
)
{
// Split into lines and sort to handle BTreeMap ordering issues between test environments
let
mut
result_lines
:
Vec
<
_
>
=
result
.trim
()
.lines
()
.map
(|
l
|
l
.trim
())
.collect
();
let
mut
expected_lines
:
Vec
<
_
>
=
expected
.trim
()
.lines
()
.map
(|
l
|
l
.trim
())
.collect
();
result_lines
.sort
();
expected_lines
.sort
();
assert_eq!
(
result_lines
,
expected_lines
);
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment