Unverified Commit d84790db authored by fzyzcjy's avatar fzyzcjy Committed by GitHub
Browse files

Support aggregating engine metrics in sgl-router (#11456)

parent 0678beaa
...@@ -84,6 +84,7 @@ subtle = "2.6" ...@@ -84,6 +84,7 @@ subtle = "2.6"
rustpython-parser = "0.4.0" rustpython-parser = "0.4.0"
num-traits = "0.2" num-traits = "0.2"
openai-harmony = { git = "https://github.com/openai/harmony", tag = "v0.0.4" } openai-harmony = { git = "https://github.com/openai/harmony", tag = "v0.0.4" }
openmetrics-parser = "0.4.4"
# gRPC and Protobuf dependencies # gRPC and Protobuf dependencies
tonic = { version = "0.14.2", features = ["gzip", "transport"] } tonic = { version = "0.14.2", features = ["gzip", "transport"] }
......
use anyhow::ensure;
use openmetrics_parser::{MetricFamily, MetricsExposition, PrometheusType, PrometheusValue};
use tracing::warn;
#[derive(Debug)]
pub struct MetricPack {
pub labels: Vec<(String, String)>,
pub metrics_text: String,
}
type PrometheusExposition = MetricsExposition<PrometheusType, PrometheusValue>;
type PrometheusFamily = MetricFamily<PrometheusType, PrometheusValue>;
/// Aggregate Prometheus metrics scraped from multiple sources into a unified one
pub fn aggregate_metrics(metric_packs: Vec<MetricPack>) -> anyhow::Result<String> {
let mut expositions = vec![];
for metric_pack in metric_packs {
let metrics_text = &metric_pack.metrics_text;
// Hacky workaround since the parser do not understand `:`, should improve later
let metrics_text = metrics_text.replace(":", "_");
let exposition = match openmetrics_parser::prometheus::parse_prometheus(&metrics_text) {
Ok(x) => x,
Err(err) => {
warn!(
"aggregate_metrics error when parsing text: pack={:?} err={:?}",
metric_pack, err
);
continue;
}
};
let exposition = transform_metrics(exposition, &metric_pack.labels);
expositions.push(exposition);
}
let text = try_reduce(expositions.into_iter(), merge_exposition)?
.map(|x| format!("{x}"))
.unwrap_or_default();
Ok(text)
}
fn transform_metrics(
mut exposition: PrometheusExposition,
extra_labels: &[(String, String)],
) -> PrometheusExposition {
for family in exposition.families.values_mut() {
*family = family.with_labels(extra_labels.iter().map(|(k, v)| (k.as_str(), v.as_str())));
}
exposition
}
fn merge_exposition(
a: PrometheusExposition,
b: PrometheusExposition,
) -> anyhow::Result<PrometheusExposition> {
let mut ans = a;
for (name, family_b) in b.families.into_iter() {
let family_merged = if let Some(family_a) = ans.families.remove(&name) {
merge_family(family_a, family_b)?
} else {
family_b
};
ans.families.insert(name, family_merged);
}
Ok(ans)
}
fn merge_family(a: PrometheusFamily, b: PrometheusFamily) -> anyhow::Result<PrometheusFamily> {
ensure!(
a.get_label_names() == b.get_label_names(),
"Label names should agree a={:?} b={:?}",
a.get_label_names(),
b.get_label_names()
);
a.with_samples(b.into_iter_samples())
.map_err(|e| anyhow::anyhow!("failed to merge samples: {e:?}"))
}
pub fn try_reduce<I, T, E, F>(iterable: I, f: F) -> Result<Option<T>, E>
where
I: IntoIterator<Item = T>,
F: FnMut(T, T) -> Result<T, E>,
{
let mut it = iterable.into_iter();
let first = match it.next() {
None => return Ok(None),
Some(x) => x,
};
Ok(Some(it.try_fold(first, f)?))
}
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
pub mod circuit_breaker; pub mod circuit_breaker;
pub mod error; pub mod error;
pub mod job_queue; pub mod job_queue;
pub mod metrics_aggregator;
pub mod retry; pub mod retry;
pub mod token_bucket; pub mod token_bucket;
pub mod worker; pub mod worker;
......
...@@ -5,7 +5,9 @@ ...@@ -5,7 +5,9 @@
use std::{collections::HashMap, sync::Arc, time::Duration}; use std::{collections::HashMap, sync::Arc, time::Duration};
use axum::response::{IntoResponse, Response};
use futures::future; use futures::future;
use http::{Method, StatusCode};
use serde_json::Value; use serde_json::Value;
use tokio::{ use tokio::{
sync::{watch, Mutex}, sync::{watch, Mutex},
...@@ -14,7 +16,7 @@ use tokio::{ ...@@ -14,7 +16,7 @@ use tokio::{
use tracing::{debug, error, info, warn}; use tracing::{debug, error, info, warn};
use crate::{ use crate::{
core::{ConnectionMode, WorkerRegistry, WorkerType}, core::{metrics_aggregator::MetricPack, ConnectionMode, WorkerRegistry, WorkerType},
policies::PolicyRegistry, policies::PolicyRegistry,
protocols::worker_spec::{FlushCacheResult, WorkerLoadInfo, WorkerLoadsResult}, protocols::worker_spec::{FlushCacheResult, WorkerLoadInfo, WorkerLoadsResult},
}; };
...@@ -234,6 +236,90 @@ impl WorkerManager { ...@@ -234,6 +236,90 @@ impl WorkerManager {
failed, failed,
} }
} }
pub async fn get_engine_metrics(
worker_registry: &WorkerRegistry,
client: &reqwest::Client,
) -> Response {
let engine_responses =
match Self::fan_out_simple_request(worker_registry, client, "metrics", Method::GET)
.await
{
Ok(x) => x,
Err(e) => return e,
};
let engine_responses = engine_responses
.into_iter()
.map(|(worker_base_url, metrics_text)| MetricPack {
labels: vec![("worker_addr".into(), worker_base_url)],
metrics_text,
})
.collect();
let text = match crate::core::metrics_aggregator::aggregate_metrics(engine_responses) {
Ok(x) => x,
Err(e) => {
let error_msg = format!("Failed to aggregate metrics: {}", e);
return (StatusCode::INTERNAL_SERVER_ERROR, error_msg).into_response();
}
};
(StatusCode::OK, text).into_response()
}
async fn fan_out_simple_request(
worker_registry: &WorkerRegistry,
client: &reqwest::Client,
endpoint: &str,
method: Method,
) -> Result<Vec<(String, String)>, Response> {
let workers = worker_registry.get_all();
if workers.is_empty() {
return Err((StatusCode::SERVICE_UNAVAILABLE, "No available workers").into_response());
}
let mut responses = vec![];
// May do parallel requests later
for worker in workers {
let worker_url = worker.url().to_string();
let url = format!("{}/{}", worker_url, endpoint);
let mut request_builder = match method {
Method::GET => client.get(url),
Method::POST => client.post(url),
_ => {
return Err((
StatusCode::METHOD_NOT_ALLOWED,
"Unsupported method for simple routing",
)
.into_response())
}
};
if let Some(api_key) = worker.api_key() {
request_builder =
request_builder.header("Authorization", format!("Bearer {}", api_key));
}
match request_builder.send().await {
Ok(res) => {
let status = StatusCode::from_u16(res.status().as_u16())
.unwrap_or(StatusCode::INTERNAL_SERVER_ERROR);
match res.text().await {
Ok(body_text) => {
if status.is_success() {
responses.push((worker_url, body_text));
}
}
Err(e) => {
warn!("fan_out_simple_request failed when reading text: {}", e)
}
}
}
Err(e) => warn!("fan_out_simple_request failed when sending: {}", e),
}
}
Ok(responses)
}
} }
/// Load monitoring service that periodically fetches worker loads /// Load monitoring service that periodically fetches worker loads
......
...@@ -115,6 +115,10 @@ async fn health_generate(State(state): State<Arc<AppState>>, req: Request) -> Re ...@@ -115,6 +115,10 @@ async fn health_generate(State(state): State<Arc<AppState>>, req: Request) -> Re
state.router.health_generate(req).await state.router.health_generate(req).await
} }
async fn engine_metrics(State(state): State<Arc<AppState>>) -> Response {
WorkerManager::get_engine_metrics(&state.context.worker_registry, &state.context.client).await
}
async fn get_server_info(State(state): State<Arc<AppState>>, req: Request) -> Response { async fn get_server_info(State(state): State<Arc<AppState>>, req: Request) -> Response {
state.router.get_server_info(req).await state.router.get_server_info(req).await
} }
...@@ -641,6 +645,7 @@ pub fn build_app( ...@@ -641,6 +645,7 @@ pub fn build_app(
.route("/readiness", get(readiness)) .route("/readiness", get(readiness))
.route("/health", get(health)) .route("/health", get(health))
.route("/health_generate", get(health_generate)) .route("/health_generate", get(health_generate))
.route("/engine_metrics", get(engine_metrics))
.route("/v1/models", get(v1_models)) .route("/v1/models", get(v1_models))
.route("/get_model_info", get(get_model_info)) .route("/get_model_info", get(get_model_info))
.route("/get_server_info", get(get_server_info)); .route("/get_server_info", get(get_server_info));
......
use sglang_router_rs::core::metrics_aggregator::{aggregate_metrics, MetricPack};
#[test]
fn test_aggregate_simple() {
let pack1 = MetricPack {
labels: vec![("source".to_string(), "worker1".to_string())],
metrics_text: r#"
# HELP http_requests_total The total number of HTTP requests.
# TYPE http_requests_total counter
http_requests_total{method="post",code="200"} 1027
http_requests_total{method="post",code="400"} 3
"#
.to_string(),
};
let pack2 = MetricPack {
labels: vec![("source".to_string(), "worker2".to_string())],
metrics_text: r#"
# HELP http_requests_total The total number of HTTP requests.
# TYPE http_requests_total counter
http_requests_total{method="post",code="200"} 500
"#
.to_string(),
};
let result = aggregate_metrics(vec![pack1, pack2]).unwrap();
let expected = r#"# HELP http_requests_total The total number of HTTP requests.
# TYPE http_requests_total counter
http_requests_total{code="200",method="post",source="worker1"} 1027
http_requests_total{code="400",method="post",source="worker1"} 3
http_requests_total{code="200",method="post",source="worker2"} 500
"#;
assert_eq!(result.trim(), expected.trim());
}
#[test]
fn test_aggregate_multiple_metrics() {
let pack1 = MetricPack {
labels: vec![("source".to_string(), "w1".to_string())],
metrics_text: r#"
# TYPE metric_a gauge
metric_a{dim="x"} 1.0
# TYPE metric_b_total counter
metric_b_total 10
"#
.to_string(),
};
let pack2 = MetricPack {
labels: vec![("source".to_string(), "w2".to_string())],
metrics_text: r#"
# TYPE metric_a gauge
metric_a{dim="y"} 2.0
"#
.to_string(),
};
let result = aggregate_metrics(vec![pack1, pack2]).unwrap();
let expected = r#"# TYPE metric_a gauge
metric_a{dim="x",source="w1"} 1
metric_a{dim="y",source="w2"} 2
# TYPE metric_b_total counter
metric_b_total{source="w1"} 10
"#;
assert_eq_sorted(&result, expected);
}
#[test]
fn test_empty_input() {
let result = aggregate_metrics(vec![]).unwrap();
assert_eq!(result, "");
}
#[test]
fn test_invalid_metrics_are_skipped() {
let pack1 = MetricPack {
labels: vec![("source".to_string(), "worker1".to_string())],
metrics_text: "invalid metrics text".to_string(),
};
let pack2 = MetricPack {
labels: vec![("source".to_string(), "worker2".to_string())],
metrics_text: "# TYPE valid_metric gauge\nvalid_metric 123\n".to_string(),
};
let result = aggregate_metrics(vec![pack1, pack2]).unwrap();
let expected = r#"# TYPE valid_metric gauge
valid_metric{source="worker2"} 123
"#;
assert_eq!(result.trim(), expected.trim());
}
#[test]
fn test_real() {
let pack1 = MetricPack {
labels: vec![("source".to_string(), "worker1".to_string())],
// https://docs.sglang.ai/references/production_metrics.html
metrics_text: r###"# HELP sglang:prompt_tokens_total Number of prefill tokens processed.
# TYPE sglang:prompt_tokens_total counter
sglang:prompt_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct"} 8.128902e+06
# HELP sglang:generation_tokens_total Number of generation tokens processed.
# TYPE sglang:generation_tokens_total counter
sglang:generation_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.557572e+06
# HELP sglang:token_usage The token usage
# TYPE sglang:token_usage gauge
sglang:token_usage{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.28
# HELP sglang:cache_hit_rate The cache hit rate
# TYPE sglang:cache_hit_rate gauge
sglang:cache_hit_rate{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.007507552643049313
# HELP sglang:time_to_first_token_seconds Histogram of time to first token in seconds.
# TYPE sglang:time_to_first_token_seconds histogram
sglang:time_to_first_token_seconds_sum{model_name="meta-llama/Llama-3.1-8B-Instruct"} 2.3518979474117756e+06
sglang:time_to_first_token_seconds_bucket{le="0.001",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
sglang:time_to_first_token_seconds_bucket{le="0.005",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
sglang:time_to_first_token_seconds_bucket{le="0.01",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
sglang:time_to_first_token_seconds_bucket{le="+Inf",model_name="meta-llama/Llama-3.1-8B-Instruct"} 11008.0
sglang:time_to_first_token_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct"} 11008.0
# HELP sglang:e2e_request_latency_seconds Histogram of End-to-end request latency in seconds
# TYPE sglang:e2e_request_latency_seconds histogram
sglang:e2e_request_latency_seconds_sum{model_name="meta-llama/Llama-3.1-8B-Instruct"} 3.116093850019932e+06
sglang:e2e_request_latency_seconds_bucket{le="0.3",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
sglang:e2e_request_latency_seconds_bucket{le="0.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
sglang:e2e_request_latency_seconds_bucket{le="0.8",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
sglang:e2e_request_latency_seconds_bucket{le="+Inf",model_name="meta-llama/Llama-3.1-8B-Instruct"} 11228.0
sglang:e2e_request_latency_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct"} 11228.0
# HELP sglang:time_per_output_token_seconds Histogram of time per output token in seconds.
# TYPE sglang:time_per_output_token_seconds histogram
sglang:time_per_output_token_seconds_sum{model_name="meta-llama/Llama-3.1-8B-Instruct"} 866964.5791549598
sglang:time_per_output_token_seconds_bucket{le="0.005",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
sglang:time_per_output_token_seconds_bucket{le="0.01",model_name="meta-llama/Llama-3.1-8B-Instruct"} 73.0
sglang:time_per_output_token_seconds_bucket{le="0.015",model_name="meta-llama/Llama-3.1-8B-Instruct"} 382.0
sglang:time_per_output_token_seconds_bucket{le="+Inf",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.400757e+06
sglang:time_per_output_token_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.400757e+06
# HELP sglang:func_latency_seconds Function latency in seconds
# TYPE sglang:func_latency_seconds histogram
sglang:func_latency_seconds_sum{name="generate_request"} 4.514771912145079
sglang:func_latency_seconds_bucket{le="0.05",name="generate_request"} 14006.0
sglang:func_latency_seconds_bucket{le="0.07500000000000001",name="generate_request"} 14006.0
sglang:func_latency_seconds_bucket{le="0.1125",name="generate_request"} 14006.0
sglang:func_latency_seconds_bucket{le="0.16875",name="generate_request"} 14006.0
sglang:func_latency_seconds_bucket{le="+Inf",name="generate_request"} 14007.0
sglang:func_latency_seconds_count{name="generate_request"} 14007.0
# HELP sglang:num_running_reqs The number of running requests
# TYPE sglang:num_running_reqs gauge
sglang:num_running_reqs{model_name="meta-llama/Llama-3.1-8B-Instruct"} 162.0
# HELP sglang:num_used_tokens The number of used tokens
# TYPE sglang:num_used_tokens gauge
sglang:num_used_tokens{model_name="meta-llama/Llama-3.1-8B-Instruct"} 123859.0
# HELP sglang:gen_throughput The generate throughput (token/s)
# TYPE sglang:gen_throughput gauge
sglang:gen_throughput{model_name="meta-llama/Llama-3.1-8B-Instruct"} 86.50814177726902
# HELP sglang:num_queue_reqs The number of requests in the waiting queue
# TYPE sglang:num_queue_reqs gauge
sglang:num_queue_reqs{model_name="meta-llama/Llama-3.1-8B-Instruct"} 2826.0
"###.to_string(),
};
let pack2 = MetricPack {
labels: vec![("source".to_string(), "worker2".to_string())],
metrics_text: pack1.metrics_text.clone(),
};
let result = aggregate_metrics(vec![pack1, pack2]).unwrap();
let expected = r###"# HELP sglang_token_usage The token usage
# TYPE sglang_token_usage gauge
sglang_token_usage{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1"} 0.28
sglang_token_usage{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2"} 0.28
# HELP sglang_time_to_first_token_seconds Histogram of time to first token in seconds.
# TYPE sglang_time_to_first_token_seconds histogram
sglang_time_to_first_token_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1",le="0.001"} 0
sglang_time_to_first_token_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1",le="0.005"} 0
sglang_time_to_first_token_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1",le="0.01"} 0
sglang_time_to_first_token_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1",le="+Inf"} 11008
sglang_time_to_first_token_seconds_sum{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1"} 2351897.9474117756
sglang_time_to_first_token_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1"} 11008
sglang_time_to_first_token_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2",le="0.001"} 0
sglang_time_to_first_token_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2",le="0.005"} 0
sglang_time_to_first_token_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2",le="0.01"} 0
sglang_time_to_first_token_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2",le="+Inf"} 11008
sglang_time_to_first_token_seconds_sum{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2"} 2351897.9474117756
sglang_time_to_first_token_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2"} 11008
# HELP sglang_time_per_output_token_seconds Histogram of time per output token in seconds.
# TYPE sglang_time_per_output_token_seconds histogram
sglang_time_per_output_token_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1",le="0.005"} 1
sglang_time_per_output_token_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1",le="0.01"} 73
sglang_time_per_output_token_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1",le="0.015"} 382
sglang_time_per_output_token_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1",le="+Inf"} 7400757
sglang_time_per_output_token_seconds_sum{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1"} 866964.5791549598
sglang_time_per_output_token_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1"} 7400757
sglang_time_per_output_token_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2",le="0.005"} 1
sglang_time_per_output_token_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2",le="0.01"} 73
sglang_time_per_output_token_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2",le="0.015"} 382
sglang_time_per_output_token_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2",le="+Inf"} 7400757
sglang_time_per_output_token_seconds_sum{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2"} 866964.5791549598
sglang_time_per_output_token_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2"} 7400757
# HELP sglang_func_latency_seconds Function latency in seconds
# TYPE sglang_func_latency_seconds histogram
sglang_func_latency_seconds_bucket{name="generate_request",source="worker1",le="0.05"} 14006
sglang_func_latency_seconds_bucket{name="generate_request",source="worker1",le="0.07500000000000001"} 14006
sglang_func_latency_seconds_bucket{name="generate_request",source="worker1",le="0.1125"} 14006
sglang_func_latency_seconds_bucket{name="generate_request",source="worker1",le="0.16875"} 14006
sglang_func_latency_seconds_bucket{name="generate_request",source="worker1",le="+Inf"} 14007
sglang_func_latency_seconds_sum{name="generate_request",source="worker1"} 4.514771912145079
sglang_func_latency_seconds_count{name="generate_request",source="worker1"} 14007
sglang_func_latency_seconds_bucket{name="generate_request",source="worker2",le="0.05"} 14006
sglang_func_latency_seconds_bucket{name="generate_request",source="worker2",le="0.07500000000000001"} 14006
sglang_func_latency_seconds_bucket{name="generate_request",source="worker2",le="0.1125"} 14006
sglang_func_latency_seconds_bucket{name="generate_request",source="worker2",le="0.16875"} 14006
sglang_func_latency_seconds_bucket{name="generate_request",source="worker2",le="+Inf"} 14007
sglang_func_latency_seconds_sum{name="generate_request",source="worker2"} 4.514771912145079
sglang_func_latency_seconds_count{name="generate_request",source="worker2"} 14007
# HELP sglang_num_used_tokens The number of used tokens
# TYPE sglang_num_used_tokens gauge
sglang_num_used_tokens{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1"} 123859
sglang_num_used_tokens{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2"} 123859
# HELP sglang_cache_hit_rate The cache hit rate
# TYPE sglang_cache_hit_rate gauge
sglang_cache_hit_rate{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1"} 0.007507552643049313
sglang_cache_hit_rate{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2"} 0.007507552643049313
# HELP sglang_num_queue_reqs The number of requests in the waiting queue
# TYPE sglang_num_queue_reqs gauge
sglang_num_queue_reqs{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1"} 2826
sglang_num_queue_reqs{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2"} 2826
# HELP sglang_generation_tokens_total Number of generation tokens processed.
# TYPE sglang_generation_tokens_total counter
sglang_generation_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1"} 7557572
sglang_generation_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2"} 7557572
# HELP sglang_num_running_reqs The number of running requests
# TYPE sglang_num_running_reqs gauge
sglang_num_running_reqs{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1"} 162
sglang_num_running_reqs{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2"} 162
# HELP sglang_e2e_request_latency_seconds Histogram of End-to-end request latency in seconds
# TYPE sglang_e2e_request_latency_seconds histogram
sglang_e2e_request_latency_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1",le="0.3"} 0
sglang_e2e_request_latency_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1",le="0.5"} 6
sglang_e2e_request_latency_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1",le="0.8"} 6
sglang_e2e_request_latency_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1",le="+Inf"} 11228
sglang_e2e_request_latency_seconds_sum{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1"} 3116093.850019932
sglang_e2e_request_latency_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1"} 11228
sglang_e2e_request_latency_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2",le="0.3"} 0
sglang_e2e_request_latency_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2",le="0.5"} 6
sglang_e2e_request_latency_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2",le="0.8"} 6
sglang_e2e_request_latency_seconds_bucket{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2",le="+Inf"} 11228
sglang_e2e_request_latency_seconds_sum{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2"} 3116093.850019932
sglang_e2e_request_latency_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2"} 11228
# HELP sglang_gen_throughput The generate throughput (token/s)
# TYPE sglang_gen_throughput gauge
sglang_gen_throughput{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1"} 86.50814177726902
sglang_gen_throughput{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2"} 86.50814177726902
# HELP sglang_prompt_tokens_total Number of prefill tokens processed.
# TYPE sglang_prompt_tokens_total counter
sglang_prompt_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker1"} 8128902
sglang_prompt_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct",source="worker2"} 8128902"###;
println!("result=\n{result}");
assert_eq_sorted(result.trim(), expected.trim());
}
fn assert_eq_sorted(result: &str, expected: &str) {
// Split into lines and sort to handle BTreeMap ordering issues between test environments
let mut result_lines: Vec<_> = result.trim().lines().map(|l| l.trim()).collect();
let mut expected_lines: Vec<_> = expected.trim().lines().map(|l| l.trim()).collect();
result_lines.sort();
expected_lines.sort();
assert_eq!(result_lines, expected_lines);
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment