Unverified Commit 60acaa7b authored by Thomas Montfort's avatar Thomas Montfort Committed by GitHub
Browse files

fix(metrics): skip output token histogram for zero-output requests (#8085)


Signed-off-by: default avatartmontfort <tmontfort@nvidia.com>
Signed-off-by: default avatarThomas Montfort <tmontfort@nvidia.com>
Co-authored-by: default avatarClaude Opus 4.6 (1M context) <noreply@anthropic.com>
parent 44cfbb68
...@@ -1451,11 +1451,17 @@ impl Drop for ResponseMetricCollector { ...@@ -1451,11 +1451,17 @@ impl Drop for ResponseMetricCollector {
.observe(avg_detokenize_latency_ms); .observe(avg_detokenize_latency_ms);
} }
// Publish final OSL when the collector is dropped // Publish final OSL when the collector is dropped, but only for
// requests that actually produced output tokens. Recording zero for
// failed/cancelled requests would corrupt histogram averages (sum/count)
// and percentiles. Failures are already tracked by requests_total with
// status and error_type labels.
if self.osl > 0 {
self.metrics self.metrics
.output_sequence_length .output_sequence_length
.with_label_values(&[&self.model]) .with_label_values(&[&self.model])
.observe(self.osl as f64); .observe(self.osl as f64);
}
// Record request summary on the enclosing span. // Record request summary on the enclosing span.
// InflightGuard::Drop and on_response logs will inherit these. // InflightGuard::Drop and on_response logs will inherit these.
......
...@@ -237,9 +237,15 @@ impl RequestGuard { ...@@ -237,9 +237,15 @@ impl RequestGuard {
.observe(latency); .observe(latency);
} }
} }
// Only record output sequence length for requests that actually
// produced output tokens. Recording zero for failed/cancelled requests
// would corrupt histogram averages (sum/count) and percentiles.
// Failures are already tracked by requests_total.
if self.cumulative_osl > 0 {
self.request_metrics self.request_metrics
.output_sequence_tokens .output_sequence_tokens
.observe(self.cumulative_osl as f64); .observe(self.cumulative_osl as f64);
}
self.request_metrics.requests_total.inc(); self.request_metrics.requests_total.inc();
} }
} }
......
...@@ -6,6 +6,7 @@ use async_stream::stream; ...@@ -6,6 +6,7 @@ use async_stream::stream;
use dynamo_llm::{ use dynamo_llm::{
http::service::{metrics::Endpoint, service_v2::HttpService}, http::service::{metrics::Endpoint, service_v2::HttpService},
model_card::ModelDeploymentCard, model_card::ModelDeploymentCard,
preprocessor::LLMMetricAnnotation,
protocols::{ protocols::{
Annotated, Annotated,
openai::chat_completions::{ openai::chat_completions::{
...@@ -50,10 +51,32 @@ impl ...@@ -50,10 +51,32 @@ impl
// Simulate some processing time // Simulate some processing time
tokio::time::sleep(std::time::Duration::from_millis(10)).await; tokio::time::sleep(std::time::Duration::from_millis(10)).await;
// Generate 5 response chunks // Generate 5 response chunks with LLMMetricAnnotation so that
// output_sequence_tokens is properly recorded (the histogram only
// records when osl > 0, which requires the annotation to be present).
for i in 0..5 { for i in 0..5 {
let output = generator.create_choice(i, Some(format!("Mock response {i}")), None, None, None); let output = generator.create_choice(i, Some(format!("Mock response {i}")), None, None, None);
yield Annotated::from_data(output); let mut annotated = Annotated::from_data(output);
let metrics = LLMMetricAnnotation {
input_tokens: 5,
output_tokens: (i + 1) as usize,
chunk_tokens: 1,
cached_tokens: None,
prefill_worker_id: None,
prefill_dp_rank: None,
prefill_worker_type: None,
decode_worker_id: None,
decode_dp_rank: None,
decode_worker_type: None,
tokenize_latency: None,
detokenize_total_latency: None,
detokenize_count: None,
};
if let Ok(ann) = metrics.to_annotation::<NvCreateChatCompletionStreamResponse>() {
annotated.event = ann.event;
annotated.comment = ann.comment;
}
yield annotated;
} }
}; };
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment