Unverified Commit 60acaa7b authored by Thomas Montfort's avatar Thomas Montfort Committed by GitHub
Browse files

fix(metrics): skip output token histogram for zero-output requests (#8085)


Signed-off-by: default avatartmontfort <tmontfort@nvidia.com>
Signed-off-by: default avatarThomas Montfort <tmontfort@nvidia.com>
Co-authored-by: default avatarClaude Opus 4.6 (1M context) <noreply@anthropic.com>
parent 44cfbb68
......@@ -1451,11 +1451,17 @@ impl Drop for ResponseMetricCollector {
.observe(avg_detokenize_latency_ms);
}
// Publish final OSL when the collector is dropped
self.metrics
.output_sequence_length
.with_label_values(&[&self.model])
.observe(self.osl as f64);
// Publish final OSL when the collector is dropped, but only for
// requests that actually produced output tokens. Recording zero for
// failed/cancelled requests would corrupt histogram averages (sum/count)
// and percentiles. Failures are already tracked by requests_total with
// status and error_type labels.
if self.osl > 0 {
self.metrics
.output_sequence_length
.with_label_values(&[&self.model])
.observe(self.osl as f64);
}
// Record request summary on the enclosing span.
// InflightGuard::Drop and on_response logs will inherit these.
......
......@@ -237,9 +237,15 @@ impl RequestGuard {
.observe(latency);
}
}
self.request_metrics
.output_sequence_tokens
.observe(self.cumulative_osl as f64);
// Only record output sequence length for requests that actually
// produced output tokens. Recording zero for failed/cancelled requests
// would corrupt histogram averages (sum/count) and percentiles.
// Failures are already tracked by requests_total.
if self.cumulative_osl > 0 {
self.request_metrics
.output_sequence_tokens
.observe(self.cumulative_osl as f64);
}
self.request_metrics.requests_total.inc();
}
}
......
......@@ -6,6 +6,7 @@ use async_stream::stream;
use dynamo_llm::{
http::service::{metrics::Endpoint, service_v2::HttpService},
model_card::ModelDeploymentCard,
preprocessor::LLMMetricAnnotation,
protocols::{
Annotated,
openai::chat_completions::{
......@@ -50,10 +51,32 @@ impl
// Simulate some processing time
tokio::time::sleep(std::time::Duration::from_millis(10)).await;
// Generate 5 response chunks
// Generate 5 response chunks with LLMMetricAnnotation so that
// output_sequence_tokens is properly recorded (the histogram only
// records when osl > 0, which requires the annotation to be present).
for i in 0..5 {
let output = generator.create_choice(i, Some(format!("Mock response {i}")), None, None, None);
yield Annotated::from_data(output);
let mut annotated = Annotated::from_data(output);
let metrics = LLMMetricAnnotation {
input_tokens: 5,
output_tokens: (i + 1) as usize,
chunk_tokens: 1,
cached_tokens: None,
prefill_worker_id: None,
prefill_dp_rank: None,
prefill_worker_type: None,
decode_worker_id: None,
decode_dp_rank: None,
decode_worker_type: None,
tokenize_latency: None,
detokenize_total_latency: None,
detokenize_count: None,
};
if let Ok(ann) = metrics.to_annotation::<NvCreateChatCompletionStreamResponse>() {
annotated.event = ann.event;
annotated.comment = ann.comment;
}
yield annotated;
}
};
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment