fix(metrics): skip output token histogram for zero-output requests (#8085)

Signed-off-by: tmontfort <tmontfort@nvidia.com> Signed-off-by: Thomas Montfort <tmontfort@nvidia.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

fix(metrics): skip output token histogram for zero-output requests (#8085)
Signed-off-by: tmontfort <tmontfort@nvidia.com> Signed-off-by: Thomas Montfort <tmontfort@nvidia.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
60acaa7b · Thomas Montfort · GitHub · 44cfbb68 · 60acaa7b · 60acaa7b
Unverified Commit 60acaa7b authored Apr 15, 2026 by Thomas Montfort Committed by GitHub Apr 15, 2026
3 changed files
--- a/lib/llm/src/http/service/metrics.rs
+++ b/lib/llm/src/http/service/metrics.rs
@@ -1451,11 +1451,17 @@ impl Drop for ResponseMetricCollector {
                .observe(avg_detokenize_latency_ms);
        }

-        // Publish final OSL when the collector is dropped
-        self.metrics
-            .output_sequence_length
-            .with_label_values(&[&self.model])
-            .observe(self.osl as f64);
+        // Publish final OSL when the collector is dropped, but only for
+        // requests that actually produced output tokens. Recording zero for
+        // failed/cancelled requests would corrupt histogram averages (sum/count)
+        // and percentiles. Failures are already tracked by requests_total with
+        // status and error_type labels.
+        if self.osl > 0 {
+            self.metrics
+                .output_sequence_length
+                .with_label_values(&[&self.model])
+                .observe(self.osl as f64);
+        }

        // Record request summary on the enclosing span.
        // InflightGuard::Drop and on_response logs will inherit these.

--- a/lib/llm/src/kv_router/push_router.rs
+++ b/lib/llm/src/kv_router/push_router.rs
@@ -237,9 +237,15 @@ impl RequestGuard {
                    .observe(latency);
            }
        }
-        self.request_metrics
-            .output_sequence_tokens
-            .observe(self.cumulative_osl as f64);
+        // Only record output sequence length for requests that actually
+        // produced output tokens. Recording zero for failed/cancelled requests
+        // would corrupt histogram averages (sum/count) and percentiles.
+        // Failures are already tracked by requests_total.
+        if self.cumulative_osl > 0 {
+            self.request_metrics
+                .output_sequence_tokens
+                .observe(self.cumulative_osl as f64);
+        }
        self.request_metrics.requests_total.inc();
    }
 }

--- a/lib/llm/tests/http_metrics.rs
+++ b/lib/llm/tests/http_metrics.rs
@@ -6,6 +6,7 @@ use async_stream::stream;
 use dynamo_llm::{
    http::service::{metrics::Endpoint, service_v2::HttpService},
    model_card::ModelDeploymentCard,
+    preprocessor::LLMMetricAnnotation,
    protocols::{
        Annotated,
        openai::chat_completions::{
@@ -50,10 +51,32 @@ impl
            // Simulate some processing time
            tokio::time::sleep(std::time::Duration::from_millis(10)).await;

-            // Generate 5 response chunks
+            // Generate 5 response chunks with LLMMetricAnnotation so that
+            // output_sequence_tokens is properly recorded (the histogram only
+            // records when osl > 0, which requires the annotation to be present).
            for i in 0..5 {
                let output = generator.create_choice(i, Some(format!("Mock response {i}")), None, None, None);
-                yield Annotated::from_data(output);
+                let mut annotated = Annotated::from_data(output);
+                let metrics = LLMMetricAnnotation {
+                    input_tokens: 5,
+                    output_tokens: (i + 1) as usize,
+                    chunk_tokens: 1,
+                    cached_tokens: None,
+                    prefill_worker_id: None,
+                    prefill_dp_rank: None,
+                    prefill_worker_type: None,
+                    decode_worker_id: None,
+                    decode_dp_rank: None,
+                    decode_worker_type: None,
+                    tokenize_latency: None,
+                    detokenize_total_latency: None,
+                    detokenize_count: None,
+                };
+                if let Ok(ann) = metrics.to_annotation::<NvCreateChatCompletionStreamResponse>() {
+                    annotated.event = ann.event;
+                    annotated.comment = ann.comment;
+                }
+                yield annotated;
            }
        };