test: add backend-specific metrics validation for E2E tests (part 1/3, vLLM only) (#3804)

Signed-off-by: Keiven Chang <keivenchang@users.noreply.github.com>

test: add backend-specific metrics validation for E2E tests (part 1/3, vLLM only) (#3804)
Signed-off-by: Keiven Chang <keivenchang@users.noreply.github.com>
bdef0ec5 · Keiven C · GitHub · f93b619a · bdef0ec5 · bdef0ec5
Unverified Commit bdef0ec5 authored Oct 22, 2025 by Keiven C Committed by GitHub Oct 22, 2025
5 changed files
--- a/tests/serve/test_sglang.py
+++ b/tests/serve/test_sglang.py
@@ -42,7 +42,11 @@ sglang_configs = {
        model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
        env={},
        models_port=8000,
-        request_payloads=[chat_payload_default(), completion_payload_default()],
+        request_payloads=[
+            chat_payload_default(),
+            completion_payload_default(),
+            # TODO: Add metric_payload_default(min_num_requests=N, backend="sglang")
+        ],
    ),
    "disaggregated": SGLangConfig(
        name="disaggregated",

--- a/tests/serve/test_trtllm.py
+++ b/tests/serve/test_trtllm.py
@@ -35,6 +35,7 @@ trtllm_configs = {
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
+            # TODO: Add metric_payload_default(min_num_requests=N, backend="trtllm")
        ],
    ),
    "disaggregated": TRTLLMConfig(

--- a/tests/serve/test_vllm.py
+++ b/tests/serve/test_vllm.py
@@ -39,7 +39,7 @@ vllm_configs = {
        request_payloads=[
            chat_payload_default(),
            completion_payload_default(),
-            metric_payload_default(min_num_requests=6),
+            metric_payload_default(min_num_requests=6, backend="vllm"),
        ],
    ),
    "agg-router": VLLMConfig(

--- a/tests/utils/payload_builder.py
+++ b/tests/utils/payload_builder.py
@@ -66,6 +66,7 @@ def metric_payload_default(
    min_num_requests: int,
    repeat_count: int = 1,
    expected_log: Optional[List[str]] = None,
+    backend: Optional[str] = None,
 ) -> MetricsPayload:
    return MetricsPayload(
        body={},
@@ -73,6 +74,7 @@ def metric_payload_default(
        expected_log=expected_log or [],
        expected_response=[],
        min_num_requests=min_num_requests,
+        backend=backend,
    )

--- a/tests/utils/payloads.py
+++ b/tests/utils/payloads.py
@@ -18,7 +18,7 @@ import re
 import time
 from copy import deepcopy
 from dataclasses import dataclass
-from typing import Any, Dict, List
+from typing import Any, Callable, Dict, List, Optional
 from dynamo import prometheus_names
@@ -190,12 +190,27 @@ class EmbeddingPayload(BasePayload):
        return EmbeddingPayload.extract_embeddings(response)
+@dataclass
+class MetricCheck:
+    """Definition of a metric validation check"""
+    name: str
+    pattern: Callable[[str], str]
+    validator: Callable[[Any], bool]
+    error_msg: Callable[[str, Any], str]
+    success_msg: Callable[[str, Any], str]
+    multiline: bool = False
 @dataclass
 class MetricsPayload(BasePayload):
    endpoint: str = "/metrics"
    method: str = "GET"
    port: int = 8081
    min_num_requests: int = 1
+    backend: Optional[
+        str
+    ] = None  # Backend identifier for metrics validation (e.g., 'vllm', 'sglang', 'trtllm')
    def with_model(self, model):
        # Metrics does not use model in request body
@@ -206,26 +221,121 @@ class MetricsPayload(BasePayload):
        return response.text
    def validate(self, response: Any, content: str) -> None:
-        requests_total_name = prometheus_names.work_handler.REQUESTS_TOTAL
+        # Use backend from payload configuration
-        pattern = (
+        backend = self.backend
-            rf'{re.escape(requests_total_name)}\{{[^}}]*model="[^"]*"[^}}]*\}}\s+(\d+)'
+        # Filter out _bucket metrics from content (histogram buckets inflate counts)
+        content_lines = content.split("\n")
+        filtered_lines = [line for line in content_lines if "_bucket{" not in line]
+        content = "\n".join(filtered_lines)
+        # Build full metric names with prefix
+        prefix = prometheus_names.name_prefix.COMPONENT
+        # Define metrics to check
+        # Pattern matches: metric_name{labels} value OR metric_name value (labels optional)
+        # Examples:
+        #   - dynamo_component_requests_total{model="Qwen/Qwen3-0.6B"} 6
+        #   - dynamo_component_uptime_seconds 150.390999059
+        def metric_pattern(name):
+            return rf"{name}(?:\{{[^}}]*\}})?\s+([\d.]+)"
+        metrics_to_check = [
+            MetricCheck(
+                # Check: Minimum count of unique dynamo_component_* metrics
+                name=f"{prefix}_*",
+                pattern=lambda name: rf"^{prefix}_\w+",
+                validator=lambda value: len(set(value))
+                >= 23,  # 80% of typical ~29 metrics (excluding _bucket) as of 2025-10-22 (but will grow)
+                error_msg=lambda name, value: f"Expected at least 23 unique {prefix}_* metrics, but found only {len(set(value))}",
+                success_msg=lambda name, value: f"SUCCESS: Found {len(set(value))} unique {prefix}_* metrics (minimum required: 23)",
+                multiline=True,
+            ),
+            MetricCheck(
+                name=f"{prefix}_{prometheus_names.work_handler.REQUESTS_TOTAL}",
+                pattern=metric_pattern,
+                validator=lambda value: int(float(value)) >= self.min_num_requests,
+                error_msg=lambda name, value: f"{name} has count {value} which is less than required {self.min_num_requests}",
+                success_msg=lambda name, value: f"SUCCESS: Found {name} with count: {value}",
+            ),
+            MetricCheck(
+                name=f"{prefix}_{prometheus_names.distributed_runtime.UPTIME_SECONDS}",
+                pattern=metric_pattern,
+                validator=lambda value: float(value) > 0,
+                error_msg=lambda name, value: f"{name} should be > 0, but got {value}",
+                success_msg=lambda name, value: f"SUCCESS: Found {name} = {value}s",
+            ),
+            MetricCheck(
+                name=f"{prefix}_{prometheus_names.kvstats.TOTAL_BLOCKS}",
+                pattern=metric_pattern,
+                validator=lambda value: int(float(value)) > 0,
+                error_msg=lambda name, value: f"{name} should be > 0, but got {value}",
+                success_msg=lambda name, value: f"SUCCESS: Found {name} = {value}",
+            ),
+        ]
+        # Add backend-specific metric checks
+        if backend == "vllm":
+            metrics_to_check.append(
+                MetricCheck(
+                    # Check: Minimum count of unique vllm:* metrics
+                    name="vllm:*",
+                    pattern=lambda name: r"^vllm:\w+",
+                    validator=lambda value: len(set(value))
+                    >= 52,  # 80% of typical ~65 vllm metrics (excluding _bucket) as of 2025-10-22 (but will grow)
+                    error_msg=lambda name, value: f"Expected at least 52 unique vllm:* metrics, but found only {len(set(value))}",
+                    success_msg=lambda name, value: f"SUCCESS: Found {len(set(value))} unique vllm:* metrics (minimum required: 52)",
+                    multiline=True,
+                )
+            )
+        # TODO: Add sglang:* and trtllm:* metrics checks (similar to vllm above)
+        # Check all metrics
+        for metric in metrics_to_check:
+            # Special handling for multiline patterns (like counting unique metrics)
+            if metric.multiline:
+                pattern = metric.pattern(metric.name)
+                matches = re.findall(pattern, content, re.MULTILINE)
+                if not matches:
+                    raise AssertionError(
+                        f"Could not find any matches for pattern '{metric.name}'"
+                    )
+                # For multiline, pass the entire list to validator
+                if metric.validator(matches):
+                    logger.info(metric.success_msg(metric.name, matches))
+                else:
+                    raise AssertionError(metric.error_msg(metric.name, matches))
+            else:
+                # Standard single-value metric check
+                if metric.name not in content:
+                    raise AssertionError(
+                        f"Metric '{metric.name}' not found in metrics output"
                    )
+                pattern = metric.pattern(metric.name)
                matches = re.findall(pattern, content)
                if not matches:
                    raise AssertionError(
-                f"Metric '{requests_total_name}' with model label not found in metrics output"
+                        f"Could not parse value for metric '{metric.name}'"
                    )
+                # For metrics with multiple values (like requests_total with different labels),
+                # check if any match passes validation
+                validation_passed = False
+                last_value = None
                for match in matches:
-            request_count = int(match)
+                    last_value = match
-            if request_count >= self.min_num_requests:
+                    if metric.validator(match):
-                logger.info(
+                        logger.info(metric.success_msg(metric.name, match))
-                    f"SUCCESS: Found {requests_total_name} with count: {request_count}"
+                        validation_passed = True
-                )
+                        break
-                return
+                if not validation_passed:
                    raise AssertionError(
-            f"{requests_total_name} exists but has count {request_count} which is less than required {self.min_num_requests}"
+                        metric.error_msg(
+                            metric.name, last_value if last_value else "N/A"
+                        )
                    )