refactor: change TensorRT-LLM metrics prefix from trtllm: to trtllm_ (#4269)

Signed-off-by: Keiven Chang <keivenchang@users.noreply.github.com> Co-authored-by: Keiven Chang <keivenchang@users.noreply.github.com>

refactor: change TensorRT-LLM metrics prefix from trtllm: to trtllm_ (#4269)
Signed-off-by: Keiven Chang <keivenchang@users.noreply.github.com> Co-authored-by: Keiven Chang <keivenchang@users.noreply.github.com>
381c428c · Keiven C · GitHub · 4adab52d · 381c428c · 381c428c
Unverified Commit 381c428c authored Nov 13, 2025 by Keiven C Committed by GitHub Nov 13, 2025
8 changed files
--- a/components/src/dynamo/common/utils/prometheus.py
+++ b/components/src/dynamo/common/utils/prometheus.py
@@ -45,7 +45,7 @@ def register_engine_metrics_callback(
        registry: Prometheus registry to collect from (e.g., REGISTRY or CollectorRegistry)
        metric_prefix_filter: Prefix to filter metrics (e.g., "vllm:" or "sglang:", None for no filtering)
        exclude_prefixes: List of metric name prefixes to exclude (e.g., ["python_", "process_"])
-        add_prefix: Prefix to add to remaining metrics (e.g., "trtllm:")
+        add_prefix: Prefix to add to remaining metrics (e.g., "trtllm_")

    Example:
        from prometheus_client import REGISTRY
@@ -57,7 +57,7 @@ def register_engine_metrics_callback(
        register_engine_metrics_callback(
            generate_endpoint, REGISTRY,
            exclude_prefixes=["python_", "process_"],
-            add_prefix="trtllm:"
+            add_prefix="trtllm_"
        )
    """

@@ -116,14 +116,14 @@ def get_prometheus_expfmt(
        metric_prefix_filter: Optional prefix to filter displayed metrics (e.g., "vllm:").
                             If None, returns all metrics. (default: None)
        exclude_prefixes: List of metric name prefixes to exclude (e.g., ["python_", "process_"])
-        add_prefix: Prefix to add to remaining metrics (e.g., "trtllm:")
+        add_prefix: Prefix to add to remaining metrics (e.g., "trtllm_")

    Returns:
        Formatted metrics text in Prometheus exposition format. Returns empty string on error.

    Example:
-        # Filter out python_/process_ metrics and add trtllm: prefix
-        get_prometheus_expfmt(registry, exclude_prefixes=["python_", "process_"], add_prefix="trtllm:")
+        # Filter out python_/process_ metrics and add trtllm_ prefix
+        get_prometheus_expfmt(registry, exclude_prefixes=["python_", "process_"], add_prefix="trtllm_")
    """
    try:
        # Generate metrics in Prometheus text format
@@ -165,20 +165,39 @@ def get_prometheus_expfmt(
                        if match:
                            comment_type, metric_name, rest = match.groups()
                            # Remove existing prefix if present
-                            if metric_prefix_filter and metric_name.startswith(
+                            if metric_prefix_filter:
+                                metric_name = metric_name.removeprefix(
                                    metric_prefix_filter
-                            ):
-                                metric_name = metric_name[len(metric_prefix_filter) :]
-                            new_metric_name = add_prefix + metric_name
-                            line = f"# {comment_type} {new_metric_name}{rest}"
+                                )
+                            # Only add prefix if it doesn't already exist
+                            if not metric_name.startswith(add_prefix):
+                                metric_name = add_prefix + metric_name
+                            line = f"# {comment_type} {metric_name}{rest}"
                    # Handle metric lines
                    elif line and not line.startswith("#"):
+                        # Extract metric name (first token)
+                        parts = line.split(None, 1)
+                        if parts:
+                            metric_name_part = parts[0]
+                            rest_of_line = parts[1] if len(parts) > 1 else ""
+
                            # Remove existing prefix if present
-                        if metric_prefix_filter and line.startswith(
+                            if metric_prefix_filter:
+                                metric_name_part = metric_name_part.removeprefix(
                                    metric_prefix_filter
-                        ):
-                            line = line[len(metric_prefix_filter) :]
-                        line = add_prefix + line
+                                )
+
+                            # Only add prefix if it doesn't already exist
+                            if not metric_name_part.startswith(add_prefix):
+                                metric_name_part = add_prefix + metric_name_part
+
+                            # Reconstruct line
+                            line = metric_name_part + (
+                                " " + rest_of_line if rest_of_line else ""
+                            )
+                        else:
+                            # Empty line or just whitespace, skip prefix addition
+                            pass

                lines.append(line)


--- a/components/src/dynamo/trtllm/main.py
+++ b/components/src/dynamo/trtllm/main.py
@@ -332,12 +332,12 @@ async def init(runtime: DistributedRuntime, config: Config):
                logging.info("TensorRT-LLM MetricsCollector initialized")

                # Register callback to expose TRT-LLM metrics via Dynamo endpoint
-                # Filter out python_/process_ metrics and add trtllm: prefix to remaining metrics
+                # Filter out python_/process_ metrics and add trtllm_ prefix to remaining metrics
                register_engine_metrics_callback(
                    endpoint=endpoint,
                    registry=REGISTRY,
                    exclude_prefixes=["python_", "process_"],
-                    add_prefix="trtllm:",
+                    add_prefix="trtllm_",
                )
                logging.info("TensorRT-LLM Prometheus metrics registered")
            except Exception as e:

--- a/components/src/dynamo/trtllm/tests/test_trtllm_main_init.py
+++ b/components/src/dynamo/trtllm/tests/test_trtllm_main_init.py
@@ -67,7 +67,7 @@ def test_prometheus_metrics_integration():
            endpoint=mock_endpoint,
            registry=REGISTRY,
            exclude_prefixes=["python_", "process_"],
-            add_prefix="trtllm:",
+            add_prefix="trtllm_",
        )

        print("✅ Prometheus metrics integration test passed")

--- a/components/src/dynamo/trtllm/tests/test_trtllm_prometheus_utils.py
+++ b/components/src/dynamo/trtllm/tests/test_trtllm_prometheus_utils.py
@@ -53,29 +53,29 @@ tokens_per_second 245.7
        dynamo.common.utils.prometheus.generate_latest = original_generate_latest

    def test_trtllm_use_case(self, trtllm_registry):
-        """Test TensorRT-LLM use case: exclude python_/process_ and add trtllm: prefix."""
+        """Test TensorRT-LLM use case: exclude python_/process_ and add trtllm_ prefix."""
        result = get_prometheus_expfmt(
            trtllm_registry,
            exclude_prefixes=["python_", "process_"],
-            add_prefix="trtllm:",
+            add_prefix="trtllm_",
        )

        # Should not contain excluded metrics
        assert "python_gc_objects_collected_total" not in result
        assert "process_cpu_seconds_total" not in result

-        # All remaining metrics should have trtllm: prefix
-        assert "trtllm:request_latency_seconds" in result
-        assert "trtllm:num_requests_running" in result
-        assert "trtllm:tokens_per_second" in result
+        # All remaining metrics should have trtllm_ prefix
+        assert "trtllm_request_latency_seconds" in result
+        assert "trtllm_num_requests_running" in result
+        assert "trtllm_tokens_per_second" in result

        # HELP/TYPE comments should have prefix
-        assert "# HELP trtllm:request_latency_seconds" in result
-        assert "# TYPE trtllm:num_requests_running" in result
+        assert "# HELP trtllm_request_latency_seconds" in result
+        assert "# TYPE trtllm_num_requests_running" in result

        # Check specific content and structure preservation
-        assert 'trtllm:request_latency_seconds_bucket{le="0.1"} 10.0' in result
-        assert "trtllm:tokens_per_second 245.7" in result
+        assert 'trtllm_request_latency_seconds_bucket{le="0.1"} 10.0' in result
+        assert "trtllm_tokens_per_second 245.7" in result
        assert result.endswith("\n")

    def test_no_filtering_all_frameworks(self, trtllm_registry):
@@ -99,6 +99,42 @@ tokens_per_second 245.7
        # Should return empty string with newline or just newline
        assert result == "\n" or result == ""

+    def test_prefix_already_exists(self):
+        """Test that prefix is not added if it already exists."""
+        registry = Mock()
+
+        # Metrics that already have trtllm_ prefix
+        sample_metrics = """# HELP trtllm_request_success_total Count of successfully processed requests
+# TYPE trtllm_request_success_total counter
+trtllm_request_success_total{model_name="test",finished_reason="stop"} 10.0
+# HELP trtllm_time_to_first_token_seconds Time to first token
+# TYPE trtllm_time_to_first_token_seconds histogram
+trtllm_time_to_first_token_seconds_count 5.0
+"""
+
+        def mock_generate_latest(reg):
+            return sample_metrics.encode("utf-8")
+
+        import dynamo.common.utils.prometheus
+
+        original_generate_latest = dynamo.common.utils.prometheus.generate_latest
+        dynamo.common.utils.prometheus.generate_latest = mock_generate_latest
+
+        try:
+            result = get_prometheus_expfmt(
+                registry,
+                exclude_prefixes=["python_", "process_"],
+                add_prefix="trtllm_",
+            )
+
+            # Should not double-add prefix
+            assert "trtllm_trtllm_request_success_total" not in result
+            assert "trtllm_request_success_total" in result
+            assert "trtllm_time_to_first_token_seconds" in result
+            assert result.endswith("\n")
+        finally:
+            dynamo.common.utils.prometheus.generate_latest = original_generate_latest
+
    def test_error_handling(self):
        """Test error handling when registry fails."""
        # Create a registry that raises an exception

--- a/docs/backends/trtllm/prometheus.md
+++ b/docs/backends/trtllm/prometheus.md
@@ -7,12 +7,12 @@ SPDX-License-Identifier: Apache-2.0

 ## Overview

-When running TensorRT-LLM through Dynamo, TensorRT-LLM's Prometheus metrics are automatically passed through and exposed on Dynamo's `/metrics` endpoint (default port 8081). This allows you to access both TensorRT-LLM engine metrics (prefixed with `trtllm:`) and Dynamo runtime metrics (prefixed with `dynamo_*`) from a single worker backend endpoint.
-
-As of the date of this documentation, the included TensorRT-LLM version 1.1.0rc5 exposes **5 basic Prometheus metrics**. Note that the `trtllm:` prefix is added by Dynamo.
+When running TensorRT-LLM through Dynamo, TensorRT-LLM's Prometheus metrics are automatically passed through and exposed on Dynamo's `/metrics` endpoint (default port 8081). This allows you to access both TensorRT-LLM engine metrics (prefixed with `trtllm_`) and Dynamo runtime metrics (prefixed with `dynamo_*`) from a single worker backend endpoint.

 Additional performance metrics are available via non-Prometheus APIs (see [Non-Prometheus Performance Metrics](#non-prometheus-performance-metrics) below).

+As of the date of this documentation, the included TensorRT-LLM version 1.1.0rc5 exposes **5 basic Prometheus metrics**. Note that the `trtllm_` prefix is added by Dynamo.
+
 **For Dynamo runtime metrics**, see the [Dynamo Metrics Guide](../../observability/metrics.md).

 **For visualization setup instructions**, see the [Prometheus and Grafana Setup Guide](../../observability/prometheus-grafana.md).
@@ -58,42 +58,54 @@ curl -H 'Content-Type: application/json' \
 http://localhost:8000/v1/chat/completions

 # Check metrics from the worker
-curl -s localhost:8081/metrics | grep "^trtllm:"
+curl -s localhost:8081/metrics | grep "^trtllm_"
 ```

 ## Exposed Metrics

-TensorRT-LLM exposes metrics in Prometheus Exposition Format text at the `/metrics` HTTP endpoint. All TensorRT-LLM engine metrics use the `trtllm:` prefix and include labels (e.g., `model_name`, `engine_type`, `finished_reason`) to identify the source.
+TensorRT-LLM exposes metrics in Prometheus Exposition Format text at the `/metrics` HTTP endpoint. All TensorRT-LLM engine metrics use the `trtllm_` prefix and include labels (e.g., `model_name`, `engine_type`, `finished_reason`) to identify the source.

 **Note:** TensorRT-LLM uses `model_name` instead of Dynamo's standard `model` label convention.

 **Example Prometheus Exposition Format text:**

 ```
-# HELP trtllm:request_success_total Count of successfully processed requests.
-# TYPE trtllm:request_success_total counter
-trtllm:request_success_total{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm",finished_reason="stop"} 150.0
-trtllm:request_success_total{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm",finished_reason="length"} 5.0
-
-# HELP trtllm:time_to_first_token_seconds Histogram of time to first token in seconds.
-# TYPE trtllm:time_to_first_token_seconds histogram
-trtllm:time_to_first_token_seconds_bucket{le="0.01",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 0.0
-trtllm:time_to_first_token_seconds_bucket{le="0.05",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 12.0
-trtllm:time_to_first_token_seconds_count{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 150.0
-trtllm:time_to_first_token_seconds_sum{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 8.75
-
-# HELP trtllm:e2e_request_latency_seconds Histogram of end to end request latency in seconds.
-# TYPE trtllm:e2e_request_latency_seconds histogram
-trtllm:e2e_request_latency_seconds_bucket{le="0.5",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 25.0
-trtllm:e2e_request_latency_seconds_count{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 150.0
-trtllm:e2e_request_latency_seconds_sum{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 45.2
+# HELP trtllm_request_success_total Count of successfully processed requests.
+# TYPE trtllm_request_success_total counter
+trtllm_request_success_total{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm",finished_reason="stop"} 150.0
+trtllm_request_success_total{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm",finished_reason="length"} 5.0
+
+# HELP trtllm_time_to_first_token_seconds Histogram of time to first token in seconds.
+# TYPE trtllm_time_to_first_token_seconds histogram
+trtllm_time_to_first_token_seconds_bucket{le="0.01",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 0.0
+trtllm_time_to_first_token_seconds_bucket{le="0.05",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 12.0
+trtllm_time_to_first_token_seconds_count{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 150.0
+trtllm_time_to_first_token_seconds_sum{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 8.75
+
+# HELP trtllm_e2e_request_latency_seconds Histogram of end to end request latency in seconds.
+# TYPE trtllm_e2e_request_latency_seconds histogram
+trtllm_e2e_request_latency_seconds_bucket{le="0.5",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 25.0
+trtllm_e2e_request_latency_seconds_count{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 150.0
+trtllm_e2e_request_latency_seconds_sum{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 45.2
+
+# HELP trtllm_time_per_output_token_seconds Histogram of time per output token in seconds.
+# TYPE trtllm_time_per_output_token_seconds histogram
+trtllm_time_per_output_token_seconds_bucket{le="0.1",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 120.0
+trtllm_time_per_output_token_seconds_count{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 150.0
+trtllm_time_per_output_token_seconds_sum{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 12.5
+
+# HELP trtllm_request_queue_time_seconds Histogram of time spent in WAITING phase for request.
+# TYPE trtllm_request_queue_time_seconds histogram
+trtllm_request_queue_time_seconds_bucket{le="1.0",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 140.0
+trtllm_request_queue_time_seconds_count{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 150.0
+trtllm_request_queue_time_seconds_sum{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 32.1
 ```

 **Note:** The specific metrics shown above are examples and may vary depending on your TensorRT-LLM version. Always inspect your actual `/metrics` endpoint for the current list.

 ### Metric Categories

-TensorRT-LLM provides metrics in the following categories (all prefixed with `trtllm:`):
+TensorRT-LLM provides metrics in the following categories (all prefixed with `trtllm_`):

 - **Request metrics** - Request success tracking and latency measurements
 - **Performance metrics** - Time to first token (TTFT), time per output token (TPOT), and queue time
@@ -102,17 +114,17 @@ TensorRT-LLM provides metrics in the following categories (all prefixed with `tr

 ## Available Metrics

-The following metrics are exposed via Dynamo's `/metrics` endpoint (with the `trtllm:` prefix added by Dynamo) for TensorRT-LLM version 1.1.0rc5:
+The following metrics are exposed via Dynamo's `/metrics` endpoint (with the `trtllm_` prefix added by Dynamo) for TensorRT-LLM version 1.1.0rc5:

- `trtllm:request_success_total` (Counter) — Count of successfully processed requests by finish reason
+- `trtllm_request_success_total` (Counter) — Count of successfully processed requests by finish reason
  - Labels: `model_name`, `engine_type`, `finished_reason`
- `trtllm:e2e_request_latency_seconds` (Histogram) — End-to-end request latency (seconds)
+- `trtllm_e2e_request_latency_seconds` (Histogram) — End-to-end request latency (seconds)
  - Labels: `model_name`, `engine_type`
- `trtllm:time_to_first_token_seconds` (Histogram) — Time to first token, TTFT (seconds)
+- `trtllm_time_to_first_token_seconds` (Histogram) — Time to first token, TTFT (seconds)
  - Labels: `model_name`, `engine_type`
- `trtllm:time_per_output_token_seconds` (Histogram) — Time per output token, TPOT (seconds)
+- `trtllm_time_per_output_token_seconds` (Histogram) — Time per output token, TPOT (seconds)
  - Labels: `model_name`, `engine_type`
- `trtllm:request_queue_time_seconds` (Histogram) — Time a request spends waiting in the queue (seconds)
+- `trtllm_request_queue_time_seconds` (Histogram) — Time a request spends waiting in the queue (seconds)
  - Labels: `model_name`, `engine_type`

 These metric names and availability are subject to change with TensorRT-LLM version updates.
@@ -161,7 +173,7 @@ TensorRT-LLM provides extensive performance data beyond the basic Prometheus met
 ## Implementation Details

 - **Prometheus Integration**: Uses the `MetricsCollector` class from `tensorrt_llm.metrics` (see [collector.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/metrics/collector.py))
- **Dynamo Integration**: Uses `register_engine_metrics_callback()` function with `add_prefix="trtllm:"`
+- **Dynamo Integration**: Uses `register_engine_metrics_callback()` function with `add_prefix="trtllm_"`
 - **Engine Configuration**: `return_perf_metrics` set to `True` when `--publish-events-and-metrics` is enabled
 - **Initialization**: Metrics appear after TensorRT-LLM engine initialization completes
 - **Metadata**: `MetricsCollector` initialized with model metadata (model name, engine type)

--- a/docs/observability/metrics.md
+++ b/docs/observability/metrics.md
@@ -84,7 +84,7 @@ Dynamo exposes several categories of metrics:
 - **Frontend Metrics** (`dynamo_frontend_*`) - Request handling, token processing, and latency measurements
 - **Component Metrics** (`dynamo_component_*`) - Request counts, processing times, byte transfers, and system uptime
 - **Specialized Component Metrics** (e.g., `dynamo_preprocessor_*`) - Component-specific metrics
- **Engine Metrics** (Pass-through) - Backend engines expose their own metrics: [vLLM](../backends/vllm/prometheus.md) (`vllm:*`), [SGLang](../backends/sglang/prometheus.md) (`sglang:*`), [TensorRT-LLM](../backends/trtllm/prometheus.md) (`trtllm:*`)
+- **Engine Metrics** (Pass-through) - Backend engines expose their own metrics: [vLLM](../backends/vllm/prometheus.md) (`vllm:*`), [SGLang](../backends/sglang/prometheus.md) (`sglang:*`), [TensorRT-LLM](../backends/trtllm/prometheus.md) (`trtllm_*`)

 ## Runtime Hierarchy


--- a/tests/utils/payload_builder.py
+++ b/tests/utils/payload_builder.py
@@ -12,15 +12,15 @@ from tests.utils.payloads import (
 )

 # Common default text prompt used across tests
-TEXT_PROMPT = "Tell me a short joke about AI."
+TEXT_PROMPT = "Tell me a knock knock joke about AI."


 def chat_payload_default(
    repeat_count: int = 3,
    expected_response: Optional[List[str]] = None,
    expected_log: Optional[List[str]] = None,
-    max_tokens: int = 150,
-    temperature: float = 0.1,
+    max_tokens: int = 1000,
+    temperature: float = 0.0,
    stream: bool = False,
 ) -> ChatPayload:
    return ChatPayload(
@@ -37,7 +37,9 @@ def chat_payload_default(
        },
        repeat_count=repeat_count,
        expected_log=expected_log or [],
-        expected_response=expected_response or ["AI"],
+        # Accept any of these keywords in the response (case-insensitive)
+        expected_response=expected_response
+        or ["AI", "knock", "joke", "think", "artificial", "intelligence"],
    )


@@ -45,8 +47,8 @@ def completion_payload_default(
    repeat_count: int = 3,
    expected_response: Optional[List[str]] = None,
    expected_log: Optional[List[str]] = None,
-    max_tokens: int = 150,
-    temperature: float = 0.1,
+    max_tokens: int = 1000,
+    temperature: float = 0.0,
    stream: bool = False,
 ) -> CompletionPayload:
    return CompletionPayload(
@@ -58,7 +60,9 @@ def completion_payload_default(
        },
        repeat_count=repeat_count,
        expected_log=expected_log or [],
-        expected_response=expected_response or ["AI"],
+        # Accept any of these keywords in the response (case-insensitive)
+        expected_response=expected_response
+        or ["AI", "knock", "joke", "think", "artificial", "intelligence"],
    )



--- a/tests/utils/payloads.py
+++ b/tests/utils/payloads.py
@@ -30,7 +30,7 @@ class BasePayload:
    """Generic payload body plus expectations and repeat count."""

    body: Dict[str, Any]
-    expected_response: List[str]
+    expected_response: List[Any]  # Can be List[str] or List[List[str]] for alternatives
    expected_log: List[str]
    repeat_count: int = 1
    timeout: int = 60
@@ -56,17 +56,40 @@ class BasePayload:
        raise NotImplementedError("Subclasses must implement response_handler()")

    def validate(self, response: Any, content: str) -> None:
-        """Default validation: ensure expected substrings appear in content."""
+        """Default validation: ensure expected substrings appear in content.
+
+        If expected_response is a list of strings, ANY one of them matching is sufficient (OR logic).
+        This allows flexible validation where responses may vary but should contain at least one keyword.
+        """
        if self.expected_response:
-            missing_expected = []
+            # Check if content is empty
+            if not content:
+                logger.error("VALIDATION FAILED - Response content is empty")
+                raise AssertionError(
+                    f"Expected content not found in response. Expected any of: {self.expected_response}. Actual content is empty."
+                )
+
+            # Check if ANY of the expected strings are found (OR logic) and count matches
+            found_keywords = []
            for expected in self.expected_response:
-                if not content or expected not in content:
-                    missing_expected.append(expected)
-            if missing_expected:
+                if isinstance(expected, str) and expected.lower() in content.lower():
+                    found_keywords.append(expected)
+
+            if not found_keywords:
+                logger.error(
+                    f"VALIDATION FAILED - Actual content returned: {repr(content)}"
+                )
+                logger.error(
+                    f"Expected to find at least one of: {self.expected_response}"
+                )
+                logger.error(f"Matches found: 0/{len(self.expected_response)}")
                raise AssertionError(
-                    f"Expected content not found in response. Missing: {missing_expected}"
+                    f"Expected content not found in response. Expected at least one of: {self.expected_response}. Actual content: {repr(content)}"
+                )
+
+            logger.info(
+                f"SUCCESS: Found {len(found_keywords)}/{len(self.expected_response)} expected keywords: {found_keywords}"
            )
-        logger.info(f"SUCCESS: All expected_responses: {self.expected_response} found.")

    def process_response(self, response: Any) -> str:
        """Convenience: run response_handler then validate; return content."""
@@ -88,9 +111,14 @@ class ChatPayload(BasePayload):
        """
        response.raise_for_status()
        result = response.json()
-        assert "choices" in result, "Missing 'choices' in response"
+
+        assert (
+            "choices" in result
+        ), f"Missing 'choices' in response. Response keys: {list(result.keys())}"
        assert len(result["choices"]) > 0, "Empty choices in response"
-        assert "message" in result["choices"][0], "Missing 'message' in first choice"
+        assert (
+            "message" in result["choices"][0]
+        ), f"Missing 'message' in first choice. Choice keys: {list(result['choices'][0].keys())}"

        # Check for content in all possible fields where parsers might put output:
        # 1. content - standard message content
@@ -305,13 +333,13 @@ class MetricsPayload(BasePayload):
        elif backend == "trtllm":
            metrics_to_check.append(
                MetricCheck(
-                    # Check: Minimum count of unique trtllm:* metrics
-                    name="trtllm:*",
-                    pattern=lambda name: r"^trtllm:\w+",
+                    # Check: Minimum count of unique trtllm_* metrics
+                    name="trtllm_*",
+                    pattern=lambda name: r"^trtllm_\w+",
                    validator=lambda value: len(set(value))
                    >= 4,  # 80% of typical ~5 trtllm metrics (excluding _bucket) as of 2025-10-22 (but will grow)
-                    error_msg=lambda name, value: f"Expected at least 4 unique trtllm:* metrics, but found only {len(set(value))}",
-                    success_msg=lambda name, value: f"SUCCESS: Found {len(set(value))} unique trtllm:* metrics (minimum required: 4)",
+                    error_msg=lambda name, value: f"Expected at least 4 unique trtllm_* metrics, but found only {len(set(value))}",
+                    success_msg=lambda name, value: f"SUCCESS: Found {len(set(value))} unique trtllm_* metrics (minimum required: 4)",
                    multiline=True,
                )
            )