Unverified Commit 381c428c authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

refactor: change TensorRT-LLM metrics prefix from trtllm: to trtllm_ (#4269)


Signed-off-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
Co-authored-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent 4adab52d
......@@ -45,7 +45,7 @@ def register_engine_metrics_callback(
registry: Prometheus registry to collect from (e.g., REGISTRY or CollectorRegistry)
metric_prefix_filter: Prefix to filter metrics (e.g., "vllm:" or "sglang:", None for no filtering)
exclude_prefixes: List of metric name prefixes to exclude (e.g., ["python_", "process_"])
add_prefix: Prefix to add to remaining metrics (e.g., "trtllm:")
add_prefix: Prefix to add to remaining metrics (e.g., "trtllm_")
Example:
from prometheus_client import REGISTRY
......@@ -57,7 +57,7 @@ def register_engine_metrics_callback(
register_engine_metrics_callback(
generate_endpoint, REGISTRY,
exclude_prefixes=["python_", "process_"],
add_prefix="trtllm:"
add_prefix="trtllm_"
)
"""
......@@ -116,14 +116,14 @@ def get_prometheus_expfmt(
metric_prefix_filter: Optional prefix to filter displayed metrics (e.g., "vllm:").
If None, returns all metrics. (default: None)
exclude_prefixes: List of metric name prefixes to exclude (e.g., ["python_", "process_"])
add_prefix: Prefix to add to remaining metrics (e.g., "trtllm:")
add_prefix: Prefix to add to remaining metrics (e.g., "trtllm_")
Returns:
Formatted metrics text in Prometheus exposition format. Returns empty string on error.
Example:
# Filter out python_/process_ metrics and add trtllm: prefix
get_prometheus_expfmt(registry, exclude_prefixes=["python_", "process_"], add_prefix="trtllm:")
# Filter out python_/process_ metrics and add trtllm_ prefix
get_prometheus_expfmt(registry, exclude_prefixes=["python_", "process_"], add_prefix="trtllm_")
"""
try:
# Generate metrics in Prometheus text format
......@@ -165,20 +165,39 @@ def get_prometheus_expfmt(
if match:
comment_type, metric_name, rest = match.groups()
# Remove existing prefix if present
if metric_prefix_filter and metric_name.startswith(
if metric_prefix_filter:
metric_name = metric_name.removeprefix(
metric_prefix_filter
):
metric_name = metric_name[len(metric_prefix_filter) :]
new_metric_name = add_prefix + metric_name
line = f"# {comment_type} {new_metric_name}{rest}"
)
# Only add prefix if it doesn't already exist
if not metric_name.startswith(add_prefix):
metric_name = add_prefix + metric_name
line = f"# {comment_type} {metric_name}{rest}"
# Handle metric lines
elif line and not line.startswith("#"):
# Extract metric name (first token)
parts = line.split(None, 1)
if parts:
metric_name_part = parts[0]
rest_of_line = parts[1] if len(parts) > 1 else ""
# Remove existing prefix if present
if metric_prefix_filter and line.startswith(
if metric_prefix_filter:
metric_name_part = metric_name_part.removeprefix(
metric_prefix_filter
):
line = line[len(metric_prefix_filter) :]
line = add_prefix + line
)
# Only add prefix if it doesn't already exist
if not metric_name_part.startswith(add_prefix):
metric_name_part = add_prefix + metric_name_part
# Reconstruct line
line = metric_name_part + (
" " + rest_of_line if rest_of_line else ""
)
else:
# Empty line or just whitespace, skip prefix addition
pass
lines.append(line)
......
......@@ -332,12 +332,12 @@ async def init(runtime: DistributedRuntime, config: Config):
logging.info("TensorRT-LLM MetricsCollector initialized")
# Register callback to expose TRT-LLM metrics via Dynamo endpoint
# Filter out python_/process_ metrics and add trtllm: prefix to remaining metrics
# Filter out python_/process_ metrics and add trtllm_ prefix to remaining metrics
register_engine_metrics_callback(
endpoint=endpoint,
registry=REGISTRY,
exclude_prefixes=["python_", "process_"],
add_prefix="trtllm:",
add_prefix="trtllm_",
)
logging.info("TensorRT-LLM Prometheus metrics registered")
except Exception as e:
......
......@@ -67,7 +67,7 @@ def test_prometheus_metrics_integration():
endpoint=mock_endpoint,
registry=REGISTRY,
exclude_prefixes=["python_", "process_"],
add_prefix="trtllm:",
add_prefix="trtllm_",
)
print("✅ Prometheus metrics integration test passed")
......
......@@ -53,29 +53,29 @@ tokens_per_second 245.7
dynamo.common.utils.prometheus.generate_latest = original_generate_latest
def test_trtllm_use_case(self, trtllm_registry):
"""Test TensorRT-LLM use case: exclude python_/process_ and add trtllm: prefix."""
"""Test TensorRT-LLM use case: exclude python_/process_ and add trtllm_ prefix."""
result = get_prometheus_expfmt(
trtllm_registry,
exclude_prefixes=["python_", "process_"],
add_prefix="trtllm:",
add_prefix="trtllm_",
)
# Should not contain excluded metrics
assert "python_gc_objects_collected_total" not in result
assert "process_cpu_seconds_total" not in result
# All remaining metrics should have trtllm: prefix
assert "trtllm:request_latency_seconds" in result
assert "trtllm:num_requests_running" in result
assert "trtllm:tokens_per_second" in result
# All remaining metrics should have trtllm_ prefix
assert "trtllm_request_latency_seconds" in result
assert "trtllm_num_requests_running" in result
assert "trtllm_tokens_per_second" in result
# HELP/TYPE comments should have prefix
assert "# HELP trtllm:request_latency_seconds" in result
assert "# TYPE trtllm:num_requests_running" in result
assert "# HELP trtllm_request_latency_seconds" in result
assert "# TYPE trtllm_num_requests_running" in result
# Check specific content and structure preservation
assert 'trtllm:request_latency_seconds_bucket{le="0.1"} 10.0' in result
assert "trtllm:tokens_per_second 245.7" in result
assert 'trtllm_request_latency_seconds_bucket{le="0.1"} 10.0' in result
assert "trtllm_tokens_per_second 245.7" in result
assert result.endswith("\n")
def test_no_filtering_all_frameworks(self, trtllm_registry):
......@@ -99,6 +99,42 @@ tokens_per_second 245.7
# Should return empty string with newline or just newline
assert result == "\n" or result == ""
def test_prefix_already_exists(self):
"""Test that prefix is not added if it already exists."""
registry = Mock()
# Metrics that already have trtllm_ prefix
sample_metrics = """# HELP trtllm_request_success_total Count of successfully processed requests
# TYPE trtllm_request_success_total counter
trtllm_request_success_total{model_name="test",finished_reason="stop"} 10.0
# HELP trtllm_time_to_first_token_seconds Time to first token
# TYPE trtllm_time_to_first_token_seconds histogram
trtllm_time_to_first_token_seconds_count 5.0
"""
def mock_generate_latest(reg):
return sample_metrics.encode("utf-8")
import dynamo.common.utils.prometheus
original_generate_latest = dynamo.common.utils.prometheus.generate_latest
dynamo.common.utils.prometheus.generate_latest = mock_generate_latest
try:
result = get_prometheus_expfmt(
registry,
exclude_prefixes=["python_", "process_"],
add_prefix="trtllm_",
)
# Should not double-add prefix
assert "trtllm_trtllm_request_success_total" not in result
assert "trtllm_request_success_total" in result
assert "trtllm_time_to_first_token_seconds" in result
assert result.endswith("\n")
finally:
dynamo.common.utils.prometheus.generate_latest = original_generate_latest
def test_error_handling(self):
"""Test error handling when registry fails."""
# Create a registry that raises an exception
......
......@@ -7,12 +7,12 @@ SPDX-License-Identifier: Apache-2.0
## Overview
When running TensorRT-LLM through Dynamo, TensorRT-LLM's Prometheus metrics are automatically passed through and exposed on Dynamo's `/metrics` endpoint (default port 8081). This allows you to access both TensorRT-LLM engine metrics (prefixed with `trtllm:`) and Dynamo runtime metrics (prefixed with `dynamo_*`) from a single worker backend endpoint.
As of the date of this documentation, the included TensorRT-LLM version 1.1.0rc5 exposes **5 basic Prometheus metrics**. Note that the `trtllm:` prefix is added by Dynamo.
When running TensorRT-LLM through Dynamo, TensorRT-LLM's Prometheus metrics are automatically passed through and exposed on Dynamo's `/metrics` endpoint (default port 8081). This allows you to access both TensorRT-LLM engine metrics (prefixed with `trtllm_`) and Dynamo runtime metrics (prefixed with `dynamo_*`) from a single worker backend endpoint.
Additional performance metrics are available via non-Prometheus APIs (see [Non-Prometheus Performance Metrics](#non-prometheus-performance-metrics) below).
As of the date of this documentation, the included TensorRT-LLM version 1.1.0rc5 exposes **5 basic Prometheus metrics**. Note that the `trtllm_` prefix is added by Dynamo.
**For Dynamo runtime metrics**, see the [Dynamo Metrics Guide](../../observability/metrics.md).
**For visualization setup instructions**, see the [Prometheus and Grafana Setup Guide](../../observability/prometheus-grafana.md).
......@@ -58,42 +58,54 @@ curl -H 'Content-Type: application/json' \
http://localhost:8000/v1/chat/completions
# Check metrics from the worker
curl -s localhost:8081/metrics | grep "^trtllm:"
curl -s localhost:8081/metrics | grep "^trtllm_"
```
## Exposed Metrics
TensorRT-LLM exposes metrics in Prometheus Exposition Format text at the `/metrics` HTTP endpoint. All TensorRT-LLM engine metrics use the `trtllm:` prefix and include labels (e.g., `model_name`, `engine_type`, `finished_reason`) to identify the source.
TensorRT-LLM exposes metrics in Prometheus Exposition Format text at the `/metrics` HTTP endpoint. All TensorRT-LLM engine metrics use the `trtllm_` prefix and include labels (e.g., `model_name`, `engine_type`, `finished_reason`) to identify the source.
**Note:** TensorRT-LLM uses `model_name` instead of Dynamo's standard `model` label convention.
**Example Prometheus Exposition Format text:**
```
# HELP trtllm:request_success_total Count of successfully processed requests.
# TYPE trtllm:request_success_total counter
trtllm:request_success_total{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm",finished_reason="stop"} 150.0
trtllm:request_success_total{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm",finished_reason="length"} 5.0
# HELP trtllm:time_to_first_token_seconds Histogram of time to first token in seconds.
# TYPE trtllm:time_to_first_token_seconds histogram
trtllm:time_to_first_token_seconds_bucket{le="0.01",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 0.0
trtllm:time_to_first_token_seconds_bucket{le="0.05",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 12.0
trtllm:time_to_first_token_seconds_count{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 150.0
trtllm:time_to_first_token_seconds_sum{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 8.75
# HELP trtllm:e2e_request_latency_seconds Histogram of end to end request latency in seconds.
# TYPE trtllm:e2e_request_latency_seconds histogram
trtllm:e2e_request_latency_seconds_bucket{le="0.5",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 25.0
trtllm:e2e_request_latency_seconds_count{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 150.0
trtllm:e2e_request_latency_seconds_sum{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 45.2
# HELP trtllm_request_success_total Count of successfully processed requests.
# TYPE trtllm_request_success_total counter
trtllm_request_success_total{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm",finished_reason="stop"} 150.0
trtllm_request_success_total{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm",finished_reason="length"} 5.0
# HELP trtllm_time_to_first_token_seconds Histogram of time to first token in seconds.
# TYPE trtllm_time_to_first_token_seconds histogram
trtllm_time_to_first_token_seconds_bucket{le="0.01",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 0.0
trtllm_time_to_first_token_seconds_bucket{le="0.05",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 12.0
trtllm_time_to_first_token_seconds_count{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 150.0
trtllm_time_to_first_token_seconds_sum{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 8.75
# HELP trtllm_e2e_request_latency_seconds Histogram of end to end request latency in seconds.
# TYPE trtllm_e2e_request_latency_seconds histogram
trtllm_e2e_request_latency_seconds_bucket{le="0.5",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 25.0
trtllm_e2e_request_latency_seconds_count{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 150.0
trtllm_e2e_request_latency_seconds_sum{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 45.2
# HELP trtllm_time_per_output_token_seconds Histogram of time per output token in seconds.
# TYPE trtllm_time_per_output_token_seconds histogram
trtllm_time_per_output_token_seconds_bucket{le="0.1",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 120.0
trtllm_time_per_output_token_seconds_count{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 150.0
trtllm_time_per_output_token_seconds_sum{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 12.5
# HELP trtllm_request_queue_time_seconds Histogram of time spent in WAITING phase for request.
# TYPE trtllm_request_queue_time_seconds histogram
trtllm_request_queue_time_seconds_bucket{le="1.0",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 140.0
trtllm_request_queue_time_seconds_count{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 150.0
trtllm_request_queue_time_seconds_sum{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 32.1
```
**Note:** The specific metrics shown above are examples and may vary depending on your TensorRT-LLM version. Always inspect your actual `/metrics` endpoint for the current list.
### Metric Categories
TensorRT-LLM provides metrics in the following categories (all prefixed with `trtllm:`):
TensorRT-LLM provides metrics in the following categories (all prefixed with `trtllm_`):
- **Request metrics** - Request success tracking and latency measurements
- **Performance metrics** - Time to first token (TTFT), time per output token (TPOT), and queue time
......@@ -102,17 +114,17 @@ TensorRT-LLM provides metrics in the following categories (all prefixed with `tr
## Available Metrics
The following metrics are exposed via Dynamo's `/metrics` endpoint (with the `trtllm:` prefix added by Dynamo) for TensorRT-LLM version 1.1.0rc5:
The following metrics are exposed via Dynamo's `/metrics` endpoint (with the `trtllm_` prefix added by Dynamo) for TensorRT-LLM version 1.1.0rc5:
- `trtllm:request_success_total` (Counter) — Count of successfully processed requests by finish reason
- `trtllm_request_success_total` (Counter) — Count of successfully processed requests by finish reason
- Labels: `model_name`, `engine_type`, `finished_reason`
- `trtllm:e2e_request_latency_seconds` (Histogram) — End-to-end request latency (seconds)
- `trtllm_e2e_request_latency_seconds` (Histogram) — End-to-end request latency (seconds)
- Labels: `model_name`, `engine_type`
- `trtllm:time_to_first_token_seconds` (Histogram) — Time to first token, TTFT (seconds)
- `trtllm_time_to_first_token_seconds` (Histogram) — Time to first token, TTFT (seconds)
- Labels: `model_name`, `engine_type`
- `trtllm:time_per_output_token_seconds` (Histogram) — Time per output token, TPOT (seconds)
- `trtllm_time_per_output_token_seconds` (Histogram) — Time per output token, TPOT (seconds)
- Labels: `model_name`, `engine_type`
- `trtllm:request_queue_time_seconds` (Histogram) — Time a request spends waiting in the queue (seconds)
- `trtllm_request_queue_time_seconds` (Histogram) — Time a request spends waiting in the queue (seconds)
- Labels: `model_name`, `engine_type`
These metric names and availability are subject to change with TensorRT-LLM version updates.
......@@ -161,7 +173,7 @@ TensorRT-LLM provides extensive performance data beyond the basic Prometheus met
## Implementation Details
- **Prometheus Integration**: Uses the `MetricsCollector` class from `tensorrt_llm.metrics` (see [collector.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/metrics/collector.py))
- **Dynamo Integration**: Uses `register_engine_metrics_callback()` function with `add_prefix="trtllm:"`
- **Dynamo Integration**: Uses `register_engine_metrics_callback()` function with `add_prefix="trtllm_"`
- **Engine Configuration**: `return_perf_metrics` set to `True` when `--publish-events-and-metrics` is enabled
- **Initialization**: Metrics appear after TensorRT-LLM engine initialization completes
- **Metadata**: `MetricsCollector` initialized with model metadata (model name, engine type)
......
......@@ -84,7 +84,7 @@ Dynamo exposes several categories of metrics:
- **Frontend Metrics** (`dynamo_frontend_*`) - Request handling, token processing, and latency measurements
- **Component Metrics** (`dynamo_component_*`) - Request counts, processing times, byte transfers, and system uptime
- **Specialized Component Metrics** (e.g., `dynamo_preprocessor_*`) - Component-specific metrics
- **Engine Metrics** (Pass-through) - Backend engines expose their own metrics: [vLLM](../backends/vllm/prometheus.md) (`vllm:*`), [SGLang](../backends/sglang/prometheus.md) (`sglang:*`), [TensorRT-LLM](../backends/trtllm/prometheus.md) (`trtllm:*`)
- **Engine Metrics** (Pass-through) - Backend engines expose their own metrics: [vLLM](../backends/vllm/prometheus.md) (`vllm:*`), [SGLang](../backends/sglang/prometheus.md) (`sglang:*`), [TensorRT-LLM](../backends/trtllm/prometheus.md) (`trtllm_*`)
## Runtime Hierarchy
......
......@@ -12,15 +12,15 @@ from tests.utils.payloads import (
)
# Common default text prompt used across tests
TEXT_PROMPT = "Tell me a short joke about AI."
TEXT_PROMPT = "Tell me a knock knock joke about AI."
def chat_payload_default(
repeat_count: int = 3,
expected_response: Optional[List[str]] = None,
expected_log: Optional[List[str]] = None,
max_tokens: int = 150,
temperature: float = 0.1,
max_tokens: int = 1000,
temperature: float = 0.0,
stream: bool = False,
) -> ChatPayload:
return ChatPayload(
......@@ -37,7 +37,9 @@ def chat_payload_default(
},
repeat_count=repeat_count,
expected_log=expected_log or [],
expected_response=expected_response or ["AI"],
# Accept any of these keywords in the response (case-insensitive)
expected_response=expected_response
or ["AI", "knock", "joke", "think", "artificial", "intelligence"],
)
......@@ -45,8 +47,8 @@ def completion_payload_default(
repeat_count: int = 3,
expected_response: Optional[List[str]] = None,
expected_log: Optional[List[str]] = None,
max_tokens: int = 150,
temperature: float = 0.1,
max_tokens: int = 1000,
temperature: float = 0.0,
stream: bool = False,
) -> CompletionPayload:
return CompletionPayload(
......@@ -58,7 +60,9 @@ def completion_payload_default(
},
repeat_count=repeat_count,
expected_log=expected_log or [],
expected_response=expected_response or ["AI"],
# Accept any of these keywords in the response (case-insensitive)
expected_response=expected_response
or ["AI", "knock", "joke", "think", "artificial", "intelligence"],
)
......
......@@ -30,7 +30,7 @@ class BasePayload:
"""Generic payload body plus expectations and repeat count."""
body: Dict[str, Any]
expected_response: List[str]
expected_response: List[Any] # Can be List[str] or List[List[str]] for alternatives
expected_log: List[str]
repeat_count: int = 1
timeout: int = 60
......@@ -56,17 +56,40 @@ class BasePayload:
raise NotImplementedError("Subclasses must implement response_handler()")
def validate(self, response: Any, content: str) -> None:
"""Default validation: ensure expected substrings appear in content."""
"""Default validation: ensure expected substrings appear in content.
If expected_response is a list of strings, ANY one of them matching is sufficient (OR logic).
This allows flexible validation where responses may vary but should contain at least one keyword.
"""
if self.expected_response:
missing_expected = []
# Check if content is empty
if not content:
logger.error("VALIDATION FAILED - Response content is empty")
raise AssertionError(
f"Expected content not found in response. Expected any of: {self.expected_response}. Actual content is empty."
)
# Check if ANY of the expected strings are found (OR logic) and count matches
found_keywords = []
for expected in self.expected_response:
if not content or expected not in content:
missing_expected.append(expected)
if missing_expected:
if isinstance(expected, str) and expected.lower() in content.lower():
found_keywords.append(expected)
if not found_keywords:
logger.error(
f"VALIDATION FAILED - Actual content returned: {repr(content)}"
)
logger.error(
f"Expected to find at least one of: {self.expected_response}"
)
logger.error(f"Matches found: 0/{len(self.expected_response)}")
raise AssertionError(
f"Expected content not found in response. Missing: {missing_expected}"
f"Expected content not found in response. Expected at least one of: {self.expected_response}. Actual content: {repr(content)}"
)
logger.info(
f"SUCCESS: Found {len(found_keywords)}/{len(self.expected_response)} expected keywords: {found_keywords}"
)
logger.info(f"SUCCESS: All expected_responses: {self.expected_response} found.")
def process_response(self, response: Any) -> str:
"""Convenience: run response_handler then validate; return content."""
......@@ -88,9 +111,14 @@ class ChatPayload(BasePayload):
"""
response.raise_for_status()
result = response.json()
assert "choices" in result, "Missing 'choices' in response"
assert (
"choices" in result
), f"Missing 'choices' in response. Response keys: {list(result.keys())}"
assert len(result["choices"]) > 0, "Empty choices in response"
assert "message" in result["choices"][0], "Missing 'message' in first choice"
assert (
"message" in result["choices"][0]
), f"Missing 'message' in first choice. Choice keys: {list(result['choices'][0].keys())}"
# Check for content in all possible fields where parsers might put output:
# 1. content - standard message content
......@@ -305,13 +333,13 @@ class MetricsPayload(BasePayload):
elif backend == "trtllm":
metrics_to_check.append(
MetricCheck(
# Check: Minimum count of unique trtllm:* metrics
name="trtllm:*",
pattern=lambda name: r"^trtllm:\w+",
# Check: Minimum count of unique trtllm_* metrics
name="trtllm_*",
pattern=lambda name: r"^trtllm_\w+",
validator=lambda value: len(set(value))
>= 4, # 80% of typical ~5 trtllm metrics (excluding _bucket) as of 2025-10-22 (but will grow)
error_msg=lambda name, value: f"Expected at least 4 unique trtllm:* metrics, but found only {len(set(value))}",
success_msg=lambda name, value: f"SUCCESS: Found {len(set(value))} unique trtllm:* metrics (minimum required: 4)",
error_msg=lambda name, value: f"Expected at least 4 unique trtllm_* metrics, but found only {len(set(value))}",
success_msg=lambda name, value: f"SUCCESS: Found {len(set(value))} unique trtllm_* metrics (minimum required: 4)",
multiline=True,
)
)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment