Unverified Commit 381c428c authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

refactor: change TensorRT-LLM metrics prefix from trtllm: to trtllm_ (#4269)


Signed-off-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
Co-authored-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent 4adab52d
...@@ -45,7 +45,7 @@ def register_engine_metrics_callback( ...@@ -45,7 +45,7 @@ def register_engine_metrics_callback(
registry: Prometheus registry to collect from (e.g., REGISTRY or CollectorRegistry) registry: Prometheus registry to collect from (e.g., REGISTRY or CollectorRegistry)
metric_prefix_filter: Prefix to filter metrics (e.g., "vllm:" or "sglang:", None for no filtering) metric_prefix_filter: Prefix to filter metrics (e.g., "vllm:" or "sglang:", None for no filtering)
exclude_prefixes: List of metric name prefixes to exclude (e.g., ["python_", "process_"]) exclude_prefixes: List of metric name prefixes to exclude (e.g., ["python_", "process_"])
add_prefix: Prefix to add to remaining metrics (e.g., "trtllm:") add_prefix: Prefix to add to remaining metrics (e.g., "trtllm_")
Example: Example:
from prometheus_client import REGISTRY from prometheus_client import REGISTRY
...@@ -57,7 +57,7 @@ def register_engine_metrics_callback( ...@@ -57,7 +57,7 @@ def register_engine_metrics_callback(
register_engine_metrics_callback( register_engine_metrics_callback(
generate_endpoint, REGISTRY, generate_endpoint, REGISTRY,
exclude_prefixes=["python_", "process_"], exclude_prefixes=["python_", "process_"],
add_prefix="trtllm:" add_prefix="trtllm_"
) )
""" """
...@@ -116,14 +116,14 @@ def get_prometheus_expfmt( ...@@ -116,14 +116,14 @@ def get_prometheus_expfmt(
metric_prefix_filter: Optional prefix to filter displayed metrics (e.g., "vllm:"). metric_prefix_filter: Optional prefix to filter displayed metrics (e.g., "vllm:").
If None, returns all metrics. (default: None) If None, returns all metrics. (default: None)
exclude_prefixes: List of metric name prefixes to exclude (e.g., ["python_", "process_"]) exclude_prefixes: List of metric name prefixes to exclude (e.g., ["python_", "process_"])
add_prefix: Prefix to add to remaining metrics (e.g., "trtllm:") add_prefix: Prefix to add to remaining metrics (e.g., "trtllm_")
Returns: Returns:
Formatted metrics text in Prometheus exposition format. Returns empty string on error. Formatted metrics text in Prometheus exposition format. Returns empty string on error.
Example: Example:
# Filter out python_/process_ metrics and add trtllm: prefix # Filter out python_/process_ metrics and add trtllm_ prefix
get_prometheus_expfmt(registry, exclude_prefixes=["python_", "process_"], add_prefix="trtllm:") get_prometheus_expfmt(registry, exclude_prefixes=["python_", "process_"], add_prefix="trtllm_")
""" """
try: try:
# Generate metrics in Prometheus text format # Generate metrics in Prometheus text format
...@@ -165,20 +165,39 @@ def get_prometheus_expfmt( ...@@ -165,20 +165,39 @@ def get_prometheus_expfmt(
if match: if match:
comment_type, metric_name, rest = match.groups() comment_type, metric_name, rest = match.groups()
# Remove existing prefix if present # Remove existing prefix if present
if metric_prefix_filter and metric_name.startswith( if metric_prefix_filter:
metric_name = metric_name.removeprefix(
metric_prefix_filter metric_prefix_filter
): )
metric_name = metric_name[len(metric_prefix_filter) :] # Only add prefix if it doesn't already exist
new_metric_name = add_prefix + metric_name if not metric_name.startswith(add_prefix):
line = f"# {comment_type} {new_metric_name}{rest}" metric_name = add_prefix + metric_name
line = f"# {comment_type} {metric_name}{rest}"
# Handle metric lines # Handle metric lines
elif line and not line.startswith("#"): elif line and not line.startswith("#"):
# Extract metric name (first token)
parts = line.split(None, 1)
if parts:
metric_name_part = parts[0]
rest_of_line = parts[1] if len(parts) > 1 else ""
# Remove existing prefix if present # Remove existing prefix if present
if metric_prefix_filter and line.startswith( if metric_prefix_filter:
metric_name_part = metric_name_part.removeprefix(
metric_prefix_filter metric_prefix_filter
): )
line = line[len(metric_prefix_filter) :]
line = add_prefix + line # Only add prefix if it doesn't already exist
if not metric_name_part.startswith(add_prefix):
metric_name_part = add_prefix + metric_name_part
# Reconstruct line
line = metric_name_part + (
" " + rest_of_line if rest_of_line else ""
)
else:
# Empty line or just whitespace, skip prefix addition
pass
lines.append(line) lines.append(line)
......
...@@ -332,12 +332,12 @@ async def init(runtime: DistributedRuntime, config: Config): ...@@ -332,12 +332,12 @@ async def init(runtime: DistributedRuntime, config: Config):
logging.info("TensorRT-LLM MetricsCollector initialized") logging.info("TensorRT-LLM MetricsCollector initialized")
# Register callback to expose TRT-LLM metrics via Dynamo endpoint # Register callback to expose TRT-LLM metrics via Dynamo endpoint
# Filter out python_/process_ metrics and add trtllm: prefix to remaining metrics # Filter out python_/process_ metrics and add trtllm_ prefix to remaining metrics
register_engine_metrics_callback( register_engine_metrics_callback(
endpoint=endpoint, endpoint=endpoint,
registry=REGISTRY, registry=REGISTRY,
exclude_prefixes=["python_", "process_"], exclude_prefixes=["python_", "process_"],
add_prefix="trtllm:", add_prefix="trtllm_",
) )
logging.info("TensorRT-LLM Prometheus metrics registered") logging.info("TensorRT-LLM Prometheus metrics registered")
except Exception as e: except Exception as e:
......
...@@ -67,7 +67,7 @@ def test_prometheus_metrics_integration(): ...@@ -67,7 +67,7 @@ def test_prometheus_metrics_integration():
endpoint=mock_endpoint, endpoint=mock_endpoint,
registry=REGISTRY, registry=REGISTRY,
exclude_prefixes=["python_", "process_"], exclude_prefixes=["python_", "process_"],
add_prefix="trtllm:", add_prefix="trtllm_",
) )
print("✅ Prometheus metrics integration test passed") print("✅ Prometheus metrics integration test passed")
......
...@@ -53,29 +53,29 @@ tokens_per_second 245.7 ...@@ -53,29 +53,29 @@ tokens_per_second 245.7
dynamo.common.utils.prometheus.generate_latest = original_generate_latest dynamo.common.utils.prometheus.generate_latest = original_generate_latest
def test_trtllm_use_case(self, trtllm_registry): def test_trtllm_use_case(self, trtllm_registry):
"""Test TensorRT-LLM use case: exclude python_/process_ and add trtllm: prefix.""" """Test TensorRT-LLM use case: exclude python_/process_ and add trtllm_ prefix."""
result = get_prometheus_expfmt( result = get_prometheus_expfmt(
trtllm_registry, trtllm_registry,
exclude_prefixes=["python_", "process_"], exclude_prefixes=["python_", "process_"],
add_prefix="trtllm:", add_prefix="trtllm_",
) )
# Should not contain excluded metrics # Should not contain excluded metrics
assert "python_gc_objects_collected_total" not in result assert "python_gc_objects_collected_total" not in result
assert "process_cpu_seconds_total" not in result assert "process_cpu_seconds_total" not in result
# All remaining metrics should have trtllm: prefix # All remaining metrics should have trtllm_ prefix
assert "trtllm:request_latency_seconds" in result assert "trtllm_request_latency_seconds" in result
assert "trtllm:num_requests_running" in result assert "trtllm_num_requests_running" in result
assert "trtllm:tokens_per_second" in result assert "trtllm_tokens_per_second" in result
# HELP/TYPE comments should have prefix # HELP/TYPE comments should have prefix
assert "# HELP trtllm:request_latency_seconds" in result assert "# HELP trtllm_request_latency_seconds" in result
assert "# TYPE trtllm:num_requests_running" in result assert "# TYPE trtllm_num_requests_running" in result
# Check specific content and structure preservation # Check specific content and structure preservation
assert 'trtllm:request_latency_seconds_bucket{le="0.1"} 10.0' in result assert 'trtllm_request_latency_seconds_bucket{le="0.1"} 10.0' in result
assert "trtllm:tokens_per_second 245.7" in result assert "trtllm_tokens_per_second 245.7" in result
assert result.endswith("\n") assert result.endswith("\n")
def test_no_filtering_all_frameworks(self, trtllm_registry): def test_no_filtering_all_frameworks(self, trtllm_registry):
...@@ -99,6 +99,42 @@ tokens_per_second 245.7 ...@@ -99,6 +99,42 @@ tokens_per_second 245.7
# Should return empty string with newline or just newline # Should return empty string with newline or just newline
assert result == "\n" or result == "" assert result == "\n" or result == ""
def test_prefix_already_exists(self):
"""Test that prefix is not added if it already exists."""
registry = Mock()
# Metrics that already have trtllm_ prefix
sample_metrics = """# HELP trtllm_request_success_total Count of successfully processed requests
# TYPE trtllm_request_success_total counter
trtllm_request_success_total{model_name="test",finished_reason="stop"} 10.0
# HELP trtllm_time_to_first_token_seconds Time to first token
# TYPE trtllm_time_to_first_token_seconds histogram
trtllm_time_to_first_token_seconds_count 5.0
"""
def mock_generate_latest(reg):
return sample_metrics.encode("utf-8")
import dynamo.common.utils.prometheus
original_generate_latest = dynamo.common.utils.prometheus.generate_latest
dynamo.common.utils.prometheus.generate_latest = mock_generate_latest
try:
result = get_prometheus_expfmt(
registry,
exclude_prefixes=["python_", "process_"],
add_prefix="trtllm_",
)
# Should not double-add prefix
assert "trtllm_trtllm_request_success_total" not in result
assert "trtllm_request_success_total" in result
assert "trtllm_time_to_first_token_seconds" in result
assert result.endswith("\n")
finally:
dynamo.common.utils.prometheus.generate_latest = original_generate_latest
def test_error_handling(self): def test_error_handling(self):
"""Test error handling when registry fails.""" """Test error handling when registry fails."""
# Create a registry that raises an exception # Create a registry that raises an exception
......
...@@ -7,12 +7,12 @@ SPDX-License-Identifier: Apache-2.0 ...@@ -7,12 +7,12 @@ SPDX-License-Identifier: Apache-2.0
## Overview ## Overview
When running TensorRT-LLM through Dynamo, TensorRT-LLM's Prometheus metrics are automatically passed through and exposed on Dynamo's `/metrics` endpoint (default port 8081). This allows you to access both TensorRT-LLM engine metrics (prefixed with `trtllm:`) and Dynamo runtime metrics (prefixed with `dynamo_*`) from a single worker backend endpoint. When running TensorRT-LLM through Dynamo, TensorRT-LLM's Prometheus metrics are automatically passed through and exposed on Dynamo's `/metrics` endpoint (default port 8081). This allows you to access both TensorRT-LLM engine metrics (prefixed with `trtllm_`) and Dynamo runtime metrics (prefixed with `dynamo_*`) from a single worker backend endpoint.
As of the date of this documentation, the included TensorRT-LLM version 1.1.0rc5 exposes **5 basic Prometheus metrics**. Note that the `trtllm:` prefix is added by Dynamo.
Additional performance metrics are available via non-Prometheus APIs (see [Non-Prometheus Performance Metrics](#non-prometheus-performance-metrics) below). Additional performance metrics are available via non-Prometheus APIs (see [Non-Prometheus Performance Metrics](#non-prometheus-performance-metrics) below).
As of the date of this documentation, the included TensorRT-LLM version 1.1.0rc5 exposes **5 basic Prometheus metrics**. Note that the `trtllm_` prefix is added by Dynamo.
**For Dynamo runtime metrics**, see the [Dynamo Metrics Guide](../../observability/metrics.md). **For Dynamo runtime metrics**, see the [Dynamo Metrics Guide](../../observability/metrics.md).
**For visualization setup instructions**, see the [Prometheus and Grafana Setup Guide](../../observability/prometheus-grafana.md). **For visualization setup instructions**, see the [Prometheus and Grafana Setup Guide](../../observability/prometheus-grafana.md).
...@@ -58,42 +58,54 @@ curl -H 'Content-Type: application/json' \ ...@@ -58,42 +58,54 @@ curl -H 'Content-Type: application/json' \
http://localhost:8000/v1/chat/completions http://localhost:8000/v1/chat/completions
# Check metrics from the worker # Check metrics from the worker
curl -s localhost:8081/metrics | grep "^trtllm:" curl -s localhost:8081/metrics | grep "^trtllm_"
``` ```
## Exposed Metrics ## Exposed Metrics
TensorRT-LLM exposes metrics in Prometheus Exposition Format text at the `/metrics` HTTP endpoint. All TensorRT-LLM engine metrics use the `trtllm:` prefix and include labels (e.g., `model_name`, `engine_type`, `finished_reason`) to identify the source. TensorRT-LLM exposes metrics in Prometheus Exposition Format text at the `/metrics` HTTP endpoint. All TensorRT-LLM engine metrics use the `trtllm_` prefix and include labels (e.g., `model_name`, `engine_type`, `finished_reason`) to identify the source.
**Note:** TensorRT-LLM uses `model_name` instead of Dynamo's standard `model` label convention. **Note:** TensorRT-LLM uses `model_name` instead of Dynamo's standard `model` label convention.
**Example Prometheus Exposition Format text:** **Example Prometheus Exposition Format text:**
``` ```
# HELP trtllm:request_success_total Count of successfully processed requests. # HELP trtllm_request_success_total Count of successfully processed requests.
# TYPE trtllm:request_success_total counter # TYPE trtllm_request_success_total counter
trtllm:request_success_total{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm",finished_reason="stop"} 150.0 trtllm_request_success_total{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm",finished_reason="stop"} 150.0
trtllm:request_success_total{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm",finished_reason="length"} 5.0 trtllm_request_success_total{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm",finished_reason="length"} 5.0
# HELP trtllm:time_to_first_token_seconds Histogram of time to first token in seconds. # HELP trtllm_time_to_first_token_seconds Histogram of time to first token in seconds.
# TYPE trtllm:time_to_first_token_seconds histogram # TYPE trtllm_time_to_first_token_seconds histogram
trtllm:time_to_first_token_seconds_bucket{le="0.01",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 0.0 trtllm_time_to_first_token_seconds_bucket{le="0.01",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 0.0
trtllm:time_to_first_token_seconds_bucket{le="0.05",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 12.0 trtllm_time_to_first_token_seconds_bucket{le="0.05",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 12.0
trtllm:time_to_first_token_seconds_count{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 150.0 trtllm_time_to_first_token_seconds_count{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 150.0
trtllm:time_to_first_token_seconds_sum{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 8.75 trtllm_time_to_first_token_seconds_sum{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 8.75
# HELP trtllm:e2e_request_latency_seconds Histogram of end to end request latency in seconds. # HELP trtllm_e2e_request_latency_seconds Histogram of end to end request latency in seconds.
# TYPE trtllm:e2e_request_latency_seconds histogram # TYPE trtllm_e2e_request_latency_seconds histogram
trtllm:e2e_request_latency_seconds_bucket{le="0.5",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 25.0 trtllm_e2e_request_latency_seconds_bucket{le="0.5",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 25.0
trtllm:e2e_request_latency_seconds_count{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 150.0 trtllm_e2e_request_latency_seconds_count{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 150.0
trtllm:e2e_request_latency_seconds_sum{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 45.2 trtllm_e2e_request_latency_seconds_sum{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 45.2
# HELP trtllm_time_per_output_token_seconds Histogram of time per output token in seconds.
# TYPE trtllm_time_per_output_token_seconds histogram
trtllm_time_per_output_token_seconds_bucket{le="0.1",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 120.0
trtllm_time_per_output_token_seconds_count{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 150.0
trtllm_time_per_output_token_seconds_sum{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 12.5
# HELP trtllm_request_queue_time_seconds Histogram of time spent in WAITING phase for request.
# TYPE trtllm_request_queue_time_seconds histogram
trtllm_request_queue_time_seconds_bucket{le="1.0",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 140.0
trtllm_request_queue_time_seconds_count{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 150.0
trtllm_request_queue_time_seconds_sum{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 32.1
``` ```
**Note:** The specific metrics shown above are examples and may vary depending on your TensorRT-LLM version. Always inspect your actual `/metrics` endpoint for the current list. **Note:** The specific metrics shown above are examples and may vary depending on your TensorRT-LLM version. Always inspect your actual `/metrics` endpoint for the current list.
### Metric Categories ### Metric Categories
TensorRT-LLM provides metrics in the following categories (all prefixed with `trtllm:`): TensorRT-LLM provides metrics in the following categories (all prefixed with `trtllm_`):
- **Request metrics** - Request success tracking and latency measurements - **Request metrics** - Request success tracking and latency measurements
- **Performance metrics** - Time to first token (TTFT), time per output token (TPOT), and queue time - **Performance metrics** - Time to first token (TTFT), time per output token (TPOT), and queue time
...@@ -102,17 +114,17 @@ TensorRT-LLM provides metrics in the following categories (all prefixed with `tr ...@@ -102,17 +114,17 @@ TensorRT-LLM provides metrics in the following categories (all prefixed with `tr
## Available Metrics ## Available Metrics
The following metrics are exposed via Dynamo's `/metrics` endpoint (with the `trtllm:` prefix added by Dynamo) for TensorRT-LLM version 1.1.0rc5: The following metrics are exposed via Dynamo's `/metrics` endpoint (with the `trtllm_` prefix added by Dynamo) for TensorRT-LLM version 1.1.0rc5:
- `trtllm:request_success_total` (Counter) — Count of successfully processed requests by finish reason - `trtllm_request_success_total` (Counter) — Count of successfully processed requests by finish reason
- Labels: `model_name`, `engine_type`, `finished_reason` - Labels: `model_name`, `engine_type`, `finished_reason`
- `trtllm:e2e_request_latency_seconds` (Histogram) — End-to-end request latency (seconds) - `trtllm_e2e_request_latency_seconds` (Histogram) — End-to-end request latency (seconds)
- Labels: `model_name`, `engine_type` - Labels: `model_name`, `engine_type`
- `trtllm:time_to_first_token_seconds` (Histogram) — Time to first token, TTFT (seconds) - `trtllm_time_to_first_token_seconds` (Histogram) — Time to first token, TTFT (seconds)
- Labels: `model_name`, `engine_type` - Labels: `model_name`, `engine_type`
- `trtllm:time_per_output_token_seconds` (Histogram) — Time per output token, TPOT (seconds) - `trtllm_time_per_output_token_seconds` (Histogram) — Time per output token, TPOT (seconds)
- Labels: `model_name`, `engine_type` - Labels: `model_name`, `engine_type`
- `trtllm:request_queue_time_seconds` (Histogram) — Time a request spends waiting in the queue (seconds) - `trtllm_request_queue_time_seconds` (Histogram) — Time a request spends waiting in the queue (seconds)
- Labels: `model_name`, `engine_type` - Labels: `model_name`, `engine_type`
These metric names and availability are subject to change with TensorRT-LLM version updates. These metric names and availability are subject to change with TensorRT-LLM version updates.
...@@ -161,7 +173,7 @@ TensorRT-LLM provides extensive performance data beyond the basic Prometheus met ...@@ -161,7 +173,7 @@ TensorRT-LLM provides extensive performance data beyond the basic Prometheus met
## Implementation Details ## Implementation Details
- **Prometheus Integration**: Uses the `MetricsCollector` class from `tensorrt_llm.metrics` (see [collector.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/metrics/collector.py)) - **Prometheus Integration**: Uses the `MetricsCollector` class from `tensorrt_llm.metrics` (see [collector.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/metrics/collector.py))
- **Dynamo Integration**: Uses `register_engine_metrics_callback()` function with `add_prefix="trtllm:"` - **Dynamo Integration**: Uses `register_engine_metrics_callback()` function with `add_prefix="trtllm_"`
- **Engine Configuration**: `return_perf_metrics` set to `True` when `--publish-events-and-metrics` is enabled - **Engine Configuration**: `return_perf_metrics` set to `True` when `--publish-events-and-metrics` is enabled
- **Initialization**: Metrics appear after TensorRT-LLM engine initialization completes - **Initialization**: Metrics appear after TensorRT-LLM engine initialization completes
- **Metadata**: `MetricsCollector` initialized with model metadata (model name, engine type) - **Metadata**: `MetricsCollector` initialized with model metadata (model name, engine type)
......
...@@ -84,7 +84,7 @@ Dynamo exposes several categories of metrics: ...@@ -84,7 +84,7 @@ Dynamo exposes several categories of metrics:
- **Frontend Metrics** (`dynamo_frontend_*`) - Request handling, token processing, and latency measurements - **Frontend Metrics** (`dynamo_frontend_*`) - Request handling, token processing, and latency measurements
- **Component Metrics** (`dynamo_component_*`) - Request counts, processing times, byte transfers, and system uptime - **Component Metrics** (`dynamo_component_*`) - Request counts, processing times, byte transfers, and system uptime
- **Specialized Component Metrics** (e.g., `dynamo_preprocessor_*`) - Component-specific metrics - **Specialized Component Metrics** (e.g., `dynamo_preprocessor_*`) - Component-specific metrics
- **Engine Metrics** (Pass-through) - Backend engines expose their own metrics: [vLLM](../backends/vllm/prometheus.md) (`vllm:*`), [SGLang](../backends/sglang/prometheus.md) (`sglang:*`), [TensorRT-LLM](../backends/trtllm/prometheus.md) (`trtllm:*`) - **Engine Metrics** (Pass-through) - Backend engines expose their own metrics: [vLLM](../backends/vllm/prometheus.md) (`vllm:*`), [SGLang](../backends/sglang/prometheus.md) (`sglang:*`), [TensorRT-LLM](../backends/trtllm/prometheus.md) (`trtllm_*`)
## Runtime Hierarchy ## Runtime Hierarchy
......
...@@ -12,15 +12,15 @@ from tests.utils.payloads import ( ...@@ -12,15 +12,15 @@ from tests.utils.payloads import (
) )
# Common default text prompt used across tests # Common default text prompt used across tests
TEXT_PROMPT = "Tell me a short joke about AI." TEXT_PROMPT = "Tell me a knock knock joke about AI."
def chat_payload_default( def chat_payload_default(
repeat_count: int = 3, repeat_count: int = 3,
expected_response: Optional[List[str]] = None, expected_response: Optional[List[str]] = None,
expected_log: Optional[List[str]] = None, expected_log: Optional[List[str]] = None,
max_tokens: int = 150, max_tokens: int = 1000,
temperature: float = 0.1, temperature: float = 0.0,
stream: bool = False, stream: bool = False,
) -> ChatPayload: ) -> ChatPayload:
return ChatPayload( return ChatPayload(
...@@ -37,7 +37,9 @@ def chat_payload_default( ...@@ -37,7 +37,9 @@ def chat_payload_default(
}, },
repeat_count=repeat_count, repeat_count=repeat_count,
expected_log=expected_log or [], expected_log=expected_log or [],
expected_response=expected_response or ["AI"], # Accept any of these keywords in the response (case-insensitive)
expected_response=expected_response
or ["AI", "knock", "joke", "think", "artificial", "intelligence"],
) )
...@@ -45,8 +47,8 @@ def completion_payload_default( ...@@ -45,8 +47,8 @@ def completion_payload_default(
repeat_count: int = 3, repeat_count: int = 3,
expected_response: Optional[List[str]] = None, expected_response: Optional[List[str]] = None,
expected_log: Optional[List[str]] = None, expected_log: Optional[List[str]] = None,
max_tokens: int = 150, max_tokens: int = 1000,
temperature: float = 0.1, temperature: float = 0.0,
stream: bool = False, stream: bool = False,
) -> CompletionPayload: ) -> CompletionPayload:
return CompletionPayload( return CompletionPayload(
...@@ -58,7 +60,9 @@ def completion_payload_default( ...@@ -58,7 +60,9 @@ def completion_payload_default(
}, },
repeat_count=repeat_count, repeat_count=repeat_count,
expected_log=expected_log or [], expected_log=expected_log or [],
expected_response=expected_response or ["AI"], # Accept any of these keywords in the response (case-insensitive)
expected_response=expected_response
or ["AI", "knock", "joke", "think", "artificial", "intelligence"],
) )
......
...@@ -30,7 +30,7 @@ class BasePayload: ...@@ -30,7 +30,7 @@ class BasePayload:
"""Generic payload body plus expectations and repeat count.""" """Generic payload body plus expectations and repeat count."""
body: Dict[str, Any] body: Dict[str, Any]
expected_response: List[str] expected_response: List[Any] # Can be List[str] or List[List[str]] for alternatives
expected_log: List[str] expected_log: List[str]
repeat_count: int = 1 repeat_count: int = 1
timeout: int = 60 timeout: int = 60
...@@ -56,17 +56,40 @@ class BasePayload: ...@@ -56,17 +56,40 @@ class BasePayload:
raise NotImplementedError("Subclasses must implement response_handler()") raise NotImplementedError("Subclasses must implement response_handler()")
def validate(self, response: Any, content: str) -> None: def validate(self, response: Any, content: str) -> None:
"""Default validation: ensure expected substrings appear in content.""" """Default validation: ensure expected substrings appear in content.
If expected_response is a list of strings, ANY one of them matching is sufficient (OR logic).
This allows flexible validation where responses may vary but should contain at least one keyword.
"""
if self.expected_response: if self.expected_response:
missing_expected = [] # Check if content is empty
if not content:
logger.error("VALIDATION FAILED - Response content is empty")
raise AssertionError(
f"Expected content not found in response. Expected any of: {self.expected_response}. Actual content is empty."
)
# Check if ANY of the expected strings are found (OR logic) and count matches
found_keywords = []
for expected in self.expected_response: for expected in self.expected_response:
if not content or expected not in content: if isinstance(expected, str) and expected.lower() in content.lower():
missing_expected.append(expected) found_keywords.append(expected)
if missing_expected:
if not found_keywords:
logger.error(
f"VALIDATION FAILED - Actual content returned: {repr(content)}"
)
logger.error(
f"Expected to find at least one of: {self.expected_response}"
)
logger.error(f"Matches found: 0/{len(self.expected_response)}")
raise AssertionError( raise AssertionError(
f"Expected content not found in response. Missing: {missing_expected}" f"Expected content not found in response. Expected at least one of: {self.expected_response}. Actual content: {repr(content)}"
)
logger.info(
f"SUCCESS: Found {len(found_keywords)}/{len(self.expected_response)} expected keywords: {found_keywords}"
) )
logger.info(f"SUCCESS: All expected_responses: {self.expected_response} found.")
def process_response(self, response: Any) -> str: def process_response(self, response: Any) -> str:
"""Convenience: run response_handler then validate; return content.""" """Convenience: run response_handler then validate; return content."""
...@@ -88,9 +111,14 @@ class ChatPayload(BasePayload): ...@@ -88,9 +111,14 @@ class ChatPayload(BasePayload):
""" """
response.raise_for_status() response.raise_for_status()
result = response.json() result = response.json()
assert "choices" in result, "Missing 'choices' in response"
assert (
"choices" in result
), f"Missing 'choices' in response. Response keys: {list(result.keys())}"
assert len(result["choices"]) > 0, "Empty choices in response" assert len(result["choices"]) > 0, "Empty choices in response"
assert "message" in result["choices"][0], "Missing 'message' in first choice" assert (
"message" in result["choices"][0]
), f"Missing 'message' in first choice. Choice keys: {list(result['choices'][0].keys())}"
# Check for content in all possible fields where parsers might put output: # Check for content in all possible fields where parsers might put output:
# 1. content - standard message content # 1. content - standard message content
...@@ -305,13 +333,13 @@ class MetricsPayload(BasePayload): ...@@ -305,13 +333,13 @@ class MetricsPayload(BasePayload):
elif backend == "trtllm": elif backend == "trtllm":
metrics_to_check.append( metrics_to_check.append(
MetricCheck( MetricCheck(
# Check: Minimum count of unique trtllm:* metrics # Check: Minimum count of unique trtllm_* metrics
name="trtllm:*", name="trtllm_*",
pattern=lambda name: r"^trtllm:\w+", pattern=lambda name: r"^trtllm_\w+",
validator=lambda value: len(set(value)) validator=lambda value: len(set(value))
>= 4, # 80% of typical ~5 trtllm metrics (excluding _bucket) as of 2025-10-22 (but will grow) >= 4, # 80% of typical ~5 trtllm metrics (excluding _bucket) as of 2025-10-22 (but will grow)
error_msg=lambda name, value: f"Expected at least 4 unique trtllm:* metrics, but found only {len(set(value))}", error_msg=lambda name, value: f"Expected at least 4 unique trtllm_* metrics, but found only {len(set(value))}",
success_msg=lambda name, value: f"SUCCESS: Found {len(set(value))} unique trtllm:* metrics (minimum required: 4)", success_msg=lambda name, value: f"SUCCESS: Found {len(set(value))} unique trtllm_* metrics (minimum required: 4)",
multiline=True, multiline=True,
) )
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment