Unverified Commit f93b619a authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

feat: add TensorRT-LLM Prometheus metrics support with prefixing + filtering (#3676)


Signed-off-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent 79ad7f36
...@@ -29,8 +29,9 @@ if TYPE_CHECKING: ...@@ -29,8 +29,9 @@ if TYPE_CHECKING:
def register_engine_metrics_callback( def register_engine_metrics_callback(
endpoint: Endpoint, endpoint: Endpoint,
registry: "CollectorRegistry", registry: "CollectorRegistry",
metric_prefix: str, metric_prefix_filter: Optional[str] = None,
engine_name: str, exclude_prefixes: Optional[list[str]] = None,
add_prefix: Optional[str] = None,
) -> None: ) -> None:
""" """
Register a callback to expose engine Prometheus metrics via Dynamo's metrics endpoint. Register a callback to expose engine Prometheus metrics via Dynamo's metrics endpoint.
...@@ -41,19 +42,32 @@ def register_engine_metrics_callback( ...@@ -41,19 +42,32 @@ def register_engine_metrics_callback(
Args: Args:
endpoint: Dynamo endpoint object with metrics.register_prometheus_expfmt_callback() endpoint: Dynamo endpoint object with metrics.register_prometheus_expfmt_callback()
registry: Prometheus registry to collect from (e.g., REGISTRY or CollectorRegistry) registry: Prometheus registry to collect from (e.g., REGISTRY or CollectorRegistry)
metric_prefix: Prefix to filter metrics (e.g., "vllm:" or "sglang:") metric_prefix_filter: Prefix to filter metrics (e.g., "vllm:" or "sglang:", None for no filtering)
engine_name: Name of the engine for logging (e.g., "vLLM" or "SGLang") exclude_prefixes: List of metric name prefixes to exclude (e.g., ["python_", "process_"])
add_prefix: Prefix to add to remaining metrics (e.g., "trtllm:")
Example: Example:
from prometheus_client import REGISTRY from prometheus_client import REGISTRY
register_engine_metrics_callback( register_engine_metrics_callback(
generate_endpoint, REGISTRY, "vllm:", "vLLM" generate_endpoint, REGISTRY, metric_prefix_filter="vllm:"
)
# With filtering and prefixing for TensorRT-LLM
register_engine_metrics_callback(
generate_endpoint, REGISTRY,
exclude_prefixes=["python_", "process_"],
add_prefix="trtllm:"
) )
""" """
def get_expfmt() -> str: def get_expfmt() -> str:
"""Callback to return engine Prometheus metrics in exposition format""" """Callback to return engine Prometheus metrics in exposition format"""
return get_prometheus_expfmt(registry, metric_prefix_filter=metric_prefix) return get_prometheus_expfmt(
registry,
metric_prefix_filter=metric_prefix_filter,
exclude_prefixes=exclude_prefixes,
add_prefix=add_prefix,
)
endpoint.metrics.register_prometheus_expfmt_callback(get_expfmt) endpoint.metrics.register_prometheus_expfmt_callback(get_expfmt)
...@@ -61,27 +75,14 @@ def register_engine_metrics_callback( ...@@ -61,27 +75,14 @@ def register_engine_metrics_callback(
def get_prometheus_expfmt( def get_prometheus_expfmt(
registry, registry,
metric_prefix_filter: Optional[str] = None, metric_prefix_filter: Optional[str] = None,
exclude_prefixes: Optional[list[str]] = None,
add_prefix: Optional[str] = None,
) -> str: ) -> str:
""" """
Get Prometheus metrics from a registry formatted as text using the standard text encoder. Get Prometheus metrics from a registry formatted as text using the standard text encoder.
Collects all metrics from the registry and returns them in Prometheus text exposition format. Collects all metrics from the registry and returns them in Prometheus text exposition format.
Optionally filters metrics by prefix. Optionally filters metrics by prefix, excludes certain prefixes, and adds a prefix.
Prometheus exposition format consists of:
- Comment lines starting with # (HELP and TYPE declarations)
- Metric lines with format: metric_name{label="value"} metric_value timestamp
Example output format:
# HELP vllm:request_success_total Number of successful requests
# TYPE vllm:request_success_total counter
vllm:request_success_total{model="llama2",endpoint="generate"} 150.0
# HELP vllm:time_to_first_token_seconds Time to first token
# TYPE vllm:time_to_first_token_seconds histogram
vllm:time_to_first_token_seconds_bucket{model="llama2",le="0.01"} 10.0
vllm:time_to_first_token_seconds_bucket{model="llama2",le="0.1"} 45.0
vllm:time_to_first_token_seconds_count{model="llama2"} 50.0
vllm:time_to_first_token_seconds_sum{model="llama2"} 2.5
Args: Args:
registry: Prometheus registry to collect from. registry: Prometheus registry to collect from.
...@@ -89,34 +90,79 @@ def get_prometheus_expfmt( ...@@ -89,34 +90,79 @@ def get_prometheus_expfmt(
Pass REGISTRY for vLLM single-process mode. Pass REGISTRY for vLLM single-process mode.
metric_prefix_filter: Optional prefix to filter displayed metrics (e.g., "vllm:"). metric_prefix_filter: Optional prefix to filter displayed metrics (e.g., "vllm:").
If None, returns all metrics. (default: None) If None, returns all metrics. (default: None)
exclude_prefixes: List of metric name prefixes to exclude (e.g., ["python_", "process_"])
add_prefix: Prefix to add to remaining metrics (e.g., "trtllm:")
Returns: Returns:
Formatted metrics text in Prometheus exposition format. Returns empty string on error. Formatted metrics text in Prometheus exposition format. Returns empty string on error.
Example: Example:
from prometheus_client import REGISTRY # Filter out python_/process_ metrics and add trtllm: prefix
metrics_text = get_prometheus_expfmt(REGISTRY) get_prometheus_expfmt(registry, exclude_prefixes=["python_", "process_"], add_prefix="trtllm:")
print(metrics_text)
# With filter
vllm_metrics = get_prometheus_expfmt(REGISTRY, metric_prefix_filter="vllm:")
""" """
try: try:
# Generate metrics in Prometheus text format # Generate metrics in Prometheus text format
metrics_text = generate_latest(registry).decode("utf-8") metrics_text = generate_latest(registry).decode("utf-8")
if metric_prefix_filter: if metric_prefix_filter or exclude_prefixes or add_prefix:
# Filter lines: keep metric lines starting with prefix and their HELP/TYPE comments lines = []
escaped_prefix = re.escape(metric_prefix_filter)
pattern = rf"^(?:{escaped_prefix}|# (?:HELP|TYPE) {escaped_prefix})" # Build exclude pattern for lines to skip entirely
filtered_lines = [ exclude_line_pattern = None
line for line in metrics_text.split("\n") if re.match(pattern, line) if exclude_prefixes:
] escaped_prefixes = [re.escape(prefix) for prefix in exclude_prefixes]
result = "\n".join(filtered_lines) prefixes_regex = "|".join(escaped_prefixes)
if result: # Match lines starting with: HELP/TYPE comments OR metric lines with excluded prefixes
# Ensure result ends with newline exclude_line_pattern = re.compile(
if result and not result.endswith("\n"): rf"^(# (HELP|TYPE) )?({prefixes_regex})"
result += "\n" )
# Build include pattern if needed
include_pattern = None
if metric_prefix_filter:
escaped_prefix = re.escape(metric_prefix_filter)
include_pattern = re.compile(rf"^(# (HELP|TYPE) )?{escaped_prefix}")
for line in metrics_text.split("\n"):
if not line.strip():
continue
# Skip excluded lines entirely
if exclude_line_pattern and exclude_line_pattern.match(line):
continue
# Apply include filter if specified
if include_pattern and not include_pattern.match(line):
continue
# Apply prefix transformation if needed
if add_prefix:
# Handle HELP/TYPE comments
if line.startswith("# HELP ") or line.startswith("# TYPE "):
match = re.match(r"^# (HELP|TYPE) (\S+)(.*)$", line)
if match:
comment_type, metric_name, rest = match.groups()
# Remove existing prefix if present
if metric_prefix_filter and metric_name.startswith(
metric_prefix_filter
):
metric_name = metric_name[len(metric_prefix_filter) :]
new_metric_name = add_prefix + metric_name
line = f"# {comment_type} {new_metric_name}{rest}"
# Handle metric lines
elif line and not line.startswith("#"):
# Remove existing prefix if present
if metric_prefix_filter and line.startswith(
metric_prefix_filter
):
line = line[len(metric_prefix_filter) :]
line = add_prefix + line
lines.append(line)
result = "\n".join(lines)
if result and not result.endswith("\n"):
result += "\n"
return result return result
else: else:
# Ensure metrics_text ends with newline # Ensure metrics_text ends with newline
......
...@@ -94,6 +94,8 @@ async def init(runtime: DistributedRuntime, config: Config): ...@@ -94,6 +94,8 @@ async def init(runtime: DistributedRuntime, config: Config):
) )
# publisher instantiates the metrics and kv event publishers # publisher instantiates the metrics and kv event publishers
# Note that when engine.server_args.enable_metrics is True, it'll also
# gather internal SGLang Prometheus metrics from all worker processes.
publisher, metrics_task, metrics_labels = await setup_sgl_metrics( publisher, metrics_task, metrics_labels = await setup_sgl_metrics(
engine, config, component, generate_endpoint engine, config, component, generate_endpoint
) )
......
...@@ -226,7 +226,9 @@ async def setup_sgl_metrics( ...@@ -226,7 +226,9 @@ async def setup_sgl_metrics(
registry = CollectorRegistry() registry = CollectorRegistry()
multiprocess.MultiProcessCollector(registry) multiprocess.MultiProcessCollector(registry)
register_engine_metrics_callback( register_engine_metrics_callback(
generate_endpoint, registry, "sglang:", "SGLang" endpoint=generate_endpoint,
registry=registry,
metric_prefix_filter="sglang:",
) )
task = asyncio.create_task(publisher.run()) task = asyncio.create_task(publisher.run())
......
...@@ -20,6 +20,7 @@ if "TLLM_LOG_LEVEL" not in os.environ and os.getenv( ...@@ -20,6 +20,7 @@ if "TLLM_LOG_LEVEL" not in os.environ and os.getenv(
tllm_level = map_dyn_log_to_tllm_level(dyn_log) tllm_level = map_dyn_log_to_tllm_level(dyn_log)
os.environ["TLLM_LOG_LEVEL"] = tllm_level os.environ["TLLM_LOG_LEVEL"] = tllm_level
import uvloop import uvloop
from prometheus_client import REGISTRY
from tensorrt_llm.llmapi import ( from tensorrt_llm.llmapi import (
BuildConfig, BuildConfig,
CapacitySchedulerPolicy, CapacitySchedulerPolicy,
...@@ -30,11 +31,13 @@ from tensorrt_llm.llmapi import ( ...@@ -30,11 +31,13 @@ from tensorrt_llm.llmapi import (
from tensorrt_llm.llmapi.llm import SamplingParams from tensorrt_llm.llmapi.llm import SamplingParams
from tensorrt_llm.llmapi.llm_utils import update_llm_args_with_extra_options from tensorrt_llm.llmapi.llm_utils import update_llm_args_with_extra_options
from tensorrt_llm.llmapi.tokenizer import tokenizer_factory from tensorrt_llm.llmapi.tokenizer import tokenizer_factory
from tensorrt_llm.metrics import MetricsCollector
from torch.cuda import device_count from torch.cuda import device_count
from transformers import AutoConfig from transformers import AutoConfig
import dynamo.nixl_connect as nixl_connect import dynamo.nixl_connect as nixl_connect
from dynamo.common.config_dump import dump_config from dynamo.common.config_dump import dump_config
from dynamo.common.utils.prometheus import register_engine_metrics_callback
from dynamo.llm import ModelInput, ModelRuntimeConfig, ModelType, register_llm from dynamo.llm import ModelInput, ModelRuntimeConfig, ModelType, register_llm
from dynamo.runtime import DistributedRuntime, dynamo_worker from dynamo.runtime import DistributedRuntime, dynamo_worker
from dynamo.runtime.logging import configure_dynamo_logging from dynamo.runtime.logging import configure_dynamo_logging
...@@ -218,6 +221,7 @@ async def init(runtime: DistributedRuntime, config: Config): ...@@ -218,6 +221,7 @@ async def init(runtime: DistributedRuntime, config: Config):
"max_seq_len": config.max_seq_len, "max_seq_len": config.max_seq_len,
"max_beam_width": config.max_beam_width, "max_beam_width": config.max_beam_width,
"max_batch_size": config.max_batch_size, "max_batch_size": config.max_batch_size,
"return_perf_metrics": config.publish_events_and_metrics,
} }
if config.extra_engine_args != "": if config.extra_engine_args != "":
...@@ -234,19 +238,21 @@ async def init(runtime: DistributedRuntime, config: Config): ...@@ -234,19 +238,21 @@ async def init(runtime: DistributedRuntime, config: Config):
except json.JSONDecodeError as e: except json.JSONDecodeError as e:
logging.error(f"Failed to parse override_engine_args as JSON: {e}") logging.error(f"Failed to parse override_engine_args as JSON: {e}")
sys.exit(1) sys.exit(1)
if config.publish_events_and_metrics: if config.publish_events_and_metrics:
# 'event_buffer_max_size' is required to enable TRTLLM to publish kv cache events. # 'event_buffer_max_size' is required to enable TRTLLM to publish kv cache events.
kv_cache_config = None # Convert KvCacheConfig object to dict and add the parameter
if "kv_cache_config" not in arg_map: current_kv_config = arg_map["kv_cache_config"]
kv_cache_config = {} if isinstance(current_kv_config, KvCacheConfig):
kv_cache_config["event_buffer_max_size"] = DEFAULT_KV_EVENT_BUFFER_MAX_SIZE arg_map["kv_cache_config"] = {
else: "free_gpu_memory_fraction": config.free_gpu_memory_fraction,
kv_cache_config = arg_map["kv_cache_config"] "event_buffer_max_size": DEFAULT_KV_EVENT_BUFFER_MAX_SIZE,
if "event_buffer_max_size" not in kv_cache_config: }
kv_cache_config[ elif isinstance(current_kv_config, dict):
if "event_buffer_max_size" not in current_kv_config:
current_kv_config[
"event_buffer_max_size" "event_buffer_max_size"
] = DEFAULT_KV_EVENT_BUFFER_MAX_SIZE ] = DEFAULT_KV_EVENT_BUFFER_MAX_SIZE
arg_map["kv_cache_config"] = kv_cache_config
# Only pytorch backend is supported for now to publish events and metrics. # Only pytorch backend is supported for now to publish events and metrics.
if "backend" not in arg_map: if "backend" not in arg_map:
...@@ -273,6 +279,7 @@ async def init(runtime: DistributedRuntime, config: Config): ...@@ -273,6 +279,7 @@ async def init(runtime: DistributedRuntime, config: Config):
# We need to initialize the tokenizer for the test logits processor # We need to initialize the tokenizer for the test logits processor
# But detokenizing still happens in the rust engine, so we do _not_ want # But detokenizing still happens in the rust engine, so we do _not_ want
# to set default_sampling_params.detokenize to True. # to set default_sampling_params.detokenize to True.
# This overrides the skip_tokenizer_init=True set earlier
engine_args["skip_tokenizer_init"] = False engine_args["skip_tokenizer_init"] = False
if modality == "multimodal": if modality == "multimodal":
...@@ -336,6 +343,31 @@ async def init(runtime: DistributedRuntime, config: Config): ...@@ -336,6 +343,31 @@ async def init(runtime: DistributedRuntime, config: Config):
# 2. We need runtime config during registration, before any requests are made # 2. We need runtime config during registration, before any requests are made
# 3. total_kv_blocks would ideally come from engine stats but is not critical for basic operation # 3. total_kv_blocks would ideally come from engine stats but is not critical for basic operation
# Initialize TensorRT-LLM MetricsCollector and register with global REGISTRY
# This enables exposing TRT-LLM's native Prometheus metrics (request latency, TTFT, TPOT, etc.)
metrics_collector = None
if config.publish_events_and_metrics:
try:
model_name_for_metrics = config.served_model_name or config.model_path
metrics_collector = MetricsCollector(
{"model_name": model_name_for_metrics, "engine_type": "trtllm"}
)
logging.info("TensorRT-LLM MetricsCollector initialized")
# Register callback to expose TRT-LLM metrics via Dynamo endpoint
# Filter out python_/process_ metrics and add trtllm: prefix to remaining metrics
register_engine_metrics_callback(
endpoint=endpoint,
registry=REGISTRY,
exclude_prefixes=["python_", "process_"],
add_prefix="trtllm:",
)
logging.info("TensorRT-LLM Prometheus metrics registered")
except Exception as e:
logging.warning(
f"Failed to initialize TensorRT-LLM Prometheus metrics: {e}"
)
# publisher will be set later if publishing is enabled. # publisher will be set later if publishing is enabled.
handler_config = RequestHandlerConfig( handler_config = RequestHandlerConfig(
component=component, component=component,
...@@ -350,6 +382,7 @@ async def init(runtime: DistributedRuntime, config: Config): ...@@ -350,6 +382,7 @@ async def init(runtime: DistributedRuntime, config: Config):
multimodal_processor=multimodal_processor, multimodal_processor=multimodal_processor,
connector=connector, connector=connector,
runtime=runtime, # Pass runtime for graceful shutdown runtime=runtime, # Pass runtime for graceful shutdown
metrics_collector=metrics_collector,
) )
if next_client: if next_client:
......
...@@ -20,7 +20,7 @@ import os ...@@ -20,7 +20,7 @@ import os
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
from dataclasses import asdict, dataclass from dataclasses import asdict, dataclass
from enum import Enum from enum import Enum
from typing import AsyncGenerator, Optional, Union from typing import Any, AsyncGenerator, Optional, Union
import torch import torch
from tensorrt_llm.executor.result import GenerationResult from tensorrt_llm.executor.result import GenerationResult
...@@ -79,6 +79,7 @@ class RequestHandlerConfig: ...@@ -79,6 +79,7 @@ class RequestHandlerConfig:
runtime: Optional[ runtime: Optional[
DistributedRuntime DistributedRuntime
] = None # DistributedRuntime reference for graceful shutdown ] = None # DistributedRuntime reference for graceful shutdown
metrics_collector: Optional[Any] = None # TensorRT-LLM MetricsCollector
class HandlerBase: class HandlerBase:
...@@ -91,6 +92,7 @@ class HandlerBase: ...@@ -91,6 +92,7 @@ class HandlerBase:
self.component = config.component self.component = config.component
self.default_sampling_params = config.default_sampling_params self.default_sampling_params = config.default_sampling_params
self.publisher = config.publisher self.publisher = config.publisher
self.metrics_collector = config.metrics_collector
self.disaggregation_mode = config.disaggregation_mode self.disaggregation_mode = config.disaggregation_mode
self.disaggregation_strategy = config.disaggregation_strategy self.disaggregation_strategy = config.disaggregation_strategy
self.next_client = config.next_client self.next_client = config.next_client
...@@ -329,6 +331,17 @@ class HandlerBase: ...@@ -329,6 +331,17 @@ class HandlerBase:
"Request finished with no finish reason set - this indicates a possible bug" "Request finished with no finish reason set - this indicates a possible bug"
) )
# Log metrics to TensorRT-LLM MetricsCollector when request finishes
if (
res.finished
and self.metrics_collector
and hasattr(res, "metrics_dict")
):
try:
self.metrics_collector.log_metrics_dict(res.metrics_dict)
except Exception as e:
logging.warning(f"Failed to log TensorRT-LLM metrics: {e}")
# Yield the chunk to the client and update the token count for the next iteration. # Yield the chunk to the client and update the token count for the next iteration.
yield out yield out
num_output_tokens_so_far = next_total_toks num_output_tokens_so_far = next_total_toks
......
...@@ -7,6 +7,7 @@ import os ...@@ -7,6 +7,7 @@ import os
import signal import signal
import uvloop import uvloop
from prometheus_client import REGISTRY
from vllm.distributed.kv_events import ZmqEventPublisher from vllm.distributed.kv_events import ZmqEventPublisher
from vllm.usage.usage_lib import UsageContext from vllm.usage.usage_lib import UsageContext
from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.async_llm import AsyncLLM
...@@ -350,9 +351,9 @@ async def init(runtime: DistributedRuntime, config: Config): ...@@ -350,9 +351,9 @@ async def init(runtime: DistributedRuntime, config: Config):
handler.kv_publishers = kv_publishers handler.kv_publishers = kv_publishers
if config.engine_args.disable_log_stats is False: if config.engine_args.disable_log_stats is False:
from prometheus_client import REGISTRY register_engine_metrics_callback(
endpoint=generate_endpoint, registry=REGISTRY, metric_prefix_filter="vllm:"
register_engine_metrics_callback(generate_endpoint, REGISTRY, "vllm:", "vLLM") )
if not config.engine_args.data_parallel_rank: # if rank is 0 or None then register if not config.engine_args.data_parallel_rank: # if rank is 0 or None then register
await register_vllm_model( await register_vllm_model(
......
# TensorRT-LLM Prometheus Metrics
This document describes how TensorRT-LLM Prometheus metrics are exposed in Dynamo, as well as where to find non-Prometheus metrics.
## Overview
When running TensorRT-LLM through Dynamo, TensorRT-LLM's Prometheus metrics are automatically passed through and exposed on Dynamo's `/metrics` endpoint (default port 8081). This allows you to access both TensorRT-LLM engine metrics (prefixed with `trtllm:`) and Dynamo runtime metrics (prefixed with `dynamo_*`) from a single worker backend endpoint.
Additional performance metrics are available via non-Prometheus APIs in the RequestPerfMetrics section below.
As of the date of this documentation, the included TensorRT-LLM version 1.1.0rc5 exposes **5 basic Prometheus metrics**. Note that the `trtllm:` prefix is added by Dynamo.
Dynamo runtime metrics are documented in [docs/guides/metrics.md](../../guides/metrics.md).
## Metric Reference
TensorRT-LLM provides Prometheus metrics through the `MetricsCollector` class (see [tensorrt_llm/metrics/collector.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/metrics/collector.py)), which includes:
- Counter and Histogram metrics
- Metric labels (e.g., `model_name`, `engine_type`, `finished_reason`) - note that TensorRT-LLM uses `model_name` instead of Dynamo's standard `model` label convention
### Current Prometheus Metrics (TensorRT-LLM 1.1.0rc5)
The following metrics are exposed via Dynamo's `/metrics` endpoint (with the `trtllm:` prefix added by Dynamo):
- `trtllm:request_success_total` (Counter) — Count of successfully processed requests by finish reason
- Labels: `model_name`, `engine_type`, `finished_reason`
- `trtllm:e2e_request_latency_seconds` (Histogram) — End-to-end request latency (seconds)
- Labels: `model_name`, `engine_type`
- `trtllm:time_to_first_token_seconds` (Histogram) — Time to first token, TTFT (seconds)
- Labels: `model_name`, `engine_type`
- `trtllm:time_per_output_token_seconds` (Histogram) — Time per output token, TPOT (seconds)
- Labels: `model_name`, `engine_type`
- `trtllm:request_queue_time_seconds` (Histogram) — Time a request spends waiting in the queue (seconds)
- Labels: `model_name`, `engine_type`
These metric names and availability are subject to change with TensorRT-LLM version updates.
## Metric Categories
TensorRT-LLM provides metrics in the following categories (all prefixed with `trtllm:`):
- Request metrics (latency, throughput)
- Performance metrics (TTFT, TPOT, queue time)
**Note:** Metrics may change between TensorRT-LLM versions. Always inspect the `/metrics` endpoint for your version.
## Enabling Metrics in Dynamo
TensorRT-LLM Prometheus metrics are automatically exposed when running TensorRT-LLM through Dynamo with the `--publish-events-and-metrics` flag.
### Required Configuration
```bash
python -m dynamo.trtllm --model <model_name> --publish-events-and-metrics
```
### Backend Requirement
- `backend`: Must be set to `"pytorch"` for metrics collection (enforced in `components/src/dynamo/trtllm/main.py`)
- TensorRT-LLM's `MetricsCollector` integration has only been tested/validated with the PyTorch backend
## Inspecting Metrics
To see the actual metrics available in your TensorRT-LLM version:
### 1. Launch TensorRT-LLM with Metrics Enabled
```bash
# Set environment variables
export DYN_SYSTEM_ENABLED=true
export DYN_SYSTEM_PORT=8081
# Start TensorRT-LLM worker with metrics enabled
python -m dynamo.trtllm --model <model_name> --publish-events-and-metrics
# Wait for engine to initialize
```
Metrics will be available at: `http://localhost:8081/metrics`
### 2. Fetch Metrics via curl
```bash
curl http://localhost:8081/metrics | grep "^trtllm:"
```
### 3. Example Output
**Note:** The specific metrics shown below are examples and may vary depending on your TensorRT-LLM version. Always inspect your actual `/metrics` endpoint for the current list.
```
# HELP trtllm:request_success_total Count of successfully processed requests.
# TYPE trtllm:request_success_total counter
trtllm:request_success_total{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm",finished_reason="stop"} 150.0
trtllm:request_success_total{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm",finished_reason="length"} 5.0
# HELP trtllm:time_to_first_token_seconds Histogram of time to first token in seconds.
# TYPE trtllm:time_to_first_token_seconds histogram
trtllm:time_to_first_token_seconds_bucket{le="0.01",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 0.0
trtllm:time_to_first_token_seconds_bucket{le="0.05",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 12.0
trtllm:time_to_first_token_seconds_count{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 150.0
trtllm:time_to_first_token_seconds_sum{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 8.75
# HELP trtllm:e2e_request_latency_seconds Histogram of end to end request latency in seconds.
# TYPE trtllm:e2e_request_latency_seconds histogram
trtllm:e2e_request_latency_seconds_bucket{le="0.5",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 25.0
trtllm:e2e_request_latency_seconds_count{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 150.0
trtllm:e2e_request_latency_seconds_sum{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 45.2
# HELP trtllm:time_per_output_token_seconds Histogram of time per output token in seconds.
# TYPE trtllm:time_per_output_token_seconds histogram
trtllm:time_per_output_token_seconds_bucket{le="0.1",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 120.0
trtllm:time_per_output_token_seconds_count{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 150.0
trtllm:time_per_output_token_seconds_sum{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 12.5
# HELP trtllm:request_queue_time_seconds Histogram of time spent in WAITING phase for request.
# TYPE trtllm:request_queue_time_seconds histogram
trtllm:request_queue_time_seconds_bucket{le="1.0",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 140.0
trtllm:request_queue_time_seconds_count{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 150.0
trtllm:request_queue_time_seconds_sum{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 32.1
```
## Implementation Details
- **Prometheus Integration**: Uses the `MetricsCollector` class from `tensorrt_llm.metrics` (see [collector.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/metrics/collector.py))
- **Dynamo Integration**: Uses `register_engine_metrics_callback()` function with `add_prefix="trtllm:"`
- **Engine Configuration**: `return_perf_metrics` set to `True` when `--publish-events-and-metrics` is enabled
- **Initialization**: Metrics appear after TensorRT-LLM engine initialization completes
- **Metadata**: `MetricsCollector` initialized with model metadata (model name, engine type)
## TensorRT-LLM Specific: Non-Prometheus Performance Metrics
TensorRT-LLM provides extensive performance data beyond the basic Prometheus metrics. These are **not exposed to Prometheus**.
### Available via Code References:
- **RequestPerfMetrics Structure**: [tensorrt_llm/executor/result.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/executor/result.py) - KV cache, timing, speculative decoding metrics
- **Engine Statistics**: `engine.llm.get_stats_async()` - System-wide aggregate statistics
- **KV Cache Events**: `engine.llm.get_kv_cache_events_async()` - Real-time cache operations
### Example RequestPerfMetrics JSON Structure:
```json
{
"timing_metrics": {
"arrival_time": 1234567890.123,
"first_scheduled_time": 1234567890.135,
"first_token_time": 1234567890.150,
"last_token_time": 1234567890.300,
"kv_cache_size": 2048576,
"kv_cache_transfer_start": 1234567890.140,
"kv_cache_transfer_end": 1234567890.145
},
"kv_cache_metrics": {
"num_total_allocated_blocks": 100,
"num_new_allocated_blocks": 10,
"num_reused_blocks": 90,
"num_missed_blocks": 5
},
"speculative_decoding": {
"acceptance_rate": 0.85,
"total_accepted_draft_tokens": 42,
"total_draft_tokens": 50
}
}
```
**Note**: These structures are valid as of the date of this documentation but are subject to change with TensorRT-LLM version updates.
## See Also
### TensorRT-LLM Metrics
- See the "TensorRT-LLM Specific: Non-Prometheus Performance Metrics" section above for detailed performance data and source code references
### Dynamo Metrics
- **Dynamo Metrics Guide**: See [docs/guides/metrics.md](../../guides/metrics.md) for complete documentation on Dynamo runtime metrics
- **Dynamo Runtime Metrics**: Metrics prefixed with `dynamo_*` for runtime, components, endpoints, and namespaces
- Implementation: `lib/runtime/src/metrics.rs` (Rust runtime metrics)
- Metric names: `lib/runtime/src/metrics/prometheus_names.rs` (metric name constants)
- Available at the same `/metrics` endpoint alongside TensorRT-LLM metrics
- **Integration Code**: `components/src/dynamo/common/utils/prometheus.py` - Prometheus utilities and callback registration
...@@ -51,6 +51,7 @@ ...@@ -51,6 +51,7 @@
backends/trtllm/multimodal_epd.md backends/trtllm/multimodal_epd.md
backends/trtllm/gemma3_sliding_window_attention.md backends/trtllm/gemma3_sliding_window_attention.md
backends/trtllm/gpt-oss.md backends/trtllm/gpt-oss.md
backends/trtllm/prometheus.md
backends/sglang/multinode-examples.md backends/sglang/multinode-examples.md
backends/sglang/dsr1-wideep-gb200.md backends/sglang/dsr1-wideep-gb200.md
......
...@@ -493,7 +493,24 @@ class ModelRuntimeConfig: ...@@ -493,7 +493,24 @@ class ModelRuntimeConfig:
""" """
A model runtime configuration is a collection of runtime information A model runtime configuration is a collection of runtime information
""" """
...
total_kv_blocks: int | None
max_num_seqs: int | None
max_num_batched_tokens: int | None
tool_call_parser: str | None
reasoning_parser: str | None
runtime_data: dict[str, Any]
tensor_model_config: Any | None
def __init__(self) -> None: ...
def set_engine_specific(self, key: str, value: Any) -> None:
"""Set an engine-specific runtime configuration value"""
...
def get_engine_specific(self, key: str) -> Any | None:
"""Get an engine-specific runtime configuration value"""
...
class OAIChatPreprocessor: class OAIChatPreprocessor:
""" """
......
...@@ -16,6 +16,7 @@ from dynamo._core import Context as Context ...@@ -16,6 +16,7 @@ from dynamo._core import Context as Context
from dynamo._core import DistributedRuntime as DistributedRuntime from dynamo._core import DistributedRuntime as DistributedRuntime
from dynamo._core import Endpoint as Endpoint from dynamo._core import Endpoint as Endpoint
from dynamo._core import ModelDeploymentCard as ModelDeploymentCard from dynamo._core import ModelDeploymentCard as ModelDeploymentCard
from dynamo._core import Namespace as Namespace
from dynamo._core import OAIChatPreprocessor as OAIChatPreprocessor from dynamo._core import OAIChatPreprocessor as OAIChatPreprocessor
......
...@@ -297,9 +297,9 @@ impl Runtime { ...@@ -297,9 +297,9 @@ impl Runtime {
tracker.wait_for_completion().await; tracker.wait_for_completion().await;
} }
// Phase 3: Now shutdown NATS/ETCD by cancelling the main token // Phase 3: Now connections will be disconnected to NATS/ETCD by cancelling the main token
tracing::info!( tracing::info!(
"Phase 3: All graceful endpoints completed, shutting down NATS/ETCD connections" "Phase 3: All endpoints ended gracefully. Connections to NATS/ETCD will now be disconnected"
); );
main_token.cancel(); main_token.cancel();
}); });
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Unit tests for Prometheus utilities."""
from unittest.mock import Mock
import pytest
from dynamo.common.utils.prometheus import get_prometheus_expfmt
pytestmark = [
pytest.mark.unit,
]
class TestGetPrometheusExpfmt:
"""Test class for get_prometheus_expfmt function."""
@pytest.fixture
def vllm_registry(self):
"""Create a mock registry with vLLM-style metrics."""
registry = Mock()
sample_metrics = """# HELP python_gc_objects_collected_total Objects collected during gc
# TYPE python_gc_objects_collected_total counter
python_gc_objects_collected_total{generation="0"} 123.0
# HELP process_cpu_seconds_total Total user and system CPU time spent in seconds
# TYPE process_cpu_seconds_total counter
process_cpu_seconds_total 45.6
# HELP vllm:request_success_total Number of successfully finished requests
# TYPE vllm:request_success_total counter
vllm:request_success_total{finished_reason="stop",model_name="meta-llama/Llama-3.1-8B"} 150.0
# HELP vllm:time_to_first_token_seconds Histogram of time to first token in seconds
# TYPE vllm:time_to_first_token_seconds histogram
vllm:time_to_first_token_seconds_bucket{le="0.005",model_name="meta-llama/Llama-3.1-8B"} 5.0
vllm:time_to_first_token_seconds_count{model_name="meta-llama/Llama-3.1-8B"} 165.0
"""
def mock_generate_latest(reg):
return sample_metrics.encode("utf-8")
import dynamo.common.utils.prometheus
original_generate_latest = dynamo.common.utils.prometheus.generate_latest
dynamo.common.utils.prometheus.generate_latest = mock_generate_latest
yield registry
dynamo.common.utils.prometheus.generate_latest = original_generate_latest
@pytest.fixture
def sglang_registry(self):
"""Create a mock registry with SGLang-style metrics."""
registry = Mock()
sample_metrics = """# HELP python_gc_objects_collected_total Objects collected during gc
# TYPE python_gc_objects_collected_total counter
python_gc_objects_collected_total{generation="0"} 123.0
# HELP process_cpu_seconds_total Total user and system CPU time spent in seconds
# TYPE process_cpu_seconds_total counter
process_cpu_seconds_total 45.6
# HELP sglang:prompt_tokens_total Number of prefill tokens processed
# TYPE sglang:prompt_tokens_total counter
sglang:prompt_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct"} 8128902.0
# HELP sglang:generation_tokens_total Number of generation tokens processed
# TYPE sglang:generation_tokens_total counter
sglang:generation_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct"} 7557572.0
# HELP sglang:cache_hit_rate The cache hit rate
# TYPE sglang:cache_hit_rate gauge
sglang:cache_hit_rate{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0075
"""
def mock_generate_latest(reg):
return sample_metrics.encode("utf-8")
import dynamo.common.utils.prometheus
original_generate_latest = dynamo.common.utils.prometheus.generate_latest
dynamo.common.utils.prometheus.generate_latest = mock_generate_latest
yield registry
dynamo.common.utils.prometheus.generate_latest = original_generate_latest
@pytest.fixture
def trtllm_registry(self):
"""Create a mock registry with TensorRT-LLM-style metrics (no existing prefixes)."""
registry = Mock()
sample_metrics = """# HELP python_gc_objects_collected_total Objects collected during gc
# TYPE python_gc_objects_collected_total counter
python_gc_objects_collected_total{generation="0"} 123.0
# HELP process_cpu_seconds_total Total user and system CPU time spent in seconds
# TYPE process_cpu_seconds_total counter
process_cpu_seconds_total 45.6
# HELP request_latency_seconds Request latency in seconds
# TYPE request_latency_seconds histogram
request_latency_seconds_bucket{le="0.1"} 10.0
request_latency_seconds_count 25.0
# HELP num_requests_running Number of requests currently running
# TYPE num_requests_running gauge
num_requests_running 3.0
# HELP tokens_per_second Tokens generated per second
# TYPE tokens_per_second gauge
tokens_per_second 245.7
"""
def mock_generate_latest(reg):
return sample_metrics.encode("utf-8")
import dynamo.common.utils.prometheus
original_generate_latest = dynamo.common.utils.prometheus.generate_latest
dynamo.common.utils.prometheus.generate_latest = mock_generate_latest
yield registry
dynamo.common.utils.prometheus.generate_latest = original_generate_latest
def test_vllm_use_case(self, vllm_registry):
"""Test vLLM use case: filter to vllm: metrics and exclude python_/process_."""
result = get_prometheus_expfmt(
vllm_registry,
metric_prefix_filter="vllm:",
exclude_prefixes=["python_", "process_"],
)
# Should only contain vllm: metrics
assert "vllm:request_success_total" in result
assert "vllm:time_to_first_token_seconds" in result
assert "# HELP vllm:request_success_total" in result
# Should not contain excluded metrics
assert "python_gc_objects_collected_total" not in result
assert "process_cpu_seconds_total" not in result
# Check specific content
assert 'finished_reason="stop"' in result
assert 'model_name="meta-llama/Llama-3.1-8B"' in result
assert result.endswith("\n")
def test_sglang_use_case(self, sglang_registry):
"""Test SGLang use case: filter to sglang: metrics and exclude python_/process_."""
result = get_prometheus_expfmt(
sglang_registry,
metric_prefix_filter="sglang:",
exclude_prefixes=["python_", "process_"],
)
# Should only contain sglang: metrics
assert "sglang:prompt_tokens_total" in result
assert "sglang:generation_tokens_total" in result
assert "sglang:cache_hit_rate" in result
assert "# HELP sglang:prompt_tokens_total" in result
# Should not contain excluded metrics
assert "python_gc_objects_collected_total" not in result
assert "process_cpu_seconds_total" not in result
# Check specific content
assert 'model_name="meta-llama/Llama-3.1-8B-Instruct"' in result
assert "8128902.0" in result # prompt tokens value
assert result.endswith("\n")
def test_trtllm_use_case(self, trtllm_registry):
"""Test TensorRT-LLM use case: exclude python_/process_ and add trtllm: prefix."""
result = get_prometheus_expfmt(
trtllm_registry,
exclude_prefixes=["python_", "process_"],
add_prefix="trtllm:",
)
# Should not contain excluded metrics
assert "python_gc_objects_collected_total" not in result
assert "process_cpu_seconds_total" not in result
# All remaining metrics should have trtllm: prefix
assert "trtllm:request_latency_seconds" in result
assert "trtllm:num_requests_running" in result
assert "trtllm:tokens_per_second" in result
# HELP/TYPE comments should have prefix
assert "# HELP trtllm:request_latency_seconds" in result
assert "# TYPE trtllm:num_requests_running" in result
# Check specific content and structure preservation
assert 'trtllm:request_latency_seconds_bucket{le="0.1"} 10.0' in result
assert "trtllm:tokens_per_second 245.7" in result
assert result.endswith("\n")
def test_no_filtering_all_frameworks(self, trtllm_registry):
"""Test that without any filters, all metrics are returned."""
result = get_prometheus_expfmt(trtllm_registry)
# Should contain all metrics including excluded ones
assert "python_gc_objects_collected_total" in result
assert "process_cpu_seconds_total" in result
assert "request_latency_seconds" in result
assert "num_requests_running" in result
assert result.endswith("\n")
def test_empty_result_handling(self, trtllm_registry):
"""Test handling when all metrics are filtered out."""
result = get_prometheus_expfmt(
trtllm_registry,
exclude_prefixes=["python_", "process_", "request_", "num_", "tokens_"],
)
# Should return empty string with newline or just newline
assert result == "\n" or result == ""
def test_error_handling(self):
"""Test error handling when registry fails."""
# Create a registry that raises an exception
bad_registry = Mock()
bad_registry.side_effect = Exception("Registry error")
result = get_prometheus_expfmt(bad_registry)
# Should return empty string on error
assert result == ""
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""
Simple test for TensorRT-LLM MetricsCollector import and basic functionality.
"""
from unittest.mock import Mock
import pytest
# Mark all tests in this module to run only in TensorRT-LLM container
pytestmark = pytest.mark.trtllm
def test_tensorrt_llm_metrics_collector_import():
"""Test that we can import MetricsCollector from TensorRT-LLM."""
try:
import warnings
with warnings.catch_warnings():
warnings.simplefilter("ignore") # Ignore warnings during import
from tensorrt_llm.metrics.collector import MetricsCollector
# Test basic initialization (only once to avoid registry conflicts)
metrics_collector = MetricsCollector(
{"model_name": "test-model-unique", "engine_type": "trtllm"}
)
assert metrics_collector is not None
print("✅ MetricsCollector imported and initialized successfully")
except ImportError as e:
pytest.skip(f"TensorRT-LLM not available: {e}")
except Exception as e:
pytest.fail(f"Failed to initialize MetricsCollector: {e}")
def test_prometheus_registry_import():
"""Test that we can import Prometheus registry."""
try:
from prometheus_client import REGISTRY
assert REGISTRY is not None
print("✅ Prometheus REGISTRY imported successfully")
except ImportError as e:
pytest.skip(f"Prometheus client not available: {e}")
def test_prometheus_metrics_integration():
"""Test Prometheus metrics integration as used in main.py init() function."""
try:
import warnings
with warnings.catch_warnings():
warnings.simplefilter("ignore") # Ignore warnings during import
from prometheus_client import REGISTRY
from dynamo.common.utils.prometheus import register_engine_metrics_callback
# Mock endpoint for registration (simulating what init() does)
mock_endpoint = Mock()
# Test the exact call that main.py init() makes
register_engine_metrics_callback(
endpoint=mock_endpoint,
registry=REGISTRY,
exclude_prefixes=["python_", "process_"],
add_prefix="trtllm:",
)
print("✅ Prometheus metrics integration test passed")
except ImportError as e:
pytest.skip(f"Required modules not available: {e}")
except Exception as e:
pytest.fail(f"Prometheus integration test failed: {e}")
if __name__ == "__main__":
# Run tests directly for quick verification
test_tensorrt_llm_metrics_collector_import()
test_prometheus_registry_import()
test_prometheus_metrics_integration()
print("🎉 All tests passed!")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment