feat: add TensorRT-LLM Prometheus metrics support with prefixing + filtering (#3676)

Signed-off-by: Keiven Chang <keivenchang@users.noreply.github.com>

feat: add TensorRT-LLM Prometheus metrics support with prefixing + filtering (#3676)
Signed-off-by: Keiven Chang <keivenchang@users.noreply.github.com>
f93b619a · Keiven C · GitHub · 79ad7f36 · f93b619a · f93b619a
Unverified Commit f93b619a authored Oct 22, 2025 by Keiven C Committed by GitHub Oct 22, 2025
13 changed files
--- a/components/src/dynamo/common/utils/prometheus.py
+++ b/components/src/dynamo/common/utils/prometheus.py
@@ -29,8 +29,9 @@ if TYPE_CHECKING:
 def register_engine_metrics_callback(
    endpoint: Endpoint,
    registry: "CollectorRegistry",
-    metric_prefix: str,
+    metric_prefix_filter: Optional[str] = None,
-    engine_name: str,
+    exclude_prefixes: Optional[list[str]] = None,
+    add_prefix: Optional[str] = None,
 ) -> None:
    """
    Register a callback to expose engine Prometheus metrics via Dynamo's metrics endpoint.
@@ -41,19 +42,32 @@ def register_engine_metrics_callback(
    Args:
        endpoint: Dynamo endpoint object with metrics.register_prometheus_expfmt_callback()
        registry: Prometheus registry to collect from (e.g., REGISTRY or CollectorRegistry)
-        metric_prefix: Prefix to filter metrics (e.g., "vllm:" or "sglang:")
+        metric_prefix_filter: Prefix to filter metrics (e.g., "vllm:" or "sglang:", None for no filtering)
-        engine_name: Name of the engine for logging (e.g., "vLLM" or "SGLang")
+        exclude_prefixes: List of metric name prefixes to exclude (e.g., ["python_", "process_"])
+        add_prefix: Prefix to add to remaining metrics (e.g., "trtllm:")
    Example:
        from prometheus_client import REGISTRY
        register_engine_metrics_callback(
-            generate_endpoint, REGISTRY, "vllm:", "vLLM"
+            generate_endpoint, REGISTRY, metric_prefix_filter="vllm:"
+        )
+        # With filtering and prefixing for TensorRT-LLM
+        register_engine_metrics_callback(
+            generate_endpoint, REGISTRY,
+            exclude_prefixes=["python_", "process_"],
+            add_prefix="trtllm:"
        )
    """
    def get_expfmt() -> str:
        """Callback to return engine Prometheus metrics in exposition format"""
-        return get_prometheus_expfmt(registry, metric_prefix_filter=metric_prefix)
+        return get_prometheus_expfmt(
+            registry,
+            metric_prefix_filter=metric_prefix_filter,
+            exclude_prefixes=exclude_prefixes,
+            add_prefix=add_prefix,
+        )
    endpoint.metrics.register_prometheus_expfmt_callback(get_expfmt)
@@ -61,27 +75,14 @@ def register_engine_metrics_callback(
 def get_prometheus_expfmt(
    registry,
    metric_prefix_filter: Optional[str] = None,
+    exclude_prefixes: Optional[list[str]] = None,
+    add_prefix: Optional[str] = None,
 ) -> str:
    """
    Get Prometheus metrics from a registry formatted as text using the standard text encoder.
    Collects all metrics from the registry and returns them in Prometheus text exposition format.
-    Optionally filters metrics by prefix.
+    Optionally filters metrics by prefix, excludes certain prefixes, and adds a prefix.
-    Prometheus exposition format consists of:
-    - Comment lines starting with # (HELP and TYPE declarations)
-    - Metric lines with format: metric_name{label="value"} metric_value timestamp
-    Example output format:
-        # HELP vllm:request_success_total Number of successful requests
-        # TYPE vllm:request_success_total counter
-        vllm:request_success_total{model="llama2",endpoint="generate"} 150.0
-        # HELP vllm:time_to_first_token_seconds Time to first token
-        # TYPE vllm:time_to_first_token_seconds histogram
-        vllm:time_to_first_token_seconds_bucket{model="llama2",le="0.01"} 10.0
-        vllm:time_to_first_token_seconds_bucket{model="llama2",le="0.1"} 45.0
-        vllm:time_to_first_token_seconds_count{model="llama2"} 50.0
-        vllm:time_to_first_token_seconds_sum{model="llama2"} 2.5
    Args:
        registry: Prometheus registry to collect from.
@@ -89,34 +90,79 @@ def get_prometheus_expfmt(
                 Pass REGISTRY for vLLM single-process mode.
        metric_prefix_filter: Optional prefix to filter displayed metrics (e.g., "vllm:").
                             If None, returns all metrics. (default: None)
+        exclude_prefixes: List of metric name prefixes to exclude (e.g., ["python_", "process_"])
+        add_prefix: Prefix to add to remaining metrics (e.g., "trtllm:")
    Returns:
        Formatted metrics text in Prometheus exposition format. Returns empty string on error.
    Example:
-        from prometheus_client import REGISTRY
+        # Filter out python_/process_ metrics and add trtllm: prefix
-        metrics_text = get_prometheus_expfmt(REGISTRY)
+        get_prometheus_expfmt(registry, exclude_prefixes=["python_", "process_"], add_prefix="trtllm:")
-        print(metrics_text)
-        # With filter
-        vllm_metrics = get_prometheus_expfmt(REGISTRY, metric_prefix_filter="vllm:")
    """
    try:
        # Generate metrics in Prometheus text format
        metrics_text = generate_latest(registry).decode("utf-8")
-        if metric_prefix_filter:
+        if metric_prefix_filter or exclude_prefixes or add_prefix:
-            # Filter lines: keep metric lines starting with prefix and their HELP/TYPE comments
+            lines = []
-            escaped_prefix = re.escape(metric_prefix_filter)
-            pattern = rf"^(?:{escaped_prefix}|# (?:HELP|TYPE) {escaped_prefix})"
+            # Build exclude pattern for lines to skip entirely
-            filtered_lines = [
+            exclude_line_pattern = None
-                line for line in metrics_text.split("\n") if re.match(pattern, line)
+            if exclude_prefixes:
-            ]
+                escaped_prefixes = [re.escape(prefix) for prefix in exclude_prefixes]
-            result = "\n".join(filtered_lines)
+                prefixes_regex = "|".join(escaped_prefixes)
-            if result:
+                # Match lines starting with: HELP/TYPE comments OR metric lines with excluded prefixes
-                # Ensure result ends with newline
+                exclude_line_pattern = re.compile(
-                if result and not result.endswith("\n"):
+                    rf"^(# (HELP|TYPE) )?({prefixes_regex})"
-                    result += "\n"
+                )
+            # Build include pattern if needed
+            include_pattern = None
+            if metric_prefix_filter:
+                escaped_prefix = re.escape(metric_prefix_filter)
+                include_pattern = re.compile(rf"^(# (HELP|TYPE) )?{escaped_prefix}")
+            for line in metrics_text.split("\n"):
+                if not line.strip():
+                    continue
+                # Skip excluded lines entirely
+                if exclude_line_pattern and exclude_line_pattern.match(line):
+                    continue
+                # Apply include filter if specified
+                if include_pattern and not include_pattern.match(line):
+                    continue
+                # Apply prefix transformation if needed
+                if add_prefix:
+                    # Handle HELP/TYPE comments
+                    if line.startswith("# HELP ") or line.startswith("# TYPE "):
+                        match = re.match(r"^# (HELP|TYPE) (\S+)(.*)$", line)
+                        if match:
+                            comment_type, metric_name, rest = match.groups()
+                            # Remove existing prefix if present
+                            if metric_prefix_filter and metric_name.startswith(
+                                metric_prefix_filter
+                            ):
+                                metric_name = metric_name[len(metric_prefix_filter) :]
+                            new_metric_name = add_prefix + metric_name
+                            line = f"# {comment_type} {new_metric_name}{rest}"
+                    # Handle metric lines
+                    elif line and not line.startswith("#"):
+                        # Remove existing prefix if present
+                        if metric_prefix_filter and line.startswith(
+                            metric_prefix_filter
+                        ):
+                            line = line[len(metric_prefix_filter) :]
+                        line = add_prefix + line
+                lines.append(line)
+            result = "\n".join(lines)
+            if result and not result.endswith("\n"):
+                result += "\n"
            return result
        else:
            # Ensure metrics_text ends with newline

--- a/components/src/dynamo/sglang/main.py
+++ b/components/src/dynamo/sglang/main.py
@@ -94,6 +94,8 @@ async def init(runtime: DistributedRuntime, config: Config):
        )
    # publisher instantiates the metrics and kv event publishers
+    # Note that when engine.server_args.enable_metrics is True, it'll also
+    # gather internal SGLang Prometheus metrics from all worker processes.
    publisher, metrics_task, metrics_labels = await setup_sgl_metrics(
        engine, config, component, generate_endpoint
    )

--- a/components/src/dynamo/sglang/publisher.py
+++ b/components/src/dynamo/sglang/publisher.py
@@ -226,7 +226,9 @@ async def setup_sgl_metrics(
        registry = CollectorRegistry()
        multiprocess.MultiProcessCollector(registry)
        register_engine_metrics_callback(
-            generate_endpoint, registry, "sglang:", "SGLang"
+            endpoint=generate_endpoint,
+            registry=registry,
+            metric_prefix_filter="sglang:",
        )
    task = asyncio.create_task(publisher.run())

--- a/components/src/dynamo/trtllm/main.py
+++ b/components/src/dynamo/trtllm/main.py
@@ -20,6 +20,7 @@ if "TLLM_LOG_LEVEL" not in os.environ and os.getenv(
    tllm_level = map_dyn_log_to_tllm_level(dyn_log)
    os.environ["TLLM_LOG_LEVEL"] = tllm_level
 import uvloop
+from prometheus_client import REGISTRY
 from tensorrt_llm.llmapi import (
    BuildConfig,
    CapacitySchedulerPolicy,
@@ -30,11 +31,13 @@ from tensorrt_llm.llmapi import (
 from tensorrt_llm.llmapi.llm import SamplingParams
 from tensorrt_llm.llmapi.llm_utils import update_llm_args_with_extra_options
 from tensorrt_llm.llmapi.tokenizer import tokenizer_factory
+from tensorrt_llm.metrics import MetricsCollector
 from torch.cuda import device_count
 from transformers import AutoConfig
 import dynamo.nixl_connect as nixl_connect
 from dynamo.common.config_dump import dump_config
+from dynamo.common.utils.prometheus import register_engine_metrics_callback
 from dynamo.llm import ModelInput, ModelRuntimeConfig, ModelType, register_llm
 from dynamo.runtime import DistributedRuntime, dynamo_worker
 from dynamo.runtime.logging import configure_dynamo_logging
@@ -218,6 +221,7 @@ async def init(runtime: DistributedRuntime, config: Config):
        "max_seq_len": config.max_seq_len,
        "max_beam_width": config.max_beam_width,
        "max_batch_size": config.max_batch_size,
+        "return_perf_metrics": config.publish_events_and_metrics,
    }
    if config.extra_engine_args != "":
@@ -234,19 +238,21 @@ async def init(runtime: DistributedRuntime, config: Config):
        except json.JSONDecodeError as e:
            logging.error(f"Failed to parse override_engine_args as JSON: {e}")
            sys.exit(1)
    if config.publish_events_and_metrics:
        # 'event_buffer_max_size' is required to enable TRTLLM to publish kv cache events.
-        kv_cache_config = None
+        # Convert KvCacheConfig object to dict and add the parameter
-        if "kv_cache_config" not in arg_map:
+        current_kv_config = arg_map["kv_cache_config"]
-            kv_cache_config = {}
+        if isinstance(current_kv_config, KvCacheConfig):
-            kv_cache_config["event_buffer_max_size"] = DEFAULT_KV_EVENT_BUFFER_MAX_SIZE
+            arg_map["kv_cache_config"] = {
-        else:
+                "free_gpu_memory_fraction": config.free_gpu_memory_fraction,
-            kv_cache_config = arg_map["kv_cache_config"]
+                "event_buffer_max_size": DEFAULT_KV_EVENT_BUFFER_MAX_SIZE,
-            if "event_buffer_max_size" not in kv_cache_config:
+            }
-                kv_cache_config[
+        elif isinstance(current_kv_config, dict):
+            if "event_buffer_max_size" not in current_kv_config:
+                current_kv_config[
                    "event_buffer_max_size"
                ] = DEFAULT_KV_EVENT_BUFFER_MAX_SIZE
-        arg_map["kv_cache_config"] = kv_cache_config
        # Only pytorch backend is supported for now to publish events and metrics.
        if "backend" not in arg_map:
@@ -273,6 +279,7 @@ async def init(runtime: DistributedRuntime, config: Config):
        # We need to initialize the tokenizer for the test logits processor
        # But detokenizing still happens in the rust engine, so we do _not_ want
        # to set default_sampling_params.detokenize to True.
+        # This overrides the skip_tokenizer_init=True set earlier
        engine_args["skip_tokenizer_init"] = False
    if modality == "multimodal":
@@ -336,6 +343,31 @@ async def init(runtime: DistributedRuntime, config: Config):
        # 2. We need runtime config during registration, before any requests are made
        # 3. total_kv_blocks would ideally come from engine stats but is not critical for basic operation
+        # Initialize TensorRT-LLM MetricsCollector and register with global REGISTRY
+        # This enables exposing TRT-LLM's native Prometheus metrics (request latency, TTFT, TPOT, etc.)
+        metrics_collector = None
+        if config.publish_events_and_metrics:
+            try:
+                model_name_for_metrics = config.served_model_name or config.model_path
+                metrics_collector = MetricsCollector(
+                    {"model_name": model_name_for_metrics, "engine_type": "trtllm"}
+                )
+                logging.info("TensorRT-LLM MetricsCollector initialized")
+                # Register callback to expose TRT-LLM metrics via Dynamo endpoint
+                # Filter out python_/process_ metrics and add trtllm: prefix to remaining metrics
+                register_engine_metrics_callback(
+                    endpoint=endpoint,
+                    registry=REGISTRY,
+                    exclude_prefixes=["python_", "process_"],
+                    add_prefix="trtllm:",
+                )
+                logging.info("TensorRT-LLM Prometheus metrics registered")
+            except Exception as e:
+                logging.warning(
+                    f"Failed to initialize TensorRT-LLM Prometheus metrics: {e}"
+                )
        # publisher will be set later if publishing is enabled.
        handler_config = RequestHandlerConfig(
            component=component,
@@ -350,6 +382,7 @@ async def init(runtime: DistributedRuntime, config: Config):
            multimodal_processor=multimodal_processor,
            connector=connector,
            runtime=runtime,  # Pass runtime for graceful shutdown
+            metrics_collector=metrics_collector,
        )
        if next_client:

--- a/components/src/dynamo/trtllm/request_handlers/handler_base.py
+++ b/components/src/dynamo/trtllm/request_handlers/handler_base.py
@@ -20,7 +20,7 @@ import os
 from contextlib import asynccontextmanager
 from dataclasses import asdict, dataclass
 from enum import Enum
-from typing import AsyncGenerator, Optional, Union
+from typing import Any, AsyncGenerator, Optional, Union
 import torch
 from tensorrt_llm.executor.result import GenerationResult
@@ -79,6 +79,7 @@ class RequestHandlerConfig:
    runtime: Optional[
        DistributedRuntime
    ] = None  # DistributedRuntime reference for graceful shutdown
+    metrics_collector: Optional[Any] = None  # TensorRT-LLM MetricsCollector
 class HandlerBase:
@@ -91,6 +92,7 @@ class HandlerBase:
        self.component = config.component
        self.default_sampling_params = config.default_sampling_params
        self.publisher = config.publisher
+        self.metrics_collector = config.metrics_collector
        self.disaggregation_mode = config.disaggregation_mode
        self.disaggregation_strategy = config.disaggregation_strategy
        self.next_client = config.next_client
@@ -329,6 +331,17 @@ class HandlerBase:
                            "Request finished with no finish reason set - this indicates a possible bug"
                        )
+                    # Log metrics to TensorRT-LLM MetricsCollector when request finishes
+                    if (
+                        res.finished
+                        and self.metrics_collector
+                        and hasattr(res, "metrics_dict")
+                    ):
+                        try:
+                            self.metrics_collector.log_metrics_dict(res.metrics_dict)
+                        except Exception as e:
+                            logging.warning(f"Failed to log TensorRT-LLM metrics: {e}")
                    # Yield the chunk to the client and update the token count for the next iteration.
                    yield out
                    num_output_tokens_so_far = next_total_toks

--- a/components/src/dynamo/vllm/main.py
+++ b/components/src/dynamo/vllm/main.py
@@ -7,6 +7,7 @@ import os
 import signal
 import uvloop
+from prometheus_client import REGISTRY
 from vllm.distributed.kv_events import ZmqEventPublisher
 from vllm.usage.usage_lib import UsageContext
 from vllm.v1.engine.async_llm import AsyncLLM
@@ -350,9 +351,9 @@ async def init(runtime: DistributedRuntime, config: Config):
        handler.kv_publishers = kv_publishers
    if config.engine_args.disable_log_stats is False:
-        from prometheus_client import REGISTRY
+        register_engine_metrics_callback(
+            endpoint=generate_endpoint, registry=REGISTRY, metric_prefix_filter="vllm:"
-        register_engine_metrics_callback(generate_endpoint, REGISTRY, "vllm:", "vLLM")
+        )
    if not config.engine_args.data_parallel_rank:  # if rank is 0 or None then register
        await register_vllm_model(

--- a/docs/backends/trtllm/prometheus.md
+++ b/docs/backends/trtllm/prometheus.md
+# TensorRT-LLM Prometheus Metrics
+This document describes how TensorRT-LLM Prometheus metrics are exposed in Dynamo, as well as where to find non-Prometheus metrics.
+## Overview
+When running TensorRT-LLM through Dynamo, TensorRT-LLM's Prometheus metrics are automatically passed through and exposed on Dynamo's `/metrics` endpoint (default port 8081). This allows you to access both TensorRT-LLM engine metrics (prefixed with `trtllm:`) and Dynamo runtime metrics (prefixed with `dynamo_*`) from a single worker backend endpoint.
+Additional performance metrics are available via non-Prometheus APIs in the RequestPerfMetrics section below.
+As of the date of this documentation, the included TensorRT-LLM version 1.1.0rc5 exposes **5 basic Prometheus metrics**. Note that the `trtllm:` prefix is added by Dynamo.
+Dynamo runtime metrics are documented in [docs/guides/metrics.md](../../guides/metrics.md).
+## Metric Reference
+TensorRT-LLM provides Prometheus metrics through the `MetricsCollector` class (see [tensorrt_llm/metrics/collector.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/metrics/collector.py)), which includes:
+- Counter and Histogram metrics
+- Metric labels (e.g., `model_name`, `engine_type`, `finished_reason`) - note that TensorRT-LLM uses `model_name` instead of Dynamo's standard `model` label convention
+### Current Prometheus Metrics (TensorRT-LLM 1.1.0rc5)
+The following metrics are exposed via Dynamo's `/metrics` endpoint (with the `trtllm:` prefix added by Dynamo):
+- `trtllm:request_success_total` (Counter) — Count of successfully processed requests by finish reason
+  - Labels: `model_name`, `engine_type`, `finished_reason`
+- `trtllm:e2e_request_latency_seconds` (Histogram) — End-to-end request latency (seconds)
+  - Labels: `model_name`, `engine_type`
+- `trtllm:time_to_first_token_seconds` (Histogram) — Time to first token, TTFT (seconds)
+  - Labels: `model_name`, `engine_type`
+- `trtllm:time_per_output_token_seconds` (Histogram) — Time per output token, TPOT (seconds)
+  - Labels: `model_name`, `engine_type`
+- `trtllm:request_queue_time_seconds` (Histogram) — Time a request spends waiting in the queue (seconds)
+  - Labels: `model_name`, `engine_type`
+These metric names and availability are subject to change with TensorRT-LLM version updates.
+## Metric Categories
+TensorRT-LLM provides metrics in the following categories (all prefixed with `trtllm:`):
+- Request metrics (latency, throughput)
+- Performance metrics (TTFT, TPOT, queue time)
+**Note:** Metrics may change between TensorRT-LLM versions. Always inspect the `/metrics` endpoint for your version.
+## Enabling Metrics in Dynamo
+TensorRT-LLM Prometheus metrics are automatically exposed when running TensorRT-LLM through Dynamo with the `--publish-events-and-metrics` flag.
+### Required Configuration
+```bash
+python -m dynamo.trtllm --model <model_name> --publish-events-and-metrics
+```
+### Backend Requirement
+- `backend`: Must be set to `"pytorch"` for metrics collection (enforced in `components/src/dynamo/trtllm/main.py`)
+- TensorRT-LLM's `MetricsCollector` integration has only been tested/validated with the PyTorch backend
+## Inspecting Metrics
+To see the actual metrics available in your TensorRT-LLM version:
+### 1. Launch TensorRT-LLM with Metrics Enabled
+```bash
+# Set environment variables
+export DYN_SYSTEM_ENABLED=true
+export DYN_SYSTEM_PORT=8081
+# Start TensorRT-LLM worker with metrics enabled
+python -m dynamo.trtllm --model <model_name> --publish-events-and-metrics
+# Wait for engine to initialize
+```
+Metrics will be available at: `http://localhost:8081/metrics`
+### 2. Fetch Metrics via curl
+```bash
+curl http://localhost:8081/metrics | grep "^trtllm:"
+```
+### 3. Example Output
+**Note:** The specific metrics shown below are examples and may vary depending on your TensorRT-LLM version. Always inspect your actual `/metrics` endpoint for the current list.
+```
+# HELP trtllm:request_success_total Count of successfully processed requests.
+# TYPE trtllm:request_success_total counter
+trtllm:request_success_total{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm",finished_reason="stop"} 150.0
+trtllm:request_success_total{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm",finished_reason="length"} 5.0
+# HELP trtllm:time_to_first_token_seconds Histogram of time to first token in seconds.
+# TYPE trtllm:time_to_first_token_seconds histogram
+trtllm:time_to_first_token_seconds_bucket{le="0.01",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 0.0
+trtllm:time_to_first_token_seconds_bucket{le="0.05",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 12.0
+trtllm:time_to_first_token_seconds_count{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 150.0
+trtllm:time_to_first_token_seconds_sum{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 8.75
+# HELP trtllm:e2e_request_latency_seconds Histogram of end to end request latency in seconds.
+# TYPE trtllm:e2e_request_latency_seconds histogram
+trtllm:e2e_request_latency_seconds_bucket{le="0.5",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 25.0
+trtllm:e2e_request_latency_seconds_count{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 150.0
+trtllm:e2e_request_latency_seconds_sum{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 45.2
+# HELP trtllm:time_per_output_token_seconds Histogram of time per output token in seconds.
+# TYPE trtllm:time_per_output_token_seconds histogram
+trtllm:time_per_output_token_seconds_bucket{le="0.1",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 120.0
+trtllm:time_per_output_token_seconds_count{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 150.0
+trtllm:time_per_output_token_seconds_sum{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 12.5
+# HELP trtllm:request_queue_time_seconds Histogram of time spent in WAITING phase for request.
+# TYPE trtllm:request_queue_time_seconds histogram
+trtllm:request_queue_time_seconds_bucket{le="1.0",model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 140.0
+trtllm:request_queue_time_seconds_count{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 150.0
+trtllm:request_queue_time_seconds_sum{model_name="Qwen/Qwen3-0.6B",engine_type="trtllm"} 32.1
+```
+## Implementation Details
+- **Prometheus Integration**: Uses the `MetricsCollector` class from `tensorrt_llm.metrics` (see [collector.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/metrics/collector.py))
+- **Dynamo Integration**: Uses `register_engine_metrics_callback()` function with `add_prefix="trtllm:"`
+- **Engine Configuration**: `return_perf_metrics` set to `True` when `--publish-events-and-metrics` is enabled
+- **Initialization**: Metrics appear after TensorRT-LLM engine initialization completes
+- **Metadata**: `MetricsCollector` initialized with model metadata (model name, engine type)
+## TensorRT-LLM Specific: Non-Prometheus Performance Metrics
+TensorRT-LLM provides extensive performance data beyond the basic Prometheus metrics. These are **not exposed to Prometheus**.
+### Available via Code References:
+- **RequestPerfMetrics Structure**: [tensorrt_llm/executor/result.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/executor/result.py) - KV cache, timing, speculative decoding metrics
+- **Engine Statistics**: `engine.llm.get_stats_async()` - System-wide aggregate statistics
+- **KV Cache Events**: `engine.llm.get_kv_cache_events_async()` - Real-time cache operations
+### Example RequestPerfMetrics JSON Structure:
+```json
+{
+  "timing_metrics": {
+    "arrival_time": 1234567890.123,
+    "first_scheduled_time": 1234567890.135,
+    "first_token_time": 1234567890.150,
+    "last_token_time": 1234567890.300,
+    "kv_cache_size": 2048576,
+    "kv_cache_transfer_start": 1234567890.140,
+    "kv_cache_transfer_end": 1234567890.145
+  },
+  "kv_cache_metrics": {
+    "num_total_allocated_blocks": 100,
+    "num_new_allocated_blocks": 10,
+    "num_reused_blocks": 90,
+    "num_missed_blocks": 5
+  },
+  "speculative_decoding": {
+    "acceptance_rate": 0.85,
+    "total_accepted_draft_tokens": 42,
+    "total_draft_tokens": 50
+  }
+}
+```
+**Note**: These structures are valid as of the date of this documentation but are subject to change with TensorRT-LLM version updates.
+## See Also
+### TensorRT-LLM Metrics
+- See the "TensorRT-LLM Specific: Non-Prometheus Performance Metrics" section above for detailed performance data and source code references
+### Dynamo Metrics
+- **Dynamo Metrics Guide**: See [docs/guides/metrics.md](../../guides/metrics.md) for complete documentation on Dynamo runtime metrics
+- **Dynamo Runtime Metrics**: Metrics prefixed with `dynamo_*` for runtime, components, endpoints, and namespaces
+  - Implementation: `lib/runtime/src/metrics.rs` (Rust runtime metrics)
+  - Metric names: `lib/runtime/src/metrics/prometheus_names.rs` (metric name constants)
+  - Available at the same `/metrics` endpoint alongside TensorRT-LLM metrics
+- **Integration Code**: `components/src/dynamo/common/utils/prometheus.py` - Prometheus utilities and callback registration
--- a/docs/hidden_toctree.rst
+++ b/docs/hidden_toctree.rst
@@ -51,6 +51,7 @@
   backends/trtllm/multimodal_epd.md
   backends/trtllm/gemma3_sliding_window_attention.md
   backends/trtllm/gpt-oss.md
+   backends/trtllm/prometheus.md
   backends/sglang/multinode-examples.md
   backends/sglang/dsr1-wideep-gb200.md

--- a/lib/bindings/python/src/dynamo/_core.pyi
+++ b/lib/bindings/python/src/dynamo/_core.pyi
@@ -493,7 +493,24 @@ class ModelRuntimeConfig:
    """
    A model runtime configuration is a collection of runtime information
    """
-    ...
+    total_kv_blocks: int | None
+    max_num_seqs: int | None
+    max_num_batched_tokens: int | None
+    tool_call_parser: str | None
+    reasoning_parser: str | None
+    runtime_data: dict[str, Any]
+    tensor_model_config: Any | None
+    def __init__(self) -> None: ...
+    def set_engine_specific(self, key: str, value: Any) -> None:
+        """Set an engine-specific runtime configuration value"""
+        ...
+    def get_engine_specific(self, key: str) -> Any | None:
+        """Get an engine-specific runtime configuration value"""
+        ...
 class OAIChatPreprocessor:
    """

--- a/lib/bindings/python/src/dynamo/runtime/__init__.py
+++ b/lib/bindings/python/src/dynamo/runtime/__init__.py
@@ -16,6 +16,7 @@ from dynamo._core import Context as Context
 from dynamo._core import DistributedRuntime as DistributedRuntime
 from dynamo._core import Endpoint as Endpoint
 from dynamo._core import ModelDeploymentCard as ModelDeploymentCard
+from dynamo._core import Namespace as Namespace
 from dynamo._core import OAIChatPreprocessor as OAIChatPreprocessor

--- a/lib/runtime/src/runtime.rs
+++ b/lib/runtime/src/runtime.rs
@@ -297,9 +297,9 @@ impl Runtime {
                tracker.wait_for_completion().await;
            }
-            // Phase 3: Now shutdown NATS/ETCD by cancelling the main token
+            // Phase 3: Now connections will be disconnected to NATS/ETCD by cancelling the main token
            tracing::info!(
-                "Phase 3: All graceful endpoints completed, shutting down NATS/ETCD connections"
+                "Phase 3: All endpoints ended gracefully. Connections to NATS/ETCD will now be disconnected"
            );
            main_token.cancel();
        });

--- a/tests/unit/test_prometheus_utils.py
+++ b/tests/unit/test_prometheus_utils.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Unit tests for Prometheus utilities."""
+from unittest.mock import Mock
+import pytest
+from dynamo.common.utils.prometheus import get_prometheus_expfmt
+pytestmark = [
+    pytest.mark.unit,
+]
+class TestGetPrometheusExpfmt:
+    """Test class for get_prometheus_expfmt function."""
+    @pytest.fixture
+    def vllm_registry(self):
+        """Create a mock registry with vLLM-style metrics."""
+        registry = Mock()
+        sample_metrics = """# HELP python_gc_objects_collected_total Objects collected during gc
+# TYPE python_gc_objects_collected_total counter
+python_gc_objects_collected_total{generation="0"} 123.0
+# HELP process_cpu_seconds_total Total user and system CPU time spent in seconds
+# TYPE process_cpu_seconds_total counter
+process_cpu_seconds_total 45.6
+# HELP vllm:request_success_total Number of successfully finished requests
+# TYPE vllm:request_success_total counter
+vllm:request_success_total{finished_reason="stop",model_name="meta-llama/Llama-3.1-8B"} 150.0
+# HELP vllm:time_to_first_token_seconds Histogram of time to first token in seconds
+# TYPE vllm:time_to_first_token_seconds histogram
+vllm:time_to_first_token_seconds_bucket{le="0.005",model_name="meta-llama/Llama-3.1-8B"} 5.0
+vllm:time_to_first_token_seconds_count{model_name="meta-llama/Llama-3.1-8B"} 165.0
+"""
+        def mock_generate_latest(reg):
+            return sample_metrics.encode("utf-8")
+        import dynamo.common.utils.prometheus
+        original_generate_latest = dynamo.common.utils.prometheus.generate_latest
+        dynamo.common.utils.prometheus.generate_latest = mock_generate_latest
+        yield registry
+        dynamo.common.utils.prometheus.generate_latest = original_generate_latest
+    @pytest.fixture
+    def sglang_registry(self):
+        """Create a mock registry with SGLang-style metrics."""
+        registry = Mock()
+        sample_metrics = """# HELP python_gc_objects_collected_total Objects collected during gc
+# TYPE python_gc_objects_collected_total counter
+python_gc_objects_collected_total{generation="0"} 123.0
+# HELP process_cpu_seconds_total Total user and system CPU time spent in seconds
+# TYPE process_cpu_seconds_total counter
+process_cpu_seconds_total 45.6
+# HELP sglang:prompt_tokens_total Number of prefill tokens processed
+# TYPE sglang:prompt_tokens_total counter
+sglang:prompt_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct"} 8128902.0
+# HELP sglang:generation_tokens_total Number of generation tokens processed
+# TYPE sglang:generation_tokens_total counter
+sglang:generation_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct"} 7557572.0
+# HELP sglang:cache_hit_rate The cache hit rate
+# TYPE sglang:cache_hit_rate gauge
+sglang:cache_hit_rate{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0075
+"""
+        def mock_generate_latest(reg):
+            return sample_metrics.encode("utf-8")
+        import dynamo.common.utils.prometheus
+        original_generate_latest = dynamo.common.utils.prometheus.generate_latest
+        dynamo.common.utils.prometheus.generate_latest = mock_generate_latest
+        yield registry
+        dynamo.common.utils.prometheus.generate_latest = original_generate_latest
+    @pytest.fixture
+    def trtllm_registry(self):
+        """Create a mock registry with TensorRT-LLM-style metrics (no existing prefixes)."""
+        registry = Mock()
+        sample_metrics = """# HELP python_gc_objects_collected_total Objects collected during gc
+# TYPE python_gc_objects_collected_total counter
+python_gc_objects_collected_total{generation="0"} 123.0
+# HELP process_cpu_seconds_total Total user and system CPU time spent in seconds
+# TYPE process_cpu_seconds_total counter
+process_cpu_seconds_total 45.6
+# HELP request_latency_seconds Request latency in seconds
+# TYPE request_latency_seconds histogram
+request_latency_seconds_bucket{le="0.1"} 10.0
+request_latency_seconds_count 25.0
+# HELP num_requests_running Number of requests currently running
+# TYPE num_requests_running gauge
+num_requests_running 3.0
+# HELP tokens_per_second Tokens generated per second
+# TYPE tokens_per_second gauge
+tokens_per_second 245.7
+"""
+        def mock_generate_latest(reg):
+            return sample_metrics.encode("utf-8")
+        import dynamo.common.utils.prometheus
+        original_generate_latest = dynamo.common.utils.prometheus.generate_latest
+        dynamo.common.utils.prometheus.generate_latest = mock_generate_latest
+        yield registry
+        dynamo.common.utils.prometheus.generate_latest = original_generate_latest
+    def test_vllm_use_case(self, vllm_registry):
+        """Test vLLM use case: filter to vllm: metrics and exclude python_/process_."""
+        result = get_prometheus_expfmt(
+            vllm_registry,
+            metric_prefix_filter="vllm:",
+            exclude_prefixes=["python_", "process_"],
+        )
+        # Should only contain vllm: metrics
+        assert "vllm:request_success_total" in result
+        assert "vllm:time_to_first_token_seconds" in result
+        assert "# HELP vllm:request_success_total" in result
+        # Should not contain excluded metrics
+        assert "python_gc_objects_collected_total" not in result
+        assert "process_cpu_seconds_total" not in result
+        # Check specific content
+        assert 'finished_reason="stop"' in result
+        assert 'model_name="meta-llama/Llama-3.1-8B"' in result
+        assert result.endswith("\n")
+    def test_sglang_use_case(self, sglang_registry):
+        """Test SGLang use case: filter to sglang: metrics and exclude python_/process_."""
+        result = get_prometheus_expfmt(
+            sglang_registry,
+            metric_prefix_filter="sglang:",
+            exclude_prefixes=["python_", "process_"],
+        )
+        # Should only contain sglang: metrics
+        assert "sglang:prompt_tokens_total" in result
+        assert "sglang:generation_tokens_total" in result
+        assert "sglang:cache_hit_rate" in result
+        assert "# HELP sglang:prompt_tokens_total" in result
+        # Should not contain excluded metrics
+        assert "python_gc_objects_collected_total" not in result
+        assert "process_cpu_seconds_total" not in result
+        # Check specific content
+        assert 'model_name="meta-llama/Llama-3.1-8B-Instruct"' in result
+        assert "8128902.0" in result  # prompt tokens value
+        assert result.endswith("\n")
+    def test_trtllm_use_case(self, trtllm_registry):
+        """Test TensorRT-LLM use case: exclude python_/process_ and add trtllm: prefix."""
+        result = get_prometheus_expfmt(
+            trtllm_registry,
+            exclude_prefixes=["python_", "process_"],
+            add_prefix="trtllm:",
+        )
+        # Should not contain excluded metrics
+        assert "python_gc_objects_collected_total" not in result
+        assert "process_cpu_seconds_total" not in result
+        # All remaining metrics should have trtllm: prefix
+        assert "trtllm:request_latency_seconds" in result
+        assert "trtllm:num_requests_running" in result
+        assert "trtllm:tokens_per_second" in result
+        # HELP/TYPE comments should have prefix
+        assert "# HELP trtllm:request_latency_seconds" in result
+        assert "# TYPE trtllm:num_requests_running" in result
+        # Check specific content and structure preservation
+        assert 'trtllm:request_latency_seconds_bucket{le="0.1"} 10.0' in result
+        assert "trtllm:tokens_per_second 245.7" in result
+        assert result.endswith("\n")
+    def test_no_filtering_all_frameworks(self, trtllm_registry):
+        """Test that without any filters, all metrics are returned."""
+        result = get_prometheus_expfmt(trtllm_registry)
+        # Should contain all metrics including excluded ones
+        assert "python_gc_objects_collected_total" in result
+        assert "process_cpu_seconds_total" in result
+        assert "request_latency_seconds" in result
+        assert "num_requests_running" in result
+        assert result.endswith("\n")
+    def test_empty_result_handling(self, trtllm_registry):
+        """Test handling when all metrics are filtered out."""
+        result = get_prometheus_expfmt(
+            trtllm_registry,
+            exclude_prefixes=["python_", "process_", "request_", "num_", "tokens_"],
+        )
+        # Should return empty string with newline or just newline
+        assert result == "\n" or result == ""
+    def test_error_handling(self):
+        """Test error handling when registry fails."""
+        # Create a registry that raises an exception
+        bad_registry = Mock()
+        bad_registry.side_effect = Exception("Registry error")
+        result = get_prometheus_expfmt(bad_registry)
+        # Should return empty string on error
+        assert result == ""
--- a/tests/unit/test_trtllm_main_init.py
+++ b/tests/unit/test_trtllm_main_init.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""
+Simple test for TensorRT-LLM MetricsCollector import and basic functionality.
+"""
+from unittest.mock import Mock
+import pytest
+# Mark all tests in this module to run only in TensorRT-LLM container
+pytestmark = pytest.mark.trtllm
+def test_tensorrt_llm_metrics_collector_import():
+    """Test that we can import MetricsCollector from TensorRT-LLM."""
+    try:
+        import warnings
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")  # Ignore warnings during import
+            from tensorrt_llm.metrics.collector import MetricsCollector
+        # Test basic initialization (only once to avoid registry conflicts)
+        metrics_collector = MetricsCollector(
+            {"model_name": "test-model-unique", "engine_type": "trtllm"}
+        )
+        assert metrics_collector is not None
+        print("✅ MetricsCollector imported and initialized successfully")
+    except ImportError as e:
+        pytest.skip(f"TensorRT-LLM not available: {e}")
+    except Exception as e:
+        pytest.fail(f"Failed to initialize MetricsCollector: {e}")
+def test_prometheus_registry_import():
+    """Test that we can import Prometheus registry."""
+    try:
+        from prometheus_client import REGISTRY
+        assert REGISTRY is not None
+        print("✅ Prometheus REGISTRY imported successfully")
+    except ImportError as e:
+        pytest.skip(f"Prometheus client not available: {e}")
+def test_prometheus_metrics_integration():
+    """Test Prometheus metrics integration as used in main.py init() function."""
+    try:
+        import warnings
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")  # Ignore warnings during import
+            from prometheus_client import REGISTRY
+            from dynamo.common.utils.prometheus import register_engine_metrics_callback
+        # Mock endpoint for registration (simulating what init() does)
+        mock_endpoint = Mock()
+        # Test the exact call that main.py init() makes
+        register_engine_metrics_callback(
+            endpoint=mock_endpoint,
+            registry=REGISTRY,
+            exclude_prefixes=["python_", "process_"],
+            add_prefix="trtllm:",
+        )
+        print("✅ Prometheus metrics integration test passed")
+    except ImportError as e:
+        pytest.skip(f"Required modules not available: {e}")
+    except Exception as e:
+        pytest.fail(f"Prometheus integration test failed: {e}")
+if __name__ == "__main__":
+    # Run tests directly for quick verification
+    test_tensorrt_llm_metrics_collector_import()
+    test_prometheus_registry_import()
+    test_prometheus_metrics_integration()
+    print("🎉 All tests passed!")