Unverified Commit 10081929 authored by Yifan Jiang's avatar Yifan Jiang Committed by GitHub
Browse files

feat(trtllm): add additional metrics for dynamo-trtllm (#6668)


Signed-off-by: default avatarYifan Jiang <19356972+yifjiang@users.noreply.github.com>
Co-authored-by: default avatarClaude Opus 4.6 <noreply@anthropic.com>
parent 2831bfec
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Additional Prometheus metrics for dynamo-trtllm beyond what the engine provides.
The TRT-LLM engine MetricsCollector already provides 5 core metrics:
request_success_total, e2e_request_latency_seconds,
time_to_first_token_seconds, inter_token_latency_seconds,
request_queue_time_seconds
The Rust frontend (metrics.rs) provides token counters:
input_tokens_total, output_tokens_total, cached_tokens
This module adds metrics that have no engine/runtime/frontend equivalent:
- Request types (image, structured output)
- KV transfer metrics (success counter, latency, throughput, per-request bytes)
- Abort tracking
"""
import logging
from datetime import timedelta
from prometheus_client import Counter, Histogram
from dynamo.prometheus_names import trtllm_additional as metric_names
logger = logging.getLogger(__name__)
# Histogram buckets for KV cache transfer metrics
KV_TRANSFER_LATENCY_BUCKETS = (
0.001,
0.005,
0.01,
0.025,
0.05,
0.1,
0.25,
0.5,
1.0,
2.5,
5.0,
10.0,
float("inf"),
)
KV_TRANSFER_SPEED_BUCKETS = (
0.1,
0.5,
1.0,
5.0,
10.0,
25.0,
50.0,
100.0,
250.0,
500.0,
float("inf"),
)
KV_TRANSFER_BYTES_BUCKETS = (
100_000,
500_000,
1_000_000,
5_000_000,
10_000_000,
50_000_000,
100_000_000,
500_000_000,
1_000_000_000,
5_000_000_000,
float("inf"),
)
class AdditionalMetricsCollector:
"""
Additional Prometheus metrics for dynamo-trtllm.
Only creates metrics that have no engine/runtime/frontend equivalent.
Metrics are registered in the default prometheus_client.REGISTRY.
Args:
labels: Dict with keys like model_name, disaggregation_mode, engine_type.
"""
def __init__(self, labels: dict):
self._labelnames = list(labels.keys())
self._labelvalues = list(labels.values())
# --- Abort tracking ---
self.num_aborted_requests = Counter(
metric_names.NUM_ABORTED_REQUESTS_TOTAL,
"Total number of aborted/cancelled requests",
labelnames=self._labelnames,
)
# --- Request type counters ---
self.request_type_image = Counter(
metric_names.REQUEST_TYPE_IMAGE_TOTAL,
"Total number of requests containing image content",
labelnames=self._labelnames,
)
self.request_type_structured_output = Counter(
metric_names.REQUEST_TYPE_STRUCTURED_OUTPUT_TOTAL,
"Total number of requests using guided/structured decoding",
labelnames=self._labelnames,
)
# --- KV cache transfer metrics ---
self.kv_transfer_success = Counter(
metric_names.KV_TRANSFER_SUCCESS_TOTAL,
"Total number of successful KV cache transfers",
labelnames=self._labelnames,
)
self.kv_transfer_latency = Histogram(
metric_names.KV_TRANSFER_LATENCY_SECONDS,
"KV cache transfer latency per request in seconds",
labelnames=self._labelnames,
buckets=KV_TRANSFER_LATENCY_BUCKETS,
)
self.kv_transfer_bytes = Histogram(
metric_names.KV_TRANSFER_BYTES,
"KV cache transfer size per request in bytes",
labelnames=self._labelnames,
buckets=KV_TRANSFER_BYTES_BUCKETS,
)
self.kv_transfer_speed = Histogram(
metric_names.KV_TRANSFER_SPEED_GB_S,
"KV cache transfer speed per request in GB/s",
labelnames=self._labelnames,
buckets=KV_TRANSFER_SPEED_BUCKETS,
)
logger.info("AdditionalMetricsCollector initialized")
# --- Request helpers ---
def record_request_abort(self):
"""Increment aborted requests counter."""
self.num_aborted_requests.labels(*self._labelvalues).inc()
# --- Request type tracking ---
def record_request_type_image(self):
"""Increment the image request type counter."""
self.request_type_image.labels(*self._labelvalues).inc()
def record_request_type_structured_output(self):
"""Increment the structured output request type counter."""
self.request_type_structured_output.labels(*self._labelvalues).inc()
# --- KV transfer ---
def record_kv_transfer_success(self):
"""Increment the KV transfer success counter."""
self.kv_transfer_success.labels(*self._labelvalues).inc()
def record_kv_transfer_perf(self, timing_metrics) -> bool:
"""Record KV transfer performance from RequestPerfMetrics.timing_metrics.
Extracts kv_cache_transfer_start, kv_cache_transfer_end, and kv_cache_size
from TRT-LLM's TimingMetrics and records latency, bytes, and speed.
Only records when a transfer actually occurred (non-zero transfer times).
Args:
timing_metrics: TimingMetrics object from RequestPerfMetrics.
Returns:
True if a transfer was recorded, False if skipped.
"""
transfer_start = timing_metrics.kv_cache_transfer_start
transfer_end = timing_metrics.kv_cache_transfer_end
# Only record when a transfer actually happened
if transfer_end <= timedelta(0) or transfer_start <= timedelta(0):
return False
latency_s = (transfer_end - transfer_start).total_seconds()
if latency_s <= 0:
return False
kv_bytes = timing_metrics.kv_cache_size
self.kv_transfer_latency.labels(*self._labelvalues).observe(latency_s)
self.kv_transfer_bytes.labels(*self._labelvalues).observe(kv_bytes)
if kv_bytes > 0:
speed_gb_s = kv_bytes / (latency_s * 1e9)
self.kv_transfer_speed.labels(*self._labelvalues).observe(speed_gb_s)
return True
...@@ -21,7 +21,7 @@ import re ...@@ -21,7 +21,7 @@ import re
from collections.abc import AsyncGenerator from collections.abc import AsyncGenerator
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
from dataclasses import asdict, dataclass from dataclasses import asdict, dataclass
from typing import Any, Optional, Union from typing import TYPE_CHECKING, Any, Optional, Union
import torch import torch
from tensorrt_llm.executor.result import GenerationResult from tensorrt_llm.executor.result import GenerationResult
...@@ -40,6 +40,7 @@ from dynamo.runtime.logging import configure_dynamo_logging ...@@ -40,6 +40,7 @@ from dynamo.runtime.logging import configure_dynamo_logging
from dynamo.trtllm.constants import DisaggregationMode from dynamo.trtllm.constants import DisaggregationMode
from dynamo.trtllm.engine import TensorRTLLMEngine from dynamo.trtllm.engine import TensorRTLLMEngine
from dynamo.trtllm.logits_processing.adapter import create_trtllm_adapters from dynamo.trtllm.logits_processing.adapter import create_trtllm_adapters
from dynamo.trtllm.metrics import AdditionalMetricsCollector
from dynamo.trtllm.multimodal_processor import MultimodalRequestProcessor from dynamo.trtllm.multimodal_processor import MultimodalRequestProcessor
from dynamo.trtllm.publisher import Publisher from dynamo.trtllm.publisher import Publisher
from dynamo.trtllm.request_handlers.base_generative_handler import BaseGenerativeHandler from dynamo.trtllm.request_handlers.base_generative_handler import BaseGenerativeHandler
...@@ -48,6 +49,11 @@ from dynamo.trtllm.utils.disagg_utils import ( ...@@ -48,6 +49,11 @@ from dynamo.trtllm.utils.disagg_utils import (
DisaggregatedParamsCodec, DisaggregatedParamsCodec,
) )
if TYPE_CHECKING:
# tensorrt_llm may use a different version that doesn't have MetricsCollector,
# so guard this import inside TYPE_CHECKING to avoid runtime import errors.
from tensorrt_llm.metrics import MetricsCollector
configure_dynamo_logging() configure_dynamo_logging()
...@@ -69,11 +75,12 @@ class RequestHandlerConfig: ...@@ -69,11 +75,12 @@ class RequestHandlerConfig:
runtime: Optional[ runtime: Optional[
DistributedRuntime DistributedRuntime
] = None # DistributedRuntime reference for graceful shutdown ] = None # DistributedRuntime reference for graceful shutdown
metrics_collector: Optional[Any] = None # TensorRT-LLM MetricsCollector metrics_collector: Optional["MetricsCollector"] = None
kv_block_size: int = 32 kv_block_size: int = 32
shutdown_event: Optional[asyncio.Event] = None shutdown_event: Optional[asyncio.Event] = None
encoder_cache_capacity_gb: float = 0 # Encoder cache capacity in GB encoder_cache_capacity_gb: float = 0 # Encoder cache capacity in GB
disable_request_abort: bool = True disable_request_abort: bool = True
additional_metrics: Optional["AdditionalMetricsCollector"] = None
class HandlerBase(BaseGenerativeHandler): class HandlerBase(BaseGenerativeHandler):
...@@ -103,6 +110,7 @@ class HandlerBase(BaseGenerativeHandler): ...@@ -103,6 +110,7 @@ class HandlerBase(BaseGenerativeHandler):
self.kv_block_size: int = config.kv_block_size self.kv_block_size: int = config.kv_block_size
self.shutdown_event = config.shutdown_event self.shutdown_event = config.shutdown_event
self.disable_request_abort = config.disable_request_abort self.disable_request_abort = config.disable_request_abort
self.additional_metrics = config.additional_metrics
def check_error(self, result: dict) -> bool: def check_error(self, result: dict) -> bool:
""" """
...@@ -621,6 +629,36 @@ class HandlerBase(BaseGenerativeHandler): ...@@ -621,6 +629,36 @@ class HandlerBase(BaseGenerativeHandler):
""" """
logging.debug(f"Request: {request}") logging.debug(f"Request: {request}")
# Additional metrics: request type detection
metrics_collector = self.additional_metrics
if metrics_collector:
try:
# Detect request types for metrics
sampling_options = request.get("sampling_options", {})
guided = sampling_options.get("guided_decoding")
if guided and isinstance(guided, dict):
has_structured_guidance = any(
guided.get(k) is not None
for k in (
"json",
"regex",
"grammar",
"json_object",
"structural_tag",
)
) or bool(guided.get("choice"))
if has_structured_guidance:
metrics_collector.record_request_type_structured_output()
if (
request.get("multi_modal_data")
or embeddings is not None
or request.get("_epd_processed_prompt") is not None
):
metrics_collector.record_request_type_image()
except Exception as e:
logging.warning("Additional metrics (request type): %s", e)
# Normalize OpenAI format to TRT-LLM internal format # Normalize OpenAI format to TRT-LLM internal format
self._normalize_request_format(request) self._normalize_request_format(request)
...@@ -822,6 +860,29 @@ class HandlerBase(BaseGenerativeHandler): ...@@ -822,6 +860,29 @@ class HandlerBase(BaseGenerativeHandler):
"Request finished with no finish reason set - this indicates a possible bug" "Request finished with no finish reason set - this indicates a possible bug"
) )
# Record additional metrics on request finish
if res.finished and metrics_collector and out.get("finish_reason"):
try:
# KV transfer metrics from request_perf_metrics
if output.request_perf_metrics is not None:
# Record KV transfer latency/bytes/speed from timing_metrics
tm = output.request_perf_metrics.timing_metrics
if tm is not None:
recorded = (
metrics_collector.record_kv_transfer_perf(tm)
)
# Only count success if a transfer actually occurred
if (
recorded
and self.disaggregation_mode
== DisaggregationMode.PREFILL
):
metrics_collector.record_kv_transfer_success()
except Exception as e:
logging.warning(
"Additional metrics (request finish): %s", e
)
# Log metrics to TensorRT-LLM MetricsCollector when request finishes # Log metrics to TensorRT-LLM MetricsCollector when request finishes
# NOTE: TRT-LLM 1.3.0rc5 (PR #11243) renamed log_metrics_dict → log_request_metrics_dict # NOTE: TRT-LLM 1.3.0rc5 (PR #11243) renamed log_metrics_dict → log_request_metrics_dict
if ( if (
...@@ -852,6 +913,11 @@ class HandlerBase(BaseGenerativeHandler): ...@@ -852,6 +913,11 @@ class HandlerBase(BaseGenerativeHandler):
except asyncio.CancelledError: except asyncio.CancelledError:
logging.debug(f"Request {request_id}: Client cancelled") logging.debug(f"Request {request_id}: Client cancelled")
# _cancellation_monitor already called abort_request # _cancellation_monitor already called abort_request
try:
if metrics_collector:
metrics_collector.record_request_abort()
except Exception as e:
logging.debug("Additional metrics (request abort): %s", e)
return # Just stop, no error response return # Just stop, no error response
# 2. Per-request errors - send to client, don't shutdown # 2. Per-request errors - send to client, don't shutdown
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Tests for AdditionalMetricsCollector and additional metrics integration."""
import ast
import inspect
import textwrap
import unittest
from datetime import timedelta
from unittest.mock import MagicMock, patch
from prometheus_client import CollectorRegistry, generate_latest
from dynamo.trtllm.metrics import AdditionalMetricsCollector
try:
from dynamo.trtllm.request_handlers.handler_base import HandlerBase
except ImportError:
# handler_base imports torch which requires CUDA libraries at import time;
# gracefully skip on CPU-only CI runners.
HandlerBase = None
class TestAdditionalMetricsCollector(unittest.TestCase):
"""Unit tests for AdditionalMetricsCollector."""
def setUp(self):
"""Create a fresh registry and collector for each test."""
self.registry = CollectorRegistry()
# Patch prometheus_client.Counter and Histogram to use our test registry
with patch("dynamo.trtllm.metrics.Counter") as MockCounter, patch(
"dynamo.trtllm.metrics.Histogram"
) as MockHistogram:
from prometheus_client import Counter, Histogram
def make_counter(name, documentation, labelnames=None, **_kw):
return Counter(
name,
documentation,
labelnames=labelnames or [],
registry=self.registry,
)
def make_histogram(
name, documentation, labelnames=None, buckets=None, **_kw
):
kwargs = {"registry": self.registry}
if buckets is not None:
kwargs["buckets"] = buckets
return Histogram(
name, documentation, labelnames=labelnames or [], **kwargs
)
MockCounter.side_effect = make_counter
MockHistogram.side_effect = make_histogram
self.collector = AdditionalMetricsCollector(
labels={
"model_name": "test-model",
"disaggregation_mode": "prefill_and_decode",
"engine_type": "trtllm",
},
)
def _get_metric_value(self, name, _labels=None):
"""Get a metric value from the registry."""
output = generate_latest(self.registry).decode()
for line in output.splitlines():
if line.startswith("#"):
continue
if line.startswith(name):
# Extract value (last token)
parts = line.split()
if len(parts) >= 2:
return float(parts[-1])
return None
def test_abort_counter(self):
"""Test abort tracking."""
self.collector.record_request_abort()
output = generate_latest(self.registry).decode()
self.assertIn("trtllm_num_aborted_requests_total", output)
def test_request_type_counters(self):
"""Test request type counters."""
self.collector.record_request_type_image()
self.collector.record_request_type_structured_output()
output = generate_latest(self.registry).decode()
self.assertIn("trtllm_request_type_image_total", output)
self.assertIn("trtllm_request_type_structured_output_total", output)
def test_kv_transfer_success_counter(self):
"""Test KV transfer success counter."""
self.collector.record_kv_transfer_success()
output = generate_latest(self.registry).decode()
self.assertIn("trtllm_kv_transfer_success_total", output)
def test_kv_transfer_perf_metrics(self):
"""Test KV transfer latency/bytes/speed from timing_metrics."""
tm = MagicMock()
tm.kv_cache_transfer_start = timedelta(seconds=1.0)
tm.kv_cache_transfer_end = timedelta(seconds=1.05)
tm.kv_cache_size = 1_000_000_000 # 1 GB
result = self.collector.record_kv_transfer_perf(tm)
self.assertTrue(result)
output = generate_latest(self.registry).decode()
self.assertIn("trtllm_kv_transfer_latency_seconds", output)
self.assertIn("trtllm_kv_transfer_bytes_bucket", output)
self.assertIn("trtllm_kv_transfer_speed_gb_s", output)
def test_kv_transfer_perf_skipped_when_no_transfer(self):
"""Test that KV transfer perf is not recorded when no transfer occurred."""
tm = MagicMock()
tm.kv_cache_transfer_start = timedelta(0)
tm.kv_cache_transfer_end = timedelta(0)
tm.kv_cache_size = 0
result = self.collector.record_kv_transfer_perf(tm)
self.assertFalse(result)
# Histogram is defined but should have no observations
sample = self.registry.get_sample_value(
"trtllm_kv_transfer_latency_seconds_count"
)
self.assertEqual(sample, 0.0)
def test_kv_transfer_perf_return_values(self):
"""Verify record_kv_transfer_perf returns True on record, False on skip."""
# Transfer occurred
tm_ok = MagicMock()
tm_ok.kv_cache_transfer_start = timedelta(seconds=1.0)
tm_ok.kv_cache_transfer_end = timedelta(seconds=1.05)
tm_ok.kv_cache_size = 5_000_000
self.assertTrue(self.collector.record_kv_transfer_perf(tm_ok))
# No transfer (zero times)
tm_zero = MagicMock()
tm_zero.kv_cache_transfer_start = timedelta(0)
tm_zero.kv_cache_transfer_end = timedelta(0)
tm_zero.kv_cache_size = 0
self.assertFalse(self.collector.record_kv_transfer_perf(tm_zero))
# Negative latency (end before start)
tm_neg = MagicMock()
tm_neg.kv_cache_transfer_start = timedelta(seconds=2.0)
tm_neg.kv_cache_transfer_end = timedelta(seconds=1.0)
tm_neg.kv_cache_size = 1000
self.assertFalse(self.collector.record_kv_transfer_perf(tm_neg))
def test_no_duplicate_metrics(self):
"""Test that removed duplicate metrics are not present."""
output = generate_latest(self.registry).decode()
# These metrics were removed as they duplicate frontend/runtime metrics
self.assertNotIn("prompt_tokens_total", output)
self.assertNotIn("generation_tokens_total", output)
self.assertNotIn("gen_throughput", output)
self.assertNotIn("kv_cache_hit_tokens_total", output)
self.assertNotIn("handler_time_to_first_token_seconds", output)
self.assertNotIn("handler_inter_token_latency_seconds", output)
self.assertNotIn("handler_e2e_request_latency_seconds", output)
# Phase timing metrics also removed (derivable from existing trtllm_* metrics)
self.assertNotIn("request_prefill_time_seconds", output)
self.assertNotIn("request_decode_time_seconds", output)
self.assertNotIn("request_inference_time_seconds", output)
# Config info metrics removed (overlap dynamo_frontend_model_* and
# dynamo_component_model_load_time_seconds)
self.assertNotIn("model_config_info", output)
self.assertNotIn("parallel_config_info", output)
self.assertNotIn("detailed_config_info", output)
self.assertNotIn("cache_config_info", output)
self.assertNotIn("engine_startup_time", output)
# KV transfer perf metrics are now wired from request_perf_metrics.timing_metrics
# (kv_transfer_latency_seconds, kv_transfer_bytes, kv_transfer_speed_gb_s)
@unittest.skipIf(HandlerBase is None, "HandlerBase requires CUDA/GPU libraries")
class TestHandlerBaseMetricsInstrumentation(unittest.TestCase):
"""Test metrics instrumentation in handler_base.py generate_locally()."""
def test_structured_output_detection_keys(self):
"""Verify guided decoding detection keys in generate_locally match _override_sampling_params."""
# Extract detection keys from generate_locally: the tuple in
# any(guided.get(k) for k in ("json", ...))
gen_source = textwrap.dedent(inspect.getsource(HandlerBase.generate_locally))
gen_tree = ast.parse(gen_source)
detection_keys = set()
for node in ast.walk(gen_tree):
# Find: any(guided.get(k) for k in (...))
if isinstance(node, ast.Tuple) and all(
isinstance(e, (ast.Constant, ast.Str)) for e in node.elts
):
vals = {
e.value if isinstance(e, ast.Constant) else e.s for e in node.elts
}
if "json_object" in vals: # identify the right tuple
detection_keys = vals
self.assertTrue(
detection_keys, "Could not extract detection keys from generate_locally"
)
# Extract canonical keys from _override_sampling_params:
# GuidedDecodingParams(json=..., regex=..., ...)
override_source = textwrap.dedent(
inspect.getsource(HandlerBase._override_sampling_params)
)
override_tree = ast.parse(override_source)
canonical_keys = set()
for node in ast.walk(override_tree):
if (
isinstance(node, ast.Call)
and getattr(node.func, "id", None) == "GuidedDecodingParams"
):
canonical_keys = {kw.arg for kw in node.keywords}
self.assertTrue(
canonical_keys, "Could not extract keys from GuidedDecodingParams call"
)
# Detection should cover all canonical keys
missing = canonical_keys - detection_keys
self.assertFalse(
missing,
f"Keys in GuidedDecodingParams but not in metrics detection: {missing}",
)
if __name__ == "__main__":
unittest.main()
...@@ -397,6 +397,7 @@ async def init_llm_worker( ...@@ -397,6 +397,7 @@ async def init_llm_worker(
# Initialize TensorRT-LLM MetricsCollector and register with global REGISTRY # Initialize TensorRT-LLM MetricsCollector and register with global REGISTRY
# This enables exposing TRT-LLM's native Prometheus metrics (request latency, TTFT, TPOT, etc.) # This enables exposing TRT-LLM's native Prometheus metrics (request latency, TTFT, TPOT, etc.)
metrics_collector = None metrics_collector = None
additional_metrics = None
if config.publish_events_and_metrics: if config.publish_events_and_metrics:
try: try:
model_name_for_metrics = config.served_model_name or config.model model_name_for_metrics = config.served_model_name or config.model
...@@ -405,18 +406,47 @@ async def init_llm_worker( ...@@ -405,18 +406,47 @@ async def init_llm_worker(
) )
logging.info("TensorRT-LLM MetricsCollector initialized") logging.info("TensorRT-LLM MetricsCollector initialized")
# Register TRT-LLM metrics (TRT-LLM natively outputs trtllm_* metrics after traffic) # Prefix filter: all TRT-LLM metrics (engine + additional) use "trtllm_" prefix
# Auto-label injection: hierarchy labels are added automatically _metric_prefixes = ["trtllm_"]
# Additional metrics (abort tracking, request types, KV transfer perf).
# Wrapped in try/except because AdditionalMetricsCollector depends on
# prometheus_names which may not be available in all packaging variants.
try:
from dynamo.trtllm.metrics import AdditionalMetricsCollector
disagg_mode_str = (
config.disaggregation_mode.value
if hasattr(config.disaggregation_mode, "value")
else str(config.disaggregation_mode)
)
additional_metrics = AdditionalMetricsCollector(
labels={
"model_name": model_name_for_metrics,
"disaggregation_mode": disagg_mode_str,
"engine_type": "trtllm",
},
)
logging.info(
"Additional metrics initialized (disagg_mode=%s)",
disagg_mode_str,
)
except Exception as e:
logging.warning("Failed to initialize additional metrics: %s", e)
# Single callback for all Python-side metrics (trtllm_ + additional)
register_engine_metrics_callback( register_engine_metrics_callback(
endpoint=endpoint, endpoint=endpoint,
registry=REGISTRY, registry=REGISTRY,
metric_prefix_filters=["trtllm_"], metric_prefix_filters=_metric_prefixes,
namespace_name=config.namespace, namespace_name=config.namespace,
component_name=config.component, component_name=config.component,
endpoint_name="generate", endpoint_name="generate",
model_name=model_name_for_metrics, model_name=model_name_for_metrics,
) )
logging.info("TensorRT-LLM Prometheus metrics registered") logging.info(
"Prometheus metrics registered (prefixes: %s)", _metric_prefixes
)
except Exception as e: except Exception as e:
logging.warning( logging.warning(
f"Failed to initialize TensorRT-LLM Prometheus metrics: {e}" f"Failed to initialize TensorRT-LLM Prometheus metrics: {e}"
...@@ -444,6 +474,7 @@ async def init_llm_worker( ...@@ -444,6 +474,7 @@ async def init_llm_worker(
shutdown_event=shutdown_event, shutdown_event=shutdown_event,
encoder_cache_capacity_gb=config.multimodal_embedding_cache_capacity_gb, encoder_cache_capacity_gb=config.multimodal_embedding_cache_capacity_gb,
disable_request_abort=config.disable_request_abort, disable_request_abort=config.disable_request_abort,
additional_metrics=additional_metrics,
) )
# Register the model with runtime config # Register the model with runtime config
......
...@@ -134,6 +134,38 @@ These metric names and availability are subject to change with TensorRT-LLM vers ...@@ -134,6 +134,38 @@ These metric names and availability are subject to change with TensorRT-LLM vers
TensorRT-LLM provides Prometheus metrics through the `MetricsCollector` class (see [tensorrt_llm/metrics/collector.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/metrics/collector.py)). TensorRT-LLM provides Prometheus metrics through the `MetricsCollector` class (see [tensorrt_llm/metrics/collector.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/metrics/collector.py)).
### Additional Operational Metrics
Dynamo adds the following operational metrics for TensorRT-LLM workers. These complement the engine's native metrics above with request-level observability that the engine does not provide. All metrics use the `trtllm_` prefix and are automatically enabled when `--publish-events-and-metrics` is set.
Metric name constants are defined in `lib/runtime/src/metrics/prometheus_names.rs` (`trtllm_additional` module).
#### Request Type Tracking
- `trtllm_request_type_image_total` (Counter) — Total number of requests containing image/multimodal content
- Labels: `model_name`, `disaggregation_mode`, `engine_type`
- `trtllm_request_type_structured_output_total` (Counter) — Total number of requests using guided/structured decoding (JSON, regex, grammar, etc.)
- Labels: `model_name`, `disaggregation_mode`, `engine_type`
#### Abort Tracking
- `trtllm_num_aborted_requests_total` (Counter) — Total number of aborted/cancelled requests
- Labels: `model_name`, `disaggregation_mode`, `engine_type`
#### KV Cache Transfer Metrics (Disaggregated Deployments)
These metrics are only recorded in disaggregated (prefill + decode) deployments when a KV cache transfer actually occurs. They are sourced from TensorRT-LLM's `RequestPerfMetrics.timing_metrics`.
- `trtllm_kv_transfer_success_total` (Counter) — Total number of successful KV cache transfers (recorded on prefill side)
- Labels: `model_name`, `disaggregation_mode`, `engine_type`
- `trtllm_kv_transfer_latency_seconds` (Histogram) — KV cache transfer latency per request in seconds
- Labels: `model_name`, `disaggregation_mode`, `engine_type`
- `trtllm_kv_transfer_bytes` (Histogram) — KV cache transfer size per request in bytes
- Labels: `model_name`, `disaggregation_mode`, `engine_type`
- Buckets: 100KB, 500KB, 1MB, 5MB, 10MB, 50MB, 100MB, 500MB, 1GB, 5GB
- `trtllm_kv_transfer_speed_gb_s` (Histogram) — KV cache transfer speed per request in GB/s
- Labels: `model_name`, `disaggregation_mode`, `engine_type`
## Non-Prometheus Performance Metrics ## Non-Prometheus Performance Metrics
TensorRT-LLM provides extensive performance data beyond the basic Prometheus metrics. These are not currently exposed to Prometheus. TensorRT-LLM provides extensive performance data beyond the basic Prometheus metrics. These are not currently exposed to Prometheus.
......
...@@ -246,6 +246,25 @@ class routing_overhead: ...@@ -246,6 +246,25 @@ class routing_overhead:
TOTAL_MS = "overhead_total_ms" TOTAL_MS = "overhead_total_ms"
class trtllm_additional:
"""Additional TRT-LLM worker metrics beyond what the engine natively provides."""
# Total number of aborted/cancelled requests
NUM_ABORTED_REQUESTS_TOTAL = "trtllm_num_aborted_requests_total"
# Total number of requests containing image content
REQUEST_TYPE_IMAGE_TOTAL = "trtllm_request_type_image_total"
# Total number of requests using guided/structured decoding
REQUEST_TYPE_STRUCTURED_OUTPUT_TOTAL = "trtllm_request_type_structured_output_total"
# Total number of successful KV cache transfers
KV_TRANSFER_SUCCESS_TOTAL = "trtllm_kv_transfer_success_total"
# KV cache transfer latency per request in seconds
KV_TRANSFER_LATENCY_SECONDS = "trtllm_kv_transfer_latency_seconds"
# KV cache transfer size per request in bytes
KV_TRANSFER_BYTES = "trtllm_kv_transfer_bytes"
# KV cache transfer speed per request in GB/s
KV_TRANSFER_SPEED_GB_S = "trtllm_kv_transfer_speed_gb_s"
class task_tracker: class task_tracker:
"""Task tracker Prometheus metric name suffixes""" """Task tracker Prometheus metric name suffixes"""
......
...@@ -484,6 +484,36 @@ pub mod kvrouter { ...@@ -484,6 +484,36 @@ pub mod kvrouter {
pub const KV_CACHE_EVENTS_APPLIED: &str = "kv_cache_events_applied"; pub const KV_CACHE_EVENTS_APPLIED: &str = "kv_cache_events_applied";
} }
/// Additional TRT-LLM worker metrics beyond what the engine natively provides.
///
/// These metrics are Python-only (registered via `prometheus_client`) and share the
/// `trtllm_` prefix so they are captured by the same prefix filter as engine metrics.
///
/// ⚠️ Python codegen: Run gen-python-prometheus-names after changes
pub mod trtllm_additional {
/// Total number of aborted/cancelled requests
pub const NUM_ABORTED_REQUESTS_TOTAL: &str = "trtllm_num_aborted_requests_total";
/// Total number of requests containing image content
pub const REQUEST_TYPE_IMAGE_TOTAL: &str = "trtllm_request_type_image_total";
/// Total number of requests using guided/structured decoding
pub const REQUEST_TYPE_STRUCTURED_OUTPUT_TOTAL: &str =
"trtllm_request_type_structured_output_total";
/// Total number of successful KV cache transfers
pub const KV_TRANSFER_SUCCESS_TOTAL: &str = "trtllm_kv_transfer_success_total";
/// KV cache transfer latency per request in seconds
pub const KV_TRANSFER_LATENCY_SECONDS: &str = "trtllm_kv_transfer_latency_seconds";
/// KV cache transfer size per request in bytes
pub const KV_TRANSFER_BYTES: &str = "trtllm_kv_transfer_bytes";
/// KV cache transfer speed per request in GB/s
pub const KV_TRANSFER_SPEED_GB_S: &str = "trtllm_kv_transfer_speed_gb_s";
}
// KV cache statistics metrics // KV cache statistics metrics
pub mod kvstats { pub mod kvstats {
/// Total number of KV cache blocks available on the worker /// Total number of KV cache blocks available on the worker
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment