Unverified Commit 10081929 authored by Yifan Jiang's avatar Yifan Jiang Committed by GitHub
Browse files

feat(trtllm): add additional metrics for dynamo-trtllm (#6668)


Signed-off-by: default avatarYifan Jiang <19356972+yifjiang@users.noreply.github.com>
Co-authored-by: default avatarClaude Opus 4.6 <noreply@anthropic.com>
parent 2831bfec
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Additional Prometheus metrics for dynamo-trtllm beyond what the engine provides.
The TRT-LLM engine MetricsCollector already provides 5 core metrics:
request_success_total, e2e_request_latency_seconds,
time_to_first_token_seconds, inter_token_latency_seconds,
request_queue_time_seconds
The Rust frontend (metrics.rs) provides token counters:
input_tokens_total, output_tokens_total, cached_tokens
This module adds metrics that have no engine/runtime/frontend equivalent:
- Request types (image, structured output)
- KV transfer metrics (success counter, latency, throughput, per-request bytes)
- Abort tracking
"""
import logging
from datetime import timedelta
from prometheus_client import Counter, Histogram
from dynamo.prometheus_names import trtllm_additional as metric_names
logger = logging.getLogger(__name__)
# Histogram buckets for KV cache transfer metrics
KV_TRANSFER_LATENCY_BUCKETS = (
0.001,
0.005,
0.01,
0.025,
0.05,
0.1,
0.25,
0.5,
1.0,
2.5,
5.0,
10.0,
float("inf"),
)
KV_TRANSFER_SPEED_BUCKETS = (
0.1,
0.5,
1.0,
5.0,
10.0,
25.0,
50.0,
100.0,
250.0,
500.0,
float("inf"),
)
KV_TRANSFER_BYTES_BUCKETS = (
100_000,
500_000,
1_000_000,
5_000_000,
10_000_000,
50_000_000,
100_000_000,
500_000_000,
1_000_000_000,
5_000_000_000,
float("inf"),
)
class AdditionalMetricsCollector:
"""
Additional Prometheus metrics for dynamo-trtllm.
Only creates metrics that have no engine/runtime/frontend equivalent.
Metrics are registered in the default prometheus_client.REGISTRY.
Args:
labels: Dict with keys like model_name, disaggregation_mode, engine_type.
"""
def __init__(self, labels: dict):
self._labelnames = list(labels.keys())
self._labelvalues = list(labels.values())
# --- Abort tracking ---
self.num_aborted_requests = Counter(
metric_names.NUM_ABORTED_REQUESTS_TOTAL,
"Total number of aborted/cancelled requests",
labelnames=self._labelnames,
)
# --- Request type counters ---
self.request_type_image = Counter(
metric_names.REQUEST_TYPE_IMAGE_TOTAL,
"Total number of requests containing image content",
labelnames=self._labelnames,
)
self.request_type_structured_output = Counter(
metric_names.REQUEST_TYPE_STRUCTURED_OUTPUT_TOTAL,
"Total number of requests using guided/structured decoding",
labelnames=self._labelnames,
)
# --- KV cache transfer metrics ---
self.kv_transfer_success = Counter(
metric_names.KV_TRANSFER_SUCCESS_TOTAL,
"Total number of successful KV cache transfers",
labelnames=self._labelnames,
)
self.kv_transfer_latency = Histogram(
metric_names.KV_TRANSFER_LATENCY_SECONDS,
"KV cache transfer latency per request in seconds",
labelnames=self._labelnames,
buckets=KV_TRANSFER_LATENCY_BUCKETS,
)
self.kv_transfer_bytes = Histogram(
metric_names.KV_TRANSFER_BYTES,
"KV cache transfer size per request in bytes",
labelnames=self._labelnames,
buckets=KV_TRANSFER_BYTES_BUCKETS,
)
self.kv_transfer_speed = Histogram(
metric_names.KV_TRANSFER_SPEED_GB_S,
"KV cache transfer speed per request in GB/s",
labelnames=self._labelnames,
buckets=KV_TRANSFER_SPEED_BUCKETS,
)
logger.info("AdditionalMetricsCollector initialized")
# --- Request helpers ---
def record_request_abort(self):
"""Increment aborted requests counter."""
self.num_aborted_requests.labels(*self._labelvalues).inc()
# --- Request type tracking ---
def record_request_type_image(self):
"""Increment the image request type counter."""
self.request_type_image.labels(*self._labelvalues).inc()
def record_request_type_structured_output(self):
"""Increment the structured output request type counter."""
self.request_type_structured_output.labels(*self._labelvalues).inc()
# --- KV transfer ---
def record_kv_transfer_success(self):
"""Increment the KV transfer success counter."""
self.kv_transfer_success.labels(*self._labelvalues).inc()
def record_kv_transfer_perf(self, timing_metrics) -> bool:
"""Record KV transfer performance from RequestPerfMetrics.timing_metrics.
Extracts kv_cache_transfer_start, kv_cache_transfer_end, and kv_cache_size
from TRT-LLM's TimingMetrics and records latency, bytes, and speed.
Only records when a transfer actually occurred (non-zero transfer times).
Args:
timing_metrics: TimingMetrics object from RequestPerfMetrics.
Returns:
True if a transfer was recorded, False if skipped.
"""
transfer_start = timing_metrics.kv_cache_transfer_start
transfer_end = timing_metrics.kv_cache_transfer_end
# Only record when a transfer actually happened
if transfer_end <= timedelta(0) or transfer_start <= timedelta(0):
return False
latency_s = (transfer_end - transfer_start).total_seconds()
if latency_s <= 0:
return False
kv_bytes = timing_metrics.kv_cache_size
self.kv_transfer_latency.labels(*self._labelvalues).observe(latency_s)
self.kv_transfer_bytes.labels(*self._labelvalues).observe(kv_bytes)
if kv_bytes > 0:
speed_gb_s = kv_bytes / (latency_s * 1e9)
self.kv_transfer_speed.labels(*self._labelvalues).observe(speed_gb_s)
return True
......@@ -21,7 +21,7 @@ import re
from collections.abc import AsyncGenerator
from contextlib import asynccontextmanager
from dataclasses import asdict, dataclass
from typing import Any, Optional, Union
from typing import TYPE_CHECKING, Any, Optional, Union
import torch
from tensorrt_llm.executor.result import GenerationResult
......@@ -40,6 +40,7 @@ from dynamo.runtime.logging import configure_dynamo_logging
from dynamo.trtllm.constants import DisaggregationMode
from dynamo.trtllm.engine import TensorRTLLMEngine
from dynamo.trtllm.logits_processing.adapter import create_trtllm_adapters
from dynamo.trtllm.metrics import AdditionalMetricsCollector
from dynamo.trtllm.multimodal_processor import MultimodalRequestProcessor
from dynamo.trtllm.publisher import Publisher
from dynamo.trtllm.request_handlers.base_generative_handler import BaseGenerativeHandler
......@@ -48,6 +49,11 @@ from dynamo.trtllm.utils.disagg_utils import (
DisaggregatedParamsCodec,
)
if TYPE_CHECKING:
# tensorrt_llm may use a different version that doesn't have MetricsCollector,
# so guard this import inside TYPE_CHECKING to avoid runtime import errors.
from tensorrt_llm.metrics import MetricsCollector
configure_dynamo_logging()
......@@ -69,11 +75,12 @@ class RequestHandlerConfig:
runtime: Optional[
DistributedRuntime
] = None # DistributedRuntime reference for graceful shutdown
metrics_collector: Optional[Any] = None # TensorRT-LLM MetricsCollector
metrics_collector: Optional["MetricsCollector"] = None
kv_block_size: int = 32
shutdown_event: Optional[asyncio.Event] = None
encoder_cache_capacity_gb: float = 0 # Encoder cache capacity in GB
disable_request_abort: bool = True
additional_metrics: Optional["AdditionalMetricsCollector"] = None
class HandlerBase(BaseGenerativeHandler):
......@@ -103,6 +110,7 @@ class HandlerBase(BaseGenerativeHandler):
self.kv_block_size: int = config.kv_block_size
self.shutdown_event = config.shutdown_event
self.disable_request_abort = config.disable_request_abort
self.additional_metrics = config.additional_metrics
def check_error(self, result: dict) -> bool:
"""
......@@ -621,6 +629,36 @@ class HandlerBase(BaseGenerativeHandler):
"""
logging.debug(f"Request: {request}")
# Additional metrics: request type detection
metrics_collector = self.additional_metrics
if metrics_collector:
try:
# Detect request types for metrics
sampling_options = request.get("sampling_options", {})
guided = sampling_options.get("guided_decoding")
if guided and isinstance(guided, dict):
has_structured_guidance = any(
guided.get(k) is not None
for k in (
"json",
"regex",
"grammar",
"json_object",
"structural_tag",
)
) or bool(guided.get("choice"))
if has_structured_guidance:
metrics_collector.record_request_type_structured_output()
if (
request.get("multi_modal_data")
or embeddings is not None
or request.get("_epd_processed_prompt") is not None
):
metrics_collector.record_request_type_image()
except Exception as e:
logging.warning("Additional metrics (request type): %s", e)
# Normalize OpenAI format to TRT-LLM internal format
self._normalize_request_format(request)
......@@ -822,6 +860,29 @@ class HandlerBase(BaseGenerativeHandler):
"Request finished with no finish reason set - this indicates a possible bug"
)
# Record additional metrics on request finish
if res.finished and metrics_collector and out.get("finish_reason"):
try:
# KV transfer metrics from request_perf_metrics
if output.request_perf_metrics is not None:
# Record KV transfer latency/bytes/speed from timing_metrics
tm = output.request_perf_metrics.timing_metrics
if tm is not None:
recorded = (
metrics_collector.record_kv_transfer_perf(tm)
)
# Only count success if a transfer actually occurred
if (
recorded
and self.disaggregation_mode
== DisaggregationMode.PREFILL
):
metrics_collector.record_kv_transfer_success()
except Exception as e:
logging.warning(
"Additional metrics (request finish): %s", e
)
# Log metrics to TensorRT-LLM MetricsCollector when request finishes
# NOTE: TRT-LLM 1.3.0rc5 (PR #11243) renamed log_metrics_dict → log_request_metrics_dict
if (
......@@ -852,6 +913,11 @@ class HandlerBase(BaseGenerativeHandler):
except asyncio.CancelledError:
logging.debug(f"Request {request_id}: Client cancelled")
# _cancellation_monitor already called abort_request
try:
if metrics_collector:
metrics_collector.record_request_abort()
except Exception as e:
logging.debug("Additional metrics (request abort): %s", e)
return # Just stop, no error response
# 2. Per-request errors - send to client, don't shutdown
......
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Tests for AdditionalMetricsCollector and additional metrics integration."""
import ast
import inspect
import textwrap
import unittest
from datetime import timedelta
from unittest.mock import MagicMock, patch
from prometheus_client import CollectorRegistry, generate_latest
from dynamo.trtllm.metrics import AdditionalMetricsCollector
try:
from dynamo.trtllm.request_handlers.handler_base import HandlerBase
except ImportError:
# handler_base imports torch which requires CUDA libraries at import time;
# gracefully skip on CPU-only CI runners.
HandlerBase = None
class TestAdditionalMetricsCollector(unittest.TestCase):
"""Unit tests for AdditionalMetricsCollector."""
def setUp(self):
"""Create a fresh registry and collector for each test."""
self.registry = CollectorRegistry()
# Patch prometheus_client.Counter and Histogram to use our test registry
with patch("dynamo.trtllm.metrics.Counter") as MockCounter, patch(
"dynamo.trtllm.metrics.Histogram"
) as MockHistogram:
from prometheus_client import Counter, Histogram
def make_counter(name, documentation, labelnames=None, **_kw):
return Counter(
name,
documentation,
labelnames=labelnames or [],
registry=self.registry,
)
def make_histogram(
name, documentation, labelnames=None, buckets=None, **_kw
):
kwargs = {"registry": self.registry}
if buckets is not None:
kwargs["buckets"] = buckets
return Histogram(
name, documentation, labelnames=labelnames or [], **kwargs
)
MockCounter.side_effect = make_counter
MockHistogram.side_effect = make_histogram
self.collector = AdditionalMetricsCollector(
labels={
"model_name": "test-model",
"disaggregation_mode": "prefill_and_decode",
"engine_type": "trtllm",
},
)
def _get_metric_value(self, name, _labels=None):
"""Get a metric value from the registry."""
output = generate_latest(self.registry).decode()
for line in output.splitlines():
if line.startswith("#"):
continue
if line.startswith(name):
# Extract value (last token)
parts = line.split()
if len(parts) >= 2:
return float(parts[-1])
return None
def test_abort_counter(self):
"""Test abort tracking."""
self.collector.record_request_abort()
output = generate_latest(self.registry).decode()
self.assertIn("trtllm_num_aborted_requests_total", output)
def test_request_type_counters(self):
"""Test request type counters."""
self.collector.record_request_type_image()
self.collector.record_request_type_structured_output()
output = generate_latest(self.registry).decode()
self.assertIn("trtllm_request_type_image_total", output)
self.assertIn("trtllm_request_type_structured_output_total", output)
def test_kv_transfer_success_counter(self):
"""Test KV transfer success counter."""
self.collector.record_kv_transfer_success()
output = generate_latest(self.registry).decode()
self.assertIn("trtllm_kv_transfer_success_total", output)
def test_kv_transfer_perf_metrics(self):
"""Test KV transfer latency/bytes/speed from timing_metrics."""
tm = MagicMock()
tm.kv_cache_transfer_start = timedelta(seconds=1.0)
tm.kv_cache_transfer_end = timedelta(seconds=1.05)
tm.kv_cache_size = 1_000_000_000 # 1 GB
result = self.collector.record_kv_transfer_perf(tm)
self.assertTrue(result)
output = generate_latest(self.registry).decode()
self.assertIn("trtllm_kv_transfer_latency_seconds", output)
self.assertIn("trtllm_kv_transfer_bytes_bucket", output)
self.assertIn("trtllm_kv_transfer_speed_gb_s", output)
def test_kv_transfer_perf_skipped_when_no_transfer(self):
"""Test that KV transfer perf is not recorded when no transfer occurred."""
tm = MagicMock()
tm.kv_cache_transfer_start = timedelta(0)
tm.kv_cache_transfer_end = timedelta(0)
tm.kv_cache_size = 0
result = self.collector.record_kv_transfer_perf(tm)
self.assertFalse(result)
# Histogram is defined but should have no observations
sample = self.registry.get_sample_value(
"trtllm_kv_transfer_latency_seconds_count"
)
self.assertEqual(sample, 0.0)
def test_kv_transfer_perf_return_values(self):
"""Verify record_kv_transfer_perf returns True on record, False on skip."""
# Transfer occurred
tm_ok = MagicMock()
tm_ok.kv_cache_transfer_start = timedelta(seconds=1.0)
tm_ok.kv_cache_transfer_end = timedelta(seconds=1.05)
tm_ok.kv_cache_size = 5_000_000
self.assertTrue(self.collector.record_kv_transfer_perf(tm_ok))
# No transfer (zero times)
tm_zero = MagicMock()
tm_zero.kv_cache_transfer_start = timedelta(0)
tm_zero.kv_cache_transfer_end = timedelta(0)
tm_zero.kv_cache_size = 0
self.assertFalse(self.collector.record_kv_transfer_perf(tm_zero))
# Negative latency (end before start)
tm_neg = MagicMock()
tm_neg.kv_cache_transfer_start = timedelta(seconds=2.0)
tm_neg.kv_cache_transfer_end = timedelta(seconds=1.0)
tm_neg.kv_cache_size = 1000
self.assertFalse(self.collector.record_kv_transfer_perf(tm_neg))
def test_no_duplicate_metrics(self):
"""Test that removed duplicate metrics are not present."""
output = generate_latest(self.registry).decode()
# These metrics were removed as they duplicate frontend/runtime metrics
self.assertNotIn("prompt_tokens_total", output)
self.assertNotIn("generation_tokens_total", output)
self.assertNotIn("gen_throughput", output)
self.assertNotIn("kv_cache_hit_tokens_total", output)
self.assertNotIn("handler_time_to_first_token_seconds", output)
self.assertNotIn("handler_inter_token_latency_seconds", output)
self.assertNotIn("handler_e2e_request_latency_seconds", output)
# Phase timing metrics also removed (derivable from existing trtllm_* metrics)
self.assertNotIn("request_prefill_time_seconds", output)
self.assertNotIn("request_decode_time_seconds", output)
self.assertNotIn("request_inference_time_seconds", output)
# Config info metrics removed (overlap dynamo_frontend_model_* and
# dynamo_component_model_load_time_seconds)
self.assertNotIn("model_config_info", output)
self.assertNotIn("parallel_config_info", output)
self.assertNotIn("detailed_config_info", output)
self.assertNotIn("cache_config_info", output)
self.assertNotIn("engine_startup_time", output)
# KV transfer perf metrics are now wired from request_perf_metrics.timing_metrics
# (kv_transfer_latency_seconds, kv_transfer_bytes, kv_transfer_speed_gb_s)
@unittest.skipIf(HandlerBase is None, "HandlerBase requires CUDA/GPU libraries")
class TestHandlerBaseMetricsInstrumentation(unittest.TestCase):
"""Test metrics instrumentation in handler_base.py generate_locally()."""
def test_structured_output_detection_keys(self):
"""Verify guided decoding detection keys in generate_locally match _override_sampling_params."""
# Extract detection keys from generate_locally: the tuple in
# any(guided.get(k) for k in ("json", ...))
gen_source = textwrap.dedent(inspect.getsource(HandlerBase.generate_locally))
gen_tree = ast.parse(gen_source)
detection_keys = set()
for node in ast.walk(gen_tree):
# Find: any(guided.get(k) for k in (...))
if isinstance(node, ast.Tuple) and all(
isinstance(e, (ast.Constant, ast.Str)) for e in node.elts
):
vals = {
e.value if isinstance(e, ast.Constant) else e.s for e in node.elts
}
if "json_object" in vals: # identify the right tuple
detection_keys = vals
self.assertTrue(
detection_keys, "Could not extract detection keys from generate_locally"
)
# Extract canonical keys from _override_sampling_params:
# GuidedDecodingParams(json=..., regex=..., ...)
override_source = textwrap.dedent(
inspect.getsource(HandlerBase._override_sampling_params)
)
override_tree = ast.parse(override_source)
canonical_keys = set()
for node in ast.walk(override_tree):
if (
isinstance(node, ast.Call)
and getattr(node.func, "id", None) == "GuidedDecodingParams"
):
canonical_keys = {kw.arg for kw in node.keywords}
self.assertTrue(
canonical_keys, "Could not extract keys from GuidedDecodingParams call"
)
# Detection should cover all canonical keys
missing = canonical_keys - detection_keys
self.assertFalse(
missing,
f"Keys in GuidedDecodingParams but not in metrics detection: {missing}",
)
if __name__ == "__main__":
unittest.main()
......@@ -397,6 +397,7 @@ async def init_llm_worker(
# Initialize TensorRT-LLM MetricsCollector and register with global REGISTRY
# This enables exposing TRT-LLM's native Prometheus metrics (request latency, TTFT, TPOT, etc.)
metrics_collector = None
additional_metrics = None
if config.publish_events_and_metrics:
try:
model_name_for_metrics = config.served_model_name or config.model
......@@ -405,18 +406,47 @@ async def init_llm_worker(
)
logging.info("TensorRT-LLM MetricsCollector initialized")
# Register TRT-LLM metrics (TRT-LLM natively outputs trtllm_* metrics after traffic)
# Auto-label injection: hierarchy labels are added automatically
# Prefix filter: all TRT-LLM metrics (engine + additional) use "trtllm_" prefix
_metric_prefixes = ["trtllm_"]
# Additional metrics (abort tracking, request types, KV transfer perf).
# Wrapped in try/except because AdditionalMetricsCollector depends on
# prometheus_names which may not be available in all packaging variants.
try:
from dynamo.trtllm.metrics import AdditionalMetricsCollector
disagg_mode_str = (
config.disaggregation_mode.value
if hasattr(config.disaggregation_mode, "value")
else str(config.disaggregation_mode)
)
additional_metrics = AdditionalMetricsCollector(
labels={
"model_name": model_name_for_metrics,
"disaggregation_mode": disagg_mode_str,
"engine_type": "trtllm",
},
)
logging.info(
"Additional metrics initialized (disagg_mode=%s)",
disagg_mode_str,
)
except Exception as e:
logging.warning("Failed to initialize additional metrics: %s", e)
# Single callback for all Python-side metrics (trtllm_ + additional)
register_engine_metrics_callback(
endpoint=endpoint,
registry=REGISTRY,
metric_prefix_filters=["trtllm_"],
metric_prefix_filters=_metric_prefixes,
namespace_name=config.namespace,
component_name=config.component,
endpoint_name="generate",
model_name=model_name_for_metrics,
)
logging.info("TensorRT-LLM Prometheus metrics registered")
logging.info(
"Prometheus metrics registered (prefixes: %s)", _metric_prefixes
)
except Exception as e:
logging.warning(
f"Failed to initialize TensorRT-LLM Prometheus metrics: {e}"
......@@ -444,6 +474,7 @@ async def init_llm_worker(
shutdown_event=shutdown_event,
encoder_cache_capacity_gb=config.multimodal_embedding_cache_capacity_gb,
disable_request_abort=config.disable_request_abort,
additional_metrics=additional_metrics,
)
# Register the model with runtime config
......
......@@ -134,6 +134,38 @@ These metric names and availability are subject to change with TensorRT-LLM vers
TensorRT-LLM provides Prometheus metrics through the `MetricsCollector` class (see [tensorrt_llm/metrics/collector.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/metrics/collector.py)).
### Additional Operational Metrics
Dynamo adds the following operational metrics for TensorRT-LLM workers. These complement the engine's native metrics above with request-level observability that the engine does not provide. All metrics use the `trtllm_` prefix and are automatically enabled when `--publish-events-and-metrics` is set.
Metric name constants are defined in `lib/runtime/src/metrics/prometheus_names.rs` (`trtllm_additional` module).
#### Request Type Tracking
- `trtllm_request_type_image_total` (Counter) — Total number of requests containing image/multimodal content
- Labels: `model_name`, `disaggregation_mode`, `engine_type`
- `trtllm_request_type_structured_output_total` (Counter) — Total number of requests using guided/structured decoding (JSON, regex, grammar, etc.)
- Labels: `model_name`, `disaggregation_mode`, `engine_type`
#### Abort Tracking
- `trtllm_num_aborted_requests_total` (Counter) — Total number of aborted/cancelled requests
- Labels: `model_name`, `disaggregation_mode`, `engine_type`
#### KV Cache Transfer Metrics (Disaggregated Deployments)
These metrics are only recorded in disaggregated (prefill + decode) deployments when a KV cache transfer actually occurs. They are sourced from TensorRT-LLM's `RequestPerfMetrics.timing_metrics`.
- `trtllm_kv_transfer_success_total` (Counter) — Total number of successful KV cache transfers (recorded on prefill side)
- Labels: `model_name`, `disaggregation_mode`, `engine_type`
- `trtllm_kv_transfer_latency_seconds` (Histogram) — KV cache transfer latency per request in seconds
- Labels: `model_name`, `disaggregation_mode`, `engine_type`
- `trtllm_kv_transfer_bytes` (Histogram) — KV cache transfer size per request in bytes
- Labels: `model_name`, `disaggregation_mode`, `engine_type`
- Buckets: 100KB, 500KB, 1MB, 5MB, 10MB, 50MB, 100MB, 500MB, 1GB, 5GB
- `trtllm_kv_transfer_speed_gb_s` (Histogram) — KV cache transfer speed per request in GB/s
- Labels: `model_name`, `disaggregation_mode`, `engine_type`
## Non-Prometheus Performance Metrics
TensorRT-LLM provides extensive performance data beyond the basic Prometheus metrics. These are not currently exposed to Prometheus.
......
......@@ -246,6 +246,25 @@ class routing_overhead:
TOTAL_MS = "overhead_total_ms"
class trtllm_additional:
"""Additional TRT-LLM worker metrics beyond what the engine natively provides."""
# Total number of aborted/cancelled requests
NUM_ABORTED_REQUESTS_TOTAL = "trtllm_num_aborted_requests_total"
# Total number of requests containing image content
REQUEST_TYPE_IMAGE_TOTAL = "trtllm_request_type_image_total"
# Total number of requests using guided/structured decoding
REQUEST_TYPE_STRUCTURED_OUTPUT_TOTAL = "trtllm_request_type_structured_output_total"
# Total number of successful KV cache transfers
KV_TRANSFER_SUCCESS_TOTAL = "trtllm_kv_transfer_success_total"
# KV cache transfer latency per request in seconds
KV_TRANSFER_LATENCY_SECONDS = "trtllm_kv_transfer_latency_seconds"
# KV cache transfer size per request in bytes
KV_TRANSFER_BYTES = "trtllm_kv_transfer_bytes"
# KV cache transfer speed per request in GB/s
KV_TRANSFER_SPEED_GB_S = "trtllm_kv_transfer_speed_gb_s"
class task_tracker:
"""Task tracker Prometheus metric name suffixes"""
......
......@@ -484,6 +484,36 @@ pub mod kvrouter {
pub const KV_CACHE_EVENTS_APPLIED: &str = "kv_cache_events_applied";
}
/// Additional TRT-LLM worker metrics beyond what the engine natively provides.
///
/// These metrics are Python-only (registered via `prometheus_client`) and share the
/// `trtllm_` prefix so they are captured by the same prefix filter as engine metrics.
///
/// ⚠️ Python codegen: Run gen-python-prometheus-names after changes
pub mod trtllm_additional {
/// Total number of aborted/cancelled requests
pub const NUM_ABORTED_REQUESTS_TOTAL: &str = "trtllm_num_aborted_requests_total";
/// Total number of requests containing image content
pub const REQUEST_TYPE_IMAGE_TOTAL: &str = "trtllm_request_type_image_total";
/// Total number of requests using guided/structured decoding
pub const REQUEST_TYPE_STRUCTURED_OUTPUT_TOTAL: &str =
"trtllm_request_type_structured_output_total";
/// Total number of successful KV cache transfers
pub const KV_TRANSFER_SUCCESS_TOTAL: &str = "trtllm_kv_transfer_success_total";
/// KV cache transfer latency per request in seconds
pub const KV_TRANSFER_LATENCY_SECONDS: &str = "trtllm_kv_transfer_latency_seconds";
/// KV cache transfer size per request in bytes
pub const KV_TRANSFER_BYTES: &str = "trtllm_kv_transfer_bytes";
/// KV cache transfer speed per request in GB/s
pub const KV_TRANSFER_SPEED_GB_S: &str = "trtllm_kv_transfer_speed_gb_s";
}
// KV cache statistics metrics
pub mod kvstats {
/// Total number of KV cache blocks available on the worker
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment