Unverified Commit eb76a8b5 authored by Indrajit Bhosale's avatar Indrajit Bhosale Committed by GitHub
Browse files

chore: Expose new kv_cache metrics from trtllm backend (#6469)

parent 7893f268
......@@ -304,6 +304,7 @@ class Publisher:
component_gauges: LLMBackendMetrics,
zmq_endpoint: Optional[str] = None,
enable_local_indexer: bool = False,
metrics_collector=None,
):
self.endpoint = endpoint
self.engine = engine
......@@ -313,6 +314,7 @@ class Publisher:
self.metrics_labels = metrics_labels
self.component_gauges = component_gauges
self.enable_local_indexer = enable_local_indexer
self.metrics_collector = metrics_collector
self.attention_dp_size = engine.get_attention_dp_size()
# The first few kv events from the model engine are always "created" type events.
......@@ -482,6 +484,16 @@ class Publisher:
)
self.component_gauges.set_gpu_cache_usage("0", gpu_cache_usage)
# Log iteration stats to TRT-LLM MetricsCollector (PR #11243)
# This populates trtllm_kv_cache_hit_rate and trtllm_kv_cache_utilization gauges
if self.metrics_collector and hasattr(
self.metrics_collector, "log_iteration_stats"
):
try:
self.metrics_collector.log_iteration_stats(stat)
except Exception as e:
logging.warning(f"Failed to log iteration stats: {e}")
await self._polling_loop(
lambda: self.engine.llm.get_stats_async(timeout=_STATS_TIMEOUT_SEC),
handle_stat,
......@@ -766,6 +778,7 @@ async def get_publisher(
component_gauges: LLMBackendMetrics,
zmq_endpoint: Optional[str] = None,
enable_local_indexer: bool = False,
metrics_collector=None,
):
publisher = Publisher(
endpoint,
......@@ -776,6 +789,7 @@ async def get_publisher(
component_gauges=component_gauges,
zmq_endpoint=zmq_endpoint,
enable_local_indexer=enable_local_indexer,
metrics_collector=metrics_collector,
)
try:
publisher.initialize()
......
......@@ -802,13 +802,24 @@ class HandlerBase(BaseGenerativeHandler):
)
# Log metrics to TensorRT-LLM MetricsCollector when request finishes
# NOTE: TRT-LLM 1.3.0rc5 (PR #11243) renamed log_metrics_dict → log_request_metrics_dict
if (
res.finished
and self.metrics_collector
and hasattr(res, "metrics_dict")
):
try:
self.metrics_collector.log_metrics_dict(res.metrics_dict)
if hasattr(
self.metrics_collector,
"log_request_metrics_dict",
):
self.metrics_collector.log_request_metrics_dict(
res.metrics_dict
)
else:
self.metrics_collector.log_metrics_dict(
res.metrics_dict
)
except Exception as e:
logging.warning(f"Failed to log TensorRT-LLM metrics: {e}")
......
......@@ -181,6 +181,10 @@ async def init_llm_worker(
"max_beam_width": config.max_beam_width,
"max_batch_size": config.max_batch_size,
"return_perf_metrics": config.publish_events_and_metrics,
# enable_iter_perf_stats is required for PyTorch backend to compute iteration-level
# stats (KV cache utilization, hit rate). TensorRT backend always has this enabled.
# See TRT-LLM PR #11243: MetricsCollector.log_iteration_stats() needs these stats.
"enable_iter_perf_stats": config.publish_events_and_metrics,
"kv_connector_config": kv_connector_config,
}
......@@ -493,6 +497,7 @@ async def init_llm_worker(
component_gauges=component_gauges,
zmq_endpoint=trtllm_zmq_bind_endpoint,
enable_local_indexer=config.enable_local_indexer,
metrics_collector=metrics_collector,
) as publisher:
handler_config.publisher = publisher
handler = RequestHandlerFactory().get_request_handler(handler_config)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment