"lib/llm/vscode:/vscode.git/clone" did not exist on "7af49a15b2786e64de5c2cd08e1ebb38d7505b47"
Unverified Commit eb76a8b5 authored by Indrajit Bhosale's avatar Indrajit Bhosale Committed by GitHub
Browse files

chore: Expose new kv_cache metrics from trtllm backend (#6469)

parent 7893f268
...@@ -304,6 +304,7 @@ class Publisher: ...@@ -304,6 +304,7 @@ class Publisher:
component_gauges: LLMBackendMetrics, component_gauges: LLMBackendMetrics,
zmq_endpoint: Optional[str] = None, zmq_endpoint: Optional[str] = None,
enable_local_indexer: bool = False, enable_local_indexer: bool = False,
metrics_collector=None,
): ):
self.endpoint = endpoint self.endpoint = endpoint
self.engine = engine self.engine = engine
...@@ -313,6 +314,7 @@ class Publisher: ...@@ -313,6 +314,7 @@ class Publisher:
self.metrics_labels = metrics_labels self.metrics_labels = metrics_labels
self.component_gauges = component_gauges self.component_gauges = component_gauges
self.enable_local_indexer = enable_local_indexer self.enable_local_indexer = enable_local_indexer
self.metrics_collector = metrics_collector
self.attention_dp_size = engine.get_attention_dp_size() self.attention_dp_size = engine.get_attention_dp_size()
# The first few kv events from the model engine are always "created" type events. # The first few kv events from the model engine are always "created" type events.
...@@ -482,6 +484,16 @@ class Publisher: ...@@ -482,6 +484,16 @@ class Publisher:
) )
self.component_gauges.set_gpu_cache_usage("0", gpu_cache_usage) self.component_gauges.set_gpu_cache_usage("0", gpu_cache_usage)
# Log iteration stats to TRT-LLM MetricsCollector (PR #11243)
# This populates trtllm_kv_cache_hit_rate and trtllm_kv_cache_utilization gauges
if self.metrics_collector and hasattr(
self.metrics_collector, "log_iteration_stats"
):
try:
self.metrics_collector.log_iteration_stats(stat)
except Exception as e:
logging.warning(f"Failed to log iteration stats: {e}")
await self._polling_loop( await self._polling_loop(
lambda: self.engine.llm.get_stats_async(timeout=_STATS_TIMEOUT_SEC), lambda: self.engine.llm.get_stats_async(timeout=_STATS_TIMEOUT_SEC),
handle_stat, handle_stat,
...@@ -766,6 +778,7 @@ async def get_publisher( ...@@ -766,6 +778,7 @@ async def get_publisher(
component_gauges: LLMBackendMetrics, component_gauges: LLMBackendMetrics,
zmq_endpoint: Optional[str] = None, zmq_endpoint: Optional[str] = None,
enable_local_indexer: bool = False, enable_local_indexer: bool = False,
metrics_collector=None,
): ):
publisher = Publisher( publisher = Publisher(
endpoint, endpoint,
...@@ -776,6 +789,7 @@ async def get_publisher( ...@@ -776,6 +789,7 @@ async def get_publisher(
component_gauges=component_gauges, component_gauges=component_gauges,
zmq_endpoint=zmq_endpoint, zmq_endpoint=zmq_endpoint,
enable_local_indexer=enable_local_indexer, enable_local_indexer=enable_local_indexer,
metrics_collector=metrics_collector,
) )
try: try:
publisher.initialize() publisher.initialize()
......
...@@ -802,13 +802,24 @@ class HandlerBase(BaseGenerativeHandler): ...@@ -802,13 +802,24 @@ class HandlerBase(BaseGenerativeHandler):
) )
# Log metrics to TensorRT-LLM MetricsCollector when request finishes # Log metrics to TensorRT-LLM MetricsCollector when request finishes
# NOTE: TRT-LLM 1.3.0rc5 (PR #11243) renamed log_metrics_dict → log_request_metrics_dict
if ( if (
res.finished res.finished
and self.metrics_collector and self.metrics_collector
and hasattr(res, "metrics_dict") and hasattr(res, "metrics_dict")
): ):
try: try:
self.metrics_collector.log_metrics_dict(res.metrics_dict) if hasattr(
self.metrics_collector,
"log_request_metrics_dict",
):
self.metrics_collector.log_request_metrics_dict(
res.metrics_dict
)
else:
self.metrics_collector.log_metrics_dict(
res.metrics_dict
)
except Exception as e: except Exception as e:
logging.warning(f"Failed to log TensorRT-LLM metrics: {e}") logging.warning(f"Failed to log TensorRT-LLM metrics: {e}")
......
...@@ -181,6 +181,10 @@ async def init_llm_worker( ...@@ -181,6 +181,10 @@ async def init_llm_worker(
"max_beam_width": config.max_beam_width, "max_beam_width": config.max_beam_width,
"max_batch_size": config.max_batch_size, "max_batch_size": config.max_batch_size,
"return_perf_metrics": config.publish_events_and_metrics, "return_perf_metrics": config.publish_events_and_metrics,
# enable_iter_perf_stats is required for PyTorch backend to compute iteration-level
# stats (KV cache utilization, hit rate). TensorRT backend always has this enabled.
# See TRT-LLM PR #11243: MetricsCollector.log_iteration_stats() needs these stats.
"enable_iter_perf_stats": config.publish_events_and_metrics,
"kv_connector_config": kv_connector_config, "kv_connector_config": kv_connector_config,
} }
...@@ -493,6 +497,7 @@ async def init_llm_worker( ...@@ -493,6 +497,7 @@ async def init_llm_worker(
component_gauges=component_gauges, component_gauges=component_gauges,
zmq_endpoint=trtllm_zmq_bind_endpoint, zmq_endpoint=trtllm_zmq_bind_endpoint,
enable_local_indexer=config.enable_local_indexer, enable_local_indexer=config.enable_local_indexer,
metrics_collector=metrics_collector,
) as publisher: ) as publisher:
handler_config.publisher = publisher handler_config.publisher = publisher
handler = RequestHandlerFactory().get_request_handler(handler_config) handler = RequestHandlerFactory().get_request_handler(handler_config)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment