Unverified Commit e319cd2b authored by nihalmaddala's avatar nihalmaddala Committed by GitHub
Browse files

fix(profiler): use final TRT-LLM paged KV cache log entry for max tokens (#7988)


Signed-off-by: default avatarnihalmaddala <nihalmaddala@gmail.com>
parent 2059f977
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Unit tests for TRT-LLM KV-cache token parsing from runtime logs."""
import pytest
from dynamo.profiler.utils.config_modifiers.trtllm import TrtllmConfigModifier
pytestmark = [
pytest.mark.unit,
pytest.mark.gpu_0,
pytest.mark.pre_merge,
pytest.mark.planner,
pytest.mark.parallel,
]
def test_get_kv_cache_size_uses_last_matching_log_entry(tmp_path) -> None:
"""Parser returns the last paged-KV token value when multiple entries exist."""
log_path = tmp_path / "dynamo.log"
log_path.write_text(
"\n".join(
[
"random startup line",
"[TensorRT-LLM][INFO] [MemUsageChange] Allocated 12.50 GiB for max tokens in paged KV cache (65536).",
"intermediate line",
"[TensorRT-LLM][INFO] [MemUsageChange] Allocated 43.87 GiB for max tokens in paged KV cache (229984).",
]
)
)
parsed = TrtllmConfigModifier.get_kv_cache_size_from_dynamo_log(str(log_path))
assert parsed == 229984
def test_get_kv_cache_size_falls_back_when_missing(tmp_path) -> None:
"""Parser returns default fallback when no paged-KV line is present."""
log_path = tmp_path / "dynamo.log"
log_path.write_text("startup\nhealthcheck ok\n")
parsed = TrtllmConfigModifier.get_kv_cache_size_from_dynamo_log(str(log_path))
assert parsed == 100000
......@@ -275,8 +275,24 @@ class TrtllmConfigModifier(BaseConfigModifier):
def get_kv_cache_size_from_dynamo_log(
cls, dynamo_log_fn: str, attention_dp_size: int = 1
) -> int:
"""Return TRT-LLM paged KV cache token capacity parsed from Dynamo logs.
TRT-LLM may emit multiple memory allocation lines for paged KV cache
during startup. This parser scans the full file and returns the token
value from the last matching entry, which reflects the effective
configured capacity.
Args:
dynamo_log_fn: Path to the Dynamo runtime log file.
attention_dp_size: Unused for TRT-LLM; included for interface parity.
Returns:
Parsed max token count for paged KV cache, or ``100000`` when no
matching log entry is found.
"""
# TRT-LLM log parsing for KV cache size
# Format: [TensorRT-LLM][INFO] [MemUsageChange] Allocated XX GiB for max tokens in paged KV cache (XXXXXX).
max_tokens: int | None = None
try:
with open(dynamo_log_fn, "r") as f:
for line in f:
......@@ -289,13 +305,13 @@ class TrtllmConfigModifier(BaseConfigModifier):
match = re.search(r"paged KV cache \((\d+)\)", line)
if match:
max_tokens = int(match.group(1))
logger.info(
f"Found TRT-LLM KV cache max tokens: {max_tokens}"
)
return max_tokens
except Exception as e:
logger.warning(f"Failed to parse KV cache size from log file. Error: {e}")
if max_tokens is not None:
logger.info(f"Found TRT-LLM KV cache max tokens: {max_tokens}")
return max_tokens
# Return a reasonable default if we couldn't find the KV cache size in logs
logger.warning(
"Could not find KV cache size in TRT-LLM logs, using default value of 100000"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment