fix(profiler): use final TRT-LLM paged KV cache log entry for max tokens (#7988)

Signed-off-by: nihalmaddala <nihalmaddala@gmail.com>

fix(profiler): use final TRT-LLM paged KV cache log entry for max tokens (#7988)
Signed-off-by: nihalmaddala <nihalmaddala@gmail.com>
e319cd2b · nihalmaddala · GitHub · 2059f977 · e319cd2b · e319cd2b
Unverified Commit e319cd2b authored Apr 09, 2026 by nihalmaddala Committed by GitHub Apr 09, 2026
2 changed files
--- a/components/src/dynamo/profiler/tests/unit/test_trtllm_kv_cache_log_parsing.py
+++ b/components/src/dynamo/profiler/tests/unit/test_trtllm_kv_cache_log_parsing.py
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Unit tests for TRT-LLM KV-cache token parsing from runtime logs."""
+
+import pytest
+
+from dynamo.profiler.utils.config_modifiers.trtllm import TrtllmConfigModifier
+
+pytestmark = [
+    pytest.mark.unit,
+    pytest.mark.gpu_0,
+    pytest.mark.pre_merge,
+    pytest.mark.planner,
+    pytest.mark.parallel,
+]
+
+
+def test_get_kv_cache_size_uses_last_matching_log_entry(tmp_path) -> None:
+    """Parser returns the last paged-KV token value when multiple entries exist."""
+    log_path = tmp_path / "dynamo.log"
+    log_path.write_text(
+        "\n".join(
+            [
+                "random startup line",
+                "[TensorRT-LLM][INFO] [MemUsageChange] Allocated 12.50 GiB for max tokens in paged KV cache (65536).",
+                "intermediate line",
+                "[TensorRT-LLM][INFO] [MemUsageChange] Allocated 43.87 GiB for max tokens in paged KV cache (229984).",
+            ]
+        )
+    )
+
+    parsed = TrtllmConfigModifier.get_kv_cache_size_from_dynamo_log(str(log_path))
+
+    assert parsed == 229984
+
+
+def test_get_kv_cache_size_falls_back_when_missing(tmp_path) -> None:
+    """Parser returns default fallback when no paged-KV line is present."""
+    log_path = tmp_path / "dynamo.log"
+    log_path.write_text("startup\nhealthcheck ok\n")
+
+    parsed = TrtllmConfigModifier.get_kv_cache_size_from_dynamo_log(str(log_path))
+
+    assert parsed == 100000
--- a/components/src/dynamo/profiler/utils/config_modifiers/trtllm.py
+++ b/components/src/dynamo/profiler/utils/config_modifiers/trtllm.py
@@ -275,8 +275,24 @@ class TrtllmConfigModifier(BaseConfigModifier):
    def get_kv_cache_size_from_dynamo_log(
        cls, dynamo_log_fn: str, attention_dp_size: int = 1
    ) -> int:
+        """Return TRT-LLM paged KV cache token capacity parsed from Dynamo logs.
+
+        TRT-LLM may emit multiple memory allocation lines for paged KV cache
+        during startup. This parser scans the full file and returns the token
+        value from the last matching entry, which reflects the effective
+        configured capacity.
+
+        Args:
+            dynamo_log_fn: Path to the Dynamo runtime log file.
+            attention_dp_size: Unused for TRT-LLM; included for interface parity.
+
+        Returns:
+            Parsed max token count for paged KV cache, or ``100000`` when no
+            matching log entry is found.
+        """
        # TRT-LLM log parsing for KV cache size
        # Format: [TensorRT-LLM][INFO] [MemUsageChange] Allocated XX GiB for max tokens in paged KV cache (XXXXXX).
+        max_tokens: int | None = None
        try:
            with open(dynamo_log_fn, "r") as f:
                for line in f:
@@ -289,13 +305,13 @@ class TrtllmConfigModifier(BaseConfigModifier):
                        match = re.search(r"paged KV cache \((\d+)\)", line)
                        if match:
                            max_tokens = int(match.group(1))
-                            logger.info(
-                                f"Found TRT-LLM KV cache max tokens: {max_tokens}"
-                            )
-                            return max_tokens
        except Exception as e:
            logger.warning(f"Failed to parse KV cache size from log file. Error: {e}")

+        if max_tokens is not None:
+            logger.info(f"Found TRT-LLM KV cache max tokens: {max_tokens}")
+            return max_tokens
+
        # Return a reasonable default if we couldn't find the KV cache size in logs
        logger.warning(
            "Could not find KV cache size in TRT-LLM logs, using default value of 100000"