fix: read block size from vllm at runtime (#5713)

Signed-off-by: Neal Vaidya <nealv@nvidia.com>

fix: read block size from vllm at runtime (#5713)
Signed-off-by: Neal Vaidya <nealv@nvidia.com>
3d7182b8 · Neal Vaidya · GitHub · 2c3066bd · 3d7182b8
Unverified Commit 3d7182b8 authored Feb 07, 2026 by Neal Vaidya Committed by GitHub Feb 07, 2026
Hide whitespace changes
Inline Side-by-side

Showing with 3 additions and 1 deletion

components/src/dynamo/vllm/main.py components/src/dynamo/vllm/main.py +3 -1

No files found.
--- a/components/src/dynamo/vllm/main.py
+++ b/components/src/dynamo/vllm/main.py
@@ -556,7 +556,7 @@ async def register_vllm_model(
        generate_endpoint,
        config.model,
        config.served_model_name,
-        kv_cache_block_size=config.engine_args.block_size,
+        kv_cache_block_size=runtime_values["block_size"],
        runtime_config=runtime_config,
        custom_template_path=config.custom_jinja_template,
        media_decoder=media_decoder,
@@ -860,6 +860,7 @@ def get_engine_cache_info(engine: AsyncLLM):
        # Get values directly from vllm_config instead of collective_rpc
        cache_values = {
            "num_gpu_blocks": engine.vllm_config.cache_config.num_gpu_blocks,
+            "block_size": engine.vllm_config.cache_config.block_size,
        }
        scheduler_values = {
@@ -871,6 +872,7 @@ def get_engine_cache_info(engine: AsyncLLM):
        logging.info(f"Scheduler config values: {scheduler_values}")
        return {
            "num_gpu_blocks": cache_values["num_gpu_blocks"],
+            "block_size": cache_values["block_size"],
            "max_num_seqs": scheduler_values["max_num_seqs"],
            "max_num_batched_tokens": scheduler_values["max_num_batched_tokens"],
        }