fix: dynamo-run pass proper args using register-llm (#1230)

cc40af70 · Alec · GitHub · e450c2c7 · cc40af70 · cc40af70
Unverified Commit cc40af70 authored May 28, 2025 by Alec Committed by GitHub May 28, 2025
Showing with 11 additions and 4 deletions

launch/dynamo-run/src/subprocess/vllm_inc.py launch/dynamo-run/src/subprocess/vllm_inc.py +10 -3

lib/bindings/python/src/dynamo/_core.pyi lib/bindings/python/src/dynamo/_core.pyi +1 -1

No files found.
--- a/launch/dynamo-run/src/subprocess/vllm_inc.py
+++ b/launch/dynamo-run/src/subprocess/vllm_inc.py
@@ -157,7 +157,7 @@ async def init(runtime: DistributedRuntime, config: Config):
        # KV routing relies on logging KV metrics
        "disable_log_stats": False,
    }
-    if config.kv_block_size:
+    assert config.kv_block_size > 0, "Must use non-negative integer for KV Block Size"
    arg_map["block_size"] = config.kv_block_size
    if config.context_length:
@@ -201,7 +201,14 @@ async def init(runtime: DistributedRuntime, config: Config):
    engine_client = await engine_context.__aenter__()
    await register_llm(
-        ModelType.Backend, endpoint, config.model_path, config.model_name
+        ModelType.Backend,
+        endpoint,
+        config.model_path,
+        config.model_name,
+        context_length=arg_map.get(
+            "max_model_len", None
+        ),  # if None, takes length from tokenizer
+        kv_cache_block_size=arg_map["block_size"],
    )
    handler = RequestHandler(component, engine_client, default_sampling_params)
    handler.setup_kv_metrics()

--- a/lib/bindings/python/src/dynamo/_core.pyi
+++ b/lib/bindings/python/src/dynamo/_core.pyi
@@ -603,7 +603,7 @@ class ModelType:
    """What type of request this model needs: Chat, Component or Backend (pre-processed)"""
    ...
-async def register_llm(model_type: ModelType, endpoint: Endpoint, model_path: str, model_name: Optional[str]) -> None:
+async def register_llm(model_type: ModelType, endpoint: Endpoint, model_path: str, model_name: Optional[str] = None, context_length: Optional[int] = None, kv_cache_block_size: Optional[int] = None) -> None:
    """Attach the model at path to the given endpoint, and advertise it as model_type"""
    ...