fix(vllmsglang): Give sglang/vllm the HF model name not the full path (#5274)

Signed-off-by: Graham King <grahamk@nvidia.com>

fix(vllmsglang): Give sglang/vllm the HF model name not the full path (#5274)
Signed-off-by: Graham King <grahamk@nvidia.com>
fcc4a60f · Graham King · GitHub · 55c26654 · fcc4a60f · fcc4a60f
Unverified Commit fcc4a60f authored Jan 08, 2026 by Graham King Committed by GitHub Jan 08, 2026
Show whitespace changes
Inline Side-by-side

Showing with 25 additions and 7 deletions

components/src/dynamo/sglang/args.py components/src/dynamo/sglang/args.py +12 -3

components/src/dynamo/vllm/main.py components/src/dynamo/vllm/main.py +13 -4

No files found.
--- a/components/src/dynamo/sglang/args.py
+++ b/components/src/dynamo/sglang/args.py
@@ -490,14 +490,23 @@ async def parse_args(args: list[str]) -> Config:
    )
    logging.debug(f"Dynamo args: {dynamo_args}")

-    # TODO: sglang downloads the model in `from_cli_args`, so we need to do it here.
-    # That's unfortunate because `parse_args` isn't the right place for this. Fix.
    model_path = parsed_args.model_path
+    # Name the model
    if not parsed_args.served_model_name:
        parsed_args.served_model_name = model_path
+    # Download the model if necessary using modelexpress.
+    # We don't set `parsed_args.model_path` to the local path fetch_llm returns
+    # because sglang will send this to its pipeline-parallel workers, which may
+    # not have the local path.
+    # sglang will attempt to download the model again, but find it in the HF cache.
+    # For non-HF models use a path instead of an HF name, and ensure all workers have
+    # that path (ideally via a shared folder).
    if not os.path.exists(model_path):
-        parsed_args.model_path = await fetch_llm(model_path)
+        await fetch_llm(model_path)

+    # TODO: sglang downloads the model in `from_cli_args`, which means we had to
+    # fetch_llm (download the model) here, in `parse_args`. `parse_args` should not
+    # contain code to download a model, it should only parse the args.
    server_args = ServerArgs.from_cli_args(parsed_args)

    if parsed_args.use_sglang_tokenizer:

--- a/components/src/dynamo/vllm/main.py
+++ b/components/src/dynamo/vllm/main.py
@@ -79,13 +79,22 @@ async def worker():

    dump_config(config.dump_config_to, config)

-    # Download the model if necessary.
-    # register_llm would do this for us, but we want it on disk before we start vllm.
-    # Ensure the original HF name (e.g. "Qwen/Qwen3-0.6B") is used as the served_model_name.
+    # Name the model. Use either the full path (vllm and sglang do the same),
+    # or the HF name (e.g. "Qwen/Qwen3-0.6B"), depending on cmd line params.
    if not config.served_model_name:
        config.served_model_name = config.engine_args.served_model_name = config.model
+
+    # Download the model if necessary using modelexpress.
+    # We want it on disk before we start vllm to avoid downloading from HuggingFace.
+    #
+    # We don't set `config.engine_args.model` to the local path fetch_llm returns
+    # because vllm will send that name to its Ray pipeline-parallel workers, which
+    # may not have the local path.
+    # vllm will attempt to download the model again, but find it in the HF cache.
+    # For non-HF models use a path instead of an HF name, and ensure all workers have
+    # that path (ideally via a shared folder).
    if not os.path.exists(config.model):
-        config.model = config.engine_args.model = await fetch_llm(config.model)
+        await fetch_llm(config.model)

    # Route to appropriate initialization based on config flags
    if config.vllm_native_encoder_worker: