Unverified Commit fcc4a60f authored by Graham King's avatar Graham King Committed by GitHub
Browse files

fix(vllmsglang): Give sglang/vllm the HF model name not the full path (#5274)


Signed-off-by: default avatarGraham King <grahamk@nvidia.com>
parent 55c26654
...@@ -490,14 +490,23 @@ async def parse_args(args: list[str]) -> Config: ...@@ -490,14 +490,23 @@ async def parse_args(args: list[str]) -> Config:
) )
logging.debug(f"Dynamo args: {dynamo_args}") logging.debug(f"Dynamo args: {dynamo_args}")
# TODO: sglang downloads the model in `from_cli_args`, so we need to do it here.
# That's unfortunate because `parse_args` isn't the right place for this. Fix.
model_path = parsed_args.model_path model_path = parsed_args.model_path
# Name the model
if not parsed_args.served_model_name: if not parsed_args.served_model_name:
parsed_args.served_model_name = model_path parsed_args.served_model_name = model_path
# Download the model if necessary using modelexpress.
# We don't set `parsed_args.model_path` to the local path fetch_llm returns
# because sglang will send this to its pipeline-parallel workers, which may
# not have the local path.
# sglang will attempt to download the model again, but find it in the HF cache.
# For non-HF models use a path instead of an HF name, and ensure all workers have
# that path (ideally via a shared folder).
if not os.path.exists(model_path): if not os.path.exists(model_path):
parsed_args.model_path = await fetch_llm(model_path) await fetch_llm(model_path)
# TODO: sglang downloads the model in `from_cli_args`, which means we had to
# fetch_llm (download the model) here, in `parse_args`. `parse_args` should not
# contain code to download a model, it should only parse the args.
server_args = ServerArgs.from_cli_args(parsed_args) server_args = ServerArgs.from_cli_args(parsed_args)
if parsed_args.use_sglang_tokenizer: if parsed_args.use_sglang_tokenizer:
......
...@@ -79,13 +79,22 @@ async def worker(): ...@@ -79,13 +79,22 @@ async def worker():
dump_config(config.dump_config_to, config) dump_config(config.dump_config_to, config)
# Download the model if necessary. # Name the model. Use either the full path (vllm and sglang do the same),
# register_llm would do this for us, but we want it on disk before we start vllm. # or the HF name (e.g. "Qwen/Qwen3-0.6B"), depending on cmd line params.
# Ensure the original HF name (e.g. "Qwen/Qwen3-0.6B") is used as the served_model_name.
if not config.served_model_name: if not config.served_model_name:
config.served_model_name = config.engine_args.served_model_name = config.model config.served_model_name = config.engine_args.served_model_name = config.model
# Download the model if necessary using modelexpress.
# We want it on disk before we start vllm to avoid downloading from HuggingFace.
#
# We don't set `config.engine_args.model` to the local path fetch_llm returns
# because vllm will send that name to its Ray pipeline-parallel workers, which
# may not have the local path.
# vllm will attempt to download the model again, but find it in the HF cache.
# For non-HF models use a path instead of an HF name, and ensure all workers have
# that path (ideally via a shared folder).
if not os.path.exists(config.model): if not os.path.exists(config.model):
config.model = config.engine_args.model = await fetch_llm(config.model) await fetch_llm(config.model)
# Route to appropriate initialization based on config flags # Route to appropriate initialization based on config flags
if config.vllm_native_encoder_worker: if config.vllm_native_encoder_worker:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment