Unverified Commit d215d1ef authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

[Mypy] Better fixes for the `mypy` issues in `vllm/config` (#37902)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 34d317dc
......@@ -42,7 +42,6 @@ details.
import random
import time
from dataclasses import fields
from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
......@@ -124,7 +123,7 @@ def main(args):
# Create the LLM engine
engine_args = EngineArgs.from_cli_args(args)
llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
llm = LLM.from_engine_args(engine_args)
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
print("------warm up------")
......
......@@ -32,7 +32,6 @@ import dataclasses
import json
import random
import time
from dataclasses import fields
from transformers import PreTrainedTokenizerBase
......@@ -197,7 +196,7 @@ def main(args):
engine_args = EngineArgs.from_cli_args(args)
llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
llm = LLM.from_engine_args(engine_args)
sampling_params = SamplingParams(
temperature=0,
......
......@@ -6,7 +6,6 @@ import argparse
import json
import random
import time
from dataclasses import fields
from transformers import AutoTokenizer, PreTrainedTokenizerBase
......@@ -79,7 +78,7 @@ def run_vllm(
) -> float:
from vllm import LLM, SamplingParams
llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
llm = LLM.from_engine_args(engine_args)
assert all(
llm.llm_engine.model_config.max_model_len >= (request[1] + request[2])
......
......@@ -9,7 +9,6 @@ on HuggingFace model repository.
"""
import os
from dataclasses import asdict
from typing import Any, NamedTuple
from huggingface_hub import snapshot_download
......@@ -633,7 +632,7 @@ def main(args):
req_data.engine_args.limit_mm_per_prompt or {}
)
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
engine_args = vars(req_data.engine_args) | {"seed": args.seed}
if args.tensor_parallel_size is not None:
engine_args["tensor_parallel_size"] = args.tensor_parallel_size
llm = LLM(**engine_args)
......
......@@ -8,7 +8,6 @@ the explicit/implicit prompt format on enc-dec LMMs for text generation.
import os
import time
from collections.abc import Sequence
from dataclasses import asdict
from typing import NamedTuple
from vllm import LLM, EngineArgs, PromptType, SamplingParams
......@@ -91,13 +90,12 @@ def main(args):
req_data = model_example_map[model]()
# Disable other modalities to save memory
engine_args = req_data.engine_args
default_limits = {"image": 0, "video": 0, "audio": 0}
req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
req_data.engine_args.limit_mm_per_prompt or {}
)
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
llm = LLM(**engine_args)
limit_mm_per_prompt = default_limits | (engine_args.limit_mm_per_prompt or {})
engine_args.limit_mm_per_prompt = limit_mm_per_prompt
engine_args.seed = args.seed
llm = LLM.from_engine_args(engine_args)
prompts = req_data.prompts
......
......@@ -20,8 +20,6 @@ python load_sharded_state.py \
--max-tokens 50
"""
import dataclasses
from vllm import LLM, EngineArgs, SamplingParams
from vllm.utils.argparse_utils import FlexibleArgumentParser
......@@ -64,7 +62,7 @@ def main():
print(f"Tensor parallel size: {engine_args.tensor_parallel_size}")
# Load the model using engine args
llm = LLM(**dataclasses.asdict(engine_args))
llm = LLM.from_engine_args(engine_args)
# Prepare sampling parameters
sampling_params = SamplingParams(
......
......@@ -21,7 +21,6 @@ llm = LLM(
)
"""
import dataclasses
import os
import shutil
from pathlib import Path
......@@ -60,7 +59,7 @@ def main(args):
if not Path(model_path).is_dir():
raise ValueError("model path must be a local directory")
# Create LLM instance from arguments
llm = LLM(**dataclasses.asdict(engine_args))
llm = LLM.from_engine_args(engine_args)
# Prepare output directory
Path(args.output).mkdir(exist_ok=True)
# Dump worker states to output directory
......
......@@ -11,7 +11,6 @@ on HuggingFace model repository.
import os
import random
from contextlib import contextmanager
from dataclasses import asdict
from typing import NamedTuple
from huggingface_hub import snapshot_download
......@@ -2434,13 +2433,13 @@ def main(args):
req_data.engine_args.limit_mm_per_prompt or {}
)
engine_args = asdict(req_data.engine_args) | {
"seed": args.seed,
"mm_processor_cache_gb": 0 if args.disable_mm_processor_cache else 4,
}
engine_args = req_data.engine_args
engine_args.seed = args.seed
mm_processor_cache_gb = 0 if args.disable_mm_processor_cache else 4
engine_args.mm_processor_cache_gb = mm_processor_cache_gb
if args.tensor_parallel_size is not None:
engine_args["tensor_parallel_size"] = args.tensor_parallel_size
llm = LLM(**engine_args)
engine_args.tensor_parallel_size = args.tensor_parallel_size
llm = LLM.from_engine_args(engine_args)
# Don't want to check the flag multiple times, so just hijack `prompts`.
prompts = (
......
......@@ -8,7 +8,6 @@ using the chat template defined by the model.
import os
from argparse import Namespace
from dataclasses import asdict
from typing import NamedTuple
from huggingface_hub import snapshot_download
......@@ -1481,10 +1480,11 @@ def run_generate(
):
req_data = model_example_map[model](question, image_urls)
engine_args = asdict(req_data.engine_args) | {"seed": seed}
engine_args = req_data.engine_args
engine_args.seed = seed
if tensor_parallel_size is not None:
engine_args["tensor_parallel_size"] = tensor_parallel_size
llm = LLM(**engine_args)
engine_args.tensor_parallel_size = tensor_parallel_size
llm = LLM.from_engine_args(engine_args)
sampling_params = SamplingParams(
temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids
......@@ -1521,10 +1521,11 @@ def run_chat(
req_data.engine_args.limit_mm_per_prompt or {}
)
engine_args = asdict(req_data.engine_args) | {"seed": seed}
engine_args = req_data.engine_args
engine_args.seed = seed
if tensor_parallel_size is not None:
engine_args["tensor_parallel_size"] = tensor_parallel_size
llm = LLM(**engine_args)
engine_args.tensor_parallel_size = tensor_parallel_size
llm = LLM.from_engine_args(engine_args)
sampling_params = (
SamplingParams(
......
......@@ -10,12 +10,11 @@ on HuggingFace model repository.
"""
import argparse
from dataclasses import asdict
from pathlib import Path
from PIL.Image import Image
from vllm import LLM, EngineArgs
from vllm import LLM
from vllm.multimodal.utils import fetch_image
from vllm.utils.print_utils import print_embeddings
......@@ -28,14 +27,13 @@ multi_modal_data = {"image": fetch_image(image_url)}
def run_clip(seed: int):
engine_args = EngineArgs(
llm = LLM(
model="openai/clip-vit-base-patch32",
runner="pooling",
limit_mm_per_prompt={"image": 1},
seed=seed,
)
llm = LLM(**asdict(engine_args) | {"seed": seed})
print("Text embedding output:")
outputs = llm.embed(text, use_tqdm=False)
print_embeddings(outputs[0].outputs.embedding)
......@@ -53,15 +51,14 @@ def run_clip(seed: int):
def run_e5_v(seed: int):
engine_args = EngineArgs(
llm = LLM(
model="royokong/e5-v",
runner="pooling",
max_model_len=4096,
limit_mm_per_prompt={"image": 1},
seed=seed,
)
llm = LLM(**asdict(engine_args) | {"seed": seed})
llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n" # noqa: E501
print("Text embedding output:")
......@@ -108,20 +105,20 @@ def run_qwen3_vl(seed: int):
multi_modal_data["image"] = post_process_image(multi_modal_data["image"])
engine_args = EngineArgs(
model="Qwen/Qwen3-VL-Embedding-2B",
runner="pooling",
max_model_len=8192,
limit_mm_per_prompt={"image": 1},
mm_processor_kwargs={"do_resize": False} if smart_resize is not None else None,
)
default_instruction = "Represent the user's input."
image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
prompt_text = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant\n"
prompt_image = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}<|im_end|>\n<|im_start|>assistant\n"
prompt_image_text = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}{text}<|im_end|>\n<|im_start|>assistant\n"
llm = LLM(**asdict(engine_args) | {"seed": seed})
llm = LLM(
model="Qwen/Qwen3-VL-Embedding-2B",
runner="pooling",
max_model_len=8192,
limit_mm_per_prompt={"image": 1},
mm_processor_kwargs={"do_resize": False} if smart_resize is not None else None,
seed=seed,
)
print("Text embedding output:")
outputs = llm.embed(prompt_text, use_tqdm=False)
......@@ -149,14 +146,13 @@ def run_qwen3_vl(seed: int):
def run_siglip(seed: int):
engine_args = EngineArgs(
llm = LLM(
model="google/siglip-base-patch16-224",
runner="pooling",
limit_mm_per_prompt={"image": 1},
seed=seed,
)
llm = LLM(**asdict(engine_args) | {"seed": seed})
print("Text embedding output:")
outputs = llm.embed(text, use_tqdm=False)
print_embeddings(outputs[0].outputs.embedding)
......@@ -174,16 +170,15 @@ def run_siglip(seed: int):
def run_vlm2vec_phi3v(seed: int):
engine_args = EngineArgs(
llm = LLM(
model="TIGER-Lab/VLM2Vec-Full",
runner="pooling",
max_model_len=4096,
trust_remote_code=True,
mm_processor_kwargs={"num_crops": 4},
limit_mm_per_prompt={"image": 1},
seed=seed,
)
llm = LLM(**asdict(engine_args) | {"seed": seed})
image_token = "<|image_1|>"
print("Text embedding output:")
......@@ -259,7 +254,7 @@ def run_vlm2vec_qwen2vl(seed: int):
processor.save_pretrained(merged_path)
print("Done!")
engine_args = EngineArgs(
llm = LLM(
model=merged_path,
runner="pooling",
max_model_len=4096,
......@@ -268,9 +263,8 @@ def run_vlm2vec_qwen2vl(seed: int):
"max_pixels": 12845056,
},
limit_mm_per_prompt={"image": 1},
seed=seed,
)
llm = LLM(**asdict(engine_args) | {"seed": seed})
image_token = "<|image_pad|>"
print("Text embedding output:")
......
......@@ -10,7 +10,6 @@ multimodal documents (text + images/videos).
from argparse import Namespace
from collections.abc import Callable
from dataclasses import asdict
from pathlib import Path
from typing import NamedTuple
......@@ -125,7 +124,7 @@ def main(args: Namespace):
model_request = model_example_map[args.model_name]()
engine_args = model_request.engine_args
llm = LLM(**asdict(engine_args))
llm = LLM.from_engine_args(engine_args)
print("Query: string & Document: string")
outputs = llm.score(query, document)
......
......@@ -414,9 +414,12 @@ def test_cudagraph_sizes_post_init(
ctx,
patch("vllm.config.parallel.cuda_device_count_stateless", return_value=tp_size),
):
kwargs = {}
if cudagraph_capture_sizes is not None:
kwargs["cudagraph_capture_sizes"] = cudagraph_capture_sizes
if max_cudagraph_capture_size is not None:
kwargs["max_cudagraph_capture_size"] = max_cudagraph_capture_size
compilation_config = CompilationConfig(
cudagraph_capture_sizes=cudagraph_capture_sizes,
max_cudagraph_capture_size=max_cudagraph_capture_size,
pass_config=PassConfig(
enable_sp=enable_sp,
fuse_norm_quant=True,
......@@ -425,6 +428,7 @@ def test_cudagraph_sizes_post_init(
sp_min_token_num=512 if enable_sp else None,
),
cudagraph_mode=cudagraph_mode,
**kwargs,
)
engine_args = EngineArgs(
model="facebook/opt-125m",
......
......@@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for HF_HUB_OFFLINE mode"""
import dataclasses
import importlib
import sys
......@@ -12,7 +11,6 @@ import urllib3
from vllm import LLM
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.engine.arg_utils import EngineArgs
MODEL_CONFIGS = [
{
......@@ -160,8 +158,7 @@ def test_model_from_huggingface_offline(monkeypatch: pytest.MonkeyPatch):
# Need to re-import huggingface_hub
# and friends to set up offline mode
_re_import_modules()
engine_args = EngineArgs(model="facebook/opt-125m")
LLM(**dataclasses.asdict(engine_args))
LLM(model="facebook/opt-125m")
finally:
# Reset the environment after the test
# NB: Assuming tests are run in online mode
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import asdict
from typing import NamedTuple
import pytest
......@@ -29,14 +28,6 @@ def test_keye_vl(image_assets, question: str):
images = [asset.pil_image for asset in image_assets]
image_urls = [encode_image_url(image) for image in images]
engine_args = EngineArgs(
model=MODEL_NAME,
trust_remote_code=True,
max_model_len=8192,
max_num_seqs=5,
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [
{
......@@ -54,8 +45,14 @@ def test_keye_vl(image_assets, question: str):
messages, tokenize=False, add_generation_prompt=True
)
engine_args = asdict(engine_args) | {"seed": 42}
llm = LLM(**engine_args)
llm = LLM(
model=MODEL_NAME,
trust_remote_code=True,
max_model_len=8192,
max_num_seqs=5,
limit_mm_per_prompt={"image": len(image_urls)},
seed=42,
)
sampling_params = SamplingParams(
temperature=0.0, max_tokens=256, stop_token_ids=None
......
......@@ -7,13 +7,12 @@ This test validates that each multimodal model can successfully generate outputs
using different ViT attention backends. Tests are parametrized by model and backend.
"""
from dataclasses import asdict
from typing import Any
import pytest
from transformers import AutoProcessor
from vllm import LLM, EngineArgs, SamplingParams
from vllm import LLM, SamplingParams
from vllm.multimodal.utils import encode_image_url
from vllm.multimodal.video import sample_frames_from_video
from vllm.platforms import current_platform
......@@ -274,7 +273,7 @@ def run_llm_generate_test(config, mm_encoder_attn_backend, image_assets):
limit_mm_per_prompt = config.get("limit_mm_per_prompt", {"image": len(images)})
# Create engine
engine_args = EngineArgs(
llm = LLM(
model=config["model_name"],
trust_remote_code=True,
max_model_len=config["max_model_len"],
......@@ -283,11 +282,9 @@ def run_llm_generate_test(config, mm_encoder_attn_backend, image_assets):
mm_encoder_attn_backend=mm_encoder_attn_backend,
hf_overrides=dummy_hf_overrides,
load_format="dummy",
seed=42,
)
engine_dict = asdict(engine_args) | {"seed": 42}
llm = LLM(**engine_dict)
# Generate
sampling_params = SamplingParams(**config["sampling_params"])
outputs = llm.generate(
......@@ -318,7 +315,7 @@ def run_llm_chat_test(config, mm_encoder_attn_backend, image_assets):
messages = build_dots_ocr_prompt([stop_sign_image], config)
# Create engine
engine_args = EngineArgs(
llm = LLM(
model=config["model_name"],
trust_remote_code=True,
max_model_len=config["max_model_len"],
......@@ -327,11 +324,9 @@ def run_llm_chat_test(config, mm_encoder_attn_backend, image_assets):
mm_encoder_attn_backend=mm_encoder_attn_backend,
hf_overrides=dummy_hf_overrides,
load_format="dummy",
seed=42,
)
engine_dict = asdict(engine_args) | {"seed": 42}
llm = LLM(**engine_dict)
# Generate using chat
sampling_params = SamplingParams(**config["sampling_params"])
outputs = llm.chat(messages=messages, sampling_params=sampling_params)
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import contextlib
from dataclasses import asdict
import pytest
import pytest_asyncio
......@@ -75,7 +74,7 @@ def tokenizer() -> MistralTokenizer:
@pytest.fixture
def engine():
engine_args = EngineArgs(**ENGINE_CONFIG)
llm = LLM(**asdict(engine_args))
llm = LLM.from_engine_args(engine_args)
try:
yield llm
finally:
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import asdict
from typing import NamedTuple
import pytest
from PIL import Image
from vllm import LLM, EngineArgs, SamplingParams
from vllm import LLM, SamplingParams
from vllm.assets.image import ImageAsset
from vllm.config import AttentionConfig, KVTransferConfig
from vllm.multimodal.utils import encode_image_url
......@@ -129,24 +128,6 @@ def test_shared_storage_connector_hashes(tmp_path, attn_backend):
# Using tmp_path as the storage path to store KV
print(f"KV storage path at: {str(tmp_path)}")
# Configure the ExampleConnector
kv_transfer_config = KVTransferConfig(
kv_connector="ExampleConnector",
kv_role="kv_both",
kv_connector_extra_config={"shared_storage_path": str(tmp_path)},
)
engine_args = EngineArgs(
model=MODEL_NAME,
max_model_len=8192,
max_num_seqs=1,
gpu_memory_utilization=0.4,
attention_config=AttentionConfig(backend=attn_backend),
enforce_eager=True,
kv_transfer_config=kv_transfer_config,
limit_mm_per_prompt={"image": 2},
)
# don't put this import at the top level
# it will call torch.accelerator.device_count()
from transformers import AutoProcessor
......@@ -163,8 +144,20 @@ def test_shared_storage_connector_hashes(tmp_path, attn_backend):
assert image_1 != image_2, "The images should not be identical"
# Create the LLM instance
engine_args = asdict(engine_args)
llm = LLM(**engine_args)
llm = LLM(
model=MODEL_NAME,
max_model_len=8192,
max_num_seqs=1,
gpu_memory_utilization=0.4,
attention_config=AttentionConfig(backend=attn_backend),
enforce_eager=True,
kv_transfer_config=KVTransferConfig(
kv_connector="ExampleConnector",
kv_role="kv_both",
kv_connector_extra_config={"shared_storage_path": str(tmp_path)},
),
limit_mm_per_prompt={"image": 2},
)
# Prepare the input cases
input_cases = [
......
......@@ -6,7 +6,6 @@ import argparse
import json
import os
import time
from dataclasses import fields
from typing import Any
import numpy as np
......@@ -85,7 +84,7 @@ def main(args: argparse.Namespace):
# NOTE(woosuk): If the request cannot be processed in a single batch,
# the engine will automatically process the request in multiple batches.
llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
llm = LLM.from_engine_args(engine_args)
assert llm.llm_engine.model_config.max_model_len >= (
args.input_len + args.output_len
), (
......
......@@ -17,7 +17,6 @@ import argparse
import json
import time
from collections import defaultdict
from dataclasses import fields
from datetime import datetime
from typing import TYPE_CHECKING, Any, Literal
......@@ -225,7 +224,7 @@ def benchmark_multimodal_processor(
args.seed = 0
engine_args = EngineArgs.from_cli_args(args)
llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
llm = LLM.from_engine_args(engine_args)
tokenizer = llm.get_tokenizer()
requests = get_requests(args, tokenizer)
......
......@@ -16,7 +16,6 @@ import shutil
import tempfile
import time
from contextlib import contextmanager
from dataclasses import fields
from typing import Any
import numpy as np
......@@ -67,7 +66,7 @@ def run_startup_in_subprocess(engine_args, result_queue):
# Measure total startup time
start_time = time.perf_counter()
llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
llm = LLM.from_engine_args(engine_args)
total_startup_time = time.perf_counter() - start_time
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment