Unverified Commit d215d1ef authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

[Mypy] Better fixes for the `mypy` issues in `vllm/config` (#37902)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 34d317dc
...@@ -42,7 +42,6 @@ details. ...@@ -42,7 +42,6 @@ details.
import random import random
import time import time
from dataclasses import fields
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
...@@ -124,7 +123,7 @@ def main(args): ...@@ -124,7 +123,7 @@ def main(args):
# Create the LLM engine # Create the LLM engine
engine_args = EngineArgs.from_cli_args(args) engine_args = EngineArgs.from_cli_args(args)
llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)}) llm = LLM.from_engine_args(engine_args)
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len) sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
print("------warm up------") print("------warm up------")
......
...@@ -32,7 +32,6 @@ import dataclasses ...@@ -32,7 +32,6 @@ import dataclasses
import json import json
import random import random
import time import time
from dataclasses import fields
from transformers import PreTrainedTokenizerBase from transformers import PreTrainedTokenizerBase
...@@ -197,7 +196,7 @@ def main(args): ...@@ -197,7 +196,7 @@ def main(args):
engine_args = EngineArgs.from_cli_args(args) engine_args = EngineArgs.from_cli_args(args)
llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)}) llm = LLM.from_engine_args(engine_args)
sampling_params = SamplingParams( sampling_params = SamplingParams(
temperature=0, temperature=0,
......
...@@ -6,7 +6,6 @@ import argparse ...@@ -6,7 +6,6 @@ import argparse
import json import json
import random import random
import time import time
from dataclasses import fields
from transformers import AutoTokenizer, PreTrainedTokenizerBase from transformers import AutoTokenizer, PreTrainedTokenizerBase
...@@ -79,7 +78,7 @@ def run_vllm( ...@@ -79,7 +78,7 @@ def run_vllm(
) -> float: ) -> float:
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)}) llm = LLM.from_engine_args(engine_args)
assert all( assert all(
llm.llm_engine.model_config.max_model_len >= (request[1] + request[2]) llm.llm_engine.model_config.max_model_len >= (request[1] + request[2])
......
...@@ -9,7 +9,6 @@ on HuggingFace model repository. ...@@ -9,7 +9,6 @@ on HuggingFace model repository.
""" """
import os import os
from dataclasses import asdict
from typing import Any, NamedTuple from typing import Any, NamedTuple
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
...@@ -633,7 +632,7 @@ def main(args): ...@@ -633,7 +632,7 @@ def main(args):
req_data.engine_args.limit_mm_per_prompt or {} req_data.engine_args.limit_mm_per_prompt or {}
) )
engine_args = asdict(req_data.engine_args) | {"seed": args.seed} engine_args = vars(req_data.engine_args) | {"seed": args.seed}
if args.tensor_parallel_size is not None: if args.tensor_parallel_size is not None:
engine_args["tensor_parallel_size"] = args.tensor_parallel_size engine_args["tensor_parallel_size"] = args.tensor_parallel_size
llm = LLM(**engine_args) llm = LLM(**engine_args)
......
...@@ -8,7 +8,6 @@ the explicit/implicit prompt format on enc-dec LMMs for text generation. ...@@ -8,7 +8,6 @@ the explicit/implicit prompt format on enc-dec LMMs for text generation.
import os import os
import time import time
from collections.abc import Sequence from collections.abc import Sequence
from dataclasses import asdict
from typing import NamedTuple from typing import NamedTuple
from vllm import LLM, EngineArgs, PromptType, SamplingParams from vllm import LLM, EngineArgs, PromptType, SamplingParams
...@@ -91,13 +90,12 @@ def main(args): ...@@ -91,13 +90,12 @@ def main(args):
req_data = model_example_map[model]() req_data = model_example_map[model]()
# Disable other modalities to save memory # Disable other modalities to save memory
engine_args = req_data.engine_args
default_limits = {"image": 0, "video": 0, "audio": 0} default_limits = {"image": 0, "video": 0, "audio": 0}
req_data.engine_args.limit_mm_per_prompt = default_limits | dict( limit_mm_per_prompt = default_limits | (engine_args.limit_mm_per_prompt or {})
req_data.engine_args.limit_mm_per_prompt or {} engine_args.limit_mm_per_prompt = limit_mm_per_prompt
) engine_args.seed = args.seed
llm = LLM.from_engine_args(engine_args)
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
llm = LLM(**engine_args)
prompts = req_data.prompts prompts = req_data.prompts
......
...@@ -20,8 +20,6 @@ python load_sharded_state.py \ ...@@ -20,8 +20,6 @@ python load_sharded_state.py \
--max-tokens 50 --max-tokens 50
""" """
import dataclasses
from vllm import LLM, EngineArgs, SamplingParams from vllm import LLM, EngineArgs, SamplingParams
from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.argparse_utils import FlexibleArgumentParser
...@@ -64,7 +62,7 @@ def main(): ...@@ -64,7 +62,7 @@ def main():
print(f"Tensor parallel size: {engine_args.tensor_parallel_size}") print(f"Tensor parallel size: {engine_args.tensor_parallel_size}")
# Load the model using engine args # Load the model using engine args
llm = LLM(**dataclasses.asdict(engine_args)) llm = LLM.from_engine_args(engine_args)
# Prepare sampling parameters # Prepare sampling parameters
sampling_params = SamplingParams( sampling_params = SamplingParams(
......
...@@ -21,7 +21,6 @@ llm = LLM( ...@@ -21,7 +21,6 @@ llm = LLM(
) )
""" """
import dataclasses
import os import os
import shutil import shutil
from pathlib import Path from pathlib import Path
...@@ -60,7 +59,7 @@ def main(args): ...@@ -60,7 +59,7 @@ def main(args):
if not Path(model_path).is_dir(): if not Path(model_path).is_dir():
raise ValueError("model path must be a local directory") raise ValueError("model path must be a local directory")
# Create LLM instance from arguments # Create LLM instance from arguments
llm = LLM(**dataclasses.asdict(engine_args)) llm = LLM.from_engine_args(engine_args)
# Prepare output directory # Prepare output directory
Path(args.output).mkdir(exist_ok=True) Path(args.output).mkdir(exist_ok=True)
# Dump worker states to output directory # Dump worker states to output directory
......
...@@ -11,7 +11,6 @@ on HuggingFace model repository. ...@@ -11,7 +11,6 @@ on HuggingFace model repository.
import os import os
import random import random
from contextlib import contextmanager from contextlib import contextmanager
from dataclasses import asdict
from typing import NamedTuple from typing import NamedTuple
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
...@@ -2434,13 +2433,13 @@ def main(args): ...@@ -2434,13 +2433,13 @@ def main(args):
req_data.engine_args.limit_mm_per_prompt or {} req_data.engine_args.limit_mm_per_prompt or {}
) )
engine_args = asdict(req_data.engine_args) | { engine_args = req_data.engine_args
"seed": args.seed, engine_args.seed = args.seed
"mm_processor_cache_gb": 0 if args.disable_mm_processor_cache else 4, mm_processor_cache_gb = 0 if args.disable_mm_processor_cache else 4
} engine_args.mm_processor_cache_gb = mm_processor_cache_gb
if args.tensor_parallel_size is not None: if args.tensor_parallel_size is not None:
engine_args["tensor_parallel_size"] = args.tensor_parallel_size engine_args.tensor_parallel_size = args.tensor_parallel_size
llm = LLM(**engine_args) llm = LLM.from_engine_args(engine_args)
# Don't want to check the flag multiple times, so just hijack `prompts`. # Don't want to check the flag multiple times, so just hijack `prompts`.
prompts = ( prompts = (
......
...@@ -8,7 +8,6 @@ using the chat template defined by the model. ...@@ -8,7 +8,6 @@ using the chat template defined by the model.
import os import os
from argparse import Namespace from argparse import Namespace
from dataclasses import asdict
from typing import NamedTuple from typing import NamedTuple
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
...@@ -1481,10 +1480,11 @@ def run_generate( ...@@ -1481,10 +1480,11 @@ def run_generate(
): ):
req_data = model_example_map[model](question, image_urls) req_data = model_example_map[model](question, image_urls)
engine_args = asdict(req_data.engine_args) | {"seed": seed} engine_args = req_data.engine_args
engine_args.seed = seed
if tensor_parallel_size is not None: if tensor_parallel_size is not None:
engine_args["tensor_parallel_size"] = tensor_parallel_size engine_args.tensor_parallel_size = tensor_parallel_size
llm = LLM(**engine_args) llm = LLM.from_engine_args(engine_args)
sampling_params = SamplingParams( sampling_params = SamplingParams(
temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids
...@@ -1521,10 +1521,11 @@ def run_chat( ...@@ -1521,10 +1521,11 @@ def run_chat(
req_data.engine_args.limit_mm_per_prompt or {} req_data.engine_args.limit_mm_per_prompt or {}
) )
engine_args = asdict(req_data.engine_args) | {"seed": seed} engine_args = req_data.engine_args
engine_args.seed = seed
if tensor_parallel_size is not None: if tensor_parallel_size is not None:
engine_args["tensor_parallel_size"] = tensor_parallel_size engine_args.tensor_parallel_size = tensor_parallel_size
llm = LLM(**engine_args) llm = LLM.from_engine_args(engine_args)
sampling_params = ( sampling_params = (
SamplingParams( SamplingParams(
......
...@@ -10,12 +10,11 @@ on HuggingFace model repository. ...@@ -10,12 +10,11 @@ on HuggingFace model repository.
""" """
import argparse import argparse
from dataclasses import asdict
from pathlib import Path from pathlib import Path
from PIL.Image import Image from PIL.Image import Image
from vllm import LLM, EngineArgs from vllm import LLM
from vllm.multimodal.utils import fetch_image from vllm.multimodal.utils import fetch_image
from vllm.utils.print_utils import print_embeddings from vllm.utils.print_utils import print_embeddings
...@@ -28,14 +27,13 @@ multi_modal_data = {"image": fetch_image(image_url)} ...@@ -28,14 +27,13 @@ multi_modal_data = {"image": fetch_image(image_url)}
def run_clip(seed: int): def run_clip(seed: int):
engine_args = EngineArgs( llm = LLM(
model="openai/clip-vit-base-patch32", model="openai/clip-vit-base-patch32",
runner="pooling", runner="pooling",
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={"image": 1},
seed=seed,
) )
llm = LLM(**asdict(engine_args) | {"seed": seed})
print("Text embedding output:") print("Text embedding output:")
outputs = llm.embed(text, use_tqdm=False) outputs = llm.embed(text, use_tqdm=False)
print_embeddings(outputs[0].outputs.embedding) print_embeddings(outputs[0].outputs.embedding)
...@@ -53,15 +51,14 @@ def run_clip(seed: int): ...@@ -53,15 +51,14 @@ def run_clip(seed: int):
def run_e5_v(seed: int): def run_e5_v(seed: int):
engine_args = EngineArgs( llm = LLM(
model="royokong/e5-v", model="royokong/e5-v",
runner="pooling", runner="pooling",
max_model_len=4096, max_model_len=4096,
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={"image": 1},
seed=seed,
) )
llm = LLM(**asdict(engine_args) | {"seed": seed})
llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n" # noqa: E501 llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n" # noqa: E501
print("Text embedding output:") print("Text embedding output:")
...@@ -108,20 +105,20 @@ def run_qwen3_vl(seed: int): ...@@ -108,20 +105,20 @@ def run_qwen3_vl(seed: int):
multi_modal_data["image"] = post_process_image(multi_modal_data["image"]) multi_modal_data["image"] = post_process_image(multi_modal_data["image"])
engine_args = EngineArgs(
model="Qwen/Qwen3-VL-Embedding-2B",
runner="pooling",
max_model_len=8192,
limit_mm_per_prompt={"image": 1},
mm_processor_kwargs={"do_resize": False} if smart_resize is not None else None,
)
default_instruction = "Represent the user's input." default_instruction = "Represent the user's input."
image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>" image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
prompt_text = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant\n" prompt_text = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant\n"
prompt_image = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}<|im_end|>\n<|im_start|>assistant\n" prompt_image = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}<|im_end|>\n<|im_start|>assistant\n"
prompt_image_text = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}{text}<|im_end|>\n<|im_start|>assistant\n" prompt_image_text = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}{text}<|im_end|>\n<|im_start|>assistant\n"
llm = LLM(**asdict(engine_args) | {"seed": seed}) llm = LLM(
model="Qwen/Qwen3-VL-Embedding-2B",
runner="pooling",
max_model_len=8192,
limit_mm_per_prompt={"image": 1},
mm_processor_kwargs={"do_resize": False} if smart_resize is not None else None,
seed=seed,
)
print("Text embedding output:") print("Text embedding output:")
outputs = llm.embed(prompt_text, use_tqdm=False) outputs = llm.embed(prompt_text, use_tqdm=False)
...@@ -149,14 +146,13 @@ def run_qwen3_vl(seed: int): ...@@ -149,14 +146,13 @@ def run_qwen3_vl(seed: int):
def run_siglip(seed: int): def run_siglip(seed: int):
engine_args = EngineArgs( llm = LLM(
model="google/siglip-base-patch16-224", model="google/siglip-base-patch16-224",
runner="pooling", runner="pooling",
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={"image": 1},
seed=seed,
) )
llm = LLM(**asdict(engine_args) | {"seed": seed})
print("Text embedding output:") print("Text embedding output:")
outputs = llm.embed(text, use_tqdm=False) outputs = llm.embed(text, use_tqdm=False)
print_embeddings(outputs[0].outputs.embedding) print_embeddings(outputs[0].outputs.embedding)
...@@ -174,16 +170,15 @@ def run_siglip(seed: int): ...@@ -174,16 +170,15 @@ def run_siglip(seed: int):
def run_vlm2vec_phi3v(seed: int): def run_vlm2vec_phi3v(seed: int):
engine_args = EngineArgs( llm = LLM(
model="TIGER-Lab/VLM2Vec-Full", model="TIGER-Lab/VLM2Vec-Full",
runner="pooling", runner="pooling",
max_model_len=4096, max_model_len=4096,
trust_remote_code=True, trust_remote_code=True,
mm_processor_kwargs={"num_crops": 4}, mm_processor_kwargs={"num_crops": 4},
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={"image": 1},
seed=seed,
) )
llm = LLM(**asdict(engine_args) | {"seed": seed})
image_token = "<|image_1|>" image_token = "<|image_1|>"
print("Text embedding output:") print("Text embedding output:")
...@@ -259,7 +254,7 @@ def run_vlm2vec_qwen2vl(seed: int): ...@@ -259,7 +254,7 @@ def run_vlm2vec_qwen2vl(seed: int):
processor.save_pretrained(merged_path) processor.save_pretrained(merged_path)
print("Done!") print("Done!")
engine_args = EngineArgs( llm = LLM(
model=merged_path, model=merged_path,
runner="pooling", runner="pooling",
max_model_len=4096, max_model_len=4096,
...@@ -268,9 +263,8 @@ def run_vlm2vec_qwen2vl(seed: int): ...@@ -268,9 +263,8 @@ def run_vlm2vec_qwen2vl(seed: int):
"max_pixels": 12845056, "max_pixels": 12845056,
}, },
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={"image": 1},
seed=seed,
) )
llm = LLM(**asdict(engine_args) | {"seed": seed})
image_token = "<|image_pad|>" image_token = "<|image_pad|>"
print("Text embedding output:") print("Text embedding output:")
......
...@@ -10,7 +10,6 @@ multimodal documents (text + images/videos). ...@@ -10,7 +10,6 @@ multimodal documents (text + images/videos).
from argparse import Namespace from argparse import Namespace
from collections.abc import Callable from collections.abc import Callable
from dataclasses import asdict
from pathlib import Path from pathlib import Path
from typing import NamedTuple from typing import NamedTuple
...@@ -125,7 +124,7 @@ def main(args: Namespace): ...@@ -125,7 +124,7 @@ def main(args: Namespace):
model_request = model_example_map[args.model_name]() model_request = model_example_map[args.model_name]()
engine_args = model_request.engine_args engine_args = model_request.engine_args
llm = LLM(**asdict(engine_args)) llm = LLM.from_engine_args(engine_args)
print("Query: string & Document: string") print("Query: string & Document: string")
outputs = llm.score(query, document) outputs = llm.score(query, document)
......
...@@ -414,9 +414,12 @@ def test_cudagraph_sizes_post_init( ...@@ -414,9 +414,12 @@ def test_cudagraph_sizes_post_init(
ctx, ctx,
patch("vllm.config.parallel.cuda_device_count_stateless", return_value=tp_size), patch("vllm.config.parallel.cuda_device_count_stateless", return_value=tp_size),
): ):
kwargs = {}
if cudagraph_capture_sizes is not None:
kwargs["cudagraph_capture_sizes"] = cudagraph_capture_sizes
if max_cudagraph_capture_size is not None:
kwargs["max_cudagraph_capture_size"] = max_cudagraph_capture_size
compilation_config = CompilationConfig( compilation_config = CompilationConfig(
cudagraph_capture_sizes=cudagraph_capture_sizes,
max_cudagraph_capture_size=max_cudagraph_capture_size,
pass_config=PassConfig( pass_config=PassConfig(
enable_sp=enable_sp, enable_sp=enable_sp,
fuse_norm_quant=True, fuse_norm_quant=True,
...@@ -425,6 +428,7 @@ def test_cudagraph_sizes_post_init( ...@@ -425,6 +428,7 @@ def test_cudagraph_sizes_post_init(
sp_min_token_num=512 if enable_sp else None, sp_min_token_num=512 if enable_sp else None,
), ),
cudagraph_mode=cudagraph_mode, cudagraph_mode=cudagraph_mode,
**kwargs,
) )
engine_args = EngineArgs( engine_args = EngineArgs(
model="facebook/opt-125m", model="facebook/opt-125m",
......
...@@ -2,7 +2,6 @@ ...@@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for HF_HUB_OFFLINE mode""" """Tests for HF_HUB_OFFLINE mode"""
import dataclasses
import importlib import importlib
import sys import sys
...@@ -12,7 +11,6 @@ import urllib3 ...@@ -12,7 +11,6 @@ import urllib3
from vllm import LLM from vllm import LLM
from vllm.distributed import cleanup_dist_env_and_memory from vllm.distributed import cleanup_dist_env_and_memory
from vllm.engine.arg_utils import EngineArgs
MODEL_CONFIGS = [ MODEL_CONFIGS = [
{ {
...@@ -160,8 +158,7 @@ def test_model_from_huggingface_offline(monkeypatch: pytest.MonkeyPatch): ...@@ -160,8 +158,7 @@ def test_model_from_huggingface_offline(monkeypatch: pytest.MonkeyPatch):
# Need to re-import huggingface_hub # Need to re-import huggingface_hub
# and friends to set up offline mode # and friends to set up offline mode
_re_import_modules() _re_import_modules()
engine_args = EngineArgs(model="facebook/opt-125m") LLM(model="facebook/opt-125m")
LLM(**dataclasses.asdict(engine_args))
finally: finally:
# Reset the environment after the test # Reset the environment after the test
# NB: Assuming tests are run in online mode # NB: Assuming tests are run in online mode
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import asdict
from typing import NamedTuple from typing import NamedTuple
import pytest import pytest
...@@ -29,14 +28,6 @@ def test_keye_vl(image_assets, question: str): ...@@ -29,14 +28,6 @@ def test_keye_vl(image_assets, question: str):
images = [asset.pil_image for asset in image_assets] images = [asset.pil_image for asset in image_assets]
image_urls = [encode_image_url(image) for image in images] image_urls = [encode_image_url(image) for image in images]
engine_args = EngineArgs(
model=MODEL_NAME,
trust_remote_code=True,
max_model_len=8192,
max_num_seqs=5,
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholders = [{"type": "image", "image": url} for url in image_urls] placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [ messages = [
{ {
...@@ -54,8 +45,14 @@ def test_keye_vl(image_assets, question: str): ...@@ -54,8 +45,14 @@ def test_keye_vl(image_assets, question: str):
messages, tokenize=False, add_generation_prompt=True messages, tokenize=False, add_generation_prompt=True
) )
engine_args = asdict(engine_args) | {"seed": 42} llm = LLM(
llm = LLM(**engine_args) model=MODEL_NAME,
trust_remote_code=True,
max_model_len=8192,
max_num_seqs=5,
limit_mm_per_prompt={"image": len(image_urls)},
seed=42,
)
sampling_params = SamplingParams( sampling_params = SamplingParams(
temperature=0.0, max_tokens=256, stop_token_ids=None temperature=0.0, max_tokens=256, stop_token_ids=None
......
...@@ -7,13 +7,12 @@ This test validates that each multimodal model can successfully generate outputs ...@@ -7,13 +7,12 @@ This test validates that each multimodal model can successfully generate outputs
using different ViT attention backends. Tests are parametrized by model and backend. using different ViT attention backends. Tests are parametrized by model and backend.
""" """
from dataclasses import asdict
from typing import Any from typing import Any
import pytest import pytest
from transformers import AutoProcessor from transformers import AutoProcessor
from vllm import LLM, EngineArgs, SamplingParams from vllm import LLM, SamplingParams
from vllm.multimodal.utils import encode_image_url from vllm.multimodal.utils import encode_image_url
from vllm.multimodal.video import sample_frames_from_video from vllm.multimodal.video import sample_frames_from_video
from vllm.platforms import current_platform from vllm.platforms import current_platform
...@@ -274,7 +273,7 @@ def run_llm_generate_test(config, mm_encoder_attn_backend, image_assets): ...@@ -274,7 +273,7 @@ def run_llm_generate_test(config, mm_encoder_attn_backend, image_assets):
limit_mm_per_prompt = config.get("limit_mm_per_prompt", {"image": len(images)}) limit_mm_per_prompt = config.get("limit_mm_per_prompt", {"image": len(images)})
# Create engine # Create engine
engine_args = EngineArgs( llm = LLM(
model=config["model_name"], model=config["model_name"],
trust_remote_code=True, trust_remote_code=True,
max_model_len=config["max_model_len"], max_model_len=config["max_model_len"],
...@@ -283,11 +282,9 @@ def run_llm_generate_test(config, mm_encoder_attn_backend, image_assets): ...@@ -283,11 +282,9 @@ def run_llm_generate_test(config, mm_encoder_attn_backend, image_assets):
mm_encoder_attn_backend=mm_encoder_attn_backend, mm_encoder_attn_backend=mm_encoder_attn_backend,
hf_overrides=dummy_hf_overrides, hf_overrides=dummy_hf_overrides,
load_format="dummy", load_format="dummy",
seed=42,
) )
engine_dict = asdict(engine_args) | {"seed": 42}
llm = LLM(**engine_dict)
# Generate # Generate
sampling_params = SamplingParams(**config["sampling_params"]) sampling_params = SamplingParams(**config["sampling_params"])
outputs = llm.generate( outputs = llm.generate(
...@@ -318,7 +315,7 @@ def run_llm_chat_test(config, mm_encoder_attn_backend, image_assets): ...@@ -318,7 +315,7 @@ def run_llm_chat_test(config, mm_encoder_attn_backend, image_assets):
messages = build_dots_ocr_prompt([stop_sign_image], config) messages = build_dots_ocr_prompt([stop_sign_image], config)
# Create engine # Create engine
engine_args = EngineArgs( llm = LLM(
model=config["model_name"], model=config["model_name"],
trust_remote_code=True, trust_remote_code=True,
max_model_len=config["max_model_len"], max_model_len=config["max_model_len"],
...@@ -327,11 +324,9 @@ def run_llm_chat_test(config, mm_encoder_attn_backend, image_assets): ...@@ -327,11 +324,9 @@ def run_llm_chat_test(config, mm_encoder_attn_backend, image_assets):
mm_encoder_attn_backend=mm_encoder_attn_backend, mm_encoder_attn_backend=mm_encoder_attn_backend,
hf_overrides=dummy_hf_overrides, hf_overrides=dummy_hf_overrides,
load_format="dummy", load_format="dummy",
seed=42,
) )
engine_dict = asdict(engine_args) | {"seed": 42}
llm = LLM(**engine_dict)
# Generate using chat # Generate using chat
sampling_params = SamplingParams(**config["sampling_params"]) sampling_params = SamplingParams(**config["sampling_params"])
outputs = llm.chat(messages=messages, sampling_params=sampling_params) outputs = llm.chat(messages=messages, sampling_params=sampling_params)
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import contextlib import contextlib
from dataclasses import asdict
import pytest import pytest
import pytest_asyncio import pytest_asyncio
...@@ -75,7 +74,7 @@ def tokenizer() -> MistralTokenizer: ...@@ -75,7 +74,7 @@ def tokenizer() -> MistralTokenizer:
@pytest.fixture @pytest.fixture
def engine(): def engine():
engine_args = EngineArgs(**ENGINE_CONFIG) engine_args = EngineArgs(**ENGINE_CONFIG)
llm = LLM(**asdict(engine_args)) llm = LLM.from_engine_args(engine_args)
try: try:
yield llm yield llm
finally: finally:
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import asdict
from typing import NamedTuple from typing import NamedTuple
import pytest import pytest
from PIL import Image from PIL import Image
from vllm import LLM, EngineArgs, SamplingParams from vllm import LLM, SamplingParams
from vllm.assets.image import ImageAsset from vllm.assets.image import ImageAsset
from vllm.config import AttentionConfig, KVTransferConfig from vllm.config import AttentionConfig, KVTransferConfig
from vllm.multimodal.utils import encode_image_url from vllm.multimodal.utils import encode_image_url
...@@ -129,24 +128,6 @@ def test_shared_storage_connector_hashes(tmp_path, attn_backend): ...@@ -129,24 +128,6 @@ def test_shared_storage_connector_hashes(tmp_path, attn_backend):
# Using tmp_path as the storage path to store KV # Using tmp_path as the storage path to store KV
print(f"KV storage path at: {str(tmp_path)}") print(f"KV storage path at: {str(tmp_path)}")
# Configure the ExampleConnector
kv_transfer_config = KVTransferConfig(
kv_connector="ExampleConnector",
kv_role="kv_both",
kv_connector_extra_config={"shared_storage_path": str(tmp_path)},
)
engine_args = EngineArgs(
model=MODEL_NAME,
max_model_len=8192,
max_num_seqs=1,
gpu_memory_utilization=0.4,
attention_config=AttentionConfig(backend=attn_backend),
enforce_eager=True,
kv_transfer_config=kv_transfer_config,
limit_mm_per_prompt={"image": 2},
)
# don't put this import at the top level # don't put this import at the top level
# it will call torch.accelerator.device_count() # it will call torch.accelerator.device_count()
from transformers import AutoProcessor from transformers import AutoProcessor
...@@ -163,8 +144,20 @@ def test_shared_storage_connector_hashes(tmp_path, attn_backend): ...@@ -163,8 +144,20 @@ def test_shared_storage_connector_hashes(tmp_path, attn_backend):
assert image_1 != image_2, "The images should not be identical" assert image_1 != image_2, "The images should not be identical"
# Create the LLM instance # Create the LLM instance
engine_args = asdict(engine_args) llm = LLM(
llm = LLM(**engine_args) model=MODEL_NAME,
max_model_len=8192,
max_num_seqs=1,
gpu_memory_utilization=0.4,
attention_config=AttentionConfig(backend=attn_backend),
enforce_eager=True,
kv_transfer_config=KVTransferConfig(
kv_connector="ExampleConnector",
kv_role="kv_both",
kv_connector_extra_config={"shared_storage_path": str(tmp_path)},
),
limit_mm_per_prompt={"image": 2},
)
# Prepare the input cases # Prepare the input cases
input_cases = [ input_cases = [
......
...@@ -6,7 +6,6 @@ import argparse ...@@ -6,7 +6,6 @@ import argparse
import json import json
import os import os
import time import time
from dataclasses import fields
from typing import Any from typing import Any
import numpy as np import numpy as np
...@@ -85,7 +84,7 @@ def main(args: argparse.Namespace): ...@@ -85,7 +84,7 @@ def main(args: argparse.Namespace):
# NOTE(woosuk): If the request cannot be processed in a single batch, # NOTE(woosuk): If the request cannot be processed in a single batch,
# the engine will automatically process the request in multiple batches. # the engine will automatically process the request in multiple batches.
llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)}) llm = LLM.from_engine_args(engine_args)
assert llm.llm_engine.model_config.max_model_len >= ( assert llm.llm_engine.model_config.max_model_len >= (
args.input_len + args.output_len args.input_len + args.output_len
), ( ), (
......
...@@ -17,7 +17,6 @@ import argparse ...@@ -17,7 +17,6 @@ import argparse
import json import json
import time import time
from collections import defaultdict from collections import defaultdict
from dataclasses import fields
from datetime import datetime from datetime import datetime
from typing import TYPE_CHECKING, Any, Literal from typing import TYPE_CHECKING, Any, Literal
...@@ -225,7 +224,7 @@ def benchmark_multimodal_processor( ...@@ -225,7 +224,7 @@ def benchmark_multimodal_processor(
args.seed = 0 args.seed = 0
engine_args = EngineArgs.from_cli_args(args) engine_args = EngineArgs.from_cli_args(args)
llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)}) llm = LLM.from_engine_args(engine_args)
tokenizer = llm.get_tokenizer() tokenizer = llm.get_tokenizer()
requests = get_requests(args, tokenizer) requests = get_requests(args, tokenizer)
......
...@@ -16,7 +16,6 @@ import shutil ...@@ -16,7 +16,6 @@ import shutil
import tempfile import tempfile
import time import time
from contextlib import contextmanager from contextlib import contextmanager
from dataclasses import fields
from typing import Any from typing import Any
import numpy as np import numpy as np
...@@ -67,7 +66,7 @@ def run_startup_in_subprocess(engine_args, result_queue): ...@@ -67,7 +66,7 @@ def run_startup_in_subprocess(engine_args, result_queue):
# Measure total startup time # Measure total startup time
start_time = time.perf_counter() start_time = time.perf_counter()
llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)}) llm = LLM.from_engine_args(engine_args)
total_startup_time = time.perf_counter() - start_time total_startup_time = time.perf_counter() - start_time
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment