"docs/vscode:/vscode.git/clone" did not exist on "1e6709dbd46b11ef43ad6e664cfa12d4e8e88f37"
Unverified Commit 86ae693f authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Deprecation][2/N] Replace `--task` with `--runner` and `--convert` (#21470)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 8f605ee3
...@@ -92,7 +92,7 @@ def _run_test( ...@@ -92,7 +92,7 @@ def _run_test(
# if we run HF first, the cuda initialization will be done and it # if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method). # will hurt multiprocessing backend with fork method (the default method).
with vllm_runner(model, with vllm_runner(model,
task="embed", runner="pooling",
dtype=dtype, dtype=dtype,
enforce_eager=True, enforce_eager=True,
max_model_len=8192) as vllm_model: max_model_len=8192) as vllm_model:
......
...@@ -49,7 +49,7 @@ def vllm_reranker( ...@@ -49,7 +49,7 @@ def vllm_reranker(
with vllm_runner( with vllm_runner(
model_name, model_name,
task="score", runner="pooling",
dtype=dtype, dtype=dtype,
max_num_seqs=2, max_num_seqs=2,
max_model_len=2048, max_model_len=2048,
......
...@@ -64,7 +64,7 @@ def _run_test( ...@@ -64,7 +64,7 @@ def _run_test(
# if we run HF first, the cuda initialization will be done and it # if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method). # will hurt multiprocessing backend with fork method (the default method).
with vllm_runner(model, with vllm_runner(model,
task="embed", runner="pooling",
dtype=dtype, dtype=dtype,
max_model_len=4096, max_model_len=4096,
enforce_eager=True) as vllm_model: enforce_eager=True) as vllm_model:
......
...@@ -44,7 +44,7 @@ def _run_test( ...@@ -44,7 +44,7 @@ def _run_test(
# vLLM needs a fresh new process without cuda initialization. # vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it # if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method). # will hurt multiprocessing backend with fork method (the default method).
with vllm_runner(model, task="embed", dtype=dtype, with vllm_runner(model, runner="pooling", dtype=dtype,
enforce_eager=True) as vllm_model: enforce_eager=True) as vllm_model:
vllm_outputs = vllm_model.embed(input_texts, images=input_images) vllm_outputs = vllm_model.embed(input_texts, images=input_images)
......
...@@ -34,7 +34,7 @@ def _run_test( ...@@ -34,7 +34,7 @@ def _run_test(
set_default_torch_num_threads(1), set_default_torch_num_threads(1),
vllm_runner( vllm_runner(
model, model,
task="embed", runner="pooling",
dtype=torch.float16, dtype=torch.float16,
enforce_eager=True, enforce_eager=True,
skip_tokenizer_init=True, skip_tokenizer_init=True,
......
...@@ -58,13 +58,10 @@ def _test_processing_correctness( ...@@ -58,13 +58,10 @@ def _test_processing_correctness(
model_config = ModelConfig( model_config = ModelConfig(
model_id, model_id,
task="auto",
tokenizer=model_info.tokenizer or model_id, tokenizer=model_info.tokenizer or model_id,
tokenizer_mode=model_info.tokenizer_mode, tokenizer_mode=model_info.tokenizer_mode,
trust_remote_code=model_info.trust_remote_code,
seed=0,
dtype="auto",
revision=model_info.revision, revision=model_info.revision,
trust_remote_code=model_info.trust_remote_code,
hf_overrides=model_info.hf_overrides, hf_overrides=model_info.hf_overrides,
) )
......
...@@ -54,13 +54,10 @@ def test_hf_model_weights_mapper(model_arch: str): ...@@ -54,13 +54,10 @@ def test_hf_model_weights_mapper(model_arch: str):
model_config = ModelConfig( model_config = ModelConfig(
model_id, model_id,
task="auto",
tokenizer=model_info.tokenizer or model_id, tokenizer=model_info.tokenizer or model_id,
tokenizer_mode=model_info.tokenizer_mode, tokenizer_mode=model_info.tokenizer_mode,
revision=model_info.revision,
trust_remote_code=model_info.trust_remote_code, trust_remote_code=model_info.trust_remote_code,
seed=0,
dtype="auto",
revision=None,
hf_overrides=model_info.hf_overrides, hf_overrides=model_info.hf_overrides,
) )
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
......
...@@ -172,7 +172,7 @@ def test_4bit_bnb_embedding_model( ...@@ -172,7 +172,7 @@ def test_4bit_bnb_embedding_model(
# Inflight 4bit quantization # Inflight 4bit quantization
with vllm_runner(model_name, with vllm_runner(model_name,
task="embed", runner="pooling",
dtype=dtype, dtype=dtype,
gpu_memory_utilization=0.5, gpu_memory_utilization=0.5,
quantization="bitsandbytes") as vllm_model: quantization="bitsandbytes") as vllm_model:
......
...@@ -7,13 +7,15 @@ import pytest ...@@ -7,13 +7,15 @@ import pytest
from transformers import PretrainedConfig from transformers import PretrainedConfig
from vllm import LLM from vllm import LLM
from vllm.config import ModelImpl
from vllm.engine.llm_engine import LLMEngine as V0LLMEngine from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
from vllm.utils import GiB_bytes from vllm.utils import GiB_bytes
from vllm.v1.core.kv_cache_utils import get_kv_cache_config from vllm.v1.core.kv_cache_utils import get_kv_cache_config
from vllm.v1.engine.core import EngineCore as V1EngineCore from vllm.v1.engine.core import EngineCore as V1EngineCore
from ..utils import create_new_process_for_each_test from ..utils import create_new_process_for_each_test
from .registry import AUTO_EXAMPLE_MODELS, HF_EXAMPLE_MODELS, HfExampleModels from .registry import (_TRANSFORMERS_BACKEND_MODELS, AUTO_EXAMPLE_MODELS,
HF_EXAMPLE_MODELS, HfExampleModels)
@create_new_process_for_each_test() @create_new_process_for_each_test()
...@@ -126,6 +128,8 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch, ...@@ -126,6 +128,8 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
# these tests seem to produce leftover memory # these tests seem to produce leftover memory
gpu_memory_utilization=0.80, gpu_memory_utilization=0.80,
load_format="dummy", load_format="dummy",
model_impl=ModelImpl.TRANSFORMERS
if model_arch in _TRANSFORMERS_BACKEND_MODELS else ModelImpl.VLLM,
hf_overrides=hf_overrides, hf_overrides=hf_overrides,
) )
......
...@@ -24,11 +24,9 @@ from .registry import HF_EXAMPLE_MODELS ...@@ -24,11 +24,9 @@ from .registry import HF_EXAMPLE_MODELS
@pytest.mark.parametrize("model_arch", ModelRegistry.get_supported_archs()) @pytest.mark.parametrize("model_arch", ModelRegistry.get_supported_archs())
def test_registry_imports(model_arch): def test_registry_imports(model_arch):
model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
model_info.check_transformers_version(on_fail="skip")
# Ensure all model classes can be imported successfully # Ensure all model classes can be imported successfully
model_cls, _ = ModelRegistry.resolve_model_cls(model_arch) model_cls = ModelRegistry._try_load_model_cls(model_arch)
assert model_cls is not None
if model_arch in _SPECULATIVE_DECODING_MODELS: if model_arch in _SPECULATIVE_DECODING_MODELS:
return # Ignore these models which do not have a unified format return # Ignore these models which do not have a unified format
...@@ -56,14 +54,16 @@ def test_registry_imports(model_arch): ...@@ -56,14 +54,16 @@ def test_registry_imports(model_arch):
("XLMRobertaForSequenceClassification", False, False, True), ("XLMRobertaForSequenceClassification", False, False, True),
]) ])
def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce): def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
assert ModelRegistry.is_multimodal_model(model_arch) is is_mm model_info = ModelRegistry._try_inspect_model_cls(model_arch)
assert model_info is not None
assert ModelRegistry.is_cross_encoder_model(model_arch) is is_ce assert model_info.supports_multimodal is is_mm
assert model_info.supports_cross_encoding is is_ce
if init_cuda and current_platform.is_cuda_alike(): if init_cuda and current_platform.is_cuda_alike():
assert not torch.cuda.is_initialized() assert not torch.cuda.is_initialized()
ModelRegistry.resolve_model_cls(model_arch) ModelRegistry._try_load_model_cls(model_arch)
if not torch.cuda.is_initialized(): if not torch.cuda.is_initialized():
warnings.warn( warnings.warn(
"This model no longer initializes CUDA on import. " "This model no longer initializes CUDA on import. "
...@@ -82,12 +82,15 @@ def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce): ...@@ -82,12 +82,15 @@ def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
("Qwen2VLForConditionalGeneration", True, True), ("Qwen2VLForConditionalGeneration", True, True),
]) ])
def test_registry_is_pp(model_arch, is_pp, init_cuda): def test_registry_is_pp(model_arch, is_pp, init_cuda):
assert ModelRegistry.is_pp_supported_model(model_arch) is is_pp model_info = ModelRegistry._try_inspect_model_cls(model_arch)
assert model_info is not None
assert model_info.supports_pp is is_pp
if init_cuda and current_platform.is_cuda_alike(): if init_cuda and current_platform.is_cuda_alike():
assert not torch.cuda.is_initialized() assert not torch.cuda.is_initialized()
ModelRegistry.resolve_model_cls(model_arch) ModelRegistry._try_load_model_cls(model_arch)
if not torch.cuda.is_initialized(): if not torch.cuda.is_initialized():
warnings.warn( warnings.warn(
"This model no longer initializes CUDA on import. " "This model no longer initializes CUDA on import. "
......
...@@ -33,6 +33,10 @@ def check_implementation( ...@@ -33,6 +33,10 @@ def check_implementation(
args = (example_prompts, max_tokens, num_logprobs) args = (example_prompts, max_tokens, num_logprobs)
with runner_test(model, **kwargs_test, **kwargs) as model_test: with runner_test(model, **kwargs_test, **kwargs) as model_test:
model_config = model_test.llm.llm_engine.model_config
assert model_config.architecture == (
model_config._get_transformers_backend_cls())
outputs_test = model_test.generate_greedy_logprobs(*args) outputs_test = model_test.generate_greedy_logprobs(*args)
with runner_ref(model, **kwargs_ref) as model_ref: with runner_ref(model, **kwargs_ref) as model_ref:
...@@ -130,8 +134,13 @@ def test_quantization( ...@@ -130,8 +134,13 @@ def test_quantization(
model_impl="transformers", model_impl="transformers",
enforce_eager=True, enforce_eager=True,
**quantization_kwargs) as vllm_model: # type: ignore[arg-type] **quantization_kwargs) as vllm_model: # type: ignore[arg-type]
model_config = vllm_model.llm.llm_engine.model_config
assert model_config.architecture == (
model_config._get_transformers_backend_cls())
transformers_outputs = vllm_model.generate_greedy_logprobs( transformers_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs) example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)
check_logprobs_close( check_logprobs_close(
outputs_0_lst=transformers_outputs, outputs_0_lst=transformers_outputs,
outputs_1_lst=vllm_outputs, outputs_1_lst=vllm_outputs,
...@@ -151,7 +160,6 @@ def test_classify( ...@@ -151,7 +160,6 @@ def test_classify(
example_prompts, example_prompts,
model: str, model: str,
dtype: str, dtype: str,
monkeypatch,
) -> None: ) -> None:
import torch import torch
from transformers import AutoModelForSequenceClassification from transformers import AutoModelForSequenceClassification
...@@ -160,6 +168,10 @@ def test_classify( ...@@ -160,6 +168,10 @@ def test_classify(
max_model_len=512, max_model_len=512,
dtype=dtype, dtype=dtype,
model_impl="transformers") as vllm_model: model_impl="transformers") as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
assert model_config.architecture == (
model_config._get_transformers_backend_cls())
vllm_outputs = vllm_model.classify(example_prompts) vllm_outputs = vllm_model.classify(example_prompts)
with hf_runner(model, with hf_runner(model,
......
...@@ -8,7 +8,7 @@ from typing import Any, NamedTuple, Optional, Union ...@@ -8,7 +8,7 @@ from typing import Any, NamedTuple, Optional, Union
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
from vllm.config import ModelConfig, TaskOption from vllm.config import ModelConfig, RunnerOption
from vllm.inputs import InputContext from vllm.inputs import InputContext
from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
...@@ -255,7 +255,7 @@ def check_logprobs_close( ...@@ -255,7 +255,7 @@ def check_logprobs_close(
def build_model_context( def build_model_context(
model_id: str, model_id: str,
task: TaskOption = "auto", runner: RunnerOption = "auto",
dtype: Union[str, torch.dtype] = "auto", dtype: Union[str, torch.dtype] = "auto",
model_config_kwargs: Optional[dict[str, Any]] = None, model_config_kwargs: Optional[dict[str, Any]] = None,
mm_processor_kwargs: Optional[dict[str, Any]] = None, mm_processor_kwargs: Optional[dict[str, Any]] = None,
...@@ -280,9 +280,10 @@ def build_model_context( ...@@ -280,9 +280,10 @@ def build_model_context(
model_config_kwargs = model_config_kwargs or {} model_config_kwargs = model_config_kwargs or {}
model_config = ModelConfig( model_config = ModelConfig(
model_id, model_id,
task=task, runner=runner,
tokenizer=model_info.tokenizer or model_id, tokenizer=model_info.tokenizer or model_id,
tokenizer_mode=model_info.tokenizer_mode, tokenizer_mode=model_info.tokenizer_mode,
revision=model_info.revision,
trust_remote_code=model_info.trust_remote_code, trust_remote_code=model_info.trust_remote_code,
dtype=dtype, dtype=dtype,
seed=0, seed=0,
......
...@@ -954,13 +954,6 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid): ...@@ -954,13 +954,6 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
model_config = ModelConfig( model_config = ModelConfig(
model=model_id, model=model_id,
task="auto",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="auto",
revision=None,
limit_mm_per_prompt=limit_mm_per_prompt, limit_mm_per_prompt=limit_mm_per_prompt,
) )
...@@ -993,13 +986,6 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid): ...@@ -993,13 +986,6 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
model_config = ModelConfig( model_config = ModelConfig(
model=model_id, model=model_id,
task="auto",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="auto",
revision=None,
limit_mm_per_prompt=limit_mm_per_prompt, limit_mm_per_prompt=limit_mm_per_prompt,
) )
...@@ -1061,16 +1047,7 @@ class _ProcessorProxy: ...@@ -1061,16 +1047,7 @@ class _ProcessorProxy:
) )
# yapf: enable # yapf: enable
def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs): def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
model_config = ModelConfig( model_config = ModelConfig(model_id)
model=model_id,
task="auto",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="auto",
revision=None,
)
processor = MULTIMODAL_REGISTRY.create_processor(model_config) processor = MULTIMODAL_REGISTRY.create_processor(model_config)
orig_get_hf_processor = processor.info.get_hf_processor orig_get_hf_processor = processor.info.get_hf_processor
......
...@@ -57,15 +57,7 @@ def test_auto_gptq(model_arg_exptype: tuple[str, None, str]) -> None: ...@@ -57,15 +57,7 @@ def test_auto_gptq(model_arg_exptype: tuple[str, None, str]) -> None:
model_path, quantization_arg, expected_type = model_arg_exptype model_path, quantization_arg, expected_type = model_arg_exptype
try: try:
model_config = ModelConfig(model_path, model_config = ModelConfig(model_path, quantization=quantization_arg)
task="auto",
tokenizer=model_path,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
revision=None,
quantization=quantization_arg)
found_quantization_type = model_config.quantization found_quantization_type = model_config.quantization
except ValueError: except ValueError:
found_quantization_type = "ERROR" found_quantization_type = "ERROR"
......
...@@ -74,115 +74,116 @@ def test_update_config(): ...@@ -74,115 +74,116 @@ def test_update_config():
new_config3 = update_config(config3, {"a": "new_value"}) new_config3 = update_config(config3, {"a": "new_value"})
# Can remove once --task option is fully deprecated
@pytest.mark.parametrize( @pytest.mark.parametrize(
("model_id", "expected_runner_type", "expected_task"), ("model_id", "expected_runner_type", "expected_convert_type",
"expected_task"),
[ [
("distilbert/distilgpt2", "generate", "generate"), ("distilbert/distilgpt2", "generate", "none", "generate"),
("intfloat/multilingual-e5-small", "pooling", "embed"), ("intfloat/multilingual-e5-small", "pooling", "none", "embed"),
("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"), ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify", "classify"),
("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify"), ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "none",
("Qwen/Qwen2.5-Math-RM-72B", "pooling", "reward"), "classify"),
("openai/whisper-small", "generate", "transcription"), ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "none", "reward"),
("openai/whisper-small", "generate", "none", "transcription"),
], ],
) )
def test_auto_task(model_id, expected_runner_type, expected_task): def test_auto_task(model_id, expected_runner_type, expected_convert_type,
config = ModelConfig( expected_task):
model_id, config = ModelConfig(model_id, task="auto")
task="auto",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
)
assert config.runner_type == expected_runner_type assert config.runner_type == expected_runner_type
assert config.convert_type == expected_convert_type
assert expected_task in config.supported_tasks
if config.runner_type == "pooling":
assert config.task == expected_task
else:
assert expected_task in config.supported_tasks
# Can remove once --task option is fully deprecated
@pytest.mark.parametrize(
("model_id", "expected_runner_type", "expected_convert_type",
"expected_task"),
[
("distilbert/distilgpt2", "pooling", "embed", "embed"),
("intfloat/multilingual-e5-small", "pooling", "embed", "embed"),
("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify", "classify"),
("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify",
"classify"),
("Qwen/Qwen2.5-Math-RM-72B", "pooling", "embed", "embed"),
("openai/whisper-small", "pooling", "embed", "embed"),
],
)
def test_score_task(model_id, expected_runner_type, expected_convert_type,
expected_task):
config = ModelConfig(model_id, task="score")
assert config.runner_type == expected_runner_type
assert config.convert_type == expected_convert_type
assert expected_task in config.supported_tasks
# Can remove once --task option is fully deprecated
@pytest.mark.parametrize( @pytest.mark.parametrize(
("model_id", "expected_runner_type", "expected_task"), ("model_id", "expected_runner_type", "expected_convert_type",
"expected_task"),
[ [
("distilbert/distilgpt2", "pooling", "embed"), ("openai/whisper-small", "generate", "none", "transcription"),
("intfloat/multilingual-e5-small", "pooling", "embed"),
("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify"),
("Qwen/Qwen2.5-Math-RM-72B", "pooling", "embed"),
("openai/whisper-small", "pooling", "embed"),
], ],
) )
def test_score_task(model_id, expected_runner_type, expected_task): def test_transcription_task(model_id, expected_runner_type,
config = ModelConfig( expected_convert_type, expected_task):
model_id, config = ModelConfig(model_id, task="transcription")
task="score",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
)
assert config.runner_type == expected_runner_type assert config.runner_type == expected_runner_type
assert config.task == expected_task assert config.convert_type == expected_convert_type
assert expected_task in config.supported_tasks
@pytest.mark.parametrize(("model_id", "expected_runner_type", "expected_task"), @pytest.mark.parametrize(
[ ("model_id", "expected_runner_type", "expected_convert_type"),
("Qwen/Qwen2.5-1.5B-Instruct", "draft", "auto"), [
]) ("distilbert/distilgpt2", "generate", "none"),
def test_draft_task(model_id, expected_runner_type, expected_task): ("intfloat/multilingual-e5-small", "pooling", "none"),
config = ModelConfig( ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
model_id, ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "none"),
runner="draft", ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "none"),
tokenizer=model_id, ("openai/whisper-small", "generate", "none"),
seed=0, ],
dtype="float16", )
) def test_auto_runner(model_id, expected_runner_type, expected_convert_type):
config = ModelConfig(model_id, runner="auto")
assert config.runner_type == expected_runner_type assert config.runner_type == expected_runner_type
assert config.task == expected_task assert config.convert_type == expected_convert_type
@pytest.mark.parametrize( @pytest.mark.parametrize(
("model_id", "expected_runner_type", "expected_task"), ("model_id", "expected_runner_type", "expected_convert_type"),
[ [
("openai/whisper-small", "generate", "transcription"), ("distilbert/distilgpt2", "pooling", "embed"),
("intfloat/multilingual-e5-small", "pooling", "none"),
("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "none"),
("Qwen/Qwen2.5-Math-RM-72B", "pooling", "none"),
("openai/whisper-small", "pooling", "embed"),
], ],
) )
def test_transcription_task(model_id, expected_runner_type, expected_task): def test_pooling_runner(model_id, expected_runner_type, expected_convert_type):
config = ModelConfig( config = ModelConfig(model_id, runner="pooling")
model_id,
task="transcription",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
)
assert config.runner_type == expected_runner_type assert config.runner_type == expected_runner_type
assert config.task == expected_task assert config.convert_type == expected_convert_type
@pytest.mark.parametrize(("model_id", "bad_task"), [ @pytest.mark.parametrize(
("Qwen/Qwen2.5-Math-RM-72B", "generate"), ("model_id", "expected_runner_type", "expected_convert_type"),
("Qwen/Qwen3-0.6B", "transcription"), [
]) ("Qwen/Qwen2.5-1.5B-Instruct", "draft", "none"),
def test_incorrect_task(model_id, bad_task): ],
with pytest.raises(ValueError, match=r"does not support task=.*"): )
ModelConfig( def test_draft_runner(model_id, expected_runner_type, expected_convert_type):
model_id, config = ModelConfig(model_id, runner="draft")
task=bad_task,
tokenizer=model_id, assert config.runner_type == expected_runner_type
tokenizer_mode="auto", assert config.convert_type == expected_convert_type
trust_remote_code=False,
seed=0,
dtype="float16",
)
MODEL_IDS_EXPECTED = [ MODEL_IDS_EXPECTED = [
...@@ -195,17 +196,7 @@ MODEL_IDS_EXPECTED = [ ...@@ -195,17 +196,7 @@ MODEL_IDS_EXPECTED = [
@pytest.mark.parametrize("model_id_expected", MODEL_IDS_EXPECTED) @pytest.mark.parametrize("model_id_expected", MODEL_IDS_EXPECTED)
def test_disable_sliding_window(model_id_expected): def test_disable_sliding_window(model_id_expected):
model_id, expected = model_id_expected model_id, expected = model_id_expected
model_config = ModelConfig( model_config = ModelConfig(model_id, disable_sliding_window=True)
model_id,
task="auto",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
revision=None,
disable_sliding_window=True,
)
assert model_config.max_model_len == expected assert model_config.max_model_len == expected
...@@ -214,16 +205,7 @@ def test_get_sliding_window(): ...@@ -214,16 +205,7 @@ def test_get_sliding_window():
# Test that the sliding window is correctly computed. # Test that the sliding window is correctly computed.
# For Qwen1.5/Qwen2, get_sliding_window() should be None # For Qwen1.5/Qwen2, get_sliding_window() should be None
# when use_sliding_window is False. # when use_sliding_window is False.
qwen2_model_config = ModelConfig( qwen2_model_config = ModelConfig("Qwen/Qwen1.5-7B")
"Qwen/Qwen1.5-7B",
task="auto",
tokenizer="Qwen/Qwen1.5-7B",
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
revision=None,
)
qwen2_model_config.hf_config.use_sliding_window = False qwen2_model_config.hf_config.use_sliding_window = False
qwen2_model_config.hf_config.sliding_window = TEST_SLIDING_WINDOW qwen2_model_config.hf_config.sliding_window = TEST_SLIDING_WINDOW
...@@ -232,16 +214,7 @@ def test_get_sliding_window(): ...@@ -232,16 +214,7 @@ def test_get_sliding_window():
qwen2_model_config.hf_config.use_sliding_window = True qwen2_model_config.hf_config.use_sliding_window = True
assert qwen2_model_config.get_sliding_window() == TEST_SLIDING_WINDOW assert qwen2_model_config.get_sliding_window() == TEST_SLIDING_WINDOW
mistral_model_config = ModelConfig( mistral_model_config = ModelConfig("mistralai/Mistral-7B-v0.1")
"mistralai/Mistral-7B-v0.1",
task="auto",
tokenizer="mistralai/Mistral-7B-v0.1",
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
revision=None,
)
mistral_model_config.hf_config.sliding_window = None mistral_model_config.hf_config.sliding_window = None
assert mistral_model_config.get_sliding_window() is None assert mistral_model_config.get_sliding_window() is None
...@@ -253,16 +226,7 @@ def test_get_sliding_window(): ...@@ -253,16 +226,7 @@ def test_get_sliding_window():
reason="Xformers backend is not supported on ROCm.") reason="Xformers backend is not supported on ROCm.")
def test_get_pooling_config(): def test_get_pooling_config():
model_id = "sentence-transformers/all-MiniLM-L12-v2" model_id = "sentence-transformers/all-MiniLM-L12-v2"
model_config = ModelConfig( model_config = ModelConfig(model_id)
model_id,
task="auto",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
revision=None,
)
pooling_config = model_config._init_pooler_config() pooling_config = model_config._init_pooler_config()
assert pooling_config is not None assert pooling_config is not None
...@@ -275,14 +239,7 @@ def test_get_pooling_config(): ...@@ -275,14 +239,7 @@ def test_get_pooling_config():
reason="Xformers backend is not supported on ROCm.") reason="Xformers backend is not supported on ROCm.")
def test_get_pooling_config_from_args(): def test_get_pooling_config_from_args():
model_id = "sentence-transformers/all-MiniLM-L12-v2" model_id = "sentence-transformers/all-MiniLM-L12-v2"
model_config = ModelConfig(model_id, model_config = ModelConfig(model_id)
task="auto",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
revision=None)
override_pooler_config = PoolerConfig(pooling_type='CLS', normalize=True) override_pooler_config = PoolerConfig(pooling_type='CLS', normalize=True)
model_config.override_pooler_config = override_pooler_config model_config.override_pooler_config = override_pooler_config
...@@ -295,16 +252,8 @@ def test_get_pooling_config_from_args(): ...@@ -295,16 +252,8 @@ def test_get_pooling_config_from_args():
@pytest.mark.skipif(current_platform.is_rocm(), @pytest.mark.skipif(current_platform.is_rocm(),
reason="Xformers backend is not supported on ROCm.") reason="Xformers backend is not supported on ROCm.")
def test_get_bert_tokenization_sentence_transformer_config(): def test_get_bert_tokenization_sentence_transformer_config():
bge_model_config = ModelConfig( model_id = "BAAI/bge-base-en-v1.5"
model="BAAI/bge-base-en-v1.5", bge_model_config = ModelConfig(model_id)
task="auto",
tokenizer="BAAI/bge-base-en-v1.5",
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
revision=None,
)
bert_bge_model_config = bge_model_config._get_encoder_config() bert_bge_model_config = bge_model_config._get_encoder_config()
...@@ -317,27 +266,13 @@ def test_rope_customization(): ...@@ -317,27 +266,13 @@ def test_rope_customization():
TEST_ROPE_THETA = 16_000_000.0 TEST_ROPE_THETA = 16_000_000.0
LONGCHAT_ROPE_SCALING = {"rope_type": "linear", "factor": 8.0} LONGCHAT_ROPE_SCALING = {"rope_type": "linear", "factor": 8.0}
llama_model_config = ModelConfig( llama_model_config = ModelConfig("meta-llama/Meta-Llama-3-8B-Instruct")
"meta-llama/Meta-Llama-3-8B-Instruct",
task="auto",
tokenizer="meta-llama/Meta-Llama-3-8B-Instruct",
tokenizer_mode="auto",
trust_remote_code=False,
dtype="float16",
seed=0,
)
assert getattr(llama_model_config.hf_config, "rope_scaling", None) is None assert getattr(llama_model_config.hf_config, "rope_scaling", None) is None
assert getattr(llama_model_config.hf_config, "rope_theta", None) == 500_000 assert getattr(llama_model_config.hf_config, "rope_theta", None) == 500_000
assert llama_model_config.max_model_len == 8192 assert llama_model_config.max_model_len == 8192
llama_model_config = ModelConfig( llama_model_config = ModelConfig(
"meta-llama/Meta-Llama-3-8B-Instruct", "meta-llama/Meta-Llama-3-8B-Instruct",
task="auto",
tokenizer="meta-llama/Meta-Llama-3-8B-Instruct",
tokenizer_mode="auto",
trust_remote_code=False,
dtype="float16",
seed=0,
hf_overrides={ hf_overrides={
"rope_scaling": TEST_ROPE_SCALING, "rope_scaling": TEST_ROPE_SCALING,
"rope_theta": TEST_ROPE_THETA, "rope_theta": TEST_ROPE_THETA,
...@@ -349,15 +284,7 @@ def test_rope_customization(): ...@@ -349,15 +284,7 @@ def test_rope_customization():
None) == TEST_ROPE_THETA None) == TEST_ROPE_THETA
assert llama_model_config.max_model_len == 16384 assert llama_model_config.max_model_len == 16384
longchat_model_config = ModelConfig( longchat_model_config = ModelConfig("lmsys/longchat-13b-16k")
"lmsys/longchat-13b-16k",
task="auto",
tokenizer="lmsys/longchat-13b-16k",
tokenizer_mode="auto",
trust_remote_code=False,
dtype="float16",
seed=0,
)
# Check if LONGCHAT_ROPE_SCALING entries are in longchat_model_config # Check if LONGCHAT_ROPE_SCALING entries are in longchat_model_config
assert all( assert all(
longchat_model_config.hf_config.rope_scaling.get(key) == value longchat_model_config.hf_config.rope_scaling.get(key) == value
...@@ -366,12 +293,6 @@ def test_rope_customization(): ...@@ -366,12 +293,6 @@ def test_rope_customization():
longchat_model_config = ModelConfig( longchat_model_config = ModelConfig(
"lmsys/longchat-13b-16k", "lmsys/longchat-13b-16k",
task="auto",
tokenizer="lmsys/longchat-13b-16k",
tokenizer_mode="auto",
trust_remote_code=False,
dtype="float16",
seed=0,
hf_overrides={ hf_overrides={
"rope_scaling": TEST_ROPE_SCALING, "rope_scaling": TEST_ROPE_SCALING,
}, },
...@@ -390,15 +311,7 @@ def test_rope_customization(): ...@@ -390,15 +311,7 @@ def test_rope_customization():
("meta-llama/Llama-3.2-11B-Vision", True), ("meta-llama/Llama-3.2-11B-Vision", True),
]) ])
def test_is_encoder_decoder(model_id, is_encoder_decoder): def test_is_encoder_decoder(model_id, is_encoder_decoder):
config = ModelConfig( config = ModelConfig(model_id)
model_id,
task="auto",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
dtype="float16",
seed=0,
)
assert config.is_encoder_decoder == is_encoder_decoder assert config.is_encoder_decoder == is_encoder_decoder
...@@ -408,15 +321,7 @@ def test_is_encoder_decoder(model_id, is_encoder_decoder): ...@@ -408,15 +321,7 @@ def test_is_encoder_decoder(model_id, is_encoder_decoder):
("Qwen/Qwen2-VL-2B-Instruct", True), ("Qwen/Qwen2-VL-2B-Instruct", True),
]) ])
def test_uses_mrope(model_id, uses_mrope): def test_uses_mrope(model_id, uses_mrope):
config = ModelConfig( config = ModelConfig(model_id)
model_id,
task="auto",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
dtype="float16",
seed=0,
)
assert config.uses_mrope == uses_mrope assert config.uses_mrope == uses_mrope
...@@ -426,26 +331,12 @@ def test_generation_config_loading(): ...@@ -426,26 +331,12 @@ def test_generation_config_loading():
# When set generation_config to "vllm", the default generation config # When set generation_config to "vllm", the default generation config
# will not be loaded. # will not be loaded.
model_config = ModelConfig(model_id, model_config = ModelConfig(model_id, generation_config="vllm")
task="auto",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
generation_config="vllm")
assert model_config.get_diff_sampling_param() == {} assert model_config.get_diff_sampling_param() == {}
# When set generation_config to "auto", the default generation config # When set generation_config to "auto", the default generation config
# should be loaded. # should be loaded.
model_config = ModelConfig(model_id, model_config = ModelConfig(model_id, generation_config="auto")
task="auto",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
generation_config="auto")
correct_generation_config = { correct_generation_config = {
"repetition_penalty": 1.1, "repetition_penalty": 1.1,
...@@ -461,12 +352,6 @@ def test_generation_config_loading(): ...@@ -461,12 +352,6 @@ def test_generation_config_loading():
model_config = ModelConfig( model_config = ModelConfig(
model_id, model_id,
task="auto",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
generation_config="auto", generation_config="auto",
override_generation_config=override_generation_config) override_generation_config=override_generation_config)
...@@ -479,12 +364,6 @@ def test_generation_config_loading(): ...@@ -479,12 +364,6 @@ def test_generation_config_loading():
# is set, the override_generation_config should be used directly. # is set, the override_generation_config should be used directly.
model_config = ModelConfig( model_config = ModelConfig(
model_id, model_id,
task="auto",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
generation_config="vllm", generation_config="vllm",
override_generation_config=override_generation_config) override_generation_config=override_generation_config)
...@@ -515,16 +394,7 @@ def test_load_config_pt_load_map_location(pt_load_map_location): ...@@ -515,16 +394,7 @@ def test_load_config_pt_load_map_location(pt_load_map_location):
def test_get_and_verify_max_len(model_id, max_model_len, expected_max_len, def test_get_and_verify_max_len(model_id, max_model_len, expected_max_len,
should_raise): should_raise):
"""Test get_and_verify_max_len with different configurations.""" """Test get_and_verify_max_len with different configurations."""
model_config = ModelConfig( model_config = ModelConfig(model_id)
model_id,
task="auto",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
revision=None,
)
if should_raise: if should_raise:
with pytest.raises(ValueError): with pytest.raises(ValueError):
......
...@@ -21,13 +21,8 @@ def test_max_tokens_none(): ...@@ -21,13 +21,8 @@ def test_max_tokens_none():
def model_config(): def model_config():
return ModelConfig( return ModelConfig(
MODEL_NAME, MODEL_NAME,
task="auto",
tokenizer=MODEL_NAME,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0, seed=0,
dtype="float16", dtype="float16",
revision=None,
) )
......
...@@ -695,11 +695,7 @@ def test_estimate_max_model_len(model_id, max_model_len, ...@@ -695,11 +695,7 @@ def test_estimate_max_model_len(model_id, max_model_len,
# Create a VllmConfig # Create a VllmConfig
model_config = ModelConfig( model_config = ModelConfig(
model_id, model_id,
task="generate", runner="generate",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16", dtype="float16",
max_model_len=max_model_len, max_model_len=max_model_len,
) )
...@@ -733,11 +729,7 @@ def test_get_max_concurrency_for_kv_cache_config(): ...@@ -733,11 +729,7 @@ def test_get_max_concurrency_for_kv_cache_config():
max_model_len = 16384 max_model_len = 16384
model_config = ModelConfig( model_config = ModelConfig(
model_id, model_id,
task="generate", runner="generate",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16", dtype="float16",
max_model_len=max_model_len, max_model_len=max_model_len,
) )
......
...@@ -1248,9 +1248,6 @@ def create_scheduler_with_priority( ...@@ -1248,9 +1248,6 @@ def create_scheduler_with_priority(
) )
model_config = ModelConfig( model_config = ModelConfig(
model=model, model=model,
task="auto",
tokenizer=model,
tokenizer_mode="auto",
trust_remote_code=True, trust_remote_code=True,
dtype="float16", dtype="float16",
seed=42, seed=42,
......
...@@ -59,9 +59,6 @@ def create_scheduler( ...@@ -59,9 +59,6 @@ def create_scheduler(
) )
model_config = ModelConfig( model_config = ModelConfig(
model=model, model=model,
task="auto",
tokenizer=model,
tokenizer_mode="auto",
trust_remote_code=True, trust_remote_code=True,
dtype="float16", dtype="float16",
seed=42, seed=42,
......
...@@ -68,9 +68,6 @@ def create_vllm_config( ...@@ -68,9 +68,6 @@ def create_vllm_config(
) )
model_config = ModelConfig( model_config = ModelConfig(
model=model, model=model,
task="auto",
tokenizer=model,
tokenizer_mode="auto",
trust_remote_code=True, trust_remote_code=True,
dtype="float16", dtype="float16",
seed=42, seed=42,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment