"tests/vscode:/vscode.git/clone" did not exist on "48f589e18b8b6758dbfb6bb23b2994430893b477"
Unverified Commit 86ae693f authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Deprecation][2/N] Replace `--task` with `--runner` and `--convert` (#21470)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 8f605ee3
......@@ -92,7 +92,7 @@ def _run_test(
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with vllm_runner(model,
task="embed",
runner="pooling",
dtype=dtype,
enforce_eager=True,
max_model_len=8192) as vllm_model:
......
......@@ -49,7 +49,7 @@ def vllm_reranker(
with vllm_runner(
model_name,
task="score",
runner="pooling",
dtype=dtype,
max_num_seqs=2,
max_model_len=2048,
......
......@@ -64,7 +64,7 @@ def _run_test(
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with vllm_runner(model,
task="embed",
runner="pooling",
dtype=dtype,
max_model_len=4096,
enforce_eager=True) as vllm_model:
......
......@@ -44,7 +44,7 @@ def _run_test(
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with vllm_runner(model, task="embed", dtype=dtype,
with vllm_runner(model, runner="pooling", dtype=dtype,
enforce_eager=True) as vllm_model:
vllm_outputs = vllm_model.embed(input_texts, images=input_images)
......
......@@ -34,7 +34,7 @@ def _run_test(
set_default_torch_num_threads(1),
vllm_runner(
model,
task="embed",
runner="pooling",
dtype=torch.float16,
enforce_eager=True,
skip_tokenizer_init=True,
......
......@@ -58,13 +58,10 @@ def _test_processing_correctness(
model_config = ModelConfig(
model_id,
task="auto",
tokenizer=model_info.tokenizer or model_id,
tokenizer_mode=model_info.tokenizer_mode,
trust_remote_code=model_info.trust_remote_code,
seed=0,
dtype="auto",
revision=model_info.revision,
trust_remote_code=model_info.trust_remote_code,
hf_overrides=model_info.hf_overrides,
)
......
......@@ -54,13 +54,10 @@ def test_hf_model_weights_mapper(model_arch: str):
model_config = ModelConfig(
model_id,
task="auto",
tokenizer=model_info.tokenizer or model_id,
tokenizer_mode=model_info.tokenizer_mode,
revision=model_info.revision,
trust_remote_code=model_info.trust_remote_code,
seed=0,
dtype="auto",
revision=None,
hf_overrides=model_info.hf_overrides,
)
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
......
......@@ -172,7 +172,7 @@ def test_4bit_bnb_embedding_model(
# Inflight 4bit quantization
with vllm_runner(model_name,
task="embed",
runner="pooling",
dtype=dtype,
gpu_memory_utilization=0.5,
quantization="bitsandbytes") as vllm_model:
......
......@@ -7,13 +7,15 @@ import pytest
from transformers import PretrainedConfig
from vllm import LLM
from vllm.config import ModelImpl
from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
from vllm.utils import GiB_bytes
from vllm.v1.core.kv_cache_utils import get_kv_cache_config
from vllm.v1.engine.core import EngineCore as V1EngineCore
from ..utils import create_new_process_for_each_test
from .registry import AUTO_EXAMPLE_MODELS, HF_EXAMPLE_MODELS, HfExampleModels
from .registry import (_TRANSFORMERS_BACKEND_MODELS, AUTO_EXAMPLE_MODELS,
HF_EXAMPLE_MODELS, HfExampleModels)
@create_new_process_for_each_test()
......@@ -126,6 +128,8 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
# these tests seem to produce leftover memory
gpu_memory_utilization=0.80,
load_format="dummy",
model_impl=ModelImpl.TRANSFORMERS
if model_arch in _TRANSFORMERS_BACKEND_MODELS else ModelImpl.VLLM,
hf_overrides=hf_overrides,
)
......
......@@ -24,11 +24,9 @@ from .registry import HF_EXAMPLE_MODELS
@pytest.mark.parametrize("model_arch", ModelRegistry.get_supported_archs())
def test_registry_imports(model_arch):
model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
model_info.check_transformers_version(on_fail="skip")
# Ensure all model classes can be imported successfully
model_cls, _ = ModelRegistry.resolve_model_cls(model_arch)
model_cls = ModelRegistry._try_load_model_cls(model_arch)
assert model_cls is not None
if model_arch in _SPECULATIVE_DECODING_MODELS:
return # Ignore these models which do not have a unified format
......@@ -56,14 +54,16 @@ def test_registry_imports(model_arch):
("XLMRobertaForSequenceClassification", False, False, True),
])
def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
assert ModelRegistry.is_multimodal_model(model_arch) is is_mm
model_info = ModelRegistry._try_inspect_model_cls(model_arch)
assert model_info is not None
assert ModelRegistry.is_cross_encoder_model(model_arch) is is_ce
assert model_info.supports_multimodal is is_mm
assert model_info.supports_cross_encoding is is_ce
if init_cuda and current_platform.is_cuda_alike():
assert not torch.cuda.is_initialized()
ModelRegistry.resolve_model_cls(model_arch)
ModelRegistry._try_load_model_cls(model_arch)
if not torch.cuda.is_initialized():
warnings.warn(
"This model no longer initializes CUDA on import. "
......@@ -82,12 +82,15 @@ def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
("Qwen2VLForConditionalGeneration", True, True),
])
def test_registry_is_pp(model_arch, is_pp, init_cuda):
assert ModelRegistry.is_pp_supported_model(model_arch) is is_pp
model_info = ModelRegistry._try_inspect_model_cls(model_arch)
assert model_info is not None
assert model_info.supports_pp is is_pp
if init_cuda and current_platform.is_cuda_alike():
assert not torch.cuda.is_initialized()
ModelRegistry.resolve_model_cls(model_arch)
ModelRegistry._try_load_model_cls(model_arch)
if not torch.cuda.is_initialized():
warnings.warn(
"This model no longer initializes CUDA on import. "
......
......@@ -33,6 +33,10 @@ def check_implementation(
args = (example_prompts, max_tokens, num_logprobs)
with runner_test(model, **kwargs_test, **kwargs) as model_test:
model_config = model_test.llm.llm_engine.model_config
assert model_config.architecture == (
model_config._get_transformers_backend_cls())
outputs_test = model_test.generate_greedy_logprobs(*args)
with runner_ref(model, **kwargs_ref) as model_ref:
......@@ -130,8 +134,13 @@ def test_quantization(
model_impl="transformers",
enforce_eager=True,
**quantization_kwargs) as vllm_model: # type: ignore[arg-type]
model_config = vllm_model.llm.llm_engine.model_config
assert model_config.architecture == (
model_config._get_transformers_backend_cls())
transformers_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)
check_logprobs_close(
outputs_0_lst=transformers_outputs,
outputs_1_lst=vllm_outputs,
......@@ -151,7 +160,6 @@ def test_classify(
example_prompts,
model: str,
dtype: str,
monkeypatch,
) -> None:
import torch
from transformers import AutoModelForSequenceClassification
......@@ -160,6 +168,10 @@ def test_classify(
max_model_len=512,
dtype=dtype,
model_impl="transformers") as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
assert model_config.architecture == (
model_config._get_transformers_backend_cls())
vllm_outputs = vllm_model.classify(example_prompts)
with hf_runner(model,
......
......@@ -8,7 +8,7 @@ from typing import Any, NamedTuple, Optional, Union
import torch
import torch.nn.functional as F
from vllm.config import ModelConfig, TaskOption
from vllm.config import ModelConfig, RunnerOption
from vllm.inputs import InputContext
from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
......@@ -255,7 +255,7 @@ def check_logprobs_close(
def build_model_context(
model_id: str,
task: TaskOption = "auto",
runner: RunnerOption = "auto",
dtype: Union[str, torch.dtype] = "auto",
model_config_kwargs: Optional[dict[str, Any]] = None,
mm_processor_kwargs: Optional[dict[str, Any]] = None,
......@@ -280,9 +280,10 @@ def build_model_context(
model_config_kwargs = model_config_kwargs or {}
model_config = ModelConfig(
model_id,
task=task,
runner=runner,
tokenizer=model_info.tokenizer or model_id,
tokenizer_mode=model_info.tokenizer_mode,
revision=model_info.revision,
trust_remote_code=model_info.trust_remote_code,
dtype=dtype,
seed=0,
......
......@@ -954,13 +954,6 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
model_config = ModelConfig(
model=model_id,
task="auto",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="auto",
revision=None,
limit_mm_per_prompt=limit_mm_per_prompt,
)
......@@ -993,13 +986,6 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
model_config = ModelConfig(
model=model_id,
task="auto",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="auto",
revision=None,
limit_mm_per_prompt=limit_mm_per_prompt,
)
......@@ -1061,16 +1047,7 @@ class _ProcessorProxy:
)
# yapf: enable
def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
model_config = ModelConfig(
model=model_id,
task="auto",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="auto",
revision=None,
)
model_config = ModelConfig(model_id)
processor = MULTIMODAL_REGISTRY.create_processor(model_config)
orig_get_hf_processor = processor.info.get_hf_processor
......
......@@ -57,15 +57,7 @@ def test_auto_gptq(model_arg_exptype: tuple[str, None, str]) -> None:
model_path, quantization_arg, expected_type = model_arg_exptype
try:
model_config = ModelConfig(model_path,
task="auto",
tokenizer=model_path,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
revision=None,
quantization=quantization_arg)
model_config = ModelConfig(model_path, quantization=quantization_arg)
found_quantization_type = model_config.quantization
except ValueError:
found_quantization_type = "ERROR"
......
......@@ -74,115 +74,116 @@ def test_update_config():
new_config3 = update_config(config3, {"a": "new_value"})
# Can remove once --task option is fully deprecated
@pytest.mark.parametrize(
("model_id", "expected_runner_type", "expected_task"),
("model_id", "expected_runner_type", "expected_convert_type",
"expected_task"),
[
("distilbert/distilgpt2", "generate", "generate"),
("intfloat/multilingual-e5-small", "pooling", "embed"),
("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify"),
("Qwen/Qwen2.5-Math-RM-72B", "pooling", "reward"),
("openai/whisper-small", "generate", "transcription"),
("distilbert/distilgpt2", "generate", "none", "generate"),
("intfloat/multilingual-e5-small", "pooling", "none", "embed"),
("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify", "classify"),
("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "none",
"classify"),
("Qwen/Qwen2.5-Math-RM-72B", "pooling", "none", "reward"),
("openai/whisper-small", "generate", "none", "transcription"),
],
)
def test_auto_task(model_id, expected_runner_type, expected_task):
config = ModelConfig(
model_id,
task="auto",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
)
def test_auto_task(model_id, expected_runner_type, expected_convert_type,
expected_task):
config = ModelConfig(model_id, task="auto")
assert config.runner_type == expected_runner_type
assert config.convert_type == expected_convert_type
assert expected_task in config.supported_tasks
if config.runner_type == "pooling":
assert config.task == expected_task
else:
assert expected_task in config.supported_tasks
# Can remove once --task option is fully deprecated
@pytest.mark.parametrize(
("model_id", "expected_runner_type", "expected_convert_type",
"expected_task"),
[
("distilbert/distilgpt2", "pooling", "embed", "embed"),
("intfloat/multilingual-e5-small", "pooling", "embed", "embed"),
("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify", "classify"),
("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify",
"classify"),
("Qwen/Qwen2.5-Math-RM-72B", "pooling", "embed", "embed"),
("openai/whisper-small", "pooling", "embed", "embed"),
],
)
def test_score_task(model_id, expected_runner_type, expected_convert_type,
expected_task):
config = ModelConfig(model_id, task="score")
assert config.runner_type == expected_runner_type
assert config.convert_type == expected_convert_type
assert expected_task in config.supported_tasks
# Can remove once --task option is fully deprecated
@pytest.mark.parametrize(
("model_id", "expected_runner_type", "expected_task"),
("model_id", "expected_runner_type", "expected_convert_type",
"expected_task"),
[
("distilbert/distilgpt2", "pooling", "embed"),
("intfloat/multilingual-e5-small", "pooling", "embed"),
("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify"),
("Qwen/Qwen2.5-Math-RM-72B", "pooling", "embed"),
("openai/whisper-small", "pooling", "embed"),
("openai/whisper-small", "generate", "none", "transcription"),
],
)
def test_score_task(model_id, expected_runner_type, expected_task):
config = ModelConfig(
model_id,
task="score",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
)
def test_transcription_task(model_id, expected_runner_type,
expected_convert_type, expected_task):
config = ModelConfig(model_id, task="transcription")
assert config.runner_type == expected_runner_type
assert config.task == expected_task
assert config.convert_type == expected_convert_type
assert expected_task in config.supported_tasks
@pytest.mark.parametrize(("model_id", "expected_runner_type", "expected_task"),
[
("Qwen/Qwen2.5-1.5B-Instruct", "draft", "auto"),
])
def test_draft_task(model_id, expected_runner_type, expected_task):
config = ModelConfig(
model_id,
runner="draft",
tokenizer=model_id,
seed=0,
dtype="float16",
)
@pytest.mark.parametrize(
("model_id", "expected_runner_type", "expected_convert_type"),
[
("distilbert/distilgpt2", "generate", "none"),
("intfloat/multilingual-e5-small", "pooling", "none"),
("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "none"),
("Qwen/Qwen2.5-Math-RM-72B", "pooling", "none"),
("openai/whisper-small", "generate", "none"),
],
)
def test_auto_runner(model_id, expected_runner_type, expected_convert_type):
config = ModelConfig(model_id, runner="auto")
assert config.runner_type == expected_runner_type
assert config.task == expected_task
assert config.convert_type == expected_convert_type
@pytest.mark.parametrize(
("model_id", "expected_runner_type", "expected_task"),
("model_id", "expected_runner_type", "expected_convert_type"),
[
("openai/whisper-small", "generate", "transcription"),
("distilbert/distilgpt2", "pooling", "embed"),
("intfloat/multilingual-e5-small", "pooling", "none"),
("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "none"),
("Qwen/Qwen2.5-Math-RM-72B", "pooling", "none"),
("openai/whisper-small", "pooling", "embed"),
],
)
def test_transcription_task(model_id, expected_runner_type, expected_task):
config = ModelConfig(
model_id,
task="transcription",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
)
def test_pooling_runner(model_id, expected_runner_type, expected_convert_type):
config = ModelConfig(model_id, runner="pooling")
assert config.runner_type == expected_runner_type
assert config.task == expected_task
assert config.convert_type == expected_convert_type
@pytest.mark.parametrize(("model_id", "bad_task"), [
("Qwen/Qwen2.5-Math-RM-72B", "generate"),
("Qwen/Qwen3-0.6B", "transcription"),
])
def test_incorrect_task(model_id, bad_task):
with pytest.raises(ValueError, match=r"does not support task=.*"):
ModelConfig(
model_id,
task=bad_task,
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
)
@pytest.mark.parametrize(
("model_id", "expected_runner_type", "expected_convert_type"),
[
("Qwen/Qwen2.5-1.5B-Instruct", "draft", "none"),
],
)
def test_draft_runner(model_id, expected_runner_type, expected_convert_type):
config = ModelConfig(model_id, runner="draft")
assert config.runner_type == expected_runner_type
assert config.convert_type == expected_convert_type
MODEL_IDS_EXPECTED = [
......@@ -195,17 +196,7 @@ MODEL_IDS_EXPECTED = [
@pytest.mark.parametrize("model_id_expected", MODEL_IDS_EXPECTED)
def test_disable_sliding_window(model_id_expected):
model_id, expected = model_id_expected
model_config = ModelConfig(
model_id,
task="auto",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
revision=None,
disable_sliding_window=True,
)
model_config = ModelConfig(model_id, disable_sliding_window=True)
assert model_config.max_model_len == expected
......@@ -214,16 +205,7 @@ def test_get_sliding_window():
# Test that the sliding window is correctly computed.
# For Qwen1.5/Qwen2, get_sliding_window() should be None
# when use_sliding_window is False.
qwen2_model_config = ModelConfig(
"Qwen/Qwen1.5-7B",
task="auto",
tokenizer="Qwen/Qwen1.5-7B",
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
revision=None,
)
qwen2_model_config = ModelConfig("Qwen/Qwen1.5-7B")
qwen2_model_config.hf_config.use_sliding_window = False
qwen2_model_config.hf_config.sliding_window = TEST_SLIDING_WINDOW
......@@ -232,16 +214,7 @@ def test_get_sliding_window():
qwen2_model_config.hf_config.use_sliding_window = True
assert qwen2_model_config.get_sliding_window() == TEST_SLIDING_WINDOW
mistral_model_config = ModelConfig(
"mistralai/Mistral-7B-v0.1",
task="auto",
tokenizer="mistralai/Mistral-7B-v0.1",
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
revision=None,
)
mistral_model_config = ModelConfig("mistralai/Mistral-7B-v0.1")
mistral_model_config.hf_config.sliding_window = None
assert mistral_model_config.get_sliding_window() is None
......@@ -253,16 +226,7 @@ def test_get_sliding_window():
reason="Xformers backend is not supported on ROCm.")
def test_get_pooling_config():
model_id = "sentence-transformers/all-MiniLM-L12-v2"
model_config = ModelConfig(
model_id,
task="auto",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
revision=None,
)
model_config = ModelConfig(model_id)
pooling_config = model_config._init_pooler_config()
assert pooling_config is not None
......@@ -275,14 +239,7 @@ def test_get_pooling_config():
reason="Xformers backend is not supported on ROCm.")
def test_get_pooling_config_from_args():
model_id = "sentence-transformers/all-MiniLM-L12-v2"
model_config = ModelConfig(model_id,
task="auto",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
revision=None)
model_config = ModelConfig(model_id)
override_pooler_config = PoolerConfig(pooling_type='CLS', normalize=True)
model_config.override_pooler_config = override_pooler_config
......@@ -295,16 +252,8 @@ def test_get_pooling_config_from_args():
@pytest.mark.skipif(current_platform.is_rocm(),
reason="Xformers backend is not supported on ROCm.")
def test_get_bert_tokenization_sentence_transformer_config():
bge_model_config = ModelConfig(
model="BAAI/bge-base-en-v1.5",
task="auto",
tokenizer="BAAI/bge-base-en-v1.5",
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
revision=None,
)
model_id = "BAAI/bge-base-en-v1.5"
bge_model_config = ModelConfig(model_id)
bert_bge_model_config = bge_model_config._get_encoder_config()
......@@ -317,27 +266,13 @@ def test_rope_customization():
TEST_ROPE_THETA = 16_000_000.0
LONGCHAT_ROPE_SCALING = {"rope_type": "linear", "factor": 8.0}
llama_model_config = ModelConfig(
"meta-llama/Meta-Llama-3-8B-Instruct",
task="auto",
tokenizer="meta-llama/Meta-Llama-3-8B-Instruct",
tokenizer_mode="auto",
trust_remote_code=False,
dtype="float16",
seed=0,
)
llama_model_config = ModelConfig("meta-llama/Meta-Llama-3-8B-Instruct")
assert getattr(llama_model_config.hf_config, "rope_scaling", None) is None
assert getattr(llama_model_config.hf_config, "rope_theta", None) == 500_000
assert llama_model_config.max_model_len == 8192
llama_model_config = ModelConfig(
"meta-llama/Meta-Llama-3-8B-Instruct",
task="auto",
tokenizer="meta-llama/Meta-Llama-3-8B-Instruct",
tokenizer_mode="auto",
trust_remote_code=False,
dtype="float16",
seed=0,
hf_overrides={
"rope_scaling": TEST_ROPE_SCALING,
"rope_theta": TEST_ROPE_THETA,
......@@ -349,15 +284,7 @@ def test_rope_customization():
None) == TEST_ROPE_THETA
assert llama_model_config.max_model_len == 16384
longchat_model_config = ModelConfig(
"lmsys/longchat-13b-16k",
task="auto",
tokenizer="lmsys/longchat-13b-16k",
tokenizer_mode="auto",
trust_remote_code=False,
dtype="float16",
seed=0,
)
longchat_model_config = ModelConfig("lmsys/longchat-13b-16k")
# Check if LONGCHAT_ROPE_SCALING entries are in longchat_model_config
assert all(
longchat_model_config.hf_config.rope_scaling.get(key) == value
......@@ -366,12 +293,6 @@ def test_rope_customization():
longchat_model_config = ModelConfig(
"lmsys/longchat-13b-16k",
task="auto",
tokenizer="lmsys/longchat-13b-16k",
tokenizer_mode="auto",
trust_remote_code=False,
dtype="float16",
seed=0,
hf_overrides={
"rope_scaling": TEST_ROPE_SCALING,
},
......@@ -390,15 +311,7 @@ def test_rope_customization():
("meta-llama/Llama-3.2-11B-Vision", True),
])
def test_is_encoder_decoder(model_id, is_encoder_decoder):
config = ModelConfig(
model_id,
task="auto",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
dtype="float16",
seed=0,
)
config = ModelConfig(model_id)
assert config.is_encoder_decoder == is_encoder_decoder
......@@ -408,15 +321,7 @@ def test_is_encoder_decoder(model_id, is_encoder_decoder):
("Qwen/Qwen2-VL-2B-Instruct", True),
])
def test_uses_mrope(model_id, uses_mrope):
config = ModelConfig(
model_id,
task="auto",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
dtype="float16",
seed=0,
)
config = ModelConfig(model_id)
assert config.uses_mrope == uses_mrope
......@@ -426,26 +331,12 @@ def test_generation_config_loading():
# When set generation_config to "vllm", the default generation config
# will not be loaded.
model_config = ModelConfig(model_id,
task="auto",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
generation_config="vllm")
model_config = ModelConfig(model_id, generation_config="vllm")
assert model_config.get_diff_sampling_param() == {}
# When set generation_config to "auto", the default generation config
# should be loaded.
model_config = ModelConfig(model_id,
task="auto",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
generation_config="auto")
model_config = ModelConfig(model_id, generation_config="auto")
correct_generation_config = {
"repetition_penalty": 1.1,
......@@ -461,12 +352,6 @@ def test_generation_config_loading():
model_config = ModelConfig(
model_id,
task="auto",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
generation_config="auto",
override_generation_config=override_generation_config)
......@@ -479,12 +364,6 @@ def test_generation_config_loading():
# is set, the override_generation_config should be used directly.
model_config = ModelConfig(
model_id,
task="auto",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
generation_config="vllm",
override_generation_config=override_generation_config)
......@@ -515,16 +394,7 @@ def test_load_config_pt_load_map_location(pt_load_map_location):
def test_get_and_verify_max_len(model_id, max_model_len, expected_max_len,
should_raise):
"""Test get_and_verify_max_len with different configurations."""
model_config = ModelConfig(
model_id,
task="auto",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
revision=None,
)
model_config = ModelConfig(model_id)
if should_raise:
with pytest.raises(ValueError):
......
......@@ -21,13 +21,8 @@ def test_max_tokens_none():
def model_config():
return ModelConfig(
MODEL_NAME,
task="auto",
tokenizer=MODEL_NAME,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
revision=None,
)
......
......@@ -695,11 +695,7 @@ def test_estimate_max_model_len(model_id, max_model_len,
# Create a VllmConfig
model_config = ModelConfig(
model_id,
task="generate",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
runner="generate",
dtype="float16",
max_model_len=max_model_len,
)
......@@ -733,11 +729,7 @@ def test_get_max_concurrency_for_kv_cache_config():
max_model_len = 16384
model_config = ModelConfig(
model_id,
task="generate",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
runner="generate",
dtype="float16",
max_model_len=max_model_len,
)
......
......@@ -1248,9 +1248,6 @@ def create_scheduler_with_priority(
)
model_config = ModelConfig(
model=model,
task="auto",
tokenizer=model,
tokenizer_mode="auto",
trust_remote_code=True,
dtype="float16",
seed=42,
......
......@@ -59,9 +59,6 @@ def create_scheduler(
)
model_config = ModelConfig(
model=model,
task="auto",
tokenizer=model,
tokenizer_mode="auto",
trust_remote_code=True,
dtype="float16",
seed=42,
......
......@@ -68,9 +68,6 @@ def create_vllm_config(
)
model_config = ModelConfig(
model=model,
task="auto",
tokenizer=model,
tokenizer_mode="auto",
trust_remote_code=True,
dtype="float16",
seed=42,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment