[Deprecation][2/N] Replace `--task` with `--runner` and `--convert` (#21470)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>

[Deprecation][2/N] Replace `--task` with `--runner` and `--convert` (#21470)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
86ae693f · Cyrus Leung · GitHub · 8f605ee3 · 86ae693f · 86ae693f
Unverified Commit 86ae693f authored Jul 28, 2025 by Cyrus Leung Committed by GitHub Jul 27, 2025
20 changed files
--- a/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
+++ b/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
@@ -92,7 +92,7 @@ def _run_test(
    # if we run HF first, the cuda initialization will be done and it
    # will hurt multiprocessing backend with fork method (the default method).
    with vllm_runner(model,
-                     task="embed",
+                     runner="pooling",
                     dtype=dtype,
                     enforce_eager=True,
                     max_model_len=8192) as vllm_model:

--- a/tests/models/multimodal/pooling/test_jinavl_reranker.py
+++ b/tests/models/multimodal/pooling/test_jinavl_reranker.py
@@ -49,7 +49,7 @@ def vllm_reranker(

    with vllm_runner(
            model_name,
-            task="score",
+            runner="pooling",
            dtype=dtype,
            max_num_seqs=2,
            max_model_len=2048,

--- a/tests/models/multimodal/pooling/test_llava_next.py
+++ b/tests/models/multimodal/pooling/test_llava_next.py
@@ -64,7 +64,7 @@ def _run_test(
    # if we run HF first, the cuda initialization will be done and it
    # will hurt multiprocessing backend with fork method (the default method).
    with vllm_runner(model,
-                     task="embed",
+                     runner="pooling",
                     dtype=dtype,
                     max_model_len=4096,
                     enforce_eager=True) as vllm_model:

--- a/tests/models/multimodal/pooling/test_phi3v.py
+++ b/tests/models/multimodal/pooling/test_phi3v.py
@@ -44,7 +44,7 @@ def _run_test(
    # vLLM needs a fresh new process without cuda initialization.
    # if we run HF first, the cuda initialization will be done and it
    # will hurt multiprocessing backend with fork method (the default method).
-    with vllm_runner(model, task="embed", dtype=dtype,
+    with vllm_runner(model, runner="pooling", dtype=dtype,
                     enforce_eager=True) as vllm_model:
        vllm_outputs = vllm_model.embed(input_texts, images=input_images)


--- a/tests/models/multimodal/pooling/test_prithvi_mae.py
+++ b/tests/models/multimodal/pooling/test_prithvi_mae.py
@@ -34,7 +34,7 @@ def _run_test(
            set_default_torch_num_threads(1),
            vllm_runner(
                model,
-                task="embed",
+                runner="pooling",
                dtype=torch.float16,
                enforce_eager=True,
                skip_tokenizer_init=True,

--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -58,13 +58,10 @@ def _test_processing_correctness(

    model_config = ModelConfig(
        model_id,
-        task="auto",
        tokenizer=model_info.tokenizer or model_id,
        tokenizer_mode=model_info.tokenizer_mode,
-        trust_remote_code=model_info.trust_remote_code,
-        seed=0,
-        dtype="auto",
        revision=model_info.revision,
+        trust_remote_code=model_info.trust_remote_code,
        hf_overrides=model_info.hf_overrides,
    )


--- a/tests/models/multimodal/test_mapping.py
+++ b/tests/models/multimodal/test_mapping.py
@@ -54,13 +54,10 @@ def test_hf_model_weights_mapper(model_arch: str):

    model_config = ModelConfig(
        model_id,
-        task="auto",
        tokenizer=model_info.tokenizer or model_id,
        tokenizer_mode=model_info.tokenizer_mode,
+        revision=model_info.revision,
        trust_remote_code=model_info.trust_remote_code,
-        seed=0,
-        dtype="auto",
-        revision=None,
        hf_overrides=model_info.hf_overrides,
    )
    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)

--- a/tests/models/quantization/test_bitsandbytes.py
+++ b/tests/models/quantization/test_bitsandbytes.py
@@ -172,7 +172,7 @@ def test_4bit_bnb_embedding_model(

    # Inflight 4bit quantization
    with vllm_runner(model_name,
-                     task="embed",
+                     runner="pooling",
                     dtype=dtype,
                     gpu_memory_utilization=0.5,
                     quantization="bitsandbytes") as vllm_model:

--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -7,13 +7,15 @@ import pytest
 from transformers import PretrainedConfig

 from vllm import LLM
+from vllm.config import ModelImpl
 from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
 from vllm.utils import GiB_bytes
 from vllm.v1.core.kv_cache_utils import get_kv_cache_config
 from vllm.v1.engine.core import EngineCore as V1EngineCore

 from ..utils import create_new_process_for_each_test
-from .registry import AUTO_EXAMPLE_MODELS, HF_EXAMPLE_MODELS, HfExampleModels
+from .registry import (_TRANSFORMERS_BACKEND_MODELS, AUTO_EXAMPLE_MODELS,
+                       HF_EXAMPLE_MODELS, HfExampleModels)


 @create_new_process_for_each_test()
@@ -126,6 +128,8 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
            # these tests seem to produce leftover memory
            gpu_memory_utilization=0.80,
            load_format="dummy",
+            model_impl=ModelImpl.TRANSFORMERS
+            if model_arch in _TRANSFORMERS_BACKEND_MODELS else ModelImpl.VLLM,
            hf_overrides=hf_overrides,
        )


--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -24,11 +24,9 @@ from .registry import HF_EXAMPLE_MODELS

 @pytest.mark.parametrize("model_arch", ModelRegistry.get_supported_archs())
 def test_registry_imports(model_arch):
-    model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
-    model_info.check_transformers_version(on_fail="skip")
-
    # Ensure all model classes can be imported successfully
-    model_cls, _ = ModelRegistry.resolve_model_cls(model_arch)
+    model_cls = ModelRegistry._try_load_model_cls(model_arch)
+    assert model_cls is not None

    if model_arch in _SPECULATIVE_DECODING_MODELS:
        return  # Ignore these models which do not have a unified format
@@ -56,14 +54,16 @@ def test_registry_imports(model_arch):
    ("XLMRobertaForSequenceClassification", False, False, True),
 ])
 def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
-    assert ModelRegistry.is_multimodal_model(model_arch) is is_mm
+    model_info = ModelRegistry._try_inspect_model_cls(model_arch)
+    assert model_info is not None

-    assert ModelRegistry.is_cross_encoder_model(model_arch) is is_ce
+    assert model_info.supports_multimodal is is_mm
+    assert model_info.supports_cross_encoding is is_ce

    if init_cuda and current_platform.is_cuda_alike():
        assert not torch.cuda.is_initialized()

-        ModelRegistry.resolve_model_cls(model_arch)
+        ModelRegistry._try_load_model_cls(model_arch)
        if not torch.cuda.is_initialized():
            warnings.warn(
                "This model no longer initializes CUDA on import. "
@@ -82,12 +82,15 @@ def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
        ("Qwen2VLForConditionalGeneration", True, True),
    ])
 def test_registry_is_pp(model_arch, is_pp, init_cuda):
-    assert ModelRegistry.is_pp_supported_model(model_arch) is is_pp
+    model_info = ModelRegistry._try_inspect_model_cls(model_arch)
+    assert model_info is not None
+
+    assert model_info.supports_pp is is_pp

    if init_cuda and current_platform.is_cuda_alike():
        assert not torch.cuda.is_initialized()

-        ModelRegistry.resolve_model_cls(model_arch)
+        ModelRegistry._try_load_model_cls(model_arch)
        if not torch.cuda.is_initialized():
            warnings.warn(
                "This model no longer initializes CUDA on import. "

--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@@ -33,6 +33,10 @@ def check_implementation(
    args = (example_prompts, max_tokens, num_logprobs)

    with runner_test(model, **kwargs_test, **kwargs) as model_test:
+        model_config = model_test.llm.llm_engine.model_config
+        assert model_config.architecture == (
+            model_config._get_transformers_backend_cls())
+
        outputs_test = model_test.generate_greedy_logprobs(*args)

    with runner_ref(model, **kwargs_ref) as model_ref:
@@ -130,8 +134,13 @@ def test_quantization(
            model_impl="transformers",
            enforce_eager=True,
            **quantization_kwargs) as vllm_model:  # type: ignore[arg-type]
+        model_config = vllm_model.llm.llm_engine.model_config
+        assert model_config.architecture == (
+            model_config._get_transformers_backend_cls())
+
        transformers_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)
+
    check_logprobs_close(
        outputs_0_lst=transformers_outputs,
        outputs_1_lst=vllm_outputs,
@@ -151,7 +160,6 @@ def test_classify(
    example_prompts,
    model: str,
    dtype: str,
-    monkeypatch,
 ) -> None:
    import torch
    from transformers import AutoModelForSequenceClassification
@@ -160,6 +168,10 @@ def test_classify(
                     max_model_len=512,
                     dtype=dtype,
                     model_impl="transformers") as vllm_model:
+        model_config = vllm_model.llm.llm_engine.model_config
+        assert model_config.architecture == (
+            model_config._get_transformers_backend_cls())
+
        vllm_outputs = vllm_model.classify(example_prompts)

    with hf_runner(model,

--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -8,7 +8,7 @@ from typing import Any, NamedTuple, Optional, Union
 import torch
 import torch.nn.functional as F

-from vllm.config import ModelConfig, TaskOption
+from vllm.config import ModelConfig, RunnerOption
 from vllm.inputs import InputContext
 from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs

@@ -255,7 +255,7 @@ def check_logprobs_close(

 def build_model_context(
    model_id: str,
-    task: TaskOption = "auto",
+    runner: RunnerOption = "auto",
    dtype: Union[str, torch.dtype] = "auto",
    model_config_kwargs: Optional[dict[str, Any]] = None,
    mm_processor_kwargs: Optional[dict[str, Any]] = None,
@@ -280,9 +280,10 @@ def build_model_context(
    model_config_kwargs = model_config_kwargs or {}
    model_config = ModelConfig(
        model_id,
-        task=task,
+        runner=runner,
        tokenizer=model_info.tokenizer or model_id,
        tokenizer_mode=model_info.tokenizer_mode,
+        revision=model_info.revision,
        trust_remote_code=model_info.trust_remote_code,
        dtype=dtype,
        seed=0,

--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -954,13 +954,6 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):

    model_config = ModelConfig(
        model=model_id,
-        task="auto",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="auto",
-        revision=None,
        limit_mm_per_prompt=limit_mm_per_prompt,
    )

@@ -993,13 +986,6 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):

    model_config = ModelConfig(
        model=model_id,
-        task="auto",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="auto",
-        revision=None,
        limit_mm_per_prompt=limit_mm_per_prompt,
    )

@@ -1061,16 +1047,7 @@ class _ProcessorProxy:
 )
 # yapf: enable
 def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
-    model_config = ModelConfig(
-        model=model_id,
-        task="auto",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="auto",
-        revision=None,
-    )
+    model_config = ModelConfig(model_id)

    processor = MULTIMODAL_REGISTRY.create_processor(model_config)
    orig_get_hf_processor = processor.info.get_hf_processor

--- a/tests/quantization/test_configs.py
+++ b/tests/quantization/test_configs.py
@@ -57,15 +57,7 @@ def test_auto_gptq(model_arg_exptype: tuple[str, None, str]) -> None:
    model_path, quantization_arg, expected_type = model_arg_exptype

    try:
-        model_config = ModelConfig(model_path,
-                                   task="auto",
-                                   tokenizer=model_path,
-                                   tokenizer_mode="auto",
-                                   trust_remote_code=False,
-                                   seed=0,
-                                   dtype="float16",
-                                   revision=None,
-                                   quantization=quantization_arg)
+        model_config = ModelConfig(model_path, quantization=quantization_arg)
        found_quantization_type = model_config.quantization
    except ValueError:
        found_quantization_type = "ERROR"

--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -74,115 +74,116 @@ def test_update_config():
        new_config3 = update_config(config3, {"a": "new_value"})


+# Can remove once --task option is fully deprecated
 @pytest.mark.parametrize(
-    ("model_id", "expected_runner_type", "expected_task"),
+    ("model_id", "expected_runner_type", "expected_convert_type",
+     "expected_task"),
    [
-        ("distilbert/distilgpt2", "generate", "generate"),
-        ("intfloat/multilingual-e5-small", "pooling", "embed"),
-        ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
-        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify"),
-        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "reward"),
-        ("openai/whisper-small", "generate", "transcription"),
+        ("distilbert/distilgpt2", "generate", "none", "generate"),
+        ("intfloat/multilingual-e5-small", "pooling", "none", "embed"),
+        ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify", "classify"),
+        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "none",
+         "classify"),
+        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "none", "reward"),
+        ("openai/whisper-small", "generate", "none", "transcription"),
    ],
 )
-def test_auto_task(model_id, expected_runner_type, expected_task):
-    config = ModelConfig(
-        model_id,
-        task="auto",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="float16",
-    )
+def test_auto_task(model_id, expected_runner_type, expected_convert_type,
+                   expected_task):
+    config = ModelConfig(model_id, task="auto")

    assert config.runner_type == expected_runner_type
+    assert config.convert_type == expected_convert_type
+    assert expected_task in config.supported_tasks

-    if config.runner_type == "pooling":
-        assert config.task == expected_task
-    else:
-        assert expected_task in config.supported_tasks

+# Can remove once --task option is fully deprecated
+@pytest.mark.parametrize(
+    ("model_id", "expected_runner_type", "expected_convert_type",
+     "expected_task"),
+    [
+        ("distilbert/distilgpt2", "pooling", "embed", "embed"),
+        ("intfloat/multilingual-e5-small", "pooling", "embed", "embed"),
+        ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify", "classify"),
+        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify",
+         "classify"),
+        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "embed", "embed"),
+        ("openai/whisper-small", "pooling", "embed", "embed"),
+    ],
+)
+def test_score_task(model_id, expected_runner_type, expected_convert_type,
+                    expected_task):
+    config = ModelConfig(model_id, task="score")

+    assert config.runner_type == expected_runner_type
+    assert config.convert_type == expected_convert_type
+    assert expected_task in config.supported_tasks
+
+
+# Can remove once --task option is fully deprecated
 @pytest.mark.parametrize(
-    ("model_id", "expected_runner_type", "expected_task"),
+    ("model_id", "expected_runner_type", "expected_convert_type",
+     "expected_task"),
    [
-        ("distilbert/distilgpt2", "pooling", "embed"),
-        ("intfloat/multilingual-e5-small", "pooling", "embed"),
-        ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
-        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify"),
-        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "embed"),
-        ("openai/whisper-small", "pooling", "embed"),
+        ("openai/whisper-small", "generate", "none", "transcription"),
    ],
 )
-def test_score_task(model_id, expected_runner_type, expected_task):
-    config = ModelConfig(
-        model_id,
-        task="score",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="float16",
-    )
+def test_transcription_task(model_id, expected_runner_type,
+                            expected_convert_type, expected_task):
+    config = ModelConfig(model_id, task="transcription")

    assert config.runner_type == expected_runner_type
-    assert config.task == expected_task
+    assert config.convert_type == expected_convert_type
+    assert expected_task in config.supported_tasks


-@pytest.mark.parametrize(("model_id", "expected_runner_type", "expected_task"),
-                         [
-                             ("Qwen/Qwen2.5-1.5B-Instruct", "draft", "auto"),
-                         ])
-def test_draft_task(model_id, expected_runner_type, expected_task):
-    config = ModelConfig(
-        model_id,
-        runner="draft",
-        tokenizer=model_id,
-        seed=0,
-        dtype="float16",
-    )
+@pytest.mark.parametrize(
+    ("model_id", "expected_runner_type", "expected_convert_type"),
+    [
+        ("distilbert/distilgpt2", "generate", "none"),
+        ("intfloat/multilingual-e5-small", "pooling", "none"),
+        ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
+        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "none"),
+        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "none"),
+        ("openai/whisper-small", "generate", "none"),
+    ],
+)
+def test_auto_runner(model_id, expected_runner_type, expected_convert_type):
+    config = ModelConfig(model_id, runner="auto")

    assert config.runner_type == expected_runner_type
-    assert config.task == expected_task
+    assert config.convert_type == expected_convert_type


 @pytest.mark.parametrize(
-    ("model_id", "expected_runner_type", "expected_task"),
+    ("model_id", "expected_runner_type", "expected_convert_type"),
    [
-        ("openai/whisper-small", "generate", "transcription"),
+        ("distilbert/distilgpt2", "pooling", "embed"),
+        ("intfloat/multilingual-e5-small", "pooling", "none"),
+        ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
+        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "none"),
+        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "none"),
+        ("openai/whisper-small", "pooling", "embed"),
    ],
 )
-def test_transcription_task(model_id, expected_runner_type, expected_task):
-    config = ModelConfig(
-        model_id,
-        task="transcription",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="float16",
-    )
+def test_pooling_runner(model_id, expected_runner_type, expected_convert_type):
+    config = ModelConfig(model_id, runner="pooling")

    assert config.runner_type == expected_runner_type
-    assert config.task == expected_task
+    assert config.convert_type == expected_convert_type


-@pytest.mark.parametrize(("model_id", "bad_task"), [
-    ("Qwen/Qwen2.5-Math-RM-72B", "generate"),
-    ("Qwen/Qwen3-0.6B", "transcription"),
-])
-def test_incorrect_task(model_id, bad_task):
-    with pytest.raises(ValueError, match=r"does not support task=.*"):
-        ModelConfig(
-            model_id,
-            task=bad_task,
-            tokenizer=model_id,
-            tokenizer_mode="auto",
-            trust_remote_code=False,
-            seed=0,
-            dtype="float16",
-        )
+@pytest.mark.parametrize(
+    ("model_id", "expected_runner_type", "expected_convert_type"),
+    [
+        ("Qwen/Qwen2.5-1.5B-Instruct", "draft", "none"),
+    ],
+)
+def test_draft_runner(model_id, expected_runner_type, expected_convert_type):
+    config = ModelConfig(model_id, runner="draft")
+
+    assert config.runner_type == expected_runner_type
+    assert config.convert_type == expected_convert_type


 MODEL_IDS_EXPECTED = [
@@ -195,17 +196,7 @@ MODEL_IDS_EXPECTED = [
 @pytest.mark.parametrize("model_id_expected", MODEL_IDS_EXPECTED)
 def test_disable_sliding_window(model_id_expected):
    model_id, expected = model_id_expected
-    model_config = ModelConfig(
-        model_id,
-        task="auto",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="float16",
-        revision=None,
-        disable_sliding_window=True,
-    )
+    model_config = ModelConfig(model_id, disable_sliding_window=True)
    assert model_config.max_model_len == expected


@@ -214,16 +205,7 @@ def test_get_sliding_window():
    # Test that the sliding window is correctly computed.
    # For Qwen1.5/Qwen2, get_sliding_window() should be None
    # when use_sliding_window is False.
-    qwen2_model_config = ModelConfig(
-        "Qwen/Qwen1.5-7B",
-        task="auto",
-        tokenizer="Qwen/Qwen1.5-7B",
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="float16",
-        revision=None,
-    )
+    qwen2_model_config = ModelConfig("Qwen/Qwen1.5-7B")

    qwen2_model_config.hf_config.use_sliding_window = False
    qwen2_model_config.hf_config.sliding_window = TEST_SLIDING_WINDOW
@@ -232,16 +214,7 @@ def test_get_sliding_window():
    qwen2_model_config.hf_config.use_sliding_window = True
    assert qwen2_model_config.get_sliding_window() == TEST_SLIDING_WINDOW

-    mistral_model_config = ModelConfig(
-        "mistralai/Mistral-7B-v0.1",
-        task="auto",
-        tokenizer="mistralai/Mistral-7B-v0.1",
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="float16",
-        revision=None,
-    )
+    mistral_model_config = ModelConfig("mistralai/Mistral-7B-v0.1")
    mistral_model_config.hf_config.sliding_window = None
    assert mistral_model_config.get_sliding_window() is None

@@ -253,16 +226,7 @@ def test_get_sliding_window():
                    reason="Xformers backend is not supported on ROCm.")
 def test_get_pooling_config():
    model_id = "sentence-transformers/all-MiniLM-L12-v2"
-    model_config = ModelConfig(
-        model_id,
-        task="auto",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="float16",
-        revision=None,
-    )
+    model_config = ModelConfig(model_id)

    pooling_config = model_config._init_pooler_config()
    assert pooling_config is not None
@@ -275,14 +239,7 @@ def test_get_pooling_config():
                    reason="Xformers backend is not supported on ROCm.")
 def test_get_pooling_config_from_args():
    model_id = "sentence-transformers/all-MiniLM-L12-v2"
-    model_config = ModelConfig(model_id,
-                               task="auto",
-                               tokenizer=model_id,
-                               tokenizer_mode="auto",
-                               trust_remote_code=False,
-                               seed=0,
-                               dtype="float16",
-                               revision=None)
+    model_config = ModelConfig(model_id)

    override_pooler_config = PoolerConfig(pooling_type='CLS', normalize=True)
    model_config.override_pooler_config = override_pooler_config
@@ -295,16 +252,8 @@ def test_get_pooling_config_from_args():
 @pytest.mark.skipif(current_platform.is_rocm(),
                    reason="Xformers backend is not supported on ROCm.")
 def test_get_bert_tokenization_sentence_transformer_config():
-    bge_model_config = ModelConfig(
-        model="BAAI/bge-base-en-v1.5",
-        task="auto",
-        tokenizer="BAAI/bge-base-en-v1.5",
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="float16",
-        revision=None,
-    )
+    model_id = "BAAI/bge-base-en-v1.5"
+    bge_model_config = ModelConfig(model_id)

    bert_bge_model_config = bge_model_config._get_encoder_config()

@@ -317,27 +266,13 @@ def test_rope_customization():
    TEST_ROPE_THETA = 16_000_000.0
    LONGCHAT_ROPE_SCALING = {"rope_type": "linear", "factor": 8.0}

-    llama_model_config = ModelConfig(
-        "meta-llama/Meta-Llama-3-8B-Instruct",
-        task="auto",
-        tokenizer="meta-llama/Meta-Llama-3-8B-Instruct",
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        dtype="float16",
-        seed=0,
-    )
+    llama_model_config = ModelConfig("meta-llama/Meta-Llama-3-8B-Instruct")
    assert getattr(llama_model_config.hf_config, "rope_scaling", None) is None
    assert getattr(llama_model_config.hf_config, "rope_theta", None) == 500_000
    assert llama_model_config.max_model_len == 8192

    llama_model_config = ModelConfig(
        "meta-llama/Meta-Llama-3-8B-Instruct",
-        task="auto",
-        tokenizer="meta-llama/Meta-Llama-3-8B-Instruct",
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        dtype="float16",
-        seed=0,
        hf_overrides={
            "rope_scaling": TEST_ROPE_SCALING,
            "rope_theta": TEST_ROPE_THETA,
@@ -349,15 +284,7 @@ def test_rope_customization():
                   None) == TEST_ROPE_THETA
    assert llama_model_config.max_model_len == 16384

-    longchat_model_config = ModelConfig(
-        "lmsys/longchat-13b-16k",
-        task="auto",
-        tokenizer="lmsys/longchat-13b-16k",
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        dtype="float16",
-        seed=0,
-    )
+    longchat_model_config = ModelConfig("lmsys/longchat-13b-16k")
    # Check if LONGCHAT_ROPE_SCALING entries are in longchat_model_config
    assert all(
        longchat_model_config.hf_config.rope_scaling.get(key) == value
@@ -366,12 +293,6 @@ def test_rope_customization():

    longchat_model_config = ModelConfig(
        "lmsys/longchat-13b-16k",
-        task="auto",
-        tokenizer="lmsys/longchat-13b-16k",
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        dtype="float16",
-        seed=0,
        hf_overrides={
            "rope_scaling": TEST_ROPE_SCALING,
        },
@@ -390,15 +311,7 @@ def test_rope_customization():
    ("meta-llama/Llama-3.2-11B-Vision", True),
 ])
 def test_is_encoder_decoder(model_id, is_encoder_decoder):
-    config = ModelConfig(
-        model_id,
-        task="auto",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        dtype="float16",
-        seed=0,
-    )
+    config = ModelConfig(model_id)

    assert config.is_encoder_decoder == is_encoder_decoder

@@ -408,15 +321,7 @@ def test_is_encoder_decoder(model_id, is_encoder_decoder):
    ("Qwen/Qwen2-VL-2B-Instruct", True),
 ])
 def test_uses_mrope(model_id, uses_mrope):
-    config = ModelConfig(
-        model_id,
-        task="auto",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        dtype="float16",
-        seed=0,
-    )
+    config = ModelConfig(model_id)

    assert config.uses_mrope == uses_mrope

@@ -426,26 +331,12 @@ def test_generation_config_loading():

    # When set generation_config to "vllm", the default generation config
    # will not be loaded.
-    model_config = ModelConfig(model_id,
-                               task="auto",
-                               tokenizer=model_id,
-                               tokenizer_mode="auto",
-                               trust_remote_code=False,
-                               seed=0,
-                               dtype="float16",
-                               generation_config="vllm")
+    model_config = ModelConfig(model_id, generation_config="vllm")
    assert model_config.get_diff_sampling_param() == {}

    # When set generation_config to "auto", the default generation config
    # should be loaded.
-    model_config = ModelConfig(model_id,
-                               task="auto",
-                               tokenizer=model_id,
-                               tokenizer_mode="auto",
-                               trust_remote_code=False,
-                               seed=0,
-                               dtype="float16",
-                               generation_config="auto")
+    model_config = ModelConfig(model_id, generation_config="auto")

    correct_generation_config = {
        "repetition_penalty": 1.1,
@@ -461,12 +352,6 @@ def test_generation_config_loading():

    model_config = ModelConfig(
        model_id,
-        task="auto",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="float16",
        generation_config="auto",
        override_generation_config=override_generation_config)

@@ -479,12 +364,6 @@ def test_generation_config_loading():
    # is set, the override_generation_config should be used directly.
    model_config = ModelConfig(
        model_id,
-        task="auto",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="float16",
        generation_config="vllm",
        override_generation_config=override_generation_config)

@@ -515,16 +394,7 @@ def test_load_config_pt_load_map_location(pt_load_map_location):
 def test_get_and_verify_max_len(model_id, max_model_len, expected_max_len,
                                should_raise):
    """Test get_and_verify_max_len with different configurations."""
-    model_config = ModelConfig(
-        model_id,
-        task="auto",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype="float16",
-        revision=None,
-    )
+    model_config = ModelConfig(model_id)

    if should_raise:
        with pytest.raises(ValueError):

--- a/tests/test_sampling_params.py
+++ b/tests/test_sampling_params.py
@@ -21,13 +21,8 @@ def test_max_tokens_none():
 def model_config():
    return ModelConfig(
        MODEL_NAME,
-        task="auto",
-        tokenizer=MODEL_NAME,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
        seed=0,
        dtype="float16",
-        revision=None,
    )



--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -695,11 +695,7 @@ def test_estimate_max_model_len(model_id, max_model_len,
    # Create a VllmConfig
    model_config = ModelConfig(
        model_id,
-        task="generate",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
+        runner="generate",
        dtype="float16",
        max_model_len=max_model_len,
    )
@@ -733,11 +729,7 @@ def test_get_max_concurrency_for_kv_cache_config():
    max_model_len = 16384
    model_config = ModelConfig(
        model_id,
-        task="generate",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
+        runner="generate",
        dtype="float16",
        max_model_len=max_model_len,
    )

--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -1248,9 +1248,6 @@ def create_scheduler_with_priority(
    )
    model_config = ModelConfig(
        model=model,
-        task="auto",
-        tokenizer=model,
-        tokenizer_mode="auto",
        trust_remote_code=True,
        dtype="float16",
        seed=42,

--- a/tests/v1/core/utils.py
+++ b/tests/v1/core/utils.py
@@ -59,9 +59,6 @@ def create_scheduler(
    )
    model_config = ModelConfig(
        model=model,
-        task="auto",
-        tokenizer=model,
-        tokenizer_mode="auto",
        trust_remote_code=True,
        dtype="float16",
        seed=42,

--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -68,9 +68,6 @@ def create_vllm_config(
    )
    model_config = ModelConfig(
        model=model,
-        task="auto",
-        tokenizer=model,
-        tokenizer_mode="auto",
        trust_remote_code=True,
        dtype="float16",
        seed=42,