[Deprecation][2/N] Replace `--task` with `--runner` and `--convert` (#21470)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>

[Deprecation][2/N] Replace `--task` with `--runner` and `--convert` (#21470)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
86ae693f · Cyrus Leung · GitHub · 8f605ee3 · 86ae693f · 86ae693f
Unverified Commit 86ae693f authored Jul 28, 2025 by Cyrus Leung Committed by GitHub Jul 27, 2025
20 changed files
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -62,8 +62,8 @@ class TestSetting:
        TestSetting(
            model="BAAI/bge-multilingual-gemma2",
            model_args=[
-                "--task", "embed", "--dtype", "bfloat16", "--max-model-len",
+                "--runner", "pooling", "--dtype", "bfloat16",
-                "2048"
+                "--max-model-len", "2048"
            ],
            pp_size=1,
            tp_size=1,
@@ -75,7 +75,7 @@ class TestSetting:
        # # encoder-based embedding model (BERT)
        # TestSetting(
        #     model="BAAI/bge-base-en-v1.5",
-        #     model_args=["--task", "embed"],
+        #     model_args=["--runner", "pooling"],
        #     pp_size=1,
        #     tp_size=1,
        #     attn_backend="XFORMERS",

--- a/tests/compile/test_fusion_all_reduce.py
+++ b/tests/compile/test_fusion_all_reduce.py
@@ -125,9 +125,6 @@ def all_reduce_fusion_pass_on_test_model(local_rank: int, world_size: int,
    # in the vllm_config, it's not really used.
    model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
    vllm_config.model_config = ModelConfig(model=model_name,
-                                           task="auto",
-                                           tokenizer=model_name,
-                                           tokenizer_mode="auto",
                                           trust_remote_code=True,
                                           dtype=dtype,
                                           seed=42)

--- a/tests/compile/test_sequence_parallelism.py
+++ b/tests/compile/test_sequence_parallelism.py
@@ -250,9 +250,6 @@ def sequence_parallelism_pass_on_test_model(
    # in the vllm_config, it's not really used.
    model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
    vllm_config.model_config = ModelConfig(model=model_name,
-                                           task="auto",
-                                           tokenizer=model_name,
-                                           tokenizer_mode="auto",
                                           trust_remote_code=True,
                                           dtype=dtype,
                                           seed=42)

--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -23,7 +23,7 @@ from vllm import LLM, SamplingParams
 from vllm.assets.audio import AudioAsset
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
-from vllm.config import TaskOption, _get_and_verify_dtype
+from vllm.config import ConvertOption, RunnerOption, _get_and_verify_dtype
 from vllm.connections import global_http_connection
 from vllm.distributed import (cleanup_dist_env_and_memory,
                              init_distributed_environment,
@@ -769,7 +769,8 @@ class VllmRunner:
    def __init__(
        self,
        model_name: str,
-        task: TaskOption = "auto",
+        runner: RunnerOption = "auto",
+        convert: ConvertOption = "auto",
        tokenizer_name: Optional[str] = None,
        tokenizer_mode: str = "auto",
        trust_remote_code: bool = True,
@@ -786,7 +787,8 @@ class VllmRunner:
    ) -> None:
        self.llm = LLM(
            model=model_name,
-            task=task,
+            runner=runner,
+            convert=convert,
            tokenizer=tokenizer_name,
            tokenizer_mode=tokenizer_mode,
            trust_remote_code=trust_remote_code,

--- a/tests/distributed/test_expert_parallel.py
+++ b/tests/distributed/test_expert_parallel.py
@@ -6,7 +6,7 @@ from typing import Literal, NamedTuple, Optional
 import pytest
-from vllm.config import TaskOption
+from vllm.config import RunnerOption
 from vllm.logger import init_logger
 from ..utils import compare_two_settings, create_new_process_for_each_test
@@ -31,14 +31,14 @@ class EPTestOptions(NamedTuple):
 class EPTestSettings:
    parallel_setups: list[ParallelSetup]
    distributed_backends: list[str]
-    task: TaskOption
+    runner: RunnerOption
    test_options: EPTestOptions
    @staticmethod
    def detailed(
        *,
        tp_base: int = 2,
-        task: TaskOption = "auto",
+        runner: RunnerOption = "auto",
        trust_remote_code: bool = False,
        tokenizer_mode: Optional[str] = None,
        load_format: Optional[str] = None,
@@ -63,7 +63,7 @@ class EPTestSettings:
                              chunked_prefill=False),
            ],
            distributed_backends=["mp", "ray"],
-            task=task,
+            runner=runner,
            test_options=EPTestOptions(trust_remote_code=trust_remote_code,
                                       tokenizer_mode=tokenizer_mode,
                                       load_format=load_format,
@@ -74,7 +74,7 @@ class EPTestSettings:
    def fast(
        *,
        tp_base: int = 2,
-        task: TaskOption = "auto",
+        runner: RunnerOption = "auto",
        trust_remote_code: bool = False,
        tokenizer_mode: Optional[str] = None,
        load_format: Optional[str] = None,
@@ -87,7 +87,7 @@ class EPTestSettings:
                              chunked_prefill=False),
            ],
            distributed_backends=["mp"],
-            task=task,
+            runner=runner,
            test_options=EPTestOptions(trust_remote_code=trust_remote_code,
                                       tokenizer_mode=tokenizer_mode,
                                       load_format=load_format,
@@ -100,7 +100,7 @@ class EPTestSettings:
        for parallel_setup in self.parallel_setups:
            for distributed_backend in self.distributed_backends:
                yield (model_name, parallel_setup, distributed_backend,
-                       self.task, opts)
+                       self.runner, opts)
 # NOTE: You can adjust tp_base locally to fit the model in GPU
@@ -118,7 +118,7 @@ def _compare_tp(
    model_name: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
-    task: TaskOption,
+    runner: RunnerOption,
    test_options: EPTestOptions,
    num_gpus_available: int,
    *,
@@ -154,8 +154,8 @@ def _compare_tp(
        common_args.append("--enable-chunked-prefill")
    if eager_mode:
        common_args.append("--enforce-eager")
-    if task != "auto":
+    if runner != "auto":
-        common_args.extend(["--task", task])
+        common_args.extend(["--runner", runner])
    if trust_remote_code:
        common_args.append("--trust-remote-code")
    if tokenizer_mode:
@@ -203,7 +203,7 @@ def _compare_tp(
 @pytest.mark.parametrize(
-    ("model_name", "parallel_setup", "distributed_backend", "task",
+    ("model_name", "parallel_setup", "distributed_backend", "runner",
     "test_options"),
    [
        params for model_name, settings in TEST_MODELS.items()
@@ -215,14 +215,14 @@ def test_ep(
    model_name: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
-    task: TaskOption,
+    runner: RunnerOption,
    test_options: EPTestOptions,
    num_gpus_available,
 ):
    _compare_tp(model_name,
                parallel_setup,
                distributed_backend,
-                task,
+                runner,
                test_options,
                num_gpus_available,
                method="generate")
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -14,7 +14,7 @@ from typing import Literal, NamedTuple, Optional
 import pytest
-from vllm.config import _FLOAT16_NOT_SUPPORTED_MODELS, TaskOption
+from vllm.config import _FLOAT16_NOT_SUPPORTED_MODELS, RunnerOption
 from vllm.logger import init_logger
 from vllm.transformers_utils.config import get_config
@@ -60,7 +60,7 @@ class PPTestSettings:
    distributed_backends: list[str]
    # vllm major version: "0" for V0, "1" for V1
    vllm_major_versions: list[str]
-    task: TaskOption
+    runner: RunnerOption
    test_options: PPTestOptions
    def __post_init__(self):
@@ -76,7 +76,7 @@ class PPTestSettings:
        tp_base: int = 1,
        pp_base: int = 2,
        multi_node_only: bool = False,
-        task: TaskOption = "auto",
+        runner: RunnerOption = "auto",
        load_format: Optional[str] = None,
    ):
        return PPTestSettings(
@@ -104,7 +104,7 @@ class PPTestSettings:
            ],
            distributed_backends=["mp", "mp", "ray", "ray"],
            vllm_major_versions=["0", "1", "0", "1"],
-            task=task,
+            runner=runner,
            test_options=PPTestOptions(multi_node_only=multi_node_only,
                                       load_format=load_format),
        )
@@ -114,7 +114,7 @@ class PPTestSettings:
        *,
        tp_base: int = 1,
        pp_base: int = 2,
-        task: TaskOption = "auto",
+        runner: RunnerOption = "auto",
        multi_node_only: bool = False,
        load_format: Optional[str] = None,
    ):
@@ -127,7 +127,7 @@ class PPTestSettings:
            ],
            distributed_backends=["mp"],
            vllm_major_versions=["0"],
-            task=task,
+            runner=runner,
            test_options=PPTestOptions(multi_node_only=multi_node_only,
                                       load_format=load_format),
        )
@@ -139,7 +139,7 @@ class PPTestSettings:
            for backend, vllm_major_version in zip(self.distributed_backends,
                                                   self.vllm_major_versions):
                yield (model_id, parallel_setup, backend, vllm_major_version,
-                       self.task, opts)
+                       self.runner, opts)
 # NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
@@ -211,10 +211,10 @@ TEXT_GENERATION_MODELS = {
 EMBEDDING_MODELS = {  # type: ignore[var-annotated]
    # [Text-only]
-    "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(task="embed"),
+    "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(runner="pooling"),
-    "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(task="embed"),
+    "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(runner="pooling"),
    "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(
-        load_format="dummy", task="embed"
+        load_format="dummy", runner="pooling"
    ),
 }
@@ -269,7 +269,7 @@ def _compare_tp(
    parallel_setup: ParallelSetup,
    distributed_backend: str,
    vllm_major_version: str,
-    task: TaskOption,
+    runner: RunnerOption,
    test_options: PPTestOptions,
    num_gpus_available: int,
    *,
@@ -335,8 +335,8 @@ def _compare_tp(
        common_args.append("--enable-chunked-prefill")
    if eager_mode:
        common_args.append("--enforce-eager")
-    if task != "auto":
+    if runner != "auto":
-        common_args.extend(["--task", task])
+        common_args.extend(["--runner", runner])
    if trust_remote_code:
        common_args.append("--trust-remote-code")
    if tokenizer_mode:
@@ -415,7 +415,7 @@ def _compare_tp(
 @pytest.mark.parametrize(
    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
-     "task", "test_options"),
+     "runner", "test_options"),
    [
        params for model_id, settings in TEXT_GENERATION_MODELS.items()
        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
@@ -427,7 +427,7 @@ def test_tp_language_generation(
    parallel_setup: ParallelSetup,
    distributed_backend: str,
    vllm_major_version: str,
-    task: TaskOption,
+    runner: RunnerOption,
    test_options: PPTestOptions,
    num_gpus_available,
 ):
@@ -435,7 +435,7 @@ def test_tp_language_generation(
                parallel_setup,
                distributed_backend,
                vllm_major_version,
-                task,
+                runner,
                test_options,
                num_gpus_available,
                method="generate",
@@ -444,7 +444,7 @@ def test_tp_language_generation(
 @pytest.mark.parametrize(
    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
-     "task", "test_options"),
+     "runner", "test_options"),
    [
        params for model_id, settings in EMBEDDING_MODELS.items()
        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
@@ -456,7 +456,7 @@ def test_tp_language_embedding(
    parallel_setup: ParallelSetup,
    distributed_backend: str,
    vllm_major_version: str,
-    task: TaskOption,
+    runner: RunnerOption,
    test_options: PPTestOptions,
    num_gpus_available,
 ):
@@ -464,7 +464,7 @@ def test_tp_language_embedding(
                parallel_setup,
                distributed_backend,
                vllm_major_version,
-                task,
+                runner,
                test_options,
                num_gpus_available,
                method="encode",
@@ -473,7 +473,7 @@ def test_tp_language_embedding(
 @pytest.mark.parametrize(
    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
-     "task", "test_options"),
+     "runner", "test_options"),
    [
        params for model_id, settings in MULTIMODAL_MODELS.items()
        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
@@ -485,7 +485,7 @@ def test_tp_multimodal_generation(
    parallel_setup: ParallelSetup,
    distributed_backend: str,
    vllm_major_version: str,
-    task: TaskOption,
+    runner: RunnerOption,
    test_options: PPTestOptions,
    num_gpus_available,
 ):
@@ -493,7 +493,7 @@ def test_tp_multimodal_generation(
                parallel_setup,
                distributed_backend,
                vllm_major_version,
-                task,
+                runner,
                test_options,
                num_gpus_available,
                method="generate",

--- a/tests/distributed/test_sequence_parallel.py
+++ b/tests/distributed/test_sequence_parallel.py
@@ -14,7 +14,7 @@ from typing import Literal, NamedTuple, Optional
 import pytest
-from vllm.config import TaskOption
+from vllm.config import RunnerOption
 from vllm.logger import init_logger
 from ..models.registry import HF_EXAMPLE_MODELS
@@ -48,7 +48,7 @@ class SPTestSettings:
    distributed_backends: list[str]
    # vllm major version: "0" for V0, "1" for V1
    vllm_major_versions: list[str]
-    task: TaskOption
+    runner: RunnerOption
    test_options: SPTestOptions
    def __post_init__(self):
@@ -64,7 +64,7 @@ class SPTestSettings:
        tp_base: int = 2,
        pp_base: int = 1,
        multi_node_only: bool = False,
-        task: TaskOption = "auto",
+        runner: RunnerOption = "auto",
        load_format: Optional[str] = None,
    ):
        parallel_setups = []
@@ -81,7 +81,7 @@ class SPTestSettings:
            parallel_setups=parallel_setups,
            distributed_backends=["mp", "ray"],
            vllm_major_versions=["1", "1"],
-            task=task,
+            runner=runner,
            test_options=SPTestOptions(multi_node_only=multi_node_only,
                                       load_format=load_format),
        )
@@ -91,7 +91,7 @@ class SPTestSettings:
        *,
        tp_base: int = 2,
        pp_base: int = 1,
-        task: TaskOption = "auto",
+        runner: RunnerOption = "auto",
        multi_node_only: bool = False,
        load_format: Optional[str] = None,
    ):
@@ -109,7 +109,7 @@ class SPTestSettings:
            parallel_setups=parallel_setups,
            distributed_backends=["mp", "ray"],
            vllm_major_versions=["1", "1"],
-            task=task,
+            runner=runner,
            test_options=SPTestOptions(multi_node_only=multi_node_only,
                                       load_format=load_format),
        )
@@ -119,7 +119,7 @@ class SPTestSettings:
        *,
        tp_base: int = 2,
        pp_base: int = 1,
-        task: TaskOption = "auto",
+        runner: RunnerOption = "auto",
        multi_node_only: bool = False,
        load_format: Optional[str] = None,
    ):
@@ -135,7 +135,7 @@ class SPTestSettings:
            parallel_setups=parallel_setups,
            distributed_backends=["mp", "ray"],
            vllm_major_versions=["1", "1"],
-            task=task,
+            runner=runner,
            test_options=SPTestOptions(multi_node_only=multi_node_only,
                                       load_format=load_format),
        )
@@ -147,7 +147,7 @@ class SPTestSettings:
            for backend, vllm_major_version in zip(self.distributed_backends,
                                                   self.vllm_major_versions):
                yield (model_id, parallel_setup, backend, vllm_major_version,
-                       self.task, opts)
+                       self.runner, opts)
 def _compare_sp(
@@ -155,7 +155,7 @@ def _compare_sp(
    parallel_setup: ParallelSetup,
    distributed_backend: str,
    vllm_major_version: str,
-    task: TaskOption,
+    runner: RunnerOption,
    test_options: SPTestOptions,
    num_gpus_available: int,
    *,
@@ -217,8 +217,8 @@ def _compare_sp(
        common_args.append("--enable-chunked-prefill")
    if eager_mode:
        common_args.append("--enforce-eager")
-    if task != "auto":
+    if runner != "auto":
-        common_args.extend(["--task", task])
+        common_args.extend(["--runner", runner])
    if trust_remote_code:
        common_args.append("--trust-remote-code")
    if tokenizer_mode:
@@ -298,7 +298,7 @@ SP_TEST_MODELS = [
 @pytest.mark.parametrize(
    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
-     "task", "test_options"),
+     "runner", "test_options"),
    [
        params for model_id, settings in SP_TEXT_GENERATION_MODELS.items()
        for params in settings.iter_params(model_id)
@@ -311,7 +311,7 @@ def test_tp_sp_generation(
    parallel_setup: ParallelSetup,
    distributed_backend: str,
    vllm_major_version: str,
-    task: TaskOption,
+    runner: RunnerOption,
    test_options: SPTestOptions,
    num_gpus_available,
 ):
@@ -319,7 +319,7 @@ def test_tp_sp_generation(
                parallel_setup,
                distributed_backend,
                vllm_major_version,
-                task,
+                runner,
                test_options,
                num_gpus_available,
                method="generate",

--- a/tests/entrypoints/openai/correctness/test_mteb_embed.py
+++ b/tests/entrypoints/openai/correctness/test_mteb_embed.py
@@ -19,7 +19,8 @@ MAIN_SCORE = 0.7422994752439667
 @pytest.fixture(scope="module")
 def server():
    args = [
-        "--task", "embed", "--enforce-eager", "--disable-uvicorn-access-log"
+        "--runner", "pooling", "--enforce-eager",
+        "--disable-uvicorn-access-log"
    ]
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:

--- a/tests/entrypoints/openai/correctness/test_mteb_score.py
+++ b/tests/entrypoints/openai/correctness/test_mteb_score.py
@@ -21,7 +21,8 @@ MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
 @pytest.fixture(scope="module")
 def server():
    args = [
-        "--task", "score", "--enforce-eager", "--disable-uvicorn-access-log"
+        "--runner", "pooling", "--enforce-eager",
+        "--disable-uvicorn-access-log"
    ]
    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:

--- a/tests/entrypoints/openai/test_chat_logit_bias_validation.py
+++ b/tests/entrypoints/openai/test_chat_logit_bias_validation.py
@@ -15,10 +15,6 @@ MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
 def get_vocab_size(model_name):
    config = ModelConfig(
        model=model_name,
-        task="auto",
-        tokenizer=model_name,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
        seed=0,
        dtype="bfloat16",
    )

--- a/tests/entrypoints/openai/test_chat_template.py
+++ b/tests/entrypoints/openai/test_chat_template.py
@@ -102,6 +102,7 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
        tokenizer=model_info.tokenizer or model,
        tokenizer_mode=model_info.tokenizer_mode,
        trust_remote_code=model_info.trust_remote_code,
+        revision=model_info.revision,
        hf_overrides=model_info.hf_overrides,
    )

--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -33,8 +33,8 @@ def v1(run_with_both_engines):
 @pytest.fixture(scope="module")
 def server():
    args = [
-        "--task",
+        "--runner",
-        "embed",
+        "pooling",
        # use half precision for speed and memory savings in CI environment
        "--dtype",
        DTYPE,

--- a/tests/entrypoints/openai/test_embedding_dimensions.py
+++ b/tests/entrypoints/openai/test_embedding_dimensions.py
@@ -42,8 +42,8 @@ def dtype(request):
 @pytest.fixture(scope="module")
 def server(model_info, dtype: str):
    args = [
-        "--task",
+        "--runner",
-        "embed",
+        "pooling",
        # use half precision for speed and memory savings in CI environment
        "--dtype",
        dtype,

--- a/tests/entrypoints/openai/test_openai_schema.py
+++ b/tests/entrypoints/openai/test_openai_schema.py
@@ -21,7 +21,7 @@ LONG_TIMEOUT_SECONDS: Final[int] = 60
 @pytest.fixture(scope="module")
 def server():
    args = [
-        "--task",
+        "--runner",
        "generate",
        "--max-model-len",
        "2048",

--- a/tests/entrypoints/openai/test_optional_middleware.py
+++ b/tests/entrypoints/openai/test_optional_middleware.py
@@ -27,8 +27,8 @@ def server(request: pytest.FixtureRequest):
        passed_params = [passed_params]
    args = [
-        "--task",
+        "--runner",
-        "embed",
+        "pooling",
        # use half precision for speed and memory savings in CI environment
        "--dtype",
        "float16",

--- a/tests/entrypoints/openai/test_pooling.py
+++ b/tests/entrypoints/openai/test_pooling.py
@@ -20,8 +20,8 @@ DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' +
 @pytest.fixture(scope="module")
 def server():
    args = [
-        "--task",
+        "--runner",
-        "reward",
+        "pooling",
        # use half precision for speed and memory savings in CI environment
        "--dtype",
        "bfloat16",

--- a/tests/entrypoints/openai/test_skip_tokenizer.py
+++ b/tests/entrypoints/openai/test_skip_tokenizer.py
@@ -26,8 +26,8 @@ def v1(run_with_both_engines):
 @pytest.fixture(scope="module")
 def server():
    args = [
-        "--task",
+        "--runner",
-        "embed",
+        "pooling",
        # use half precision for speed and memory savings in CI environment
        "--dtype",
        DTYPE,

--- a/tests/entrypoints/openai/test_truncation.py
+++ b/tests/entrypoints/openai/test_truncation.py
@@ -29,8 +29,8 @@ input = """Immerse yourself in the enchanting chronicle of calculus, a
 @pytest.fixture(scope="module")
 def server():
    args = [
-        "--task",
+        "--runner",
-        "embed",
+        "pooling",
        "--dtype",
        "bfloat16",
        "--enforce-eager",

--- a/tests/entrypoints/openai/test_video.py
+++ b/tests/entrypoints/openai/test_video.py
@@ -25,7 +25,7 @@ TEST_VIDEO_URLS = [
 @pytest.fixture(scope="module")
 def server():
    args = [
-        "--task",
+        "--runner",
        "generate",
        "--max-model-len",
        "32768",

--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -48,7 +48,7 @@ EXPECTED_MM_BEAM_SEARCH_RES = [
 @pytest.fixture(scope="module")
 def server():
    args = [
-        "--task",
+        "--runner",
        "generate",
        "--max-model-len",
        "2048",