Unverified Commit 86ae693f authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Deprecation][2/N] Replace `--task` with `--runner` and `--convert` (#21470)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 8f605ee3
...@@ -62,8 +62,8 @@ class TestSetting: ...@@ -62,8 +62,8 @@ class TestSetting:
TestSetting( TestSetting(
model="BAAI/bge-multilingual-gemma2", model="BAAI/bge-multilingual-gemma2",
model_args=[ model_args=[
"--task", "embed", "--dtype", "bfloat16", "--max-model-len", "--runner", "pooling", "--dtype", "bfloat16",
"2048" "--max-model-len", "2048"
], ],
pp_size=1, pp_size=1,
tp_size=1, tp_size=1,
...@@ -75,7 +75,7 @@ class TestSetting: ...@@ -75,7 +75,7 @@ class TestSetting:
# # encoder-based embedding model (BERT) # # encoder-based embedding model (BERT)
# TestSetting( # TestSetting(
# model="BAAI/bge-base-en-v1.5", # model="BAAI/bge-base-en-v1.5",
# model_args=["--task", "embed"], # model_args=["--runner", "pooling"],
# pp_size=1, # pp_size=1,
# tp_size=1, # tp_size=1,
# attn_backend="XFORMERS", # attn_backend="XFORMERS",
......
...@@ -125,9 +125,6 @@ def all_reduce_fusion_pass_on_test_model(local_rank: int, world_size: int, ...@@ -125,9 +125,6 @@ def all_reduce_fusion_pass_on_test_model(local_rank: int, world_size: int,
# in the vllm_config, it's not really used. # in the vllm_config, it's not really used.
model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e" model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
vllm_config.model_config = ModelConfig(model=model_name, vllm_config.model_config = ModelConfig(model=model_name,
task="auto",
tokenizer=model_name,
tokenizer_mode="auto",
trust_remote_code=True, trust_remote_code=True,
dtype=dtype, dtype=dtype,
seed=42) seed=42)
......
...@@ -250,9 +250,6 @@ def sequence_parallelism_pass_on_test_model( ...@@ -250,9 +250,6 @@ def sequence_parallelism_pass_on_test_model(
# in the vllm_config, it's not really used. # in the vllm_config, it's not really used.
model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e" model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
vllm_config.model_config = ModelConfig(model=model_name, vllm_config.model_config = ModelConfig(model=model_name,
task="auto",
tokenizer=model_name,
tokenizer_mode="auto",
trust_remote_code=True, trust_remote_code=True,
dtype=dtype, dtype=dtype,
seed=42) seed=42)
......
...@@ -23,7 +23,7 @@ from vllm import LLM, SamplingParams ...@@ -23,7 +23,7 @@ from vllm import LLM, SamplingParams
from vllm.assets.audio import AudioAsset from vllm.assets.audio import AudioAsset
from vllm.assets.image import ImageAsset from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset from vllm.assets.video import VideoAsset
from vllm.config import TaskOption, _get_and_verify_dtype from vllm.config import ConvertOption, RunnerOption, _get_and_verify_dtype
from vllm.connections import global_http_connection from vllm.connections import global_http_connection
from vllm.distributed import (cleanup_dist_env_and_memory, from vllm.distributed import (cleanup_dist_env_and_memory,
init_distributed_environment, init_distributed_environment,
...@@ -769,7 +769,8 @@ class VllmRunner: ...@@ -769,7 +769,8 @@ class VllmRunner:
def __init__( def __init__(
self, self,
model_name: str, model_name: str,
task: TaskOption = "auto", runner: RunnerOption = "auto",
convert: ConvertOption = "auto",
tokenizer_name: Optional[str] = None, tokenizer_name: Optional[str] = None,
tokenizer_mode: str = "auto", tokenizer_mode: str = "auto",
trust_remote_code: bool = True, trust_remote_code: bool = True,
...@@ -786,7 +787,8 @@ class VllmRunner: ...@@ -786,7 +787,8 @@ class VllmRunner:
) -> None: ) -> None:
self.llm = LLM( self.llm = LLM(
model=model_name, model=model_name,
task=task, runner=runner,
convert=convert,
tokenizer=tokenizer_name, tokenizer=tokenizer_name,
tokenizer_mode=tokenizer_mode, tokenizer_mode=tokenizer_mode,
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
......
...@@ -6,7 +6,7 @@ from typing import Literal, NamedTuple, Optional ...@@ -6,7 +6,7 @@ from typing import Literal, NamedTuple, Optional
import pytest import pytest
from vllm.config import TaskOption from vllm.config import RunnerOption
from vllm.logger import init_logger from vllm.logger import init_logger
from ..utils import compare_two_settings, create_new_process_for_each_test from ..utils import compare_two_settings, create_new_process_for_each_test
...@@ -31,14 +31,14 @@ class EPTestOptions(NamedTuple): ...@@ -31,14 +31,14 @@ class EPTestOptions(NamedTuple):
class EPTestSettings: class EPTestSettings:
parallel_setups: list[ParallelSetup] parallel_setups: list[ParallelSetup]
distributed_backends: list[str] distributed_backends: list[str]
task: TaskOption runner: RunnerOption
test_options: EPTestOptions test_options: EPTestOptions
@staticmethod @staticmethod
def detailed( def detailed(
*, *,
tp_base: int = 2, tp_base: int = 2,
task: TaskOption = "auto", runner: RunnerOption = "auto",
trust_remote_code: bool = False, trust_remote_code: bool = False,
tokenizer_mode: Optional[str] = None, tokenizer_mode: Optional[str] = None,
load_format: Optional[str] = None, load_format: Optional[str] = None,
...@@ -63,7 +63,7 @@ class EPTestSettings: ...@@ -63,7 +63,7 @@ class EPTestSettings:
chunked_prefill=False), chunked_prefill=False),
], ],
distributed_backends=["mp", "ray"], distributed_backends=["mp", "ray"],
task=task, runner=runner,
test_options=EPTestOptions(trust_remote_code=trust_remote_code, test_options=EPTestOptions(trust_remote_code=trust_remote_code,
tokenizer_mode=tokenizer_mode, tokenizer_mode=tokenizer_mode,
load_format=load_format, load_format=load_format,
...@@ -74,7 +74,7 @@ class EPTestSettings: ...@@ -74,7 +74,7 @@ class EPTestSettings:
def fast( def fast(
*, *,
tp_base: int = 2, tp_base: int = 2,
task: TaskOption = "auto", runner: RunnerOption = "auto",
trust_remote_code: bool = False, trust_remote_code: bool = False,
tokenizer_mode: Optional[str] = None, tokenizer_mode: Optional[str] = None,
load_format: Optional[str] = None, load_format: Optional[str] = None,
...@@ -87,7 +87,7 @@ class EPTestSettings: ...@@ -87,7 +87,7 @@ class EPTestSettings:
chunked_prefill=False), chunked_prefill=False),
], ],
distributed_backends=["mp"], distributed_backends=["mp"],
task=task, runner=runner,
test_options=EPTestOptions(trust_remote_code=trust_remote_code, test_options=EPTestOptions(trust_remote_code=trust_remote_code,
tokenizer_mode=tokenizer_mode, tokenizer_mode=tokenizer_mode,
load_format=load_format, load_format=load_format,
...@@ -100,7 +100,7 @@ class EPTestSettings: ...@@ -100,7 +100,7 @@ class EPTestSettings:
for parallel_setup in self.parallel_setups: for parallel_setup in self.parallel_setups:
for distributed_backend in self.distributed_backends: for distributed_backend in self.distributed_backends:
yield (model_name, parallel_setup, distributed_backend, yield (model_name, parallel_setup, distributed_backend,
self.task, opts) self.runner, opts)
# NOTE: You can adjust tp_base locally to fit the model in GPU # NOTE: You can adjust tp_base locally to fit the model in GPU
...@@ -118,7 +118,7 @@ def _compare_tp( ...@@ -118,7 +118,7 @@ def _compare_tp(
model_name: str, model_name: str,
parallel_setup: ParallelSetup, parallel_setup: ParallelSetup,
distributed_backend: str, distributed_backend: str,
task: TaskOption, runner: RunnerOption,
test_options: EPTestOptions, test_options: EPTestOptions,
num_gpus_available: int, num_gpus_available: int,
*, *,
...@@ -154,8 +154,8 @@ def _compare_tp( ...@@ -154,8 +154,8 @@ def _compare_tp(
common_args.append("--enable-chunked-prefill") common_args.append("--enable-chunked-prefill")
if eager_mode: if eager_mode:
common_args.append("--enforce-eager") common_args.append("--enforce-eager")
if task != "auto": if runner != "auto":
common_args.extend(["--task", task]) common_args.extend(["--runner", runner])
if trust_remote_code: if trust_remote_code:
common_args.append("--trust-remote-code") common_args.append("--trust-remote-code")
if tokenizer_mode: if tokenizer_mode:
...@@ -203,7 +203,7 @@ def _compare_tp( ...@@ -203,7 +203,7 @@ def _compare_tp(
@pytest.mark.parametrize( @pytest.mark.parametrize(
("model_name", "parallel_setup", "distributed_backend", "task", ("model_name", "parallel_setup", "distributed_backend", "runner",
"test_options"), "test_options"),
[ [
params for model_name, settings in TEST_MODELS.items() params for model_name, settings in TEST_MODELS.items()
...@@ -215,14 +215,14 @@ def test_ep( ...@@ -215,14 +215,14 @@ def test_ep(
model_name: str, model_name: str,
parallel_setup: ParallelSetup, parallel_setup: ParallelSetup,
distributed_backend: str, distributed_backend: str,
task: TaskOption, runner: RunnerOption,
test_options: EPTestOptions, test_options: EPTestOptions,
num_gpus_available, num_gpus_available,
): ):
_compare_tp(model_name, _compare_tp(model_name,
parallel_setup, parallel_setup,
distributed_backend, distributed_backend,
task, runner,
test_options, test_options,
num_gpus_available, num_gpus_available,
method="generate") method="generate")
...@@ -14,7 +14,7 @@ from typing import Literal, NamedTuple, Optional ...@@ -14,7 +14,7 @@ from typing import Literal, NamedTuple, Optional
import pytest import pytest
from vllm.config import _FLOAT16_NOT_SUPPORTED_MODELS, TaskOption from vllm.config import _FLOAT16_NOT_SUPPORTED_MODELS, RunnerOption
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.transformers_utils.config import get_config from vllm.transformers_utils.config import get_config
...@@ -60,7 +60,7 @@ class PPTestSettings: ...@@ -60,7 +60,7 @@ class PPTestSettings:
distributed_backends: list[str] distributed_backends: list[str]
# vllm major version: "0" for V0, "1" for V1 # vllm major version: "0" for V0, "1" for V1
vllm_major_versions: list[str] vllm_major_versions: list[str]
task: TaskOption runner: RunnerOption
test_options: PPTestOptions test_options: PPTestOptions
def __post_init__(self): def __post_init__(self):
...@@ -76,7 +76,7 @@ class PPTestSettings: ...@@ -76,7 +76,7 @@ class PPTestSettings:
tp_base: int = 1, tp_base: int = 1,
pp_base: int = 2, pp_base: int = 2,
multi_node_only: bool = False, multi_node_only: bool = False,
task: TaskOption = "auto", runner: RunnerOption = "auto",
load_format: Optional[str] = None, load_format: Optional[str] = None,
): ):
return PPTestSettings( return PPTestSettings(
...@@ -104,7 +104,7 @@ class PPTestSettings: ...@@ -104,7 +104,7 @@ class PPTestSettings:
], ],
distributed_backends=["mp", "mp", "ray", "ray"], distributed_backends=["mp", "mp", "ray", "ray"],
vllm_major_versions=["0", "1", "0", "1"], vllm_major_versions=["0", "1", "0", "1"],
task=task, runner=runner,
test_options=PPTestOptions(multi_node_only=multi_node_only, test_options=PPTestOptions(multi_node_only=multi_node_only,
load_format=load_format), load_format=load_format),
) )
...@@ -114,7 +114,7 @@ class PPTestSettings: ...@@ -114,7 +114,7 @@ class PPTestSettings:
*, *,
tp_base: int = 1, tp_base: int = 1,
pp_base: int = 2, pp_base: int = 2,
task: TaskOption = "auto", runner: RunnerOption = "auto",
multi_node_only: bool = False, multi_node_only: bool = False,
load_format: Optional[str] = None, load_format: Optional[str] = None,
): ):
...@@ -127,7 +127,7 @@ class PPTestSettings: ...@@ -127,7 +127,7 @@ class PPTestSettings:
], ],
distributed_backends=["mp"], distributed_backends=["mp"],
vllm_major_versions=["0"], vllm_major_versions=["0"],
task=task, runner=runner,
test_options=PPTestOptions(multi_node_only=multi_node_only, test_options=PPTestOptions(multi_node_only=multi_node_only,
load_format=load_format), load_format=load_format),
) )
...@@ -139,7 +139,7 @@ class PPTestSettings: ...@@ -139,7 +139,7 @@ class PPTestSettings:
for backend, vllm_major_version in zip(self.distributed_backends, for backend, vllm_major_version in zip(self.distributed_backends,
self.vllm_major_versions): self.vllm_major_versions):
yield (model_id, parallel_setup, backend, vllm_major_version, yield (model_id, parallel_setup, backend, vllm_major_version,
self.task, opts) self.runner, opts)
# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU # NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
...@@ -211,10 +211,10 @@ TEXT_GENERATION_MODELS = { ...@@ -211,10 +211,10 @@ TEXT_GENERATION_MODELS = {
EMBEDDING_MODELS = { # type: ignore[var-annotated] EMBEDDING_MODELS = { # type: ignore[var-annotated]
# [Text-only] # [Text-only]
"intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(task="embed"), "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(runner="pooling"),
"BAAI/bge-multilingual-gemma2": PPTestSettings.fast(task="embed"), "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(runner="pooling"),
"Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast( "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(
load_format="dummy", task="embed" load_format="dummy", runner="pooling"
), ),
} }
...@@ -269,7 +269,7 @@ def _compare_tp( ...@@ -269,7 +269,7 @@ def _compare_tp(
parallel_setup: ParallelSetup, parallel_setup: ParallelSetup,
distributed_backend: str, distributed_backend: str,
vllm_major_version: str, vllm_major_version: str,
task: TaskOption, runner: RunnerOption,
test_options: PPTestOptions, test_options: PPTestOptions,
num_gpus_available: int, num_gpus_available: int,
*, *,
...@@ -335,8 +335,8 @@ def _compare_tp( ...@@ -335,8 +335,8 @@ def _compare_tp(
common_args.append("--enable-chunked-prefill") common_args.append("--enable-chunked-prefill")
if eager_mode: if eager_mode:
common_args.append("--enforce-eager") common_args.append("--enforce-eager")
if task != "auto": if runner != "auto":
common_args.extend(["--task", task]) common_args.extend(["--runner", runner])
if trust_remote_code: if trust_remote_code:
common_args.append("--trust-remote-code") common_args.append("--trust-remote-code")
if tokenizer_mode: if tokenizer_mode:
...@@ -415,7 +415,7 @@ def _compare_tp( ...@@ -415,7 +415,7 @@ def _compare_tp(
@pytest.mark.parametrize( @pytest.mark.parametrize(
("model_id", "parallel_setup", "distributed_backend", "vllm_major_version", ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
"task", "test_options"), "runner", "test_options"),
[ [
params for model_id, settings in TEXT_GENERATION_MODELS.items() params for model_id, settings in TEXT_GENERATION_MODELS.items()
for params in settings.iter_params(model_id) if model_id in TEST_MODELS for params in settings.iter_params(model_id) if model_id in TEST_MODELS
...@@ -427,7 +427,7 @@ def test_tp_language_generation( ...@@ -427,7 +427,7 @@ def test_tp_language_generation(
parallel_setup: ParallelSetup, parallel_setup: ParallelSetup,
distributed_backend: str, distributed_backend: str,
vllm_major_version: str, vllm_major_version: str,
task: TaskOption, runner: RunnerOption,
test_options: PPTestOptions, test_options: PPTestOptions,
num_gpus_available, num_gpus_available,
): ):
...@@ -435,7 +435,7 @@ def test_tp_language_generation( ...@@ -435,7 +435,7 @@ def test_tp_language_generation(
parallel_setup, parallel_setup,
distributed_backend, distributed_backend,
vllm_major_version, vllm_major_version,
task, runner,
test_options, test_options,
num_gpus_available, num_gpus_available,
method="generate", method="generate",
...@@ -444,7 +444,7 @@ def test_tp_language_generation( ...@@ -444,7 +444,7 @@ def test_tp_language_generation(
@pytest.mark.parametrize( @pytest.mark.parametrize(
("model_id", "parallel_setup", "distributed_backend", "vllm_major_version", ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
"task", "test_options"), "runner", "test_options"),
[ [
params for model_id, settings in EMBEDDING_MODELS.items() params for model_id, settings in EMBEDDING_MODELS.items()
for params in settings.iter_params(model_id) if model_id in TEST_MODELS for params in settings.iter_params(model_id) if model_id in TEST_MODELS
...@@ -456,7 +456,7 @@ def test_tp_language_embedding( ...@@ -456,7 +456,7 @@ def test_tp_language_embedding(
parallel_setup: ParallelSetup, parallel_setup: ParallelSetup,
distributed_backend: str, distributed_backend: str,
vllm_major_version: str, vllm_major_version: str,
task: TaskOption, runner: RunnerOption,
test_options: PPTestOptions, test_options: PPTestOptions,
num_gpus_available, num_gpus_available,
): ):
...@@ -464,7 +464,7 @@ def test_tp_language_embedding( ...@@ -464,7 +464,7 @@ def test_tp_language_embedding(
parallel_setup, parallel_setup,
distributed_backend, distributed_backend,
vllm_major_version, vllm_major_version,
task, runner,
test_options, test_options,
num_gpus_available, num_gpus_available,
method="encode", method="encode",
...@@ -473,7 +473,7 @@ def test_tp_language_embedding( ...@@ -473,7 +473,7 @@ def test_tp_language_embedding(
@pytest.mark.parametrize( @pytest.mark.parametrize(
("model_id", "parallel_setup", "distributed_backend", "vllm_major_version", ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
"task", "test_options"), "runner", "test_options"),
[ [
params for model_id, settings in MULTIMODAL_MODELS.items() params for model_id, settings in MULTIMODAL_MODELS.items()
for params in settings.iter_params(model_id) if model_id in TEST_MODELS for params in settings.iter_params(model_id) if model_id in TEST_MODELS
...@@ -485,7 +485,7 @@ def test_tp_multimodal_generation( ...@@ -485,7 +485,7 @@ def test_tp_multimodal_generation(
parallel_setup: ParallelSetup, parallel_setup: ParallelSetup,
distributed_backend: str, distributed_backend: str,
vllm_major_version: str, vllm_major_version: str,
task: TaskOption, runner: RunnerOption,
test_options: PPTestOptions, test_options: PPTestOptions,
num_gpus_available, num_gpus_available,
): ):
...@@ -493,7 +493,7 @@ def test_tp_multimodal_generation( ...@@ -493,7 +493,7 @@ def test_tp_multimodal_generation(
parallel_setup, parallel_setup,
distributed_backend, distributed_backend,
vllm_major_version, vllm_major_version,
task, runner,
test_options, test_options,
num_gpus_available, num_gpus_available,
method="generate", method="generate",
......
...@@ -14,7 +14,7 @@ from typing import Literal, NamedTuple, Optional ...@@ -14,7 +14,7 @@ from typing import Literal, NamedTuple, Optional
import pytest import pytest
from vllm.config import TaskOption from vllm.config import RunnerOption
from vllm.logger import init_logger from vllm.logger import init_logger
from ..models.registry import HF_EXAMPLE_MODELS from ..models.registry import HF_EXAMPLE_MODELS
...@@ -48,7 +48,7 @@ class SPTestSettings: ...@@ -48,7 +48,7 @@ class SPTestSettings:
distributed_backends: list[str] distributed_backends: list[str]
# vllm major version: "0" for V0, "1" for V1 # vllm major version: "0" for V0, "1" for V1
vllm_major_versions: list[str] vllm_major_versions: list[str]
task: TaskOption runner: RunnerOption
test_options: SPTestOptions test_options: SPTestOptions
def __post_init__(self): def __post_init__(self):
...@@ -64,7 +64,7 @@ class SPTestSettings: ...@@ -64,7 +64,7 @@ class SPTestSettings:
tp_base: int = 2, tp_base: int = 2,
pp_base: int = 1, pp_base: int = 1,
multi_node_only: bool = False, multi_node_only: bool = False,
task: TaskOption = "auto", runner: RunnerOption = "auto",
load_format: Optional[str] = None, load_format: Optional[str] = None,
): ):
parallel_setups = [] parallel_setups = []
...@@ -81,7 +81,7 @@ class SPTestSettings: ...@@ -81,7 +81,7 @@ class SPTestSettings:
parallel_setups=parallel_setups, parallel_setups=parallel_setups,
distributed_backends=["mp", "ray"], distributed_backends=["mp", "ray"],
vllm_major_versions=["1", "1"], vllm_major_versions=["1", "1"],
task=task, runner=runner,
test_options=SPTestOptions(multi_node_only=multi_node_only, test_options=SPTestOptions(multi_node_only=multi_node_only,
load_format=load_format), load_format=load_format),
) )
...@@ -91,7 +91,7 @@ class SPTestSettings: ...@@ -91,7 +91,7 @@ class SPTestSettings:
*, *,
tp_base: int = 2, tp_base: int = 2,
pp_base: int = 1, pp_base: int = 1,
task: TaskOption = "auto", runner: RunnerOption = "auto",
multi_node_only: bool = False, multi_node_only: bool = False,
load_format: Optional[str] = None, load_format: Optional[str] = None,
): ):
...@@ -109,7 +109,7 @@ class SPTestSettings: ...@@ -109,7 +109,7 @@ class SPTestSettings:
parallel_setups=parallel_setups, parallel_setups=parallel_setups,
distributed_backends=["mp", "ray"], distributed_backends=["mp", "ray"],
vllm_major_versions=["1", "1"], vllm_major_versions=["1", "1"],
task=task, runner=runner,
test_options=SPTestOptions(multi_node_only=multi_node_only, test_options=SPTestOptions(multi_node_only=multi_node_only,
load_format=load_format), load_format=load_format),
) )
...@@ -119,7 +119,7 @@ class SPTestSettings: ...@@ -119,7 +119,7 @@ class SPTestSettings:
*, *,
tp_base: int = 2, tp_base: int = 2,
pp_base: int = 1, pp_base: int = 1,
task: TaskOption = "auto", runner: RunnerOption = "auto",
multi_node_only: bool = False, multi_node_only: bool = False,
load_format: Optional[str] = None, load_format: Optional[str] = None,
): ):
...@@ -135,7 +135,7 @@ class SPTestSettings: ...@@ -135,7 +135,7 @@ class SPTestSettings:
parallel_setups=parallel_setups, parallel_setups=parallel_setups,
distributed_backends=["mp", "ray"], distributed_backends=["mp", "ray"],
vllm_major_versions=["1", "1"], vllm_major_versions=["1", "1"],
task=task, runner=runner,
test_options=SPTestOptions(multi_node_only=multi_node_only, test_options=SPTestOptions(multi_node_only=multi_node_only,
load_format=load_format), load_format=load_format),
) )
...@@ -147,7 +147,7 @@ class SPTestSettings: ...@@ -147,7 +147,7 @@ class SPTestSettings:
for backend, vllm_major_version in zip(self.distributed_backends, for backend, vllm_major_version in zip(self.distributed_backends,
self.vllm_major_versions): self.vllm_major_versions):
yield (model_id, parallel_setup, backend, vllm_major_version, yield (model_id, parallel_setup, backend, vllm_major_version,
self.task, opts) self.runner, opts)
def _compare_sp( def _compare_sp(
...@@ -155,7 +155,7 @@ def _compare_sp( ...@@ -155,7 +155,7 @@ def _compare_sp(
parallel_setup: ParallelSetup, parallel_setup: ParallelSetup,
distributed_backend: str, distributed_backend: str,
vllm_major_version: str, vllm_major_version: str,
task: TaskOption, runner: RunnerOption,
test_options: SPTestOptions, test_options: SPTestOptions,
num_gpus_available: int, num_gpus_available: int,
*, *,
...@@ -217,8 +217,8 @@ def _compare_sp( ...@@ -217,8 +217,8 @@ def _compare_sp(
common_args.append("--enable-chunked-prefill") common_args.append("--enable-chunked-prefill")
if eager_mode: if eager_mode:
common_args.append("--enforce-eager") common_args.append("--enforce-eager")
if task != "auto": if runner != "auto":
common_args.extend(["--task", task]) common_args.extend(["--runner", runner])
if trust_remote_code: if trust_remote_code:
common_args.append("--trust-remote-code") common_args.append("--trust-remote-code")
if tokenizer_mode: if tokenizer_mode:
...@@ -298,7 +298,7 @@ SP_TEST_MODELS = [ ...@@ -298,7 +298,7 @@ SP_TEST_MODELS = [
@pytest.mark.parametrize( @pytest.mark.parametrize(
("model_id", "parallel_setup", "distributed_backend", "vllm_major_version", ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
"task", "test_options"), "runner", "test_options"),
[ [
params for model_id, settings in SP_TEXT_GENERATION_MODELS.items() params for model_id, settings in SP_TEXT_GENERATION_MODELS.items()
for params in settings.iter_params(model_id) for params in settings.iter_params(model_id)
...@@ -311,7 +311,7 @@ def test_tp_sp_generation( ...@@ -311,7 +311,7 @@ def test_tp_sp_generation(
parallel_setup: ParallelSetup, parallel_setup: ParallelSetup,
distributed_backend: str, distributed_backend: str,
vllm_major_version: str, vllm_major_version: str,
task: TaskOption, runner: RunnerOption,
test_options: SPTestOptions, test_options: SPTestOptions,
num_gpus_available, num_gpus_available,
): ):
...@@ -319,7 +319,7 @@ def test_tp_sp_generation( ...@@ -319,7 +319,7 @@ def test_tp_sp_generation(
parallel_setup, parallel_setup,
distributed_backend, distributed_backend,
vllm_major_version, vllm_major_version,
task, runner,
test_options, test_options,
num_gpus_available, num_gpus_available,
method="generate", method="generate",
......
...@@ -19,7 +19,8 @@ MAIN_SCORE = 0.7422994752439667 ...@@ -19,7 +19,8 @@ MAIN_SCORE = 0.7422994752439667
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): def server():
args = [ args = [
"--task", "embed", "--enforce-eager", "--disable-uvicorn-access-log" "--runner", "pooling", "--enforce-eager",
"--disable-uvicorn-access-log"
] ]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
......
...@@ -21,7 +21,8 @@ MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2" ...@@ -21,7 +21,8 @@ MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): def server():
args = [ args = [
"--task", "score", "--enforce-eager", "--disable-uvicorn-access-log" "--runner", "pooling", "--enforce-eager",
"--disable-uvicorn-access-log"
] ]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
......
...@@ -15,10 +15,6 @@ MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct" ...@@ -15,10 +15,6 @@ MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
def get_vocab_size(model_name): def get_vocab_size(model_name):
config = ModelConfig( config = ModelConfig(
model=model_name, model=model_name,
task="auto",
tokenizer=model_name,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0, seed=0,
dtype="bfloat16", dtype="bfloat16",
) )
......
...@@ -102,6 +102,7 @@ def test_get_gen_prompt(model, template, add_generation_prompt, ...@@ -102,6 +102,7 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
tokenizer=model_info.tokenizer or model, tokenizer=model_info.tokenizer or model,
tokenizer_mode=model_info.tokenizer_mode, tokenizer_mode=model_info.tokenizer_mode,
trust_remote_code=model_info.trust_remote_code, trust_remote_code=model_info.trust_remote_code,
revision=model_info.revision,
hf_overrides=model_info.hf_overrides, hf_overrides=model_info.hf_overrides,
) )
......
...@@ -33,8 +33,8 @@ def v1(run_with_both_engines): ...@@ -33,8 +33,8 @@ def v1(run_with_both_engines):
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): def server():
args = [ args = [
"--task", "--runner",
"embed", "pooling",
# use half precision for speed and memory savings in CI environment # use half precision for speed and memory savings in CI environment
"--dtype", "--dtype",
DTYPE, DTYPE,
......
...@@ -42,8 +42,8 @@ def dtype(request): ...@@ -42,8 +42,8 @@ def dtype(request):
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(model_info, dtype: str): def server(model_info, dtype: str):
args = [ args = [
"--task", "--runner",
"embed", "pooling",
# use half precision for speed and memory savings in CI environment # use half precision for speed and memory savings in CI environment
"--dtype", "--dtype",
dtype, dtype,
......
...@@ -21,7 +21,7 @@ LONG_TIMEOUT_SECONDS: Final[int] = 60 ...@@ -21,7 +21,7 @@ LONG_TIMEOUT_SECONDS: Final[int] = 60
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): def server():
args = [ args = [
"--task", "--runner",
"generate", "generate",
"--max-model-len", "--max-model-len",
"2048", "2048",
......
...@@ -27,8 +27,8 @@ def server(request: pytest.FixtureRequest): ...@@ -27,8 +27,8 @@ def server(request: pytest.FixtureRequest):
passed_params = [passed_params] passed_params = [passed_params]
args = [ args = [
"--task", "--runner",
"embed", "pooling",
# use half precision for speed and memory savings in CI environment # use half precision for speed and memory savings in CI environment
"--dtype", "--dtype",
"float16", "float16",
......
...@@ -20,8 +20,8 @@ DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + ...@@ -20,8 +20,8 @@ DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' +
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): def server():
args = [ args = [
"--task", "--runner",
"reward", "pooling",
# use half precision for speed and memory savings in CI environment # use half precision for speed and memory savings in CI environment
"--dtype", "--dtype",
"bfloat16", "bfloat16",
......
...@@ -26,8 +26,8 @@ def v1(run_with_both_engines): ...@@ -26,8 +26,8 @@ def v1(run_with_both_engines):
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): def server():
args = [ args = [
"--task", "--runner",
"embed", "pooling",
# use half precision for speed and memory savings in CI environment # use half precision for speed and memory savings in CI environment
"--dtype", "--dtype",
DTYPE, DTYPE,
......
...@@ -29,8 +29,8 @@ input = """Immerse yourself in the enchanting chronicle of calculus, a ...@@ -29,8 +29,8 @@ input = """Immerse yourself in the enchanting chronicle of calculus, a
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): def server():
args = [ args = [
"--task", "--runner",
"embed", "pooling",
"--dtype", "--dtype",
"bfloat16", "bfloat16",
"--enforce-eager", "--enforce-eager",
......
...@@ -25,7 +25,7 @@ TEST_VIDEO_URLS = [ ...@@ -25,7 +25,7 @@ TEST_VIDEO_URLS = [
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): def server():
args = [ args = [
"--task", "--runner",
"generate", "generate",
"--max-model-len", "--max-model-len",
"32768", "32768",
......
...@@ -48,7 +48,7 @@ EXPECTED_MM_BEAM_SEARCH_RES = [ ...@@ -48,7 +48,7 @@ EXPECTED_MM_BEAM_SEARCH_RES = [
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): def server():
args = [ args = [
"--task", "--runner",
"generate", "generate",
"--max-model-len", "--max-model-len",
"2048", "2048",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment