Unverified Commit 86ae693f authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Deprecation][2/N] Replace `--task` with `--runner` and `--convert` (#21470)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 8f605ee3
......@@ -62,8 +62,8 @@ class TestSetting:
TestSetting(
model="BAAI/bge-multilingual-gemma2",
model_args=[
"--task", "embed", "--dtype", "bfloat16", "--max-model-len",
"2048"
"--runner", "pooling", "--dtype", "bfloat16",
"--max-model-len", "2048"
],
pp_size=1,
tp_size=1,
......@@ -75,7 +75,7 @@ class TestSetting:
# # encoder-based embedding model (BERT)
# TestSetting(
# model="BAAI/bge-base-en-v1.5",
# model_args=["--task", "embed"],
# model_args=["--runner", "pooling"],
# pp_size=1,
# tp_size=1,
# attn_backend="XFORMERS",
......
......@@ -125,9 +125,6 @@ def all_reduce_fusion_pass_on_test_model(local_rank: int, world_size: int,
# in the vllm_config, it's not really used.
model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
vllm_config.model_config = ModelConfig(model=model_name,
task="auto",
tokenizer=model_name,
tokenizer_mode="auto",
trust_remote_code=True,
dtype=dtype,
seed=42)
......
......@@ -250,9 +250,6 @@ def sequence_parallelism_pass_on_test_model(
# in the vllm_config, it's not really used.
model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
vllm_config.model_config = ModelConfig(model=model_name,
task="auto",
tokenizer=model_name,
tokenizer_mode="auto",
trust_remote_code=True,
dtype=dtype,
seed=42)
......
......@@ -23,7 +23,7 @@ from vllm import LLM, SamplingParams
from vllm.assets.audio import AudioAsset
from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
from vllm.config import TaskOption, _get_and_verify_dtype
from vllm.config import ConvertOption, RunnerOption, _get_and_verify_dtype
from vllm.connections import global_http_connection
from vllm.distributed import (cleanup_dist_env_and_memory,
init_distributed_environment,
......@@ -769,7 +769,8 @@ class VllmRunner:
def __init__(
self,
model_name: str,
task: TaskOption = "auto",
runner: RunnerOption = "auto",
convert: ConvertOption = "auto",
tokenizer_name: Optional[str] = None,
tokenizer_mode: str = "auto",
trust_remote_code: bool = True,
......@@ -786,7 +787,8 @@ class VllmRunner:
) -> None:
self.llm = LLM(
model=model_name,
task=task,
runner=runner,
convert=convert,
tokenizer=tokenizer_name,
tokenizer_mode=tokenizer_mode,
trust_remote_code=trust_remote_code,
......
......@@ -6,7 +6,7 @@ from typing import Literal, NamedTuple, Optional
import pytest
from vllm.config import TaskOption
from vllm.config import RunnerOption
from vllm.logger import init_logger
from ..utils import compare_two_settings, create_new_process_for_each_test
......@@ -31,14 +31,14 @@ class EPTestOptions(NamedTuple):
class EPTestSettings:
parallel_setups: list[ParallelSetup]
distributed_backends: list[str]
task: TaskOption
runner: RunnerOption
test_options: EPTestOptions
@staticmethod
def detailed(
*,
tp_base: int = 2,
task: TaskOption = "auto",
runner: RunnerOption = "auto",
trust_remote_code: bool = False,
tokenizer_mode: Optional[str] = None,
load_format: Optional[str] = None,
......@@ -63,7 +63,7 @@ class EPTestSettings:
chunked_prefill=False),
],
distributed_backends=["mp", "ray"],
task=task,
runner=runner,
test_options=EPTestOptions(trust_remote_code=trust_remote_code,
tokenizer_mode=tokenizer_mode,
load_format=load_format,
......@@ -74,7 +74,7 @@ class EPTestSettings:
def fast(
*,
tp_base: int = 2,
task: TaskOption = "auto",
runner: RunnerOption = "auto",
trust_remote_code: bool = False,
tokenizer_mode: Optional[str] = None,
load_format: Optional[str] = None,
......@@ -87,7 +87,7 @@ class EPTestSettings:
chunked_prefill=False),
],
distributed_backends=["mp"],
task=task,
runner=runner,
test_options=EPTestOptions(trust_remote_code=trust_remote_code,
tokenizer_mode=tokenizer_mode,
load_format=load_format,
......@@ -100,7 +100,7 @@ class EPTestSettings:
for parallel_setup in self.parallel_setups:
for distributed_backend in self.distributed_backends:
yield (model_name, parallel_setup, distributed_backend,
self.task, opts)
self.runner, opts)
# NOTE: You can adjust tp_base locally to fit the model in GPU
......@@ -118,7 +118,7 @@ def _compare_tp(
model_name: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
task: TaskOption,
runner: RunnerOption,
test_options: EPTestOptions,
num_gpus_available: int,
*,
......@@ -154,8 +154,8 @@ def _compare_tp(
common_args.append("--enable-chunked-prefill")
if eager_mode:
common_args.append("--enforce-eager")
if task != "auto":
common_args.extend(["--task", task])
if runner != "auto":
common_args.extend(["--runner", runner])
if trust_remote_code:
common_args.append("--trust-remote-code")
if tokenizer_mode:
......@@ -203,7 +203,7 @@ def _compare_tp(
@pytest.mark.parametrize(
("model_name", "parallel_setup", "distributed_backend", "task",
("model_name", "parallel_setup", "distributed_backend", "runner",
"test_options"),
[
params for model_name, settings in TEST_MODELS.items()
......@@ -215,14 +215,14 @@ def test_ep(
model_name: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
task: TaskOption,
runner: RunnerOption,
test_options: EPTestOptions,
num_gpus_available,
):
_compare_tp(model_name,
parallel_setup,
distributed_backend,
task,
runner,
test_options,
num_gpus_available,
method="generate")
......@@ -14,7 +14,7 @@ from typing import Literal, NamedTuple, Optional
import pytest
from vllm.config import _FLOAT16_NOT_SUPPORTED_MODELS, TaskOption
from vllm.config import _FLOAT16_NOT_SUPPORTED_MODELS, RunnerOption
from vllm.logger import init_logger
from vllm.transformers_utils.config import get_config
......@@ -60,7 +60,7 @@ class PPTestSettings:
distributed_backends: list[str]
# vllm major version: "0" for V0, "1" for V1
vllm_major_versions: list[str]
task: TaskOption
runner: RunnerOption
test_options: PPTestOptions
def __post_init__(self):
......@@ -76,7 +76,7 @@ class PPTestSettings:
tp_base: int = 1,
pp_base: int = 2,
multi_node_only: bool = False,
task: TaskOption = "auto",
runner: RunnerOption = "auto",
load_format: Optional[str] = None,
):
return PPTestSettings(
......@@ -104,7 +104,7 @@ class PPTestSettings:
],
distributed_backends=["mp", "mp", "ray", "ray"],
vllm_major_versions=["0", "1", "0", "1"],
task=task,
runner=runner,
test_options=PPTestOptions(multi_node_only=multi_node_only,
load_format=load_format),
)
......@@ -114,7 +114,7 @@ class PPTestSettings:
*,
tp_base: int = 1,
pp_base: int = 2,
task: TaskOption = "auto",
runner: RunnerOption = "auto",
multi_node_only: bool = False,
load_format: Optional[str] = None,
):
......@@ -127,7 +127,7 @@ class PPTestSettings:
],
distributed_backends=["mp"],
vllm_major_versions=["0"],
task=task,
runner=runner,
test_options=PPTestOptions(multi_node_only=multi_node_only,
load_format=load_format),
)
......@@ -139,7 +139,7 @@ class PPTestSettings:
for backend, vllm_major_version in zip(self.distributed_backends,
self.vllm_major_versions):
yield (model_id, parallel_setup, backend, vllm_major_version,
self.task, opts)
self.runner, opts)
# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
......@@ -211,10 +211,10 @@ TEXT_GENERATION_MODELS = {
EMBEDDING_MODELS = { # type: ignore[var-annotated]
# [Text-only]
"intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(task="embed"),
"BAAI/bge-multilingual-gemma2": PPTestSettings.fast(task="embed"),
"intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(runner="pooling"),
"BAAI/bge-multilingual-gemma2": PPTestSettings.fast(runner="pooling"),
"Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(
load_format="dummy", task="embed"
load_format="dummy", runner="pooling"
),
}
......@@ -269,7 +269,7 @@ def _compare_tp(
parallel_setup: ParallelSetup,
distributed_backend: str,
vllm_major_version: str,
task: TaskOption,
runner: RunnerOption,
test_options: PPTestOptions,
num_gpus_available: int,
*,
......@@ -335,8 +335,8 @@ def _compare_tp(
common_args.append("--enable-chunked-prefill")
if eager_mode:
common_args.append("--enforce-eager")
if task != "auto":
common_args.extend(["--task", task])
if runner != "auto":
common_args.extend(["--runner", runner])
if trust_remote_code:
common_args.append("--trust-remote-code")
if tokenizer_mode:
......@@ -415,7 +415,7 @@ def _compare_tp(
@pytest.mark.parametrize(
("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
"task", "test_options"),
"runner", "test_options"),
[
params for model_id, settings in TEXT_GENERATION_MODELS.items()
for params in settings.iter_params(model_id) if model_id in TEST_MODELS
......@@ -427,7 +427,7 @@ def test_tp_language_generation(
parallel_setup: ParallelSetup,
distributed_backend: str,
vllm_major_version: str,
task: TaskOption,
runner: RunnerOption,
test_options: PPTestOptions,
num_gpus_available,
):
......@@ -435,7 +435,7 @@ def test_tp_language_generation(
parallel_setup,
distributed_backend,
vllm_major_version,
task,
runner,
test_options,
num_gpus_available,
method="generate",
......@@ -444,7 +444,7 @@ def test_tp_language_generation(
@pytest.mark.parametrize(
("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
"task", "test_options"),
"runner", "test_options"),
[
params for model_id, settings in EMBEDDING_MODELS.items()
for params in settings.iter_params(model_id) if model_id in TEST_MODELS
......@@ -456,7 +456,7 @@ def test_tp_language_embedding(
parallel_setup: ParallelSetup,
distributed_backend: str,
vllm_major_version: str,
task: TaskOption,
runner: RunnerOption,
test_options: PPTestOptions,
num_gpus_available,
):
......@@ -464,7 +464,7 @@ def test_tp_language_embedding(
parallel_setup,
distributed_backend,
vllm_major_version,
task,
runner,
test_options,
num_gpus_available,
method="encode",
......@@ -473,7 +473,7 @@ def test_tp_language_embedding(
@pytest.mark.parametrize(
("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
"task", "test_options"),
"runner", "test_options"),
[
params for model_id, settings in MULTIMODAL_MODELS.items()
for params in settings.iter_params(model_id) if model_id in TEST_MODELS
......@@ -485,7 +485,7 @@ def test_tp_multimodal_generation(
parallel_setup: ParallelSetup,
distributed_backend: str,
vllm_major_version: str,
task: TaskOption,
runner: RunnerOption,
test_options: PPTestOptions,
num_gpus_available,
):
......@@ -493,7 +493,7 @@ def test_tp_multimodal_generation(
parallel_setup,
distributed_backend,
vllm_major_version,
task,
runner,
test_options,
num_gpus_available,
method="generate",
......
......@@ -14,7 +14,7 @@ from typing import Literal, NamedTuple, Optional
import pytest
from vllm.config import TaskOption
from vllm.config import RunnerOption
from vllm.logger import init_logger
from ..models.registry import HF_EXAMPLE_MODELS
......@@ -48,7 +48,7 @@ class SPTestSettings:
distributed_backends: list[str]
# vllm major version: "0" for V0, "1" for V1
vllm_major_versions: list[str]
task: TaskOption
runner: RunnerOption
test_options: SPTestOptions
def __post_init__(self):
......@@ -64,7 +64,7 @@ class SPTestSettings:
tp_base: int = 2,
pp_base: int = 1,
multi_node_only: bool = False,
task: TaskOption = "auto",
runner: RunnerOption = "auto",
load_format: Optional[str] = None,
):
parallel_setups = []
......@@ -81,7 +81,7 @@ class SPTestSettings:
parallel_setups=parallel_setups,
distributed_backends=["mp", "ray"],
vllm_major_versions=["1", "1"],
task=task,
runner=runner,
test_options=SPTestOptions(multi_node_only=multi_node_only,
load_format=load_format),
)
......@@ -91,7 +91,7 @@ class SPTestSettings:
*,
tp_base: int = 2,
pp_base: int = 1,
task: TaskOption = "auto",
runner: RunnerOption = "auto",
multi_node_only: bool = False,
load_format: Optional[str] = None,
):
......@@ -109,7 +109,7 @@ class SPTestSettings:
parallel_setups=parallel_setups,
distributed_backends=["mp", "ray"],
vllm_major_versions=["1", "1"],
task=task,
runner=runner,
test_options=SPTestOptions(multi_node_only=multi_node_only,
load_format=load_format),
)
......@@ -119,7 +119,7 @@ class SPTestSettings:
*,
tp_base: int = 2,
pp_base: int = 1,
task: TaskOption = "auto",
runner: RunnerOption = "auto",
multi_node_only: bool = False,
load_format: Optional[str] = None,
):
......@@ -135,7 +135,7 @@ class SPTestSettings:
parallel_setups=parallel_setups,
distributed_backends=["mp", "ray"],
vllm_major_versions=["1", "1"],
task=task,
runner=runner,
test_options=SPTestOptions(multi_node_only=multi_node_only,
load_format=load_format),
)
......@@ -147,7 +147,7 @@ class SPTestSettings:
for backend, vllm_major_version in zip(self.distributed_backends,
self.vllm_major_versions):
yield (model_id, parallel_setup, backend, vllm_major_version,
self.task, opts)
self.runner, opts)
def _compare_sp(
......@@ -155,7 +155,7 @@ def _compare_sp(
parallel_setup: ParallelSetup,
distributed_backend: str,
vllm_major_version: str,
task: TaskOption,
runner: RunnerOption,
test_options: SPTestOptions,
num_gpus_available: int,
*,
......@@ -217,8 +217,8 @@ def _compare_sp(
common_args.append("--enable-chunked-prefill")
if eager_mode:
common_args.append("--enforce-eager")
if task != "auto":
common_args.extend(["--task", task])
if runner != "auto":
common_args.extend(["--runner", runner])
if trust_remote_code:
common_args.append("--trust-remote-code")
if tokenizer_mode:
......@@ -298,7 +298,7 @@ SP_TEST_MODELS = [
@pytest.mark.parametrize(
("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
"task", "test_options"),
"runner", "test_options"),
[
params for model_id, settings in SP_TEXT_GENERATION_MODELS.items()
for params in settings.iter_params(model_id)
......@@ -311,7 +311,7 @@ def test_tp_sp_generation(
parallel_setup: ParallelSetup,
distributed_backend: str,
vllm_major_version: str,
task: TaskOption,
runner: RunnerOption,
test_options: SPTestOptions,
num_gpus_available,
):
......@@ -319,7 +319,7 @@ def test_tp_sp_generation(
parallel_setup,
distributed_backend,
vllm_major_version,
task,
runner,
test_options,
num_gpus_available,
method="generate",
......
......@@ -19,7 +19,8 @@ MAIN_SCORE = 0.7422994752439667
@pytest.fixture(scope="module")
def server():
args = [
"--task", "embed", "--enforce-eager", "--disable-uvicorn-access-log"
"--runner", "pooling", "--enforce-eager",
"--disable-uvicorn-access-log"
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
......
......@@ -21,7 +21,8 @@ MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
@pytest.fixture(scope="module")
def server():
args = [
"--task", "score", "--enforce-eager", "--disable-uvicorn-access-log"
"--runner", "pooling", "--enforce-eager",
"--disable-uvicorn-access-log"
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
......
......@@ -15,10 +15,6 @@ MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
def get_vocab_size(model_name):
config = ModelConfig(
model=model_name,
task="auto",
tokenizer=model_name,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="bfloat16",
)
......
......@@ -102,6 +102,7 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
tokenizer=model_info.tokenizer or model,
tokenizer_mode=model_info.tokenizer_mode,
trust_remote_code=model_info.trust_remote_code,
revision=model_info.revision,
hf_overrides=model_info.hf_overrides,
)
......
......@@ -33,8 +33,8 @@ def v1(run_with_both_engines):
@pytest.fixture(scope="module")
def server():
args = [
"--task",
"embed",
"--runner",
"pooling",
# use half precision for speed and memory savings in CI environment
"--dtype",
DTYPE,
......
......@@ -42,8 +42,8 @@ def dtype(request):
@pytest.fixture(scope="module")
def server(model_info, dtype: str):
args = [
"--task",
"embed",
"--runner",
"pooling",
# use half precision for speed and memory savings in CI environment
"--dtype",
dtype,
......
......@@ -21,7 +21,7 @@ LONG_TIMEOUT_SECONDS: Final[int] = 60
@pytest.fixture(scope="module")
def server():
args = [
"--task",
"--runner",
"generate",
"--max-model-len",
"2048",
......
......@@ -27,8 +27,8 @@ def server(request: pytest.FixtureRequest):
passed_params = [passed_params]
args = [
"--task",
"embed",
"--runner",
"pooling",
# use half precision for speed and memory savings in CI environment
"--dtype",
"float16",
......
......@@ -20,8 +20,8 @@ DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' +
@pytest.fixture(scope="module")
def server():
args = [
"--task",
"reward",
"--runner",
"pooling",
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
......
......@@ -26,8 +26,8 @@ def v1(run_with_both_engines):
@pytest.fixture(scope="module")
def server():
args = [
"--task",
"embed",
"--runner",
"pooling",
# use half precision for speed and memory savings in CI environment
"--dtype",
DTYPE,
......
......@@ -29,8 +29,8 @@ input = """Immerse yourself in the enchanting chronicle of calculus, a
@pytest.fixture(scope="module")
def server():
args = [
"--task",
"embed",
"--runner",
"pooling",
"--dtype",
"bfloat16",
"--enforce-eager",
......
......@@ -25,7 +25,7 @@ TEST_VIDEO_URLS = [
@pytest.fixture(scope="module")
def server():
args = [
"--task",
"--runner",
"generate",
"--max-model-len",
"32768",
......
......@@ -48,7 +48,7 @@ EXPECTED_MM_BEAM_SEARCH_RES = [
@pytest.fixture(scope="module")
def server():
args = [
"--task",
"--runner",
"generate",
"--max-model-len",
"2048",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment