Unverified Commit 86ae693f authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Deprecation][2/N] Replace `--task` with `--runner` and `--convert` (#21470)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 8f605ee3
......@@ -31,8 +31,8 @@ TEST_IMAGE_URLS = [
@pytest.fixture(scope="module")
def server():
args = [
"--task",
"embed",
"--runner",
"pooling",
"--max-model-len",
"2048",
"--max-num-seqs",
......
......@@ -47,12 +47,8 @@ MISTRAL_MODEL_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
@pytest.fixture(scope="function")
def phi3v_model_config():
return ModelConfig(PHI3V_MODEL_ID,
task="generate",
tokenizer=PHI3V_MODEL_ID,
tokenizer_mode="auto",
runner="generate",
trust_remote_code=True,
dtype="auto",
seed=0,
limit_mm_per_prompt={
"image": 2,
})
......@@ -61,12 +57,8 @@ def phi3v_model_config():
@pytest.fixture(scope="function")
def phi3v_model_config_mm_interleaved():
return ModelConfig(PHI3V_MODEL_ID,
task="generate",
tokenizer=PHI3V_MODEL_ID,
tokenizer_mode="auto",
runner="generate",
trust_remote_code=True,
dtype="auto",
seed=0,
interleave_mm_strings=True,
limit_mm_per_prompt={
"image": 2,
......@@ -86,11 +78,7 @@ def phi3v_tokenizer():
@pytest.fixture(scope="function")
def qwen25omni_model_config_mm_interleaved():
return ModelConfig(QWEN25OMNI_MODEL_ID,
task="generate",
tokenizer=QWEN25OMNI_MODEL_ID,
tokenizer_mode="auto",
dtype="auto",
seed=0,
runner="generate",
interleave_mm_strings=True,
limit_mm_per_prompt={
"image": 2,
......@@ -112,12 +100,7 @@ def qwen25omni_tokenizer():
@pytest.fixture(scope="module")
def mllama_model_config():
return ModelConfig(MLLAMA_MODEL_ID,
task="generate",
tokenizer=MLLAMA_MODEL_ID,
tokenizer_mode="auto",
trust_remote_code=True,
dtype="auto",
seed=0,
runner="generate",
limit_mm_per_prompt={
"image": 2,
})
......@@ -136,12 +119,7 @@ def mllama_tokenizer():
@pytest.fixture(scope="function")
def mistral_model_config():
return ModelConfig(MISTRAL_MODEL_ID,
task="generate",
tokenizer=MISTRAL_MODEL_ID,
tokenizer_mode="auto",
trust_remote_code=True,
dtype="auto",
seed=0,
runner="generate",
limit_mm_per_prompt={
"image": 2,
})
......@@ -1105,12 +1083,7 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
# Build a config for the model
model_config = ModelConfig(model,
task="generate",
tokenizer=model,
tokenizer_mode="auto",
trust_remote_code=True,
dtype="auto",
seed=0,
runner="generate",
limit_mm_per_prompt={
"image": 2,
})
......@@ -1170,6 +1143,7 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
model,
tokenizer=model_info.tokenizer or model,
tokenizer_mode=model_info.tokenizer_mode,
revision=model_info.revision,
trust_remote_code=model_info.trust_remote_code,
hf_overrides=model_info.hf_overrides,
)
......@@ -1225,6 +1199,7 @@ def test_resolve_content_format_hf_defined(model, expected_format):
model,
tokenizer=model_info.tokenizer or model,
tokenizer_mode=model_info.tokenizer_mode,
revision=model_info.revision,
trust_remote_code=model_info.trust_remote_code,
hf_overrides=model_info.hf_overrides,
)
......@@ -1284,6 +1259,7 @@ def test_resolve_content_format_fallbacks(model, expected_format):
model,
tokenizer=model_info.tokenizer or model,
tokenizer_mode=model_info.tokenizer_mode,
revision=model_info.revision,
trust_remote_code=model_info.trust_remote_code,
hf_overrides=model_info.hf_overrides,
)
......
......@@ -38,13 +38,8 @@ def test_worker_apply_lora(sql_lora_files):
vllm_config = VllmConfig(
model_config=ModelConfig(
"meta-llama/Llama-2-7b-hf",
task="auto",
tokenizer="meta-llama/Llama-2-7b-hf",
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="float16",
revision=None,
enforce_eager=True,
),
load_config=LoadConfig(
......
......@@ -69,10 +69,7 @@ async def test_guided_logits_processor_black_box(backend: str, is_local: bool,
config = ModelConfig(
MODEL_NAME,
task="generate",
tokenizer=MODEL_NAME,
tokenizer_mode="auto",
trust_remote_code=False,
runner="generate",
seed=0,
dtype="bfloat16",
)
......@@ -113,10 +110,7 @@ async def test_guided_logits_processor_with_reasoning(
config = ModelConfig(
REASONING_MODEL_NAME,
task="generate",
tokenizer=REASONING_MODEL_NAME,
tokenizer_mode="auto",
trust_remote_code=False,
runner="generate",
seed=0,
dtype="bfloat16",
)
......
......@@ -57,7 +57,6 @@ def test_model_loading_with_params(vllm_runner, monkeypatch):
vllm_model.apply_model(check_model)
# assert output
assert output
......@@ -99,7 +98,6 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
vllm_model.apply_model(check_model)
# assert output
assert output
......
......@@ -52,7 +52,7 @@ def correctness_test_embed_models(hf_runner,
vllm_extra_kwargs["dtype"] = model_info.dtype
with vllm_runner(model_info.name,
task="embed",
runner="pooling",
max_model_len=None,
**vllm_extra_kwargs) as vllm_model:
vllm_outputs = vllm_model.embed(example_prompts)
......
......@@ -172,7 +172,7 @@ def mteb_test_embed_models(hf_runner,
vllm_extra_kwargs["dtype"] = model_info.dtype
with vllm_runner(model_info.name,
task="embed",
runner="pooling",
max_model_len=None,
**vllm_extra_kwargs) as vllm_model:
......@@ -279,15 +279,12 @@ def mteb_test_rerank_models(hf_runner,
vllm_extra_kwargs["dtype"] = model_info.dtype
with vllm_runner(model_info.name,
task="score",
runner="pooling",
max_model_len=None,
max_num_seqs=8,
**vllm_extra_kwargs) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
if model_info.architecture:
assert (model_info.architecture in model_config.architectures)
assert model_config.hf_config.num_labels == 1
vllm_main_score = run_mteb_rerank(vllm_mteb_encoder(vllm_model),
......
......@@ -85,7 +85,7 @@ def test_models(
hf_outputs = hf_model.encode(example_prompts)
with vllm_runner(model,
task="embed",
runner="pooling",
max_model_len=max_model_len,
**vllm_extra_kwargs) as vllm_model:
vllm_outputs = vllm_model.embed(example_prompts)
......
......@@ -28,10 +28,7 @@ def test_find_array():
model_config = ModelConfig(
MODEL_NAME,
task="embed",
tokenizer=MODEL_NAME,
tokenizer_mode="auto",
trust_remote_code=False,
runner="pooling",
dtype="bfloat16",
seed=0,
)
......@@ -117,7 +114,7 @@ def test_gritlm_offline_embedding(vllm_runner):
with vllm_runner(
MODEL_NAME,
task="embed",
runner="pooling",
max_model_len=MAX_MODEL_LEN,
) as vllm_model:
llm = vllm_model.llm
......@@ -140,7 +137,7 @@ def test_gritlm_offline_embedding(vllm_runner):
async def test_gritlm_api_server_embedding():
queries, q_instruction, documents, d_instruction = get_test_data()
args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
args = ["--runner", "pooling", "--max_model_len", str(MAX_MODEL_LEN)]
with RemoteOpenAIServer(MODEL_NAME, args) as server:
client_embedding = server.get_async_client()
......@@ -164,7 +161,7 @@ def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
with vllm_runner(
MODEL_NAME,
task="generate",
runner="generate",
max_model_len=MAX_MODEL_LEN,
) as vllm_model:
llm = vllm_model.llm
......@@ -179,7 +176,7 @@ def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
async def test_gritlm_api_server_generate():
input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
args = ["--task", "generate", "--max_model_len", str(MAX_MODEL_LEN)]
args = ["--runner", "generate", "--max_model_len", str(MAX_MODEL_LEN)]
with RemoteOpenAIServer(MODEL_NAME, args) as server:
client_generate = server.get_async_client()
......
......@@ -4,6 +4,7 @@ from functools import partial
import pytest
import vllm.envs as envs
from vllm import PoolingParams
from ...utils import EmbedModelInfo, RerankModelInfo
......@@ -62,6 +63,10 @@ def test_embed_models_correctness(hf_runner, vllm_runner,
@pytest.mark.parametrize("model_info", RERANK_MODELS)
def test_rerank_models_mteb(hf_runner, vllm_runner,
model_info: RerankModelInfo) -> None:
if (model_info.architecture == "XLMRobertaForSequenceClassification"
and envs.VLLM_USE_V1):
pytest.skip("Not supported yet")
mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
......@@ -92,7 +97,7 @@ def test_matryoshka(
hf_outputs = matryoshka_fy(hf_outputs, dimensions)
with vllm_runner(model_info.name,
task="embed",
runner="pooling",
dtype=dtype,
max_model_len=None) as vllm_model:
assert vllm_model.llm.llm_engine.model_config.is_matryoshka
......
......@@ -21,7 +21,7 @@ max_model_len = int(original_max_position_embeddings * factor)
@pytest.mark.parametrize("model_info", MODELS)
def test_default(model_info, vllm_runner):
with vllm_runner(model_info.name, task="embed",
with vllm_runner(model_info.name, runner="pooling",
max_model_len=None) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
if model_info.name == "nomic-ai/nomic-embed-text-v2-moe":
......@@ -36,7 +36,7 @@ def test_default(model_info, vllm_runner):
@pytest.mark.parametrize("model_info", MODELS)
def test_set_max_model_len_legal(model_info, vllm_runner):
# set max_model_len <= 512
with vllm_runner(model_info.name, task="embed",
with vllm_runner(model_info.name, runner="pooling",
max_model_len=256) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
assert model_config.max_model_len == 256
......@@ -46,11 +46,12 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
# For nomic-embed-text-v2-moe the length is set to 512
# by sentence_bert_config.json.
with pytest.raises(ValueError):
with vllm_runner(model_info.name, task="embed",
with vllm_runner(model_info.name,
runner="pooling",
max_model_len=1024):
pass
else:
with vllm_runner(model_info.name, task="embed",
with vllm_runner(model_info.name, runner="pooling",
max_model_len=1024) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config
assert model_config.max_model_len == 1024
......@@ -60,14 +61,15 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
def test_set_max_model_len_illegal(model_info, vllm_runner):
# set max_model_len > 2048
with pytest.raises(ValueError):
with vllm_runner(model_info.name, task="embed", max_model_len=4096):
with vllm_runner(model_info.name, runner="pooling",
max_model_len=4096):
pass
# set max_model_len > 2048 by hf_overrides
hf_overrides = {"max_model_len": 4096}
with pytest.raises(ValueError):
with vllm_runner(model_info.name,
task="embed",
runner="pooling",
max_model_len=None,
hf_overrides=hf_overrides):
pass
......@@ -87,7 +89,7 @@ def test_use_rope_scaling_legal(model_info, vllm_runner):
}
with vllm_runner(model_info.name,
task="embed",
runner="pooling",
max_model_len=None,
hf_overrides=hf_overrides):
pass
......@@ -107,7 +109,7 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner):
# illegal max_model_len
with pytest.raises(ValueError):
with vllm_runner(model_info.name,
task="embed",
runner="pooling",
max_model_len=max_model_len + 1,
hf_overrides=hf_overrides):
pass
......@@ -125,7 +127,7 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner):
# illegal max_model_len by hf_overrides
with pytest.raises(ValueError):
with vllm_runner(model_info.name,
task="embed",
runner="pooling",
max_model_len=None,
hf_overrides=hf_overrides):
pass
......@@ -37,7 +37,9 @@ def test_cross_encoder_1_to_1(vllm_runner, hf_runner, model_name):
with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
hf_outputs = hf_model.predict([text_pair]).tolist()
with vllm_runner(model_name, task="score", dtype=DTYPE,
with vllm_runner(model_name,
runner="pooling",
dtype=DTYPE,
max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
......@@ -56,7 +58,9 @@ def test_cross_encoder_1_to_N(vllm_runner, hf_runner, model_name):
with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
hf_outputs = hf_model.predict(text_pairs).tolist()
with vllm_runner(model_name, task="score", dtype=DTYPE,
with vllm_runner(model_name,
runner="pooling",
dtype=DTYPE,
max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
......@@ -76,7 +80,9 @@ def test_cross_encoder_N_to_N(vllm_runner, hf_runner, model_name):
with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
hf_outputs = hf_model.predict(text_pairs).tolist()
with vllm_runner(model_name, task="score", dtype=DTYPE,
with vllm_runner(model_name,
runner="pooling",
dtype=DTYPE,
max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
......@@ -103,7 +109,7 @@ def test_embedding_1_to_1(vllm_runner, hf_runner, emb_model_name):
]
with vllm_runner(emb_model_name,
task="embed",
runner="pooling",
dtype=DTYPE,
max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
......@@ -131,7 +137,7 @@ def test_embedding_1_to_N(vllm_runner, hf_runner, emb_model_name):
]
with vllm_runner(emb_model_name,
task="embed",
runner="pooling",
dtype=DTYPE,
max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
......@@ -160,7 +166,7 @@ def test_embedding_N_to_N(vllm_runner, hf_runner, emb_model_name):
]
with vllm_runner(emb_model_name,
task="embed",
runner="pooling",
dtype=DTYPE,
max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
......
......@@ -26,7 +26,7 @@ def test_smaller_truncation_size(vllm_runner,
truncate_prompt_tokens = 10
with vllm_runner(model_name, task="embed",
with vllm_runner(model_name, runner="pooling",
max_model_len=max_model_len) as vllm_model:
vllm_output = vllm_model.llm.encode(
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
......@@ -41,7 +41,7 @@ def test_max_truncation_size(vllm_runner,
input_str=input_str):
truncate_prompt_tokens = -1
with vllm_runner(model_name, task="embed",
with vllm_runner(model_name, runner="pooling",
max_model_len=max_model_len) as vllm_model:
vllm_output = vllm_model.llm.encode(
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
......@@ -58,7 +58,7 @@ def test_bigger_truncation_size(vllm_runner,
truncate_prompt_tokens = max_model_len + 1
with pytest.raises(ValueError), vllm_runner(
model_name, task="embed",
model_name, runner="pooling",
max_model_len=max_model_len) as vllm_model:
llm_output = vllm_model.llm.encode(
......
......@@ -222,7 +222,6 @@ VLM_TEST_SETTINGS = {
},
marks=[large_gpu_mark(min_gb=32)],
),
# Check "auto" with fallback to transformers
"internvl-transformers": VLMTestInfo(
models=["OpenGVLab/InternVL3-1B-hf"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
......@@ -232,7 +231,7 @@ VLM_TEST_SETTINGS = {
use_tokenizer_eos=True,
image_size_factors=[(0.25, 0.5, 1.0)],
vllm_runner_kwargs={
"model_impl": "auto",
"model_impl": "transformers",
},
auto_cls=AutoModelForImageTextToText,
marks=[pytest.mark.core_model],
......@@ -638,7 +637,7 @@ VLM_TEST_SETTINGS = {
img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n",
max_model_len=4096,
max_num_seqs=2,
task="generate",
runner="generate",
# use sdpa mode for hf runner since phi3v didn't work with flash_attn
hf_model_kwargs={"_attn_implementation": "sdpa"},
use_tokenizer_eos=True,
......
......@@ -65,7 +65,7 @@ def run_test(
# max_model_len should be greater than image_feature_size
with vllm_runner(
model,
task="generate",
runner="generate",
max_model_len=max_model_len,
max_num_seqs=1,
dtype=dtype,
......
......@@ -48,7 +48,7 @@ def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
]
with vllm_runner(model,
task="generate",
runner="generate",
dtype=dtype,
limit_mm_per_prompt={"image": 2},
max_model_len=32768,
......
......@@ -99,7 +99,7 @@ def run_test(
# max_model_len should be greater than image_feature_size
with vllm_runner(
model,
task="generate",
runner="generate",
max_model_len=max_model_len,
max_num_seqs=2,
dtype=dtype,
......
......@@ -267,7 +267,7 @@ def run_embedding_input_test(
# max_model_len should be greater than image_feature_size
with vllm_runner(model,
task="generate",
runner="generate",
max_model_len=4000,
max_num_seqs=3,
dtype=dtype,
......
......@@ -6,7 +6,7 @@ from typing import Any, Callable, Optional
import torch
from transformers.models.auto.auto_factory import _BaseAutoModelClass
from vllm.config import TaskOption
from vllm.config import RunnerOption
from vllm.transformers_utils.tokenizer import AnyTokenizer
from .....conftest import HfRunner, VllmRunner
......@@ -37,7 +37,7 @@ def run_test(
vllm_runner_kwargs: Optional[dict[str, Any]],
hf_model_kwargs: Optional[dict[str, Any]],
patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]],
task: TaskOption = "auto",
runner: RunnerOption = "auto",
distributed_executor_backend: Optional[str] = None,
tensor_parallel_size: int = 1,
vllm_embeddings: Optional[torch.Tensor] = None,
......@@ -83,7 +83,7 @@ def run_test(
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=enforce_eager,
task=task,
runner=runner,
**vllm_runner_kwargs_) as vllm_model:
tokenizer = vllm_model.llm.get_tokenizer()
......
......@@ -11,7 +11,7 @@ from pytest import MarkDecorator
from transformers import AutoModelForCausalLM
from transformers.models.auto.auto_factory import _BaseAutoModelClass
from vllm.config import TaskOption
from vllm.config import RunnerOption
from vllm.sequence import SampleLogprobs
from vllm.transformers_utils.tokenizer import AnyTokenizer
......@@ -109,7 +109,7 @@ class VLMTestInfo(NamedTuple):
enforce_eager: bool = True
max_model_len: int = 1024
max_num_seqs: int = 256
task: TaskOption = "auto"
runner: RunnerOption = "auto"
tensor_parallel_size: int = 1
vllm_runner_kwargs: Optional[dict[str, Any]] = None
......@@ -173,7 +173,7 @@ class VLMTestInfo(NamedTuple):
"enforce_eager": self.enforce_eager,
"max_model_len": self.max_model_len,
"max_num_seqs": self.max_num_seqs,
"task": self.task,
"runner": self.runner,
"tensor_parallel_size": self.tensor_parallel_size,
"vllm_runner_kwargs": self.vllm_runner_kwargs,
"hf_output_post_proc": self.hf_output_post_proc,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment