"tests/vscode:/vscode.git/clone" did not exist on "febdc998d84bdaf7cfddacdc972d07989f94285b"
Unverified Commit 86ae693f authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Deprecation][2/N] Replace `--task` with `--runner` and `--convert` (#21470)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 8f605ee3
...@@ -31,8 +31,8 @@ TEST_IMAGE_URLS = [ ...@@ -31,8 +31,8 @@ TEST_IMAGE_URLS = [
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): def server():
args = [ args = [
"--task", "--runner",
"embed", "pooling",
"--max-model-len", "--max-model-len",
"2048", "2048",
"--max-num-seqs", "--max-num-seqs",
......
...@@ -47,12 +47,8 @@ MISTRAL_MODEL_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503" ...@@ -47,12 +47,8 @@ MISTRAL_MODEL_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
@pytest.fixture(scope="function") @pytest.fixture(scope="function")
def phi3v_model_config(): def phi3v_model_config():
return ModelConfig(PHI3V_MODEL_ID, return ModelConfig(PHI3V_MODEL_ID,
task="generate", runner="generate",
tokenizer=PHI3V_MODEL_ID,
tokenizer_mode="auto",
trust_remote_code=True, trust_remote_code=True,
dtype="auto",
seed=0,
limit_mm_per_prompt={ limit_mm_per_prompt={
"image": 2, "image": 2,
}) })
...@@ -61,12 +57,8 @@ def phi3v_model_config(): ...@@ -61,12 +57,8 @@ def phi3v_model_config():
@pytest.fixture(scope="function") @pytest.fixture(scope="function")
def phi3v_model_config_mm_interleaved(): def phi3v_model_config_mm_interleaved():
return ModelConfig(PHI3V_MODEL_ID, return ModelConfig(PHI3V_MODEL_ID,
task="generate", runner="generate",
tokenizer=PHI3V_MODEL_ID,
tokenizer_mode="auto",
trust_remote_code=True, trust_remote_code=True,
dtype="auto",
seed=0,
interleave_mm_strings=True, interleave_mm_strings=True,
limit_mm_per_prompt={ limit_mm_per_prompt={
"image": 2, "image": 2,
...@@ -86,11 +78,7 @@ def phi3v_tokenizer(): ...@@ -86,11 +78,7 @@ def phi3v_tokenizer():
@pytest.fixture(scope="function") @pytest.fixture(scope="function")
def qwen25omni_model_config_mm_interleaved(): def qwen25omni_model_config_mm_interleaved():
return ModelConfig(QWEN25OMNI_MODEL_ID, return ModelConfig(QWEN25OMNI_MODEL_ID,
task="generate", runner="generate",
tokenizer=QWEN25OMNI_MODEL_ID,
tokenizer_mode="auto",
dtype="auto",
seed=0,
interleave_mm_strings=True, interleave_mm_strings=True,
limit_mm_per_prompt={ limit_mm_per_prompt={
"image": 2, "image": 2,
...@@ -112,12 +100,7 @@ def qwen25omni_tokenizer(): ...@@ -112,12 +100,7 @@ def qwen25omni_tokenizer():
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def mllama_model_config(): def mllama_model_config():
return ModelConfig(MLLAMA_MODEL_ID, return ModelConfig(MLLAMA_MODEL_ID,
task="generate", runner="generate",
tokenizer=MLLAMA_MODEL_ID,
tokenizer_mode="auto",
trust_remote_code=True,
dtype="auto",
seed=0,
limit_mm_per_prompt={ limit_mm_per_prompt={
"image": 2, "image": 2,
}) })
...@@ -136,12 +119,7 @@ def mllama_tokenizer(): ...@@ -136,12 +119,7 @@ def mllama_tokenizer():
@pytest.fixture(scope="function") @pytest.fixture(scope="function")
def mistral_model_config(): def mistral_model_config():
return ModelConfig(MISTRAL_MODEL_ID, return ModelConfig(MISTRAL_MODEL_ID,
task="generate", runner="generate",
tokenizer=MISTRAL_MODEL_ID,
tokenizer_mode="auto",
trust_remote_code=True,
dtype="auto",
seed=0,
limit_mm_per_prompt={ limit_mm_per_prompt={
"image": 2, "image": 2,
}) })
...@@ -1105,12 +1083,7 @@ def test_multimodal_image_parsing_matches_hf(model, image_url): ...@@ -1105,12 +1083,7 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
# Build a config for the model # Build a config for the model
model_config = ModelConfig(model, model_config = ModelConfig(model,
task="generate", runner="generate",
tokenizer=model,
tokenizer_mode="auto",
trust_remote_code=True,
dtype="auto",
seed=0,
limit_mm_per_prompt={ limit_mm_per_prompt={
"image": 2, "image": 2,
}) })
...@@ -1170,6 +1143,7 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools): ...@@ -1170,6 +1143,7 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
model, model,
tokenizer=model_info.tokenizer or model, tokenizer=model_info.tokenizer or model,
tokenizer_mode=model_info.tokenizer_mode, tokenizer_mode=model_info.tokenizer_mode,
revision=model_info.revision,
trust_remote_code=model_info.trust_remote_code, trust_remote_code=model_info.trust_remote_code,
hf_overrides=model_info.hf_overrides, hf_overrides=model_info.hf_overrides,
) )
...@@ -1225,6 +1199,7 @@ def test_resolve_content_format_hf_defined(model, expected_format): ...@@ -1225,6 +1199,7 @@ def test_resolve_content_format_hf_defined(model, expected_format):
model, model,
tokenizer=model_info.tokenizer or model, tokenizer=model_info.tokenizer or model,
tokenizer_mode=model_info.tokenizer_mode, tokenizer_mode=model_info.tokenizer_mode,
revision=model_info.revision,
trust_remote_code=model_info.trust_remote_code, trust_remote_code=model_info.trust_remote_code,
hf_overrides=model_info.hf_overrides, hf_overrides=model_info.hf_overrides,
) )
...@@ -1284,6 +1259,7 @@ def test_resolve_content_format_fallbacks(model, expected_format): ...@@ -1284,6 +1259,7 @@ def test_resolve_content_format_fallbacks(model, expected_format):
model, model,
tokenizer=model_info.tokenizer or model, tokenizer=model_info.tokenizer or model,
tokenizer_mode=model_info.tokenizer_mode, tokenizer_mode=model_info.tokenizer_mode,
revision=model_info.revision,
trust_remote_code=model_info.trust_remote_code, trust_remote_code=model_info.trust_remote_code,
hf_overrides=model_info.hf_overrides, hf_overrides=model_info.hf_overrides,
) )
......
...@@ -38,13 +38,8 @@ def test_worker_apply_lora(sql_lora_files): ...@@ -38,13 +38,8 @@ def test_worker_apply_lora(sql_lora_files):
vllm_config = VllmConfig( vllm_config = VllmConfig(
model_config=ModelConfig( model_config=ModelConfig(
"meta-llama/Llama-2-7b-hf", "meta-llama/Llama-2-7b-hf",
task="auto",
tokenizer="meta-llama/Llama-2-7b-hf",
tokenizer_mode="auto",
trust_remote_code=False,
seed=0, seed=0,
dtype="float16", dtype="float16",
revision=None,
enforce_eager=True, enforce_eager=True,
), ),
load_config=LoadConfig( load_config=LoadConfig(
......
...@@ -69,10 +69,7 @@ async def test_guided_logits_processor_black_box(backend: str, is_local: bool, ...@@ -69,10 +69,7 @@ async def test_guided_logits_processor_black_box(backend: str, is_local: bool,
config = ModelConfig( config = ModelConfig(
MODEL_NAME, MODEL_NAME,
task="generate", runner="generate",
tokenizer=MODEL_NAME,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0, seed=0,
dtype="bfloat16", dtype="bfloat16",
) )
...@@ -113,10 +110,7 @@ async def test_guided_logits_processor_with_reasoning( ...@@ -113,10 +110,7 @@ async def test_guided_logits_processor_with_reasoning(
config = ModelConfig( config = ModelConfig(
REASONING_MODEL_NAME, REASONING_MODEL_NAME,
task="generate", runner="generate",
tokenizer=REASONING_MODEL_NAME,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0, seed=0,
dtype="bfloat16", dtype="bfloat16",
) )
......
...@@ -57,7 +57,6 @@ def test_model_loading_with_params(vllm_runner, monkeypatch): ...@@ -57,7 +57,6 @@ def test_model_loading_with_params(vllm_runner, monkeypatch):
vllm_model.apply_model(check_model) vllm_model.apply_model(check_model)
# assert output
assert output assert output
...@@ -99,7 +98,6 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch): ...@@ -99,7 +98,6 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
vllm_model.apply_model(check_model) vllm_model.apply_model(check_model)
# assert output
assert output assert output
......
...@@ -52,7 +52,7 @@ def correctness_test_embed_models(hf_runner, ...@@ -52,7 +52,7 @@ def correctness_test_embed_models(hf_runner,
vllm_extra_kwargs["dtype"] = model_info.dtype vllm_extra_kwargs["dtype"] = model_info.dtype
with vllm_runner(model_info.name, with vllm_runner(model_info.name,
task="embed", runner="pooling",
max_model_len=None, max_model_len=None,
**vllm_extra_kwargs) as vllm_model: **vllm_extra_kwargs) as vllm_model:
vllm_outputs = vllm_model.embed(example_prompts) vllm_outputs = vllm_model.embed(example_prompts)
......
...@@ -172,7 +172,7 @@ def mteb_test_embed_models(hf_runner, ...@@ -172,7 +172,7 @@ def mteb_test_embed_models(hf_runner,
vllm_extra_kwargs["dtype"] = model_info.dtype vllm_extra_kwargs["dtype"] = model_info.dtype
with vllm_runner(model_info.name, with vllm_runner(model_info.name,
task="embed", runner="pooling",
max_model_len=None, max_model_len=None,
**vllm_extra_kwargs) as vllm_model: **vllm_extra_kwargs) as vllm_model:
...@@ -279,15 +279,12 @@ def mteb_test_rerank_models(hf_runner, ...@@ -279,15 +279,12 @@ def mteb_test_rerank_models(hf_runner,
vllm_extra_kwargs["dtype"] = model_info.dtype vllm_extra_kwargs["dtype"] = model_info.dtype
with vllm_runner(model_info.name, with vllm_runner(model_info.name,
task="score", runner="pooling",
max_model_len=None, max_model_len=None,
max_num_seqs=8, max_num_seqs=8,
**vllm_extra_kwargs) as vllm_model: **vllm_extra_kwargs) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config model_config = vllm_model.llm.llm_engine.model_config
if model_info.architecture:
assert (model_info.architecture in model_config.architectures)
assert model_config.hf_config.num_labels == 1 assert model_config.hf_config.num_labels == 1
vllm_main_score = run_mteb_rerank(vllm_mteb_encoder(vllm_model), vllm_main_score = run_mteb_rerank(vllm_mteb_encoder(vllm_model),
......
...@@ -85,7 +85,7 @@ def test_models( ...@@ -85,7 +85,7 @@ def test_models(
hf_outputs = hf_model.encode(example_prompts) hf_outputs = hf_model.encode(example_prompts)
with vllm_runner(model, with vllm_runner(model,
task="embed", runner="pooling",
max_model_len=max_model_len, max_model_len=max_model_len,
**vllm_extra_kwargs) as vllm_model: **vllm_extra_kwargs) as vllm_model:
vllm_outputs = vllm_model.embed(example_prompts) vllm_outputs = vllm_model.embed(example_prompts)
......
...@@ -28,10 +28,7 @@ def test_find_array(): ...@@ -28,10 +28,7 @@ def test_find_array():
model_config = ModelConfig( model_config = ModelConfig(
MODEL_NAME, MODEL_NAME,
task="embed", runner="pooling",
tokenizer=MODEL_NAME,
tokenizer_mode="auto",
trust_remote_code=False,
dtype="bfloat16", dtype="bfloat16",
seed=0, seed=0,
) )
...@@ -117,7 +114,7 @@ def test_gritlm_offline_embedding(vllm_runner): ...@@ -117,7 +114,7 @@ def test_gritlm_offline_embedding(vllm_runner):
with vllm_runner( with vllm_runner(
MODEL_NAME, MODEL_NAME,
task="embed", runner="pooling",
max_model_len=MAX_MODEL_LEN, max_model_len=MAX_MODEL_LEN,
) as vllm_model: ) as vllm_model:
llm = vllm_model.llm llm = vllm_model.llm
...@@ -140,7 +137,7 @@ def test_gritlm_offline_embedding(vllm_runner): ...@@ -140,7 +137,7 @@ def test_gritlm_offline_embedding(vllm_runner):
async def test_gritlm_api_server_embedding(): async def test_gritlm_api_server_embedding():
queries, q_instruction, documents, d_instruction = get_test_data() queries, q_instruction, documents, d_instruction = get_test_data()
args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)] args = ["--runner", "pooling", "--max_model_len", str(MAX_MODEL_LEN)]
with RemoteOpenAIServer(MODEL_NAME, args) as server: with RemoteOpenAIServer(MODEL_NAME, args) as server:
client_embedding = server.get_async_client() client_embedding = server.get_async_client()
...@@ -164,7 +161,7 @@ def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner): ...@@ -164,7 +161,7 @@ def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
with vllm_runner( with vllm_runner(
MODEL_NAME, MODEL_NAME,
task="generate", runner="generate",
max_model_len=MAX_MODEL_LEN, max_model_len=MAX_MODEL_LEN,
) as vllm_model: ) as vllm_model:
llm = vllm_model.llm llm = vllm_model.llm
...@@ -179,7 +176,7 @@ def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner): ...@@ -179,7 +176,7 @@ def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
async def test_gritlm_api_server_generate(): async def test_gritlm_api_server_generate():
input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n" input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
args = ["--task", "generate", "--max_model_len", str(MAX_MODEL_LEN)] args = ["--runner", "generate", "--max_model_len", str(MAX_MODEL_LEN)]
with RemoteOpenAIServer(MODEL_NAME, args) as server: with RemoteOpenAIServer(MODEL_NAME, args) as server:
client_generate = server.get_async_client() client_generate = server.get_async_client()
......
...@@ -4,6 +4,7 @@ from functools import partial ...@@ -4,6 +4,7 @@ from functools import partial
import pytest import pytest
import vllm.envs as envs
from vllm import PoolingParams from vllm import PoolingParams
from ...utils import EmbedModelInfo, RerankModelInfo from ...utils import EmbedModelInfo, RerankModelInfo
...@@ -62,6 +63,10 @@ def test_embed_models_correctness(hf_runner, vllm_runner, ...@@ -62,6 +63,10 @@ def test_embed_models_correctness(hf_runner, vllm_runner,
@pytest.mark.parametrize("model_info", RERANK_MODELS) @pytest.mark.parametrize("model_info", RERANK_MODELS)
def test_rerank_models_mteb(hf_runner, vllm_runner, def test_rerank_models_mteb(hf_runner, vllm_runner,
model_info: RerankModelInfo) -> None: model_info: RerankModelInfo) -> None:
if (model_info.architecture == "XLMRobertaForSequenceClassification"
and envs.VLLM_USE_V1):
pytest.skip("Not supported yet")
mteb_test_rerank_models(hf_runner, vllm_runner, model_info) mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
...@@ -92,7 +97,7 @@ def test_matryoshka( ...@@ -92,7 +97,7 @@ def test_matryoshka(
hf_outputs = matryoshka_fy(hf_outputs, dimensions) hf_outputs = matryoshka_fy(hf_outputs, dimensions)
with vllm_runner(model_info.name, with vllm_runner(model_info.name,
task="embed", runner="pooling",
dtype=dtype, dtype=dtype,
max_model_len=None) as vllm_model: max_model_len=None) as vllm_model:
assert vllm_model.llm.llm_engine.model_config.is_matryoshka assert vllm_model.llm.llm_engine.model_config.is_matryoshka
......
...@@ -21,7 +21,7 @@ max_model_len = int(original_max_position_embeddings * factor) ...@@ -21,7 +21,7 @@ max_model_len = int(original_max_position_embeddings * factor)
@pytest.mark.parametrize("model_info", MODELS) @pytest.mark.parametrize("model_info", MODELS)
def test_default(model_info, vllm_runner): def test_default(model_info, vllm_runner):
with vllm_runner(model_info.name, task="embed", with vllm_runner(model_info.name, runner="pooling",
max_model_len=None) as vllm_model: max_model_len=None) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config model_config = vllm_model.llm.llm_engine.model_config
if model_info.name == "nomic-ai/nomic-embed-text-v2-moe": if model_info.name == "nomic-ai/nomic-embed-text-v2-moe":
...@@ -36,7 +36,7 @@ def test_default(model_info, vllm_runner): ...@@ -36,7 +36,7 @@ def test_default(model_info, vllm_runner):
@pytest.mark.parametrize("model_info", MODELS) @pytest.mark.parametrize("model_info", MODELS)
def test_set_max_model_len_legal(model_info, vllm_runner): def test_set_max_model_len_legal(model_info, vllm_runner):
# set max_model_len <= 512 # set max_model_len <= 512
with vllm_runner(model_info.name, task="embed", with vllm_runner(model_info.name, runner="pooling",
max_model_len=256) as vllm_model: max_model_len=256) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config model_config = vllm_model.llm.llm_engine.model_config
assert model_config.max_model_len == 256 assert model_config.max_model_len == 256
...@@ -46,11 +46,12 @@ def test_set_max_model_len_legal(model_info, vllm_runner): ...@@ -46,11 +46,12 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
# For nomic-embed-text-v2-moe the length is set to 512 # For nomic-embed-text-v2-moe the length is set to 512
# by sentence_bert_config.json. # by sentence_bert_config.json.
with pytest.raises(ValueError): with pytest.raises(ValueError):
with vllm_runner(model_info.name, task="embed", with vllm_runner(model_info.name,
runner="pooling",
max_model_len=1024): max_model_len=1024):
pass pass
else: else:
with vllm_runner(model_info.name, task="embed", with vllm_runner(model_info.name, runner="pooling",
max_model_len=1024) as vllm_model: max_model_len=1024) as vllm_model:
model_config = vllm_model.llm.llm_engine.model_config model_config = vllm_model.llm.llm_engine.model_config
assert model_config.max_model_len == 1024 assert model_config.max_model_len == 1024
...@@ -60,14 +61,15 @@ def test_set_max_model_len_legal(model_info, vllm_runner): ...@@ -60,14 +61,15 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
def test_set_max_model_len_illegal(model_info, vllm_runner): def test_set_max_model_len_illegal(model_info, vllm_runner):
# set max_model_len > 2048 # set max_model_len > 2048
with pytest.raises(ValueError): with pytest.raises(ValueError):
with vllm_runner(model_info.name, task="embed", max_model_len=4096): with vllm_runner(model_info.name, runner="pooling",
max_model_len=4096):
pass pass
# set max_model_len > 2048 by hf_overrides # set max_model_len > 2048 by hf_overrides
hf_overrides = {"max_model_len": 4096} hf_overrides = {"max_model_len": 4096}
with pytest.raises(ValueError): with pytest.raises(ValueError):
with vllm_runner(model_info.name, with vllm_runner(model_info.name,
task="embed", runner="pooling",
max_model_len=None, max_model_len=None,
hf_overrides=hf_overrides): hf_overrides=hf_overrides):
pass pass
...@@ -87,7 +89,7 @@ def test_use_rope_scaling_legal(model_info, vllm_runner): ...@@ -87,7 +89,7 @@ def test_use_rope_scaling_legal(model_info, vllm_runner):
} }
with vllm_runner(model_info.name, with vllm_runner(model_info.name,
task="embed", runner="pooling",
max_model_len=None, max_model_len=None,
hf_overrides=hf_overrides): hf_overrides=hf_overrides):
pass pass
...@@ -107,7 +109,7 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner): ...@@ -107,7 +109,7 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner):
# illegal max_model_len # illegal max_model_len
with pytest.raises(ValueError): with pytest.raises(ValueError):
with vllm_runner(model_info.name, with vllm_runner(model_info.name,
task="embed", runner="pooling",
max_model_len=max_model_len + 1, max_model_len=max_model_len + 1,
hf_overrides=hf_overrides): hf_overrides=hf_overrides):
pass pass
...@@ -125,7 +127,7 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner): ...@@ -125,7 +127,7 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner):
# illegal max_model_len by hf_overrides # illegal max_model_len by hf_overrides
with pytest.raises(ValueError): with pytest.raises(ValueError):
with vllm_runner(model_info.name, with vllm_runner(model_info.name,
task="embed", runner="pooling",
max_model_len=None, max_model_len=None,
hf_overrides=hf_overrides): hf_overrides=hf_overrides):
pass pass
...@@ -37,7 +37,9 @@ def test_cross_encoder_1_to_1(vllm_runner, hf_runner, model_name): ...@@ -37,7 +37,9 @@ def test_cross_encoder_1_to_1(vllm_runner, hf_runner, model_name):
with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model: with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
hf_outputs = hf_model.predict([text_pair]).tolist() hf_outputs = hf_model.predict([text_pair]).tolist()
with vllm_runner(model_name, task="score", dtype=DTYPE, with vllm_runner(model_name,
runner="pooling",
dtype=DTYPE,
max_model_len=None) as vllm_model: max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(text_pair[0], text_pair[1]) vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
...@@ -56,7 +58,9 @@ def test_cross_encoder_1_to_N(vllm_runner, hf_runner, model_name): ...@@ -56,7 +58,9 @@ def test_cross_encoder_1_to_N(vllm_runner, hf_runner, model_name):
with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model: with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
hf_outputs = hf_model.predict(text_pairs).tolist() hf_outputs = hf_model.predict(text_pairs).tolist()
with vllm_runner(model_name, task="score", dtype=DTYPE, with vllm_runner(model_name,
runner="pooling",
dtype=DTYPE,
max_model_len=None) as vllm_model: max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2) vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
...@@ -76,7 +80,9 @@ def test_cross_encoder_N_to_N(vllm_runner, hf_runner, model_name): ...@@ -76,7 +80,9 @@ def test_cross_encoder_N_to_N(vllm_runner, hf_runner, model_name):
with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model: with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
hf_outputs = hf_model.predict(text_pairs).tolist() hf_outputs = hf_model.predict(text_pairs).tolist()
with vllm_runner(model_name, task="score", dtype=DTYPE, with vllm_runner(model_name,
runner="pooling",
dtype=DTYPE,
max_model_len=None) as vllm_model: max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2) vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
...@@ -103,7 +109,7 @@ def test_embedding_1_to_1(vllm_runner, hf_runner, emb_model_name): ...@@ -103,7 +109,7 @@ def test_embedding_1_to_1(vllm_runner, hf_runner, emb_model_name):
] ]
with vllm_runner(emb_model_name, with vllm_runner(emb_model_name,
task="embed", runner="pooling",
dtype=DTYPE, dtype=DTYPE,
max_model_len=None) as vllm_model: max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(text_pair[0], text_pair[1]) vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
...@@ -131,7 +137,7 @@ def test_embedding_1_to_N(vllm_runner, hf_runner, emb_model_name): ...@@ -131,7 +137,7 @@ def test_embedding_1_to_N(vllm_runner, hf_runner, emb_model_name):
] ]
with vllm_runner(emb_model_name, with vllm_runner(emb_model_name,
task="embed", runner="pooling",
dtype=DTYPE, dtype=DTYPE,
max_model_len=None) as vllm_model: max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2) vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
...@@ -160,7 +166,7 @@ def test_embedding_N_to_N(vllm_runner, hf_runner, emb_model_name): ...@@ -160,7 +166,7 @@ def test_embedding_N_to_N(vllm_runner, hf_runner, emb_model_name):
] ]
with vllm_runner(emb_model_name, with vllm_runner(emb_model_name,
task="embed", runner="pooling",
dtype=DTYPE, dtype=DTYPE,
max_model_len=None) as vllm_model: max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2) vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
......
...@@ -26,7 +26,7 @@ def test_smaller_truncation_size(vllm_runner, ...@@ -26,7 +26,7 @@ def test_smaller_truncation_size(vllm_runner,
truncate_prompt_tokens = 10 truncate_prompt_tokens = 10
with vllm_runner(model_name, task="embed", with vllm_runner(model_name, runner="pooling",
max_model_len=max_model_len) as vllm_model: max_model_len=max_model_len) as vllm_model:
vllm_output = vllm_model.llm.encode( vllm_output = vllm_model.llm.encode(
input_str, truncate_prompt_tokens=truncate_prompt_tokens) input_str, truncate_prompt_tokens=truncate_prompt_tokens)
...@@ -41,7 +41,7 @@ def test_max_truncation_size(vllm_runner, ...@@ -41,7 +41,7 @@ def test_max_truncation_size(vllm_runner,
input_str=input_str): input_str=input_str):
truncate_prompt_tokens = -1 truncate_prompt_tokens = -1
with vllm_runner(model_name, task="embed", with vllm_runner(model_name, runner="pooling",
max_model_len=max_model_len) as vllm_model: max_model_len=max_model_len) as vllm_model:
vllm_output = vllm_model.llm.encode( vllm_output = vllm_model.llm.encode(
input_str, truncate_prompt_tokens=truncate_prompt_tokens) input_str, truncate_prompt_tokens=truncate_prompt_tokens)
...@@ -58,7 +58,7 @@ def test_bigger_truncation_size(vllm_runner, ...@@ -58,7 +58,7 @@ def test_bigger_truncation_size(vllm_runner,
truncate_prompt_tokens = max_model_len + 1 truncate_prompt_tokens = max_model_len + 1
with pytest.raises(ValueError), vllm_runner( with pytest.raises(ValueError), vllm_runner(
model_name, task="embed", model_name, runner="pooling",
max_model_len=max_model_len) as vllm_model: max_model_len=max_model_len) as vllm_model:
llm_output = vllm_model.llm.encode( llm_output = vllm_model.llm.encode(
......
...@@ -222,7 +222,6 @@ VLM_TEST_SETTINGS = { ...@@ -222,7 +222,6 @@ VLM_TEST_SETTINGS = {
}, },
marks=[large_gpu_mark(min_gb=32)], marks=[large_gpu_mark(min_gb=32)],
), ),
# Check "auto" with fallback to transformers
"internvl-transformers": VLMTestInfo( "internvl-transformers": VLMTestInfo(
models=["OpenGVLab/InternVL3-1B-hf"], models=["OpenGVLab/InternVL3-1B-hf"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
...@@ -232,7 +231,7 @@ VLM_TEST_SETTINGS = { ...@@ -232,7 +231,7 @@ VLM_TEST_SETTINGS = {
use_tokenizer_eos=True, use_tokenizer_eos=True,
image_size_factors=[(0.25, 0.5, 1.0)], image_size_factors=[(0.25, 0.5, 1.0)],
vllm_runner_kwargs={ vllm_runner_kwargs={
"model_impl": "auto", "model_impl": "transformers",
}, },
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
marks=[pytest.mark.core_model], marks=[pytest.mark.core_model],
...@@ -638,7 +637,7 @@ VLM_TEST_SETTINGS = { ...@@ -638,7 +637,7 @@ VLM_TEST_SETTINGS = {
img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n", img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n",
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
task="generate", runner="generate",
# use sdpa mode for hf runner since phi3v didn't work with flash_attn # use sdpa mode for hf runner since phi3v didn't work with flash_attn
hf_model_kwargs={"_attn_implementation": "sdpa"}, hf_model_kwargs={"_attn_implementation": "sdpa"},
use_tokenizer_eos=True, use_tokenizer_eos=True,
......
...@@ -65,7 +65,7 @@ def run_test( ...@@ -65,7 +65,7 @@ def run_test(
# max_model_len should be greater than image_feature_size # max_model_len should be greater than image_feature_size
with vllm_runner( with vllm_runner(
model, model,
task="generate", runner="generate",
max_model_len=max_model_len, max_model_len=max_model_len,
max_num_seqs=1, max_num_seqs=1,
dtype=dtype, dtype=dtype,
......
...@@ -48,7 +48,7 @@ def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None: ...@@ -48,7 +48,7 @@ def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
] ]
with vllm_runner(model, with vllm_runner(model,
task="generate", runner="generate",
dtype=dtype, dtype=dtype,
limit_mm_per_prompt={"image": 2}, limit_mm_per_prompt={"image": 2},
max_model_len=32768, max_model_len=32768,
......
...@@ -99,7 +99,7 @@ def run_test( ...@@ -99,7 +99,7 @@ def run_test(
# max_model_len should be greater than image_feature_size # max_model_len should be greater than image_feature_size
with vllm_runner( with vllm_runner(
model, model,
task="generate", runner="generate",
max_model_len=max_model_len, max_model_len=max_model_len,
max_num_seqs=2, max_num_seqs=2,
dtype=dtype, dtype=dtype,
......
...@@ -267,7 +267,7 @@ def run_embedding_input_test( ...@@ -267,7 +267,7 @@ def run_embedding_input_test(
# max_model_len should be greater than image_feature_size # max_model_len should be greater than image_feature_size
with vllm_runner(model, with vllm_runner(model,
task="generate", runner="generate",
max_model_len=4000, max_model_len=4000,
max_num_seqs=3, max_num_seqs=3,
dtype=dtype, dtype=dtype,
......
...@@ -6,7 +6,7 @@ from typing import Any, Callable, Optional ...@@ -6,7 +6,7 @@ from typing import Any, Callable, Optional
import torch import torch
from transformers.models.auto.auto_factory import _BaseAutoModelClass from transformers.models.auto.auto_factory import _BaseAutoModelClass
from vllm.config import TaskOption from vllm.config import RunnerOption
from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer import AnyTokenizer
from .....conftest import HfRunner, VllmRunner from .....conftest import HfRunner, VllmRunner
...@@ -37,7 +37,7 @@ def run_test( ...@@ -37,7 +37,7 @@ def run_test(
vllm_runner_kwargs: Optional[dict[str, Any]], vllm_runner_kwargs: Optional[dict[str, Any]],
hf_model_kwargs: Optional[dict[str, Any]], hf_model_kwargs: Optional[dict[str, Any]],
patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]], patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]],
task: TaskOption = "auto", runner: RunnerOption = "auto",
distributed_executor_backend: Optional[str] = None, distributed_executor_backend: Optional[str] = None,
tensor_parallel_size: int = 1, tensor_parallel_size: int = 1,
vllm_embeddings: Optional[torch.Tensor] = None, vllm_embeddings: Optional[torch.Tensor] = None,
...@@ -83,7 +83,7 @@ def run_test( ...@@ -83,7 +83,7 @@ def run_test(
tensor_parallel_size=tensor_parallel_size, tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend, distributed_executor_backend=distributed_executor_backend,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
task=task, runner=runner,
**vllm_runner_kwargs_) as vllm_model: **vllm_runner_kwargs_) as vllm_model:
tokenizer = vllm_model.llm.get_tokenizer() tokenizer = vllm_model.llm.get_tokenizer()
......
...@@ -11,7 +11,7 @@ from pytest import MarkDecorator ...@@ -11,7 +11,7 @@ from pytest import MarkDecorator
from transformers import AutoModelForCausalLM from transformers import AutoModelForCausalLM
from transformers.models.auto.auto_factory import _BaseAutoModelClass from transformers.models.auto.auto_factory import _BaseAutoModelClass
from vllm.config import TaskOption from vllm.config import RunnerOption
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer import AnyTokenizer
...@@ -109,7 +109,7 @@ class VLMTestInfo(NamedTuple): ...@@ -109,7 +109,7 @@ class VLMTestInfo(NamedTuple):
enforce_eager: bool = True enforce_eager: bool = True
max_model_len: int = 1024 max_model_len: int = 1024
max_num_seqs: int = 256 max_num_seqs: int = 256
task: TaskOption = "auto" runner: RunnerOption = "auto"
tensor_parallel_size: int = 1 tensor_parallel_size: int = 1
vllm_runner_kwargs: Optional[dict[str, Any]] = None vllm_runner_kwargs: Optional[dict[str, Any]] = None
...@@ -173,7 +173,7 @@ class VLMTestInfo(NamedTuple): ...@@ -173,7 +173,7 @@ class VLMTestInfo(NamedTuple):
"enforce_eager": self.enforce_eager, "enforce_eager": self.enforce_eager,
"max_model_len": self.max_model_len, "max_model_len": self.max_model_len,
"max_num_seqs": self.max_num_seqs, "max_num_seqs": self.max_num_seqs,
"task": self.task, "runner": self.runner,
"tensor_parallel_size": self.tensor_parallel_size, "tensor_parallel_size": self.tensor_parallel_size,
"vllm_runner_kwargs": self.vllm_runner_kwargs, "vllm_runner_kwargs": self.vllm_runner_kwargs,
"hf_output_post_proc": self.hf_output_post_proc, "hf_output_post_proc": self.hf_output_post_proc,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment