Commit 7a985548 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.9.0' into v0.9.0-ori

parents 45d3785c dc1440cf
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""Compare the embedding outputs of HF and vLLM models.
Run `pytest tests/models/embedding/language/test_embedding.py`.
"""
import pytest import pytest
from vllm.config import PoolerConfig from vllm.config import PoolerConfig
from vllm.platforms import current_platform from vllm.platforms import current_platform
from ..utils import check_embeddings_close from ...utils import check_embeddings_close
@pytest.mark.parametrize( @pytest.mark.parametrize(
......
...@@ -7,12 +7,10 @@ from array import array ...@@ -7,12 +7,10 @@ from array import array
import openai import openai
import pytest import pytest
import pytest_asyncio
from scipy.spatial.distance import cosine from scipy.spatial.distance import cosine
import vllm from vllm import LLM, SamplingParams
import vllm.config from vllm.config import ModelConfig
from vllm.utils import STR_BACKEND_ENV_VAR
from ....utils import RemoteOpenAIServer from ....utils import RemoteOpenAIServer
...@@ -31,73 +29,45 @@ def _arr(arr): ...@@ -31,73 +29,45 @@ def _arr(arr):
return array("i", arr) return array("i", arr)
def test_find_array(monkeypatch: pytest.MonkeyPatch): def test_find_array():
# GritLM embedding implementation is only supported by XFormers backend. from vllm.model_executor.models.gritlm import GritLMPooler
with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
from vllm.model_executor.models.gritlm import GritLMPooler
# Create an LLM object to get the model config.
llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
pooler = GritLMPooler(model_config=llm.llm_engine.model_config)
arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1
with pytest.raises(ValueError):
pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
@pytest.fixture(scope="module")
def server_embedding():
# GritLM embedding implementation is only supported by XFormers backend.
args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
with pytest.MonkeyPatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.fixture(scope="module")
def server_generate():
args = ["--task", "generate", "--max_model_len", str(MAX_MODEL_LEN)]
with pytest.MonkeyPatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
model_config = ModelConfig(
MODEL_NAME,
task="embed",
tokenizer=MODEL_NAME,
tokenizer_mode="auto",
trust_remote_code=False,
dtype="bfloat16",
seed=0,
)
pooler = GritLMPooler(model_config=model_config)
@pytest_asyncio.fixture arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
async def client_embedding(server_embedding: RemoteOpenAIServer):
async with server_embedding.get_async_client() as async_client:
yield async_client
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1
@pytest_asyncio.fixture with pytest.raises(ValueError):
async def client_generate(server_generate: RemoteOpenAIServer): pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
async with server_generate.get_async_client() as async_client:
yield async_client
def run_llm_encode( def run_llm_encode(
llm: vllm.LLM, llm: LLM,
queries: list[str], queries: list[str],
instruction: str, instruction: str,
) -> list[float]: ) -> list[list[float]]:
outputs = llm.encode([instruction + q for q in queries], ) outputs = llm.embed([instruction + q for q in queries])
return [output.outputs.embedding for output in outputs] return [output.outputs.embedding for output in outputs]
async def run_client_embeddings( async def run_client_embeddings(
client: vllm.LLM, client: openai.AsyncOpenAI,
queries: list[str], queries: list[str],
instruction: str, instruction: str,
) -> list[float]: ) -> list[list[float]]:
outputs = await client.embeddings.create( outputs = await client.embeddings.create(
model=MODEL_NAME, model=MODEL_NAME,
input=[instruction + q for q in queries], input=[instruction + q for q in queries],
...@@ -132,7 +102,7 @@ def get_test_data(): ...@@ -132,7 +102,7 @@ def get_test_data():
return queries, q_instruction, documents, d_instruction return queries, q_instruction, documents, d_instruction
def validate_embed_output(q_rep: list[float], d_rep: list[float]): def validate_embed_output(q_rep: list[list[float]], d_rep: list[list[float]]):
cosine_sim_q0_d0 = 1 - cosine(q_rep[0], d_rep[0]) cosine_sim_q0_d0 = 1 - cosine(q_rep[0], d_rep[0])
assert math.isclose(cosine_sim_q0_d0, 0.609, abs_tol=0.001) assert math.isclose(cosine_sim_q0_d0, 0.609, abs_tol=0.001)
...@@ -143,17 +113,18 @@ def validate_embed_output(q_rep: list[float], d_rep: list[float]): ...@@ -143,17 +113,18 @@ def validate_embed_output(q_rep: list[float], d_rep: list[float]):
assert math.isclose(cosine_sim_q1_d0, 0.120, abs_tol=0.001) assert math.isclose(cosine_sim_q1_d0, 0.120, abs_tol=0.001)
cosine_sim_q1_d1 = 1 - cosine(q_rep[1], d_rep[1]) cosine_sim_q1_d1 = 1 - cosine(q_rep[1], d_rep[1])
assert math.isclose(cosine_sim_q1_d1, 0.532, abs_tol=0.001) assert math.isclose(cosine_sim_q1_d1, 0.534, abs_tol=0.001)
def test_gritlm_offline_embedding(monkeypatch: pytest.MonkeyPatch):
# GritLM embedding implementation is only supported by XFormers backend.
with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
queries, q_instruction, documents, d_instruction = get_test_data() def test_gritlm_offline_embedding(vllm_runner):
queries, q_instruction, documents, d_instruction = get_test_data()
llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN) with vllm_runner(
MODEL_NAME,
task="embed",
max_model_len=MAX_MODEL_LEN,
) as vllm_model:
llm = vllm_model.model
d_rep = run_llm_encode( d_rep = run_llm_encode(
llm, llm,
...@@ -166,47 +137,62 @@ def test_gritlm_offline_embedding(monkeypatch: pytest.MonkeyPatch): ...@@ -166,47 +137,62 @@ def test_gritlm_offline_embedding(monkeypatch: pytest.MonkeyPatch):
q_instruction, q_instruction,
) )
validate_embed_output(q_rep, d_rep) validate_embed_output(q_rep, d_rep)
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_gritlm_api_server_embedding( async def test_gritlm_api_server_embedding():
client_embedding: openai.AsyncOpenAI, ):
queries, q_instruction, documents, d_instruction = get_test_data() queries, q_instruction, documents, d_instruction = get_test_data()
d_rep = await run_client_embeddings( args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
client_embedding,
documents, with RemoteOpenAIServer(MODEL_NAME, args) as server:
d_instruction, client_embedding = server.get_async_client()
)
q_rep = await run_client_embeddings( d_rep = await run_client_embeddings(
client_embedding, client_embedding,
queries, documents,
q_instruction, d_instruction,
) )
q_rep = await run_client_embeddings(
client_embedding,
queries,
q_instruction,
)
validate_embed_output(q_rep, d_rep) validate_embed_output(q_rep, d_rep)
def test_gritlm_offline_gen(): def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n" input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
llm = vllm.LLM(MODEL_NAME, max_model_len=MAX_MODEL_LEN) with vllm_runner(
sampling_params = vllm.SamplingParams(temperature=0.0, max_tokens=256) MODEL_NAME,
outputs = llm.generate(input, sampling_params=sampling_params) task="generate",
max_model_len=MAX_MODEL_LEN,
) as vllm_model:
llm = vllm_model.model
sampling_params = SamplingParams(temperature=0.0, max_tokens=256)
outputs = llm.generate(input, sampling_params=sampling_params)
assert outputs[0].outputs[0].text == "The capital of France is Paris." assert outputs[0].outputs[0].text == "The capital of France is Paris."
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_gritlm_api_server_gen(client_generate: openai.AsyncOpenAI): async def test_gritlm_api_server_generate():
input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n" input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
outputs = await client_generate.completions.create( args = ["--task", "generate", "--max_model_len", str(MAX_MODEL_LEN)]
model=MODEL_NAME,
prompt=input, with RemoteOpenAIServer(MODEL_NAME, args) as server:
max_tokens=256, client_generate = server.get_async_client()
temperature=0.0,
) outputs = await client_generate.completions.create(
model=MODEL_NAME,
prompt=input,
max_tokens=256,
temperature=0.0,
)
assert outputs.choices[0].text == "The capital of France is Paris." assert outputs.choices[0].text == "The capital of France is Paris."
# SPDX-License-Identifier: Apache-2.0
from typing import Any
import pytest
from ...utils import EmbedModelInfo, run_embedding_correctness_test
MODELS = [
########## BertModel
EmbedModelInfo("thenlper/gte-large",
architecture="BertModel",
dtype="float32",
enable_test=True),
EmbedModelInfo("thenlper/gte-base",
architecture="BertModel",
dtype="float32",
enable_test=False),
EmbedModelInfo("thenlper/gte-small",
architecture="BertModel",
dtype="float32",
enable_test=False),
EmbedModelInfo("thenlper/gte-large-zh",
architecture="BertModel",
dtype="float32",
enable_test=False),
EmbedModelInfo("thenlper/gte-base-zh",
architecture="BertModel",
dtype="float32",
enable_test=False),
EmbedModelInfo("thenlper/gte-small-zh",
architecture="BertModel",
dtype="float32",
enable_test=False),
########### NewModel
EmbedModelInfo("Alibaba-NLP/gte-multilingual-base",
architecture="GteNewModel",
enable_test=True),
EmbedModelInfo("Alibaba-NLP/gte-base-en-v1.5",
architecture="GteNewModel",
enable_test=True),
EmbedModelInfo("Alibaba-NLP/gte-large-en-v1.5",
architecture="GteNewModel",
enable_test=True),
########### Qwen2ForCausalLM
EmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
architecture="Qwen2ForCausalLM",
enable_test=True),
EmbedModelInfo("Alibaba-NLP/gte-Qwen2-7B-instruct",
architecture="Qwen2ForCausalLM",
enable_test=False),
########## ModernBertModel
EmbedModelInfo("Alibaba-NLP/gte-modernbert-base",
architecture="ModernBertModel",
enable_test=True),
]
@pytest.mark.parametrize("model_info", MODELS)
def test_models_mteb(hf_runner, vllm_runner,
model_info: EmbedModelInfo) -> None:
pytest.skip("Skipping mteb test.")
from .mteb_utils import mteb_test_embed_models
vllm_extra_kwargs: dict[str, Any] = {}
if model_info.name == "Alibaba-NLP/gte-Qwen2-1.5B-instruct":
vllm_extra_kwargs["hf_overrides"] = {"is_causal": True}
if model_info.architecture == "GteNewModel":
vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}
mteb_test_embed_models(hf_runner, vllm_runner, model_info,
vllm_extra_kwargs)
@pytest.mark.parametrize("model_info", MODELS)
def test_models_correctness(hf_runner, vllm_runner, model_info: EmbedModelInfo,
example_prompts) -> None:
if not model_info.enable_test:
pytest.skip("Skipping test.")
# ST will strip the input texts, see test_embedding.py
example_prompts = [str(s).strip() for s in example_prompts]
vllm_extra_kwargs: dict[str, Any] = {}
if model_info.name == "Alibaba-NLP/gte-Qwen2-1.5B-instruct":
vllm_extra_kwargs["hf_overrides"] = {"is_causal": True}
if model_info.architecture == "GteNewModel":
vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}
with vllm_runner(model_info.name,
task="embed",
dtype=model_info.dtype,
max_model_len=None,
**vllm_extra_kwargs) as vllm_model:
vllm_outputs = vllm_model.encode(example_prompts)
with hf_runner(
model_info.name,
dtype=model_info.dtype,
is_sentence_transformer=True,
) as hf_model:
run_embedding_correctness_test(hf_model, example_prompts, vllm_outputs)
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# ruff: noqa: E501
"""Compare the scoring outputs of HF and vLLM models.
Run `pytest tests/models/embedding/language/test_jina.py`.
"""
import math import math
import pytest import pytest
from tests.models.embedding.utils import check_embeddings_close, matryoshka_fy
from vllm import PoolingParams from vllm import PoolingParams
from ...utils import check_embeddings_close, matryoshka_fy
SCORING_MODELS = [ SCORING_MODELS = [
"jinaai/jina-reranker-v2-base-multilingual", # Roberta "jinaai/jina-reranker-v2-base-multilingual", # Roberta
] ]
...@@ -21,9 +17,9 @@ TEXTS_2 = [ ...@@ -21,9 +17,9 @@ TEXTS_2 = [
"Organic skincare for sensitive skin with aloe vera and chamomile.", "Organic skincare for sensitive skin with aloe vera and chamomile.",
"New makeup trends focus on bold colors and innovative techniques", "New makeup trends focus on bold colors and innovative techniques",
"Bio-Hautpflege für empfindliche Haut mit Aloe Vera und Kamille", "Bio-Hautpflege für empfindliche Haut mit Aloe Vera und Kamille",
"Neue Make-up-Trends setzen auf kräftige Farben und innovative Techniken", "Neue Make-up-Trends setzen auf kräftige Farben und innovative Techniken", # noqa: E501
"Cuidado de la piel orgánico para piel sensible con aloe vera y manzanilla", "Cuidado de la piel orgánico para piel sensible con aloe vera y manzanilla", # noqa: E501
"Las nuevas tendencias de maquillaje se centran en colores vivos y técnicas innovadoras", "Las nuevas tendencias de maquillaje se centran en colores vivos y técnicas innovadoras", # noqa: E501
"针对敏感肌专门设计的天然有机护肤产品", "针对敏感肌专门设计的天然有机护肤产品",
"新的化妆趋势注重鲜艳的颜色和创新的技巧", "新的化妆趋势注重鲜艳的颜色和创新的技巧",
"敏感肌のために特別に設計された天然有機スキンケア製品", "敏感肌のために特別に設計された天然有機スキンケア製品",
......
# SPDX-License-Identifier: Apache-2.0
import pytest
from ...utils import EmbedModelInfo, run_embedding_correctness_test
MODELS = [
EmbedModelInfo("nomic-ai/nomic-embed-text-v1",
architecture="NomicBertModel",
dtype="float32",
enable_test=True),
EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5",
architecture="NomicBertModel",
dtype="float32",
enable_test=False),
EmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe",
architecture="NomicBertModel",
dtype="float32",
enable_test=True)
]
@pytest.mark.parametrize("model_info", MODELS)
def test_models_mteb(hf_runner, vllm_runner,
model_info: EmbedModelInfo) -> None:
pytest.skip("Skipping mteb test.")
from .mteb_utils import mteb_test_embed_models
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
@pytest.mark.parametrize("model_info", MODELS)
def test_models_correctness(hf_runner, vllm_runner, model_info: EmbedModelInfo,
example_prompts) -> None:
if not model_info.enable_test:
pytest.skip("Skipping test.")
# ST will strip the input texts, see test_embedding.py
example_prompts = [str(s).strip() for s in example_prompts]
with vllm_runner(model_info.name,
task="embed",
dtype=model_info.dtype,
max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.encode(example_prompts)
with hf_runner(
model_info.name,
dtype=model_info.dtype,
is_sentence_transformer=True,
) as hf_model:
run_embedding_correctness_test(hf_model, example_prompts, vllm_outputs)
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""Compare the scoring outputs of HF and vLLM models.
Run `pytest tests/models/embedding/language/test_scoring.py`.
"""
import math import math
import pytest import pytest
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
MODELS = [ CROSS_ENCODER_MODELS = [
"cross-encoder/ms-marco-MiniLM-L-6-v2", # Bert "cross-encoder/ms-marco-MiniLM-L-6-v2", # Bert
"BAAI/bge-reranker-v2-m3", # Roberta "BAAI/bge-reranker-v2-m3", # Roberta
] ]
...@@ -28,21 +24,21 @@ TEXTS_2 = [ ...@@ -28,21 +24,21 @@ TEXTS_2 = [
"The capital of Germany is Berlin.", "The capital of Germany is Berlin.",
] ]
DTYPE = "half"
@pytest.fixture(scope="module", params=MODELS) @pytest.fixture(scope="module", params=CROSS_ENCODER_MODELS)
def model_name(request): def model_name(request):
yield request.param yield request.param
@pytest.mark.parametrize("dtype", ["half"]) def test_cross_encoder_1_to_1(vllm_runner, hf_runner, model_name):
def test_llm_1_to_1(vllm_runner, hf_runner, model_name, dtype: str):
text_pair = [TEXTS_1[0], TEXTS_2[0]] text_pair = [TEXTS_1[0], TEXTS_2[0]]
with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model: with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
hf_outputs = hf_model.predict([text_pair]).tolist() hf_outputs = hf_model.predict([text_pair]).tolist()
with vllm_runner(model_name, task="score", dtype=dtype, with vllm_runner(model_name, task="score", dtype=DTYPE,
max_model_len=None) as vllm_model: max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(text_pair[0], text_pair[1]) vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
...@@ -52,18 +48,16 @@ def test_llm_1_to_1(vllm_runner, hf_runner, model_name, dtype: str): ...@@ -52,18 +48,16 @@ def test_llm_1_to_1(vllm_runner, hf_runner, model_name, dtype: str):
assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
@pytest.mark.parametrize("dtype", ["half"]) def test_cross_encoder_1_to_N(vllm_runner, hf_runner, model_name):
def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str):
text_pairs = [ text_pairs = [
[TEXTS_1[0], TEXTS_2[0]], [TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[0], TEXTS_2[1]], [TEXTS_1[0], TEXTS_2[1]],
] ]
with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model: with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
hf_outputs = hf_model.predict(text_pairs).tolist() hf_outputs = hf_model.predict(text_pairs).tolist()
with vllm_runner(model_name, task="score", dtype=dtype, with vllm_runner(model_name, task="score", dtype=DTYPE,
max_model_len=None) as vllm_model: max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2) vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
...@@ -74,18 +68,16 @@ def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str): ...@@ -74,18 +68,16 @@ def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str):
assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01) assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
@pytest.mark.parametrize("dtype", ["half"]) def test_cross_encoder_N_to_N(vllm_runner, hf_runner, model_name):
def test_llm_N_to_N(vllm_runner, hf_runner, model_name, dtype: str):
text_pairs = [ text_pairs = [
[TEXTS_1[0], TEXTS_2[0]], [TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[1], TEXTS_2[1]], [TEXTS_1[1], TEXTS_2[1]],
] ]
with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model: with hf_runner(model_name, dtype=DTYPE, is_cross_encoder=True) as hf_model:
hf_outputs = hf_model.predict(text_pairs).tolist() hf_outputs = hf_model.predict(text_pairs).tolist()
with vllm_runner(model_name, task="score", dtype=dtype, with vllm_runner(model_name, task="score", dtype=DTYPE,
max_model_len=None) as vllm_model: max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2) vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
...@@ -101,13 +93,10 @@ def emb_model_name(request): ...@@ -101,13 +93,10 @@ def emb_model_name(request):
yield request.param yield request.param
@pytest.mark.parametrize("dtype", ["half"]) def test_embedding_1_to_1(vllm_runner, hf_runner, emb_model_name):
def test_llm_1_to_1_embedding(vllm_runner, hf_runner, emb_model_name,
dtype: str):
text_pair = [TEXTS_1[0], TEXTS_2[0]] text_pair = [TEXTS_1[0], TEXTS_2[0]]
with hf_runner(emb_model_name, dtype=dtype, with hf_runner(emb_model_name, dtype=DTYPE,
is_sentence_transformer=True) as hf_model: is_sentence_transformer=True) as hf_model:
hf_embeddings = hf_model.encode(text_pair) hf_embeddings = hf_model.encode(text_pair)
hf_outputs = [ hf_outputs = [
...@@ -116,7 +105,7 @@ def test_llm_1_to_1_embedding(vllm_runner, hf_runner, emb_model_name, ...@@ -116,7 +105,7 @@ def test_llm_1_to_1_embedding(vllm_runner, hf_runner, emb_model_name,
with vllm_runner(emb_model_name, with vllm_runner(emb_model_name,
task="embed", task="embed",
dtype=dtype, dtype=DTYPE,
max_model_len=None) as vllm_model: max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(text_pair[0], text_pair[1]) vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
...@@ -126,16 +115,13 @@ def test_llm_1_to_1_embedding(vllm_runner, hf_runner, emb_model_name, ...@@ -126,16 +115,13 @@ def test_llm_1_to_1_embedding(vllm_runner, hf_runner, emb_model_name,
assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
@pytest.mark.parametrize("dtype", ["half"]) def test_embedding_1_to_N(vllm_runner, hf_runner, emb_model_name):
def test_llm_1_to_N_embedding(vllm_runner, hf_runner, emb_model_name,
dtype: str):
text_pairs = [ text_pairs = [
[TEXTS_1[0], TEXTS_2[0]], [TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[0], TEXTS_2[1]], [TEXTS_1[0], TEXTS_2[1]],
] ]
with hf_runner(emb_model_name, dtype=dtype, with hf_runner(emb_model_name, dtype=DTYPE,
is_sentence_transformer=True) as hf_model: is_sentence_transformer=True) as hf_model:
hf_embeddings = [ hf_embeddings = [
hf_model.encode(text_pair) for text_pair in text_pairs hf_model.encode(text_pair) for text_pair in text_pairs
...@@ -147,7 +133,7 @@ def test_llm_1_to_N_embedding(vllm_runner, hf_runner, emb_model_name, ...@@ -147,7 +133,7 @@ def test_llm_1_to_N_embedding(vllm_runner, hf_runner, emb_model_name,
with vllm_runner(emb_model_name, with vllm_runner(emb_model_name,
task="embed", task="embed",
dtype=dtype, dtype=DTYPE,
max_model_len=None) as vllm_model: max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2) vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
...@@ -158,16 +144,13 @@ def test_llm_1_to_N_embedding(vllm_runner, hf_runner, emb_model_name, ...@@ -158,16 +144,13 @@ def test_llm_1_to_N_embedding(vllm_runner, hf_runner, emb_model_name,
assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01) assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
@pytest.mark.parametrize("dtype", ["half"]) def test_embedding_N_to_N(vllm_runner, hf_runner, emb_model_name):
def test_llm_N_to_N_embedding(vllm_runner, hf_runner, emb_model_name,
dtype: str):
text_pairs = [ text_pairs = [
[TEXTS_1[0], TEXTS_2[0]], [TEXTS_1[0], TEXTS_2[0]],
[TEXTS_1[1], TEXTS_2[1]], [TEXTS_1[1], TEXTS_2[1]],
] ]
with hf_runner(emb_model_name, dtype=dtype, with hf_runner(emb_model_name, dtype=DTYPE,
is_sentence_transformer=True) as hf_model: is_sentence_transformer=True) as hf_model:
hf_embeddings = [ hf_embeddings = [
hf_model.encode(text_pair) for text_pair in text_pairs hf_model.encode(text_pair) for text_pair in text_pairs
...@@ -179,7 +162,7 @@ def test_llm_N_to_N_embedding(vllm_runner, hf_runner, emb_model_name, ...@@ -179,7 +162,7 @@ def test_llm_N_to_N_embedding(vllm_runner, hf_runner, emb_model_name,
with vllm_runner(emb_model_name, with vllm_runner(emb_model_name,
task="embed", task="embed",
dtype=dtype, dtype=DTYPE,
max_model_len=None) as vllm_model: max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2) vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2)
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""Compare the embedding outputs of HF and vLLM models.
Run `pytest tests/models/embedding/language/test_snowflake_arctic_embed.py`.
"""
import pytest import pytest
from tests.models.embedding.utils import EmbedModelInfo from ...utils import EmbedModelInfo, run_embedding_correctness_test
from ..utils import check_embeddings_close
EMBEDDING_PROMPTS = [
'what is snowflake?', 'Where can I get the best tacos?', 'The Data Cloud!',
'Mexico City of Course!'
]
MODELS = [ MODELS = [
EmbedModelInfo("Snowflake/snowflake-arctic-embed-xs", EmbedModelInfo("Snowflake/snowflake-arctic-embed-xs",
...@@ -51,51 +41,38 @@ MODELS = [ ...@@ -51,51 +41,38 @@ MODELS = [
@pytest.mark.parametrize("model_info", MODELS) @pytest.mark.parametrize("model_info", MODELS)
@pytest.mark.parametrize("dtype", ["half"]) def test_models_mteb(
def test_models(
hf_runner, hf_runner,
vllm_runner, vllm_runner,
example_prompts,
model_info: EmbedModelInfo, model_info: EmbedModelInfo,
dtype: str,
monkeypatch,
) -> None: ) -> None:
if not model_info.enable_test: pytest.skip("Skipping mteb test.")
# A model family has many models with the same architecture, from .mteb_utils import mteb_test_embed_models
# and we don't need to test each one. mteb_test_embed_models(hf_runner, vllm_runner, model_info)
pytest.skip("Skipping test.")
example_prompts = example_prompts + EMBEDDING_PROMPTS
vllm_extra_kwargs = { @pytest.mark.parametrize("model_info", MODELS)
"hf_overrides": { def test_models_correctness(
"is_matryoshka": model_info.is_matryoshka hf_runner,
} vllm_runner,
} model_info: EmbedModelInfo,
example_prompts,
) -> None:
if not model_info.enable_test:
pytest.skip("Skipping test.")
with hf_runner(model_info.name, dtype=dtype, # ST will strip the input texts, see test_embedding.py
is_sentence_transformer=True) as hf_model: example_prompts = [str(s).strip() for s in example_prompts]
hf_outputs = hf_model.encode(example_prompts)
with vllm_runner(model_info.name, with vllm_runner(model_info.name,
task="embed", task="embed",
dtype=dtype, dtype=model_info.dtype,
max_model_len=None, max_model_len=None) as vllm_model:
**vllm_extra_kwargs) as vllm_model:
assert (vllm_model.model.llm_engine.model_config.is_matryoshka ==
model_info.is_matryoshka)
if model_info.architecture:
assert (model_info.architecture
in vllm_model.model.llm_engine.model_config.architectures)
vllm_outputs = vllm_model.encode(example_prompts) vllm_outputs = vllm_model.encode(example_prompts)
check_embeddings_close( with hf_runner(
embeddings_0_lst=hf_outputs, model_info.name,
embeddings_1_lst=vllm_outputs, dtype=model_info.dtype,
name_0="hf", is_sentence_transformer=True,
name_1="vllm", ) as hf_model:
tol=1e-2, run_embedding_correctness_test(hf_model, example_prompts, vllm_outputs)
)
# SPDX-License-Identifier: Apache-2.0
import pytest
MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2"
max_model_len = 128
input_str = """Immerse yourself in the enchanting chronicle of calculus, a
mathematical domain that has radically transformed our comprehension of
change and motion. Despite its roots in ancient civilizations, the
formal birth of calculus predominantly occurred in the 17th century,
primarily under the influential guidance of Sir Isaac Newton and Gottfried
Wilhelm Leibniz. The earliest traces of calculus concepts are found in
ancient Greek mathematics,most notably in the works of Eudoxus and
Archimedes, around 300 BCE. They utilized the 'method of exhaustion'—a
technique for computing areas and volumes through the use of finite sums.
This methodology laid crucial foundational work for integral calculus.
In the 17th century, both Newton and Leibniz independently pioneered
calculus, each contributing unique perspectives that would shape this new
field."""
def test_smaller_truncation_size(vllm_runner,
model_name=MODEL_NAME,
input_str=input_str):
truncate_prompt_tokens = 10
with vllm_runner(model_name, task="embed",
max_model_len=max_model_len) as vllm_model:
vllm_output = vllm_model.model.encode(
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
prompt_tokens = vllm_output[0].prompt_token_ids
assert len(prompt_tokens) == truncate_prompt_tokens
def test_max_truncation_size(vllm_runner,
model_name=MODEL_NAME,
input_str=input_str):
truncate_prompt_tokens = -1
with vllm_runner(model_name, task="embed",
max_model_len=max_model_len) as vllm_model:
vllm_output = vllm_model.model.encode(
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
prompt_tokens = vllm_output[0].prompt_token_ids
assert len(prompt_tokens) == max_model_len
def test_bigger_truncation_size(vllm_runner,
model_name=MODEL_NAME,
input_str=input_str):
truncate_prompt_tokens = max_model_len + 1
with pytest.raises(ValueError), vllm_runner(
model_name, task="embed",
max_model_len=max_model_len) as vllm_model:
llm_output = vllm_model.model.encode(
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
assert llm_output == f"""truncate_prompt_tokens value
({truncate_prompt_tokens}) is greater than
max_model_len ({max_model_len}). Please, select
a smaller truncation size."""
...@@ -8,13 +8,14 @@ from collections import defaultdict ...@@ -8,13 +8,14 @@ from collections import defaultdict
from pathlib import PosixPath from pathlib import PosixPath
import pytest import pytest
from transformers import AutoModelForImageTextToText, AutoModelForVision2Seq from transformers import (AutoModel, AutoModelForImageTextToText,
AutoModelForTextToWaveform, AutoModelForVision2Seq)
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import identity from vllm.utils import identity
from ....conftest import (IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets, from ....conftest import (IMAGE_ASSETS, AudioTestAssets, HfRunner,
_VideoAssets) ImageTestAssets, VideoTestAssets, VllmRunner)
from ....utils import (create_new_process_for_each_test, large_gpu_mark, from ....utils import (create_new_process_for_each_test, large_gpu_mark,
multi_gpu_marks) multi_gpu_marks)
from ...utils import check_outputs_equal from ...utils import check_outputs_equal
...@@ -140,7 +141,7 @@ VLM_TEST_SETTINGS = { ...@@ -140,7 +141,7 @@ VLM_TEST_SETTINGS = {
marks=[pytest.mark.core_model, pytest.mark.cpu_model], marks=[pytest.mark.core_model, pytest.mark.cpu_model],
), ),
"qwen2_5_omni": VLMTestInfo( "qwen2_5_omni": VLMTestInfo(
models=["Qwen/Qwen2.5-Omni-7B"], models=["Qwen/Qwen2.5-Omni-3B"],
test_type=( test_type=(
VLMTestType.IMAGE, VLMTestType.IMAGE,
VLMTestType.MULTI_IMAGE, VLMTestType.MULTI_IMAGE,
...@@ -151,11 +152,23 @@ VLM_TEST_SETTINGS = { ...@@ -151,11 +152,23 @@ VLM_TEST_SETTINGS = {
video_idx_to_prompt=lambda idx: "<|vision_bos|><|VIDEO|><|vision_eos|>", # noqa: E501 video_idx_to_prompt=lambda idx: "<|vision_bos|><|VIDEO|><|vision_eos|>", # noqa: E501
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForTextToWaveform,
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
patch_hf_runner=model_utils.qwen2_5_omni_patch_hf_runner,
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
marks=[pytest.mark.core_model, pytest.mark.cpu_model], marks=[pytest.mark.core_model, pytest.mark.cpu_model],
), ),
"ultravox": VLMTestInfo(
models = ["fixie-ai/ultravox-v0_5-llama-3_2-1b"],
test_type=VLMTestType.AUDIO,
prompt_formatter=lambda audio_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{audio_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
audio_idx_to_prompt=lambda idx: "<|audio|>",
max_model_len=4096,
max_num_seqs=2,
auto_cls=AutoModel,
hf_output_post_proc=model_utils.ultravox_trunc_hf_output,
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
),
#### Extended model tests #### Extended model tests
"aria": VLMTestInfo( "aria": VLMTestInfo(
models=["rhymes-ai/Aria"], models=["rhymes-ai/Aria"],
...@@ -267,6 +280,7 @@ VLM_TEST_SETTINGS = { ...@@ -267,6 +280,7 @@ VLM_TEST_SETTINGS = {
multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501 multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
dtype="bfloat16",
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}}, vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
patch_hf_runner=model_utils.gemma3_patch_hf_runner, patch_hf_runner=model_utils.gemma3_patch_hf_runner,
...@@ -390,7 +404,6 @@ VLM_TEST_SETTINGS = { ...@@ -390,7 +404,6 @@ VLM_TEST_SETTINGS = {
formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
), ),
limit_mm_per_prompt={"video": 4}, limit_mm_per_prompt={"video": 4},
runner_mm_key="videos",
)], )],
), ),
"llava_next_video": VLMTestInfo( "llava_next_video": VLMTestInfo(
...@@ -423,6 +436,8 @@ VLM_TEST_SETTINGS = { ...@@ -423,6 +436,8 @@ VLM_TEST_SETTINGS = {
get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id], get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id],
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
patch_hf_runner=model_utils.minicpmv_25_patch_hf_runner, patch_hf_runner=model_utils.minicpmv_25_patch_hf_runner,
# FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
marks=[pytest.mark.skip("HF import fails")],
), ),
"minicpmo_26": VLMTestInfo( "minicpmo_26": VLMTestInfo(
models=["openbmb/MiniCPM-o-2_6"], models=["openbmb/MiniCPM-o-2_6"],
...@@ -434,6 +449,8 @@ VLM_TEST_SETTINGS = { ...@@ -434,6 +449,8 @@ VLM_TEST_SETTINGS = {
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501 get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner, patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
# FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
marks=[pytest.mark.skip("HF import fails")],
), ),
"minicpmv_26": VLMTestInfo( "minicpmv_26": VLMTestInfo(
models=["openbmb/MiniCPM-V-2_6"], models=["openbmb/MiniCPM-V-2_6"],
...@@ -445,6 +462,21 @@ VLM_TEST_SETTINGS = { ...@@ -445,6 +462,21 @@ VLM_TEST_SETTINGS = {
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501 get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner, patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
# FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
marks=[pytest.mark.skip("HF import fails")],
),
"minimax_vl_01": VLMTestInfo(
models=["MiniMaxAI/MiniMax-VL-01"],
prompt_formatter=lambda img_prompt: f"<beginning_of_sentence>user: {img_prompt} assistant:<end_of_sentence>", # noqa: E501
img_idx_to_prompt=lambda _: "<image>",
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
max_model_len=8192,
max_num_seqs=4,
dtype="bfloat16",
hf_output_post_proc=model_utils.minimax_vl_01_hf_output,
patch_hf_runner=model_utils.minimax_vl_01_patch_hf_runner,
auto_cls=AutoModelForImageTextToText,
marks=[large_gpu_mark(min_gb=80)],
), ),
"molmo": VLMTestInfo( "molmo": VLMTestInfo(
models=["allenai/Molmo-7B-D-0924"], models=["allenai/Molmo-7B-D-0924"],
...@@ -454,6 +486,43 @@ VLM_TEST_SETTINGS = { ...@@ -454,6 +486,43 @@ VLM_TEST_SETTINGS = {
max_num_seqs=2, max_num_seqs=2,
patch_hf_runner=model_utils.molmo_patch_hf_runner, patch_hf_runner=model_utils.molmo_patch_hf_runner,
), ),
"ovis1_6-gemma2": VLMTestInfo(
models=["AIDC-AI/Ovis1.6-Gemma2-9B"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
dtype="half",
# use sdpa mode for hf runner since ovis2 didn't work with flash_attn
hf_model_kwargs={"llm_attn_implementation": "sdpa"},
patch_hf_runner=model_utils.ovis_patch_hf_runner,
marks=[large_gpu_mark(min_gb=32)],
),
"ovis1_6": VLMTestInfo(
models=["AIDC-AI/Ovis1.6-Llama3.2-3B"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful and honest multimodal assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
dtype="half",
# use sdpa mode for hf runner since ovis2 didn't work with flash_attn
hf_model_kwargs={"llm_attn_implementation": "sdpa"},
patch_hf_runner=model_utils.ovis_patch_hf_runner,
),
"ovis2": VLMTestInfo(
models=["AIDC-AI/Ovis2-1B"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
dtype="half",
# use sdpa mode for hf runner since ovis2 didn't work with flash_attn
hf_model_kwargs={"llm_attn_implementation": "sdpa"},
patch_hf_runner=model_utils.ovis_patch_hf_runner,
),
"phi3v": VLMTestInfo( "phi3v": VLMTestInfo(
models=["microsoft/Phi-3.5-vision-instruct"], models=["microsoft/Phi-3.5-vision-instruct"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
...@@ -663,6 +732,7 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2) ...@@ -663,6 +732,7 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
# - multi-image # - multi-image
# - image embeddings # - image embeddings
# - video # - video
# - audio
# - custom inputs # - custom inputs
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model_type,test_case", "model_type,test_case",
...@@ -675,7 +745,7 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str, ...@@ -675,7 +745,7 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets, monkeypatch): image_assets: ImageTestAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS: if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0") monkeypatch.setenv("VLLM_USE_V1", "0")
model_test_info = VLM_TEST_SETTINGS[model_type] model_test_info = VLM_TEST_SETTINGS[model_type]
...@@ -700,7 +770,7 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str, ...@@ -700,7 +770,7 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets, monkeypatch): image_assets: ImageTestAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS: if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0") monkeypatch.setenv("VLLM_USE_V1", "0")
model_test_info = VLM_TEST_SETTINGS[model_type] model_test_info = VLM_TEST_SETTINGS[model_type]
...@@ -725,7 +795,7 @@ def test_image_embedding_models(model_type: str, ...@@ -725,7 +795,7 @@ def test_image_embedding_models(model_type: str,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets, monkeypatch): image_assets: ImageTestAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS: if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0") monkeypatch.setenv("VLLM_USE_V1", "0")
model_test_info = VLM_TEST_SETTINGS[model_type] model_test_info = VLM_TEST_SETTINGS[model_type]
...@@ -747,7 +817,7 @@ def test_image_embedding_models(model_type: str, ...@@ -747,7 +817,7 @@ def test_image_embedding_models(model_type: str,
)) ))
def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs, def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
video_assets: _VideoAssets, monkeypatch): video_assets: VideoTestAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS: if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0") monkeypatch.setenv("VLLM_USE_V1", "0")
model_test_info = VLM_TEST_SETTINGS[model_type] model_test_info = VLM_TEST_SETTINGS[model_type]
...@@ -760,6 +830,28 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs, ...@@ -760,6 +830,28 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
) )
@pytest.mark.parametrize(
"model_type,test_case",
get_parametrized_options(
VLM_TEST_SETTINGS,
test_type=VLMTestType.AUDIO,
create_new_process_for_each_test=False,
))
def test_audio_models(model_type: str, test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
audio_assets: AudioTestAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0")
model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_audio_test(
model_test_info=model_test_info,
test_case=test_case,
hf_runner=hf_runner,
vllm_runner=vllm_runner,
audio_assets=audio_assets,
)
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model_type,test_case", "model_type,test_case",
get_parametrized_options( get_parametrized_options(
...@@ -798,7 +890,7 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str, ...@@ -798,7 +890,7 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets, monkeypatch): image_assets: ImageTestAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS: if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0") monkeypatch.setenv("VLLM_USE_V1", "0")
model_test_info = VLM_TEST_SETTINGS[model_type] model_test_info = VLM_TEST_SETTINGS[model_type]
...@@ -824,7 +916,7 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str, ...@@ -824,7 +916,7 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets, monkeypatch): image_assets: ImageTestAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS: if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0") monkeypatch.setenv("VLLM_USE_V1", "0")
model_test_info = VLM_TEST_SETTINGS[model_type] model_test_info = VLM_TEST_SETTINGS[model_type]
...@@ -850,7 +942,8 @@ def test_image_embedding_models_heavy(model_type: str, ...@@ -850,7 +942,8 @@ def test_image_embedding_models_heavy(model_type: str,
test_case: ExpandableVLMTestArgs, test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets, monkeypatch): image_assets: ImageTestAssets,
monkeypatch):
if model_type in REQUIRES_V0_MODELS: if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0") monkeypatch.setenv("VLLM_USE_V1", "0")
model_test_info = VLM_TEST_SETTINGS[model_type] model_test_info = VLM_TEST_SETTINGS[model_type]
...@@ -873,7 +966,7 @@ def test_image_embedding_models_heavy(model_type: str, ...@@ -873,7 +966,7 @@ def test_image_embedding_models_heavy(model_type: str,
def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs, def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
video_assets: _VideoAssets, monkeypatch): video_assets: VideoTestAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS: if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0") monkeypatch.setenv("VLLM_USE_V1", "0")
model_test_info = VLM_TEST_SETTINGS[model_type] model_test_info = VLM_TEST_SETTINGS[model_type]
...@@ -886,6 +979,29 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs, ...@@ -886,6 +979,29 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
) )
@pytest.mark.parametrize(
"model_type,test_case",
get_parametrized_options(
VLM_TEST_SETTINGS,
test_type=VLMTestType.AUDIO,
create_new_process_for_each_test=True,
))
def test_audio_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
audio_assets: AudioTestAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0")
model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_audio_test(
model_test_info=model_test_info,
test_case=test_case,
hf_runner=hf_runner,
vllm_runner=vllm_runner,
audio_assets=audio_assets,
)
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model_type,test_case", "model_type,test_case",
get_parametrized_options( get_parametrized_options(
......
...@@ -9,7 +9,7 @@ from vllm.inputs.data import ExplicitEncoderDecoderPrompt, TextPrompt ...@@ -9,7 +9,7 @@ from vllm.inputs.data import ExplicitEncoderDecoderPrompt, TextPrompt
from vllm.multimodal.image import rescale_image_size from vllm.multimodal.image import rescale_image_size
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets from ....conftest import IMAGE_ASSETS, HfRunner, ImageTestAssets, VllmRunner
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
MODELS = ["microsoft/Florence-2-base"] MODELS = ["microsoft/Florence-2-base"]
...@@ -118,7 +118,7 @@ def run_test( ...@@ -118,7 +118,7 @@ def run_test(
@pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5]) @pytest.mark.parametrize("num_logprobs", [5])
def test_models(hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], def test_models(hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets, model: str, image_assets: ImageTestAssets, model: str,
size_factors: list[int], dtype: str, max_tokens: int, size_factors: list[int], dtype: str, max_tokens: int,
num_logprobs: int) -> None: num_logprobs: int) -> None:
images = [asset.pil_image for asset in image_assets] images = [asset.pil_image for asset in image_assets]
......
...@@ -9,7 +9,8 @@ from transformers import AutoModelForSpeechSeq2Seq ...@@ -9,7 +9,8 @@ from transformers import AutoModelForSpeechSeq2Seq
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
from ....conftest import HfRunner, PromptAudioInput, VllmRunner, _AudioAssets from ....conftest import (AudioTestAssets, HfRunner, PromptAudioInput,
VllmRunner)
from ...registry import HF_EXAMPLE_MODELS from ...registry import HF_EXAMPLE_MODELS
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
...@@ -116,9 +117,9 @@ def run_test( ...@@ -116,9 +117,9 @@ def run_test(
@pytest.mark.parametrize("max_model_len", [2048]) @pytest.mark.parametrize("max_model_len", [2048])
@pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10]) @pytest.mark.parametrize("num_logprobs", [10])
def test_models(hf_runner, vllm_runner, model: str, audio_assets: _AudioAssets, def test_models(hf_runner, vllm_runner, model: str,
dtype: str, max_model_len: int, max_tokens: int, audio_assets: AudioTestAssets, dtype: str, max_model_len: int,
num_logprobs: int) -> None: max_tokens: int, num_logprobs: int) -> None:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip") model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip") model_info.check_transformers_version(on_fail="skip")
......
...@@ -16,6 +16,7 @@ INTERLEAVED_PROMPT = base_prompt("<image><video><image>\n") ...@@ -16,6 +16,7 @@ INTERLEAVED_PROMPT = base_prompt("<image><video><image>\n")
NONINTERLEAVED_PROMPT = base_prompt("<image><image><video>\n") NONINTERLEAVED_PROMPT = base_prompt("<image><image><video>\n")
@pytest.mark.core_model
@pytest.mark.parametrize("model", models) @pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("dtype", ["float16"]) @pytest.mark.parametrize("dtype", ["float16"])
@pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("max_tokens", [128])
...@@ -28,7 +29,7 @@ def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None: ...@@ -28,7 +29,7 @@ def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
image_cherry = ImageAsset("cherry_blossom").pil_image.convert("RGB") image_cherry = ImageAsset("cherry_blossom").pil_image.convert("RGB")
image_stop = ImageAsset("stop_sign").pil_image.convert("RGB") image_stop = ImageAsset("stop_sign").pil_image.convert("RGB")
images = [image_cherry, image_stop] images = [image_cherry, image_stop]
video = VideoAsset(name="sample_demo_1.mp4", num_frames=16).np_ndarrays video = VideoAsset(name="baby_reading", num_frames=16).np_ndarrays
inputs = [ inputs = [
( (
......
...@@ -14,10 +14,11 @@ from vllm.model_executor.models.mllama import MllamaForConditionalGeneration ...@@ -14,10 +14,11 @@ from vllm.model_executor.models.mllama import MllamaForConditionalGeneration
from vllm.multimodal.image import rescale_image_size from vllm.multimodal.image import rescale_image_size
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner, from ....conftest import (IMAGE_ASSETS, HfRunner, ImageTestAssets,
_ImageAssets) PromptImageInput, VllmRunner)
from ....quantization.utils import is_quant_method_supported from ....quantization.utils import is_quant_method_supported
from ....utils import large_gpu_test from ....utils import (create_new_process_for_each_test, large_gpu_test,
multi_gpu_test)
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
_LIMIT_IMAGE_PER_PROMPT = 3 _LIMIT_IMAGE_PER_PROMPT = 3
...@@ -89,7 +90,7 @@ def vllm_to_hf_output(vllm_output: tuple[list[int], str, ...@@ -89,7 +90,7 @@ def vllm_to_hf_output(vllm_output: tuple[list[int], str,
def _get_inputs( def _get_inputs(
image_assets: _ImageAssets, image_assets: ImageTestAssets,
*, *,
size_factors: Optional[list[float]] = None, size_factors: Optional[list[float]] = None,
sizes: Optional[list[tuple[int, int]]] = None, sizes: Optional[list[tuple[int, int]]] = None,
...@@ -125,7 +126,7 @@ def _get_inputs( ...@@ -125,7 +126,7 @@ def _get_inputs(
def run_test( def run_test(
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets, image_assets: ImageTestAssets,
model: str, model: str,
*, *,
size_factors: list[float], size_factors: list[float],
...@@ -142,7 +143,7 @@ def run_test( ...@@ -142,7 +143,7 @@ def run_test(
def run_test( def run_test(
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets, image_assets: ImageTestAssets,
model: str, model: str,
*, *,
sizes: list[tuple[int, int]], sizes: list[tuple[int, int]],
...@@ -158,7 +159,7 @@ def run_test( ...@@ -158,7 +159,7 @@ def run_test(
def run_test( def run_test(
hf_runner: type[HfRunner], hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
image_assets: _ImageAssets, image_assets: ImageTestAssets,
model: str, model: str,
*, *,
size_factors: Optional[list[float]] = None, size_factors: Optional[list[float]] = None,
...@@ -393,6 +394,37 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model, ...@@ -393,6 +394,37 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
) )
@create_new_process_for_each_test()
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models_distributed(
hf_runner,
vllm_runner,
image_assets,
distributed_executor_backend,
model,
dtype,
max_tokens,
num_logprobs,
) -> None:
run_test(
hf_runner,
vllm_runner,
image_assets,
model=model,
size_factors=[0.25, 0.5, 1.0],
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
tensor_parallel_size=2,
distributed_executor_backend=distributed_executor_backend,
)
@large_gpu_test(min_gb=48) @large_gpu_test(min_gb=48)
@pytest.mark.core_model @pytest.mark.core_model
@pytest.mark.parametrize("model", models) @pytest.mark.parametrize("model", models)
...@@ -401,7 +433,7 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model, ...@@ -401,7 +433,7 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"), @pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
reason='bitsandbytes is not supported on this GPU type.') reason='bitsandbytes is not supported on this GPU type.')
def test_bnb_regression( def test_bnb_regression(
image_assets: _ImageAssets, image_assets: ImageTestAssets,
model: str, model: str,
dtype: str, dtype: str,
max_tokens: int, max_tokens: int,
...@@ -441,7 +473,7 @@ def test_bnb_regression( ...@@ -441,7 +473,7 @@ def test_bnb_regression(
@pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("max_tokens", [32])
def test_explicit_implicit_prompt( def test_explicit_implicit_prompt(
image_assets: _ImageAssets, image_assets: ImageTestAssets,
model: str, model: str,
dtype: str, dtype: str,
max_tokens: int, max_tokens: int,
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
Run `pytest tests/models/test_mistral.py`.
"""
import json import json
from dataclasses import asdict from dataclasses import asdict
from typing import TYPE_CHECKING, Any, Optional from typing import TYPE_CHECKING, Any, Optional
......
...@@ -50,7 +50,7 @@ IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ ...@@ -50,7 +50,7 @@ IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
}) })
VIDEO_PROMPTS = VIDEO_ASSETS.prompts({ VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
"sample_demo_1": "baby_reading":
qwen2_vl_chat_template( qwen2_vl_chat_template(
VIDEO_PLACEHOLDER, VIDEO_PLACEHOLDER,
"Describe this video with a short sentence ", "Describe this video with a short sentence ",
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import json import json
from typing import Any, Optional from typing import Any
import numpy as np import numpy as np
import pytest import pytest
import pytest_asyncio import pytest_asyncio
from transformers import AutoModel, AutoTokenizer from transformers import AutoTokenizer
from vllm.multimodal.audio import resample_audio_librosa from ....conftest import AUDIO_ASSETS, AudioTestAssets, VllmRunner
from vllm.sequence import SampleLogprobs
from ....conftest import HfRunner, VllmRunner, _AudioAssets
from ....utils import RemoteOpenAIServer from ....utils import RemoteOpenAIServer
from ...registry import HF_EXAMPLE_MODELS from ...registry import HF_EXAMPLE_MODELS
from ...utils import check_logprobs_close
MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b" MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
AUDIO_PROMPTS = AUDIO_ASSETS.prompts({
"mary_had_lamb":
"Transcribe this into English.",
"winning_call":
"What is happening in this audio clip?",
})
MULTI_AUDIO_PROMPT = "Describe each of the audios above."
AudioTuple = tuple[np.ndarray, int] AudioTuple = tuple[np.ndarray, int]
VLLM_PLACEHOLDER = "<|audio|>" VLLM_PLACEHOLDER = "<|audio|>"
...@@ -31,12 +36,6 @@ CHUNKED_PREFILL_KWARGS = { ...@@ -31,12 +36,6 @@ CHUNKED_PREFILL_KWARGS = {
} }
@pytest.fixture(scope="module", params=("mary_had_lamb", "winning_call"))
def audio(request):
from vllm.assets.audio import AudioAsset
return AudioAsset(request.param)
def params_kwargs_to_cli_args(params_kwargs: dict[str, Any]) -> list[str]: def params_kwargs_to_cli_args(params_kwargs: dict[str, Any]) -> list[str]:
"""Convert kwargs to CLI args.""" """Convert kwargs to CLI args."""
args = [] args = []
...@@ -53,7 +52,7 @@ def params_kwargs_to_cli_args(params_kwargs: dict[str, Any]) -> list[str]: ...@@ -53,7 +52,7 @@ def params_kwargs_to_cli_args(params_kwargs: dict[str, Any]) -> list[str]:
pytest.param({}, marks=pytest.mark.cpu_model), pytest.param({}, marks=pytest.mark.cpu_model),
pytest.param(CHUNKED_PREFILL_KWARGS), pytest.param(CHUNKED_PREFILL_KWARGS),
]) ])
def server(request, audio_assets: _AudioAssets): def server(request, audio_assets: AudioTestAssets):
args = [ args = [
"--dtype", "bfloat16", "--max-model-len", "4096", "--enforce-eager", "--dtype", "bfloat16", "--max-model-len", "4096", "--enforce-eager",
"--limit-mm-per-prompt", "--limit-mm-per-prompt",
...@@ -85,79 +84,6 @@ def _get_prompt(audio_count, question, placeholder): ...@@ -85,79 +84,6 @@ def _get_prompt(audio_count, question, placeholder):
add_generation_prompt=True) add_generation_prompt=True)
def vllm_to_hf_output(vllm_output: tuple[list[int], str,
Optional[SampleLogprobs]],
model: str):
"""Sanitize vllm output to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output
tokenizer = AutoTokenizer.from_pretrained(model)
eos_token_id = tokenizer.eos_token_id
hf_output_ids = output_ids[:]
hf_output_str = output_str
if hf_output_ids[-1] == eos_token_id:
hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
return hf_output_ids, hf_output_str, out_logprobs
def run_test(
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
prompts_and_audios: list[tuple[str, str, AudioTuple]],
model: str,
*,
dtype: str,
max_tokens: int,
num_logprobs: int,
**kwargs,
):
"""Inference result should be the same between hf and vllm."""
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with vllm_runner(model, dtype=dtype, enforce_eager=True,
**kwargs) as vllm_model:
vllm_outputs_per_audio = [
vllm_model.generate_greedy_logprobs([vllm_prompt],
max_tokens,
num_logprobs=num_logprobs,
audios=[audio])
for vllm_prompt, _, audio in prompts_and_audios
]
with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:
hf_outputs_per_audio = [
hf_model.generate_greedy_logprobs_limit(
[hf_prompt],
max_tokens,
num_logprobs=num_logprobs,
audios=[(resample_audio_librosa(audio[0],
orig_sr=audio[1],
target_sr=16000), 16000)])
for _, hf_prompt, audio in prompts_and_audios
]
for hf_outputs, vllm_outputs in zip(hf_outputs_per_audio,
vllm_outputs_per_audio):
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=[
vllm_to_hf_output(vllm_output, model)
for vllm_output in vllm_outputs
],
name_0="hf",
name_1="vllm",
)
def run_multi_audio_test( def run_multi_audio_test(
vllm_runner: type[VllmRunner], vllm_runner: type[VllmRunner],
prompts_and_audios: list[tuple[str, list[AudioTuple]]], prompts_and_audios: list[tuple[str, list[AudioTuple]]],
...@@ -191,31 +117,6 @@ def run_multi_audio_test( ...@@ -191,31 +117,6 @@ def run_multi_audio_test(
assert all(tokens for tokens, *_ in vllm_outputs) assert all(tokens for tokens, *_ in vllm_outputs)
@pytest.mark.core_model
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("vllm_kwargs", [
pytest.param({}, marks=pytest.mark.cpu_model),
pytest.param(CHUNKED_PREFILL_KWARGS),
])
def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
num_logprobs: int, vllm_kwargs: dict) -> None:
vllm_prompt = _get_prompt(1, "Describe the audio above.", VLLM_PLACEHOLDER)
hf_prompt = _get_prompt(1, "Describe the audio above.", HF_PLACEHOLDER)
run_test(
hf_runner,
vllm_runner,
[(vllm_prompt, hf_prompt, audio.audio_and_sample_rate)],
MODEL_NAME,
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
**vllm_kwargs,
)
@pytest.mark.core_model @pytest.mark.core_model
@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("max_tokens", [128])
...@@ -224,13 +125,12 @@ def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int, ...@@ -224,13 +125,12 @@ def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
pytest.param({}, marks=pytest.mark.cpu_model), pytest.param({}, marks=pytest.mark.cpu_model),
pytest.param(CHUNKED_PREFILL_KWARGS), pytest.param(CHUNKED_PREFILL_KWARGS),
]) ])
def test_models_with_multiple_audios(vllm_runner, audio_assets: _AudioAssets, def test_models_with_multiple_audios(vllm_runner,
dtype: str, max_tokens: int, audio_assets: AudioTestAssets, dtype: str,
num_logprobs: int, max_tokens: int, num_logprobs: int,
vllm_kwargs: dict) -> None: vllm_kwargs: dict) -> None:
vllm_prompt = _get_prompt(len(audio_assets), vllm_prompt = _get_prompt(len(audio_assets), MULTI_AUDIO_PROMPT,
"Describe each of the audios above.",
VLLM_PLACEHOLDER) VLLM_PLACEHOLDER)
run_multi_audio_test( run_multi_audio_test(
vllm_runner, vllm_runner,
...@@ -245,7 +145,7 @@ def test_models_with_multiple_audios(vllm_runner, audio_assets: _AudioAssets, ...@@ -245,7 +145,7 @@ def test_models_with_multiple_audios(vllm_runner, audio_assets: _AudioAssets,
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_online_serving(client, audio_assets: _AudioAssets): async def test_online_serving(client, audio_assets: AudioTestAssets):
"""Exercises online serving with/without chunked prefill enabled.""" """Exercises online serving with/without chunked prefill enabled."""
messages = [{ messages = [{
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""Compare the outputs of HF and vLLM for Whisper models using greedy sampling.
Run `pytest tests/models/encoder_decoder/audio/test_whisper.py`.
"""
from typing import Optional from typing import Optional
import pytest import pytest
from vllm import LLM, SamplingParams from vllm import SamplingParams
from vllm.assets.audio import AudioAsset from vllm.assets.audio import AudioAsset
from ....conftest import VllmRunner
from ....utils import create_new_process_for_each_test, multi_gpu_test from ....utils import create_new_process_for_each_test, multi_gpu_test
PROMPTS = [ PROMPTS = [
...@@ -92,6 +89,7 @@ EXPECTED = { ...@@ -92,6 +89,7 @@ EXPECTED = {
def run_test( def run_test(
vllm_runner: type[VllmRunner],
model: str, model: str,
*, *,
tensor_parallel_size: int, tensor_parallel_size: int,
...@@ -100,38 +98,52 @@ def run_test( ...@@ -100,38 +98,52 @@ def run_test(
prompt_list = PROMPTS * 10 prompt_list = PROMPTS * 10
expected_list = EXPECTED[model] * 10 expected_list = EXPECTED[model] * 10
llm = LLM( with vllm_runner(
model=model, model,
tensor_parallel_size=tensor_parallel_size, max_model_len=448,
distributed_executor_backend=distributed_executor_backend, tensor_parallel_size=tensor_parallel_size,
) distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
llm = vllm_model.model
sampling_params = SamplingParams( sampling_params = SamplingParams(
temperature=0, temperature=0,
top_p=1.0, top_p=1.0,
max_tokens=200, max_tokens=200,
) )
outputs = llm.generate(prompt_list, sampling_params) outputs = llm.generate(prompt_list, sampling_params)
for output, expected in zip(outputs, expected_list): for output, expected in zip(outputs, expected_list):
print(output.outputs[0].text) print(output.outputs[0].text)
assert output.outputs[0].text == expected assert output.outputs[0].text == expected
@create_new_process_for_each_test()
@pytest.mark.core_model @pytest.mark.core_model
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model", ["openai/whisper-small", "openai/whisper-large-v3-turbo"]) "model", ["openai/whisper-small", "openai/whisper-large-v3-turbo"])
def test_models(model) -> None: @create_new_process_for_each_test()
run_test(model, tensor_parallel_size=1) def test_models(vllm_runner, model) -> None:
run_test(
vllm_runner,
model,
tensor_parallel_size=1,
)
@multi_gpu_test(num_gpus=2) @multi_gpu_test(num_gpus=2)
@pytest.mark.core_model @pytest.mark.core_model
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"]) @pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"]) @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
def test_models_distributed(model, distributed_executor_backend) -> None: @create_new_process_for_each_test()
run_test(model, def test_models_distributed(
tensor_parallel_size=2, vllm_runner,
distributed_executor_backend=distributed_executor_backend) model,
distributed_executor_backend,
) -> None:
run_test(
vllm_runner,
model,
tensor_parallel_size=2,
distributed_executor_backend=distributed_executor_backend,
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment