"vscode:/vscode.git/clone" did not exist on "f8a1e39fae05ca610be8d5a78be9d40f5274e5fc"
Commit 38d80967 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.10.2rc2' into v0.10.2rc2-ori

parents 33650733 880c741b
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from tests.models.utils import GenerateModelInfo
from .ppl_utils import wikitext_ppl_test
MODELS = [GenerateModelInfo("openai-community/gpt2-large")]
@pytest.mark.parametrize("model_info", MODELS)
def test_ppl(hf_runner, vllm_runner, model_info: GenerateModelInfo):
wikitext_ppl_test(hf_runner, vllm_runner, model_info)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from tests.models.utils import GenerateModelInfo
from .ppl_utils import wikitext_ppl_test
MODELS = [
GenerateModelInfo("Qwen/Qwen3-0.6B"),
GenerateModelInfo("Qwen/Qwen3-0.6B-FP8"),
# transformers:
# Loading a GPTQ quantized model requires optimum, gptqmodel
# GenerateModelInfo("Qwen/Qwen3-0.6B-GPTQ-Int8"),
]
@pytest.mark.parametrize("model_info", MODELS)
def test_ppl(hf_runner, vllm_runner, model_info: GenerateModelInfo):
wikitext_ppl_test(hf_runner, vllm_runner, model_info)
......@@ -35,10 +35,7 @@ def correctness_test_embed_models(hf_runner,
example_prompts,
vllm_extra_kwargs=None,
hf_model_callback=None):
if not model_info.enable_test:
# A model family has many models with the same architecture,
# and we don't need to test each one.
pytest.skip("Skipping test.")
pytest.skip("Debug only, ci prefers to use mteb test.")
# The example_prompts has ending "\n", for example:
# "Write a short story about a robot that dreams for the first time.\n"
......@@ -62,7 +59,7 @@ def correctness_test_embed_models(hf_runner,
with hf_runner(
model_info.name,
dtype="float32",
dtype=model_info.hf_dtype,
is_sentence_transformer=True,
) as hf_model:
......
......@@ -7,7 +7,7 @@ import pytest
from vllm.config import PoolerConfig
from vllm.platforms import current_platform
from ...utils import check_embeddings_close, check_transformers_version
from ...utils import check_embeddings_close
@pytest.mark.parametrize(
......@@ -27,12 +27,17 @@ from ...utils import check_embeddings_close, check_transformers_version
pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
marks=[pytest.mark.cpu_model]),
# [Encoder-only]
pytest.param("BAAI/bge-base-en-v1.5", marks=[pytest.mark.core_model]),
pytest.param(
"BAAI/bge-base-en-v1.5",
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
),
pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
pytest.param("intfloat/multilingual-e5-small"),
pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"),
# [Cross-Encoder]
pytest.param("sentence-transformers/stsb-roberta-base-v2"),
pytest.param(
"sentence-transformers/stsb-roberta-base-v2",
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
),
],
)
def test_models(
......@@ -42,8 +47,6 @@ def test_models(
model,
monkeypatch,
) -> None:
if model == "Alibaba-NLP/gte-Qwen2-1.5B-instruct":
check_transformers_version(model, max_transformers_version="4.53.2")
if model == "BAAI/bge-multilingual-gemma2" and current_platform.is_rocm():
# ROCm Triton FA does not currently support sliding window attention
......
......@@ -9,8 +9,10 @@ import mteb
import numpy as np
import pytest
import requests
import torch
from tests.models.utils import EmbedModelInfo, RerankModelInfo
from tests.models.utils import (EmbedModelInfo, RerankModelInfo,
check_embeddings_close)
# Most embedding models on the STS12 task (See #17175):
# - Model implementation and minor changes in tensor dtype
......@@ -18,7 +20,7 @@ from tests.models.utils import EmbedModelInfo, RerankModelInfo
# - Different model results in differences more than 1e-3
# 1e-4 is a good tolerance threshold
MTEB_EMBED_TASKS = ["STS12"]
MTEB_EMBED_TOL = 0.02
MTEB_EMBED_TOL = 1e-4
# See #19344
MTEB_RERANK_TASKS = ["NFCorpus"]
......@@ -163,15 +165,20 @@ def mteb_test_embed_models(hf_runner,
model_info: EmbedModelInfo,
vllm_extra_kwargs=None,
hf_model_callback=None,
atol=MTEB_RERANK_TOL):
atol=MTEB_EMBED_TOL):
# A model family has many models with the same architecture,
# and we don't need to test each one.
if not model_info.enable_test:
# A model family has many models with the same architecture,
# and we don't need to test each one.
pytest.skip("Skipping test.")
# Test embed_dims, isnan and whether to use normalize
example_prompts = ["The chef prepared a delicious meal." * 1000]
# Allow vllm to test using the given dtype, such as float32
vllm_extra_kwargs = vllm_extra_kwargs or {}
vllm_extra_kwargs["dtype"] = model_info.dtype
# Allow vllm to test using hf_overrides
if model_info.hf_overrides is not None:
vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides
......@@ -183,8 +190,12 @@ def mteb_test_embed_models(hf_runner,
model_config = vllm_model.llm.llm_engine.model_config
# Confirm whether vllm is using the correct architecture
if model_info.architecture:
assert model_info.architecture in model_config.architectures
# Confirm whether vllm uses the correct default_pooling_type, which
# relates to whether chunked prefill and prefix caching are enabled
assert (model_config._model_info.default_pooling_type ==
model_info.default_pooling_type)
......@@ -192,22 +203,46 @@ def mteb_test_embed_models(hf_runner,
MTEB_EMBED_TASKS)
vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
with hf_runner(model_info.name,
is_sentence_transformer=True,
dtype="float32") as hf_model:
if hf_model_callback is not None:
hf_model_callback(hf_model)
st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
st_dtype = next(hf_model.model.parameters()).dtype
# Test embed_dims, isnan and whether to use normalize
vllm_outputs = vllm_model.embed(example_prompts,
truncate_prompt_tokens=-1)
assert not torch.any(torch.isnan(torch.tensor(vllm_outputs)))
# Accelerate mteb test by setting
# SentenceTransformers mteb score to a constant
if model_info.mteb_score is None:
with hf_runner(model_info.name,
is_sentence_transformer=True,
dtype=model_info.hf_dtype) as hf_model:
# e.g. setting default parameters for the encode method of hf_runner
if hf_model_callback is not None:
hf_model_callback(hf_model)
st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
st_dtype = next(hf_model.model.parameters()).dtype
# Test embed_dims and whether to use normalize
hf_outputs = hf_model.encode(example_prompts)
check_embeddings_close(
embeddings_0_lst=hf_outputs,
embeddings_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
tol=1e-2,
)
else:
st_main_score = model_info.mteb_score
st_dtype = "Constant"
print("Model:", model_info.name)
print("VLLM:", vllm_dtype, vllm_main_score)
print("SentenceTransformers:", st_dtype, st_main_score)
print("Difference:", st_main_score - vllm_main_score)
assert st_main_score == pytest.approx(vllm_main_score, abs=atol)
# We are not concerned that the vllm mteb results are better
# than SentenceTransformers, so we only perform one-sided testing.
assert st_main_score - vllm_main_score < atol
def run_mteb_rerank(cross_encoder, tasks, languages):
......@@ -243,9 +278,12 @@ def run_mteb_rerank(cross_encoder, tasks, languages):
return main_score
def mteb_test_rerank_models_hf(hf_runner, model_name, hf_model_callback=None):
def mteb_test_rerank_models_hf(hf_runner,
model_name,
hf_dtype="float32",
hf_model_callback=None):
with hf_runner(model_name, is_cross_encoder=True,
dtype="float32") as hf_model:
dtype=hf_dtype) as hf_model:
original_predict = hf_model.predict
......@@ -279,14 +317,16 @@ def mteb_test_rerank_models(hf_runner,
hf_model_callback=None,
vllm_mteb_encoder=VllmMtebEncoder,
atol=MTEB_RERANK_TOL):
# A model family has many models with the same architecture,
# and we don't need to test each one.
if not model_info.enable_test:
# A model family has many models with the same architecture,
# and we don't need to test each one.
pytest.skip("Skipping test.")
# Allow vllm to test using the given dtype, such as float32
vllm_extra_kwargs = vllm_extra_kwargs or {}
vllm_extra_kwargs["dtype"] = model_info.dtype
# Allow vllm to test using hf_overrides
if model_info.hf_overrides is not None:
vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides
......@@ -299,9 +339,15 @@ def mteb_test_rerank_models(hf_runner,
model_config = vllm_model.llm.llm_engine.model_config
# Confirm whether vllm is using the correct architecture
if model_info.architecture:
assert (model_info.architecture in model_config.architectures)
# Score API is only enabled for num_labels == 1
assert model_config.hf_config.num_labels == 1
# Confirm whether vllm uses the correct default_pooling_type, which
# relates to whether chunked prefill and prefix caching are enabled
assert (model_config._model_info.default_pooling_type ==
model_info.default_pooling_type)
......@@ -310,12 +356,20 @@ def mteb_test_rerank_models(hf_runner,
languages=MTEB_RERANK_LANGS)
vllm_dtype = model_config.dtype
st_main_score, st_dtype = mteb_test_rerank_models_hf(
hf_runner, model_info.name, hf_model_callback)
# Accelerate mteb test by setting
# SentenceTransformers mteb score to a constant
if model_info.mteb_score is None:
st_main_score, st_dtype = mteb_test_rerank_models_hf(
hf_runner, model_info.name, model_info.hf_dtype, hf_model_callback)
else:
st_main_score = model_info.mteb_score
st_dtype = "Constant"
print("Model:", model_info.name)
print("VLLM:", vllm_dtype, vllm_main_score)
print("SentenceTransformers:", st_dtype, st_main_score)
print("Difference:", st_main_score - vllm_main_score)
assert st_main_score == pytest.approx(vllm_main_score, abs=atol)
# We are not concerned that the vllm mteb results are better
# than SentenceTransformers, so we only perform one-sided testing.
assert st_main_score - vllm_main_score < atol
......@@ -2,16 +2,19 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from ...utils import (CLSPoolingEmbedModelInfo, CLSPoolingRerankModelInfo,
EmbedModelInfo, LASTPoolingEmbedModelInfo,
RerankModelInfo)
from .embed_utils import correctness_test_embed_models
from tests.models.language.pooling.embed_utils import (
correctness_test_embed_models)
from tests.models.utils import (CLSPoolingEmbedModelInfo,
CLSPoolingRerankModelInfo, EmbedModelInfo,
LASTPoolingEmbedModelInfo, RerankModelInfo)
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
MODELS = [
########## BertModel
CLSPoolingEmbedModelInfo("BAAI/bge-base-en",
architecture="BertModel",
mteb_score=0.779336792,
enable_test=True),
CLSPoolingEmbedModelInfo("BAAI/bge-base-zh",
architecture="BertModel",
......@@ -52,10 +55,12 @@ MODELS = [
########## XLMRobertaModel
CLSPoolingEmbedModelInfo("BAAI/bge-m3",
architecture="XLMRobertaModel",
mteb_score=0.787343078,
enable_test=True),
########## Qwen2Model
LASTPoolingEmbedModelInfo("BAAI/bge-code-v1",
architecture="Qwen2Model",
mteb_score=0.75724465,
dtype="float32",
enable_test=True),
]
......@@ -65,6 +70,7 @@ RERANK_MODELS = [
CLSPoolingRerankModelInfo(
"BAAI/bge-reranker-base",
architecture="XLMRobertaForSequenceClassification",
mteb_score=0.32398,
enable_test=True),
CLSPoolingRerankModelInfo(
"BAAI/bge-reranker-large",
......
......@@ -7,13 +7,14 @@ import pytest
import torch
from tests.conftest import HfRunner
from ...utils import LASTPoolingRerankModelInfo, RerankModelInfo
from .mteb_utils import VllmMtebEncoder, mteb_test_rerank_models
from tests.models.language.pooling_mteb_test.mteb_utils import (
VllmMtebEncoder, mteb_test_rerank_models)
from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
RERANK_MODELS = [
LASTPoolingRerankModelInfo("BAAI/bge-reranker-v2-gemma",
architecture="GemmaForSequenceClassification",
mteb_score=0.33757,
hf_overrides={
"architectures":
["GemmaForSequenceClassification"],
......@@ -104,7 +105,6 @@ class GemmaMtebEncoder(VllmMtebEncoder):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.prompt = PROMPT
self.query_template = "A: {query}\n"
self.document_template = "B: {doc}\n{prompt}"
......@@ -119,7 +119,7 @@ class GemmaMtebEncoder(VllmMtebEncoder):
_sentences = []
for query, corpus, prompt in sentences:
query = self.query_template.format(query=query)
corpus = self.document_template.format(doc=corpus, prompt=prompt)
corpus = self.document_template.format(doc=corpus, prompt=PROMPT)
_sentences.append((query, corpus, prompt))
return super().predict(_sentences, *args, **kwargs)
......
......@@ -2,14 +2,17 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from ...utils import (CLSPoolingRerankModelInfo, LASTPoolingRerankModelInfo,
RerankModelInfo)
from tests.models.utils import (CLSPoolingRerankModelInfo,
LASTPoolingRerankModelInfo, RerankModelInfo)
from .mteb_utils import mteb_test_rerank_models
RERANK_MODELS = [
CLSPoolingRerankModelInfo("cross-encoder/ms-marco-TinyBERT-L-2-v2",
mteb_score=0.32898,
architecture="BertForSequenceClassification"),
LASTPoolingRerankModelInfo("tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
mteb_score=0.25736,
architecture="Qwen3ForSequenceClassification")
]
......
......@@ -3,15 +3,18 @@
import pytest
from ...utils import (CLSPoolingEmbedModelInfo, CLSPoolingRerankModelInfo,
EmbedModelInfo, LASTPoolingEmbedModelInfo,
RerankModelInfo, check_transformers_version)
from .embed_utils import correctness_test_embed_models
from tests.models.language.pooling.embed_utils import (
correctness_test_embed_models)
from tests.models.utils import (CLSPoolingEmbedModelInfo,
CLSPoolingRerankModelInfo, EmbedModelInfo,
LASTPoolingEmbedModelInfo, RerankModelInfo)
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
MODELS = [
########## BertModel
CLSPoolingEmbedModelInfo("thenlper/gte-large",
mteb_score=0.76807651,
architecture="BertModel",
enable_test=True),
CLSPoolingEmbedModelInfo("thenlper/gte-base",
......@@ -30,28 +33,37 @@ MODELS = [
architecture="BertModel",
enable_test=False),
########### NewModel
# These three architectures are almost the same, but not exactly the same.
# For example,
# - whether to use token_type_embeddings
# - whether to use context expansion
# So only test one (the most widely used) model
CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-multilingual-base",
architecture="GteNewModel",
mteb_score=0.775074696,
hf_overrides={"architectures": ["GteNewModel"]},
enable_test=True),
CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-base-en-v1.5",
architecture="GteNewModel",
hf_overrides={"architectures": ["GteNewModel"]},
enable_test=True),
enable_test=False),
CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-large-en-v1.5",
architecture="GteNewModel",
hf_overrides={"architectures": ["GteNewModel"]},
enable_test=True),
enable_test=False),
########### Qwen2ForCausalLM
LASTPoolingEmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
mteb_score=0.758473459018872,
architecture="Qwen2ForCausalLM",
enable_test=True),
########## ModernBertModel
CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-modernbert-base",
mteb_score=0.748193353,
architecture="ModernBertModel",
enable_test=True),
########## Qwen3ForCausalLM
LASTPoolingEmbedModelInfo("Qwen/Qwen3-Embedding-0.6B",
mteb_score=0.771163695,
architecture="Qwen3ForCausalLM",
dtype="float32",
enable_test=True),
......@@ -65,10 +77,12 @@ RERANK_MODELS = [
CLSPoolingRerankModelInfo(
# classifier_pooling: mean
"Alibaba-NLP/gte-reranker-modernbert-base",
mteb_score=0.33386,
architecture="ModernBertForSequenceClassification",
enable_test=True),
CLSPoolingRerankModelInfo(
"Alibaba-NLP/gte-multilingual-reranker-base",
mteb_score=0.33062,
architecture="GteNewForSequenceClassification",
hf_overrides={"architectures": ["GteNewForSequenceClassification"]},
enable_test=True),
......@@ -78,10 +92,6 @@ RERANK_MODELS = [
@pytest.mark.parametrize("model_info", MODELS)
def test_embed_models_mteb(hf_runner, vllm_runner,
model_info: EmbedModelInfo) -> None:
if model_info.name == "Alibaba-NLP/gte-Qwen2-1.5B-instruct":
check_transformers_version(model_info.name,
max_transformers_version="4.53.2")
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
......@@ -89,10 +99,6 @@ def test_embed_models_mteb(hf_runner, vllm_runner,
def test_embed_models_correctness(hf_runner, vllm_runner,
model_info: EmbedModelInfo,
example_prompts) -> None:
if model_info.name == "Alibaba-NLP/gte-Qwen2-1.5B-instruct":
check_transformers_version(model_info.name,
max_transformers_version="4.53.2")
correctness_test_embed_models(hf_runner, vllm_runner, model_info,
example_prompts)
......
......@@ -2,14 +2,17 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from ...utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
from .embed_utils import correctness_test_embed_models
from tests.models.language.pooling.embed_utils import (
correctness_test_embed_models)
from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
from .mteb_utils import mteb_test_embed_models
MODELS = [
########## BertModel
CLSPoolingEmbedModelInfo("intfloat/e5-small",
architecture="BertModel",
mteb_score=0.742285423,
enable_test=True),
CLSPoolingEmbedModelInfo("intfloat/e5-base",
architecture="BertModel",
......@@ -23,6 +26,7 @@ MODELS = [
########## XLMRobertaModel
CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-base",
architecture="XLMRobertaModel",
mteb_score=0.779325955,
enable_test=True),
CLSPoolingEmbedModelInfo("intfloat/multilingual-e5-large",
architecture="XLMRobertaModel",
......@@ -36,7 +40,7 @@ MODELS = [
@pytest.mark.parametrize("model_info", MODELS)
def test_embed_models_mteb(hf_runner, vllm_runner,
model_info: EmbedModelInfo) -> None:
mteb_test_embed_models(hf_runner, vllm_runner, model_info, atol=0.02)
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
@pytest.mark.parametrize("model_info", MODELS)
......
......@@ -4,16 +4,18 @@ from functools import partial
import pytest
from tests.models.language.pooling.embed_utils import (
check_embeddings_close, correctness_test_embed_models, matryoshka_fy)
from tests.models.utils import (CLSPoolingEmbedModelInfo,
CLSPoolingRerankModelInfo, EmbedModelInfo,
RerankModelInfo)
from vllm import PoolingParams
from ...utils import (CLSPoolingEmbedModelInfo, CLSPoolingRerankModelInfo,
EmbedModelInfo, RerankModelInfo)
from .embed_utils import (check_embeddings_close,
correctness_test_embed_models, matryoshka_fy)
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
EMBEDDING_MODELS = [
CLSPoolingEmbedModelInfo("jinaai/jina-embeddings-v3",
mteb_score=0.824413164,
architecture="XLMRobertaModel",
is_matryoshka=True)
]
......@@ -21,6 +23,7 @@ EMBEDDING_MODELS = [
RERANK_MODELS = [
CLSPoolingRerankModelInfo(
"jinaai/jina-reranker-v2-base-multilingual",
mteb_score=0.33643,
architecture="XLMRobertaForSequenceClassification")
]
......
......@@ -6,8 +6,8 @@ import pytest
import torch
from tests.conftest import HfRunner
from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
from ...utils import LASTPoolingRerankModelInfo, RerankModelInfo
from .mteb_utils import mteb_test_rerank_models
mxbai_rerank_hf_overrides = {
......@@ -20,6 +20,7 @@ RERANK_MODELS = [
LASTPoolingRerankModelInfo("mixedbread-ai/mxbai-rerank-base-v2",
architecture="Qwen2ForSequenceClassification",
hf_overrides=mxbai_rerank_hf_overrides,
mteb_score=0.273,
enable_test=True),
LASTPoolingRerankModelInfo("mixedbread-ai/mxbai-rerank-large-v2",
architecture="Qwen2ForSequenceClassification",
......
......@@ -3,13 +3,16 @@
import pytest
from ...utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
from .embed_utils import correctness_test_embed_models
from tests.models.language.pooling.embed_utils import (
correctness_test_embed_models)
from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
from .mteb_utils import mteb_test_embed_models
MODELS = [
CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v1",
architecture="NomicBertModel",
mteb_score=0.737568559,
enable_test=True),
CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v1.5",
architecture="NomicBertModel",
......@@ -19,6 +22,7 @@ MODELS = [
enable_test=False),
CLSPoolingEmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe",
architecture="NomicBertModel",
mteb_score=0.715488912,
enable_test=True)
]
......
......@@ -6,9 +6,9 @@ import pytest
import torch
from tests.conftest import HfRunner
from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
from tests.utils import multi_gpu_test
from ...utils import LASTPoolingRerankModelInfo, RerankModelInfo
from .mteb_utils import mteb_test_rerank_models
qwen3_reranker_hf_overrides = {
......@@ -20,6 +20,7 @@ qwen3_reranker_hf_overrides = {
RERANK_MODELS = [
LASTPoolingRerankModelInfo("Qwen/Qwen3-Reranker-0.6B",
architecture="Qwen3ForSequenceClassification",
mteb_score=0.25736,
hf_overrides=qwen3_reranker_hf_overrides,
enable_test=True),
LASTPoolingRerankModelInfo("Qwen/Qwen3-Reranker-4B",
......
......@@ -3,14 +3,17 @@
import pytest
from ...utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
from .embed_utils import correctness_test_embed_models
from tests.models.language.pooling.embed_utils import (
correctness_test_embed_models)
from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
from .mteb_utils import mteb_test_embed_models
MODELS = [
CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-xs",
is_matryoshka=False,
architecture="BertModel",
mteb_score=0.714927797,
enable_test=True),
CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-s",
is_matryoshka=False,
......@@ -23,6 +26,7 @@ MODELS = [
CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long",
is_matryoshka=False,
architecture="NomicBertModel",
mteb_score=0.681146831,
enable_test=True),
CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-l",
is_matryoshka=False,
......@@ -31,14 +35,17 @@ MODELS = [
CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v1.5",
is_matryoshka=True,
architecture="BertModel",
mteb_score=0.649088363,
enable_test=True),
CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-l-v2.0",
is_matryoshka=True,
architecture="XLMRobertaModel",
mteb_score=0.712258299,
enable_test=True),
CLSPoolingEmbedModelInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
is_matryoshka=True,
architecture="GteModel",
mteb_score=0.706622444,
enable_test=True),
]
......@@ -46,7 +53,7 @@ MODELS = [
@pytest.mark.parametrize("model_info", MODELS)
def test_embed_models_mteb(hf_runner, vllm_runner,
model_info: EmbedModelInfo) -> None:
mteb_test_embed_models(hf_runner, vllm_runner, model_info, atol=0.02)
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
@pytest.mark.parametrize("model_info", MODELS)
......
......@@ -2,7 +2,9 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from ...utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
from tests.models.utils import (CLSPoolingEmbedModelInfo, EmbedModelInfo,
LASTPoolingEmbedModelInfo)
from .mteb_utils import mteb_test_embed_models
# ST models with projector (Dense) layers
......@@ -10,8 +12,13 @@ ST_PROJECTOR_MODELS = [
CLSPoolingEmbedModelInfo(
"TencentBAC/Conan-embedding-v1",
architecture="BertModel",
mteb_score=0.688611955,
enable_test=True,
),
LASTPoolingEmbedModelInfo("google/embeddinggemma-300m",
architecture="Gemma3TextModel",
mteb_score=0.7473819294684156,
enable_test=True)
]
......
......@@ -29,10 +29,10 @@ MISTRAL_SMALL_3_1_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
MODELS = [PIXTRAL_ID, MISTRAL_SMALL_3_1_ID]
IMG_URLS = [
"https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
"https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/231-200x300.jpg",
"https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/27-500x500.jpg",
"https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/17-150x600.jpg",
"237-400x300.jpg", # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
"231-200x300.jpg", # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
"27-500x500.jpg", # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
"17-150x600.jpg", # "https://huggingface.co/datasets/Isotr0py/mistral-test-images/resolve/main/237-400x300.jpg",
]
PROMPT = "Describe each image in one short sentence."
......@@ -105,12 +105,6 @@ def _create_engine_inputs_hf(urls: list[str]) -> TextPrompt:
return engine_inputs
MSGS = [
_create_msg_format(IMG_URLS[:1]),
_create_msg_format(IMG_URLS[:2]),
_create_msg_format(IMG_URLS),
]
SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
LIMIT_MM_PER_PROMPT = dict(image=4)
......@@ -156,12 +150,8 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_model_len", MAX_MODEL_LEN)
@pytest.mark.parametrize("dtype", ["bfloat16"])
def test_chat(
vllm_runner,
max_model_len: int,
model: str,
dtype: str,
) -> None:
def test_chat(vllm_runner, max_model_len: int, model: str, dtype: str,
local_asset_server) -> None:
EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(
FIXTURE_LOGPROBS_CHAT[model])
with vllm_runner(
......@@ -174,7 +164,14 @@ def test_chat(
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
) as vllm_model:
outputs = []
for msg in MSGS:
urls_all = [local_asset_server.url_for(u) for u in IMG_URLS]
msgs = [
_create_msg_format(urls_all[:1]),
_create_msg_format(urls_all[:2]),
_create_msg_format(urls_all),
]
for msg in msgs:
output = vllm_model.llm.chat(msg, sampling_params=SAMPLING_PARAMS)
outputs.extend(output)
......@@ -190,17 +187,19 @@ def test_chat(
name_1="output")
@pytest.mark.parametrize("prompt,expected_ranges",
[(_create_engine_inputs_hf(IMG_URLS[:1]),
[PlaceholderRange(offset=11, length=494)]),
(_create_engine_inputs_hf(IMG_URLS[1:4]), [
PlaceholderRange(offset=11, length=266),
PlaceholderRange(offset=277, length=1056),
PlaceholderRange(offset=1333, length=418)
])])
def test_multi_modal_placeholders(vllm_runner, prompt: TextPrompt,
@pytest.mark.parametrize(
"image_urls,expected_ranges",
[(IMG_URLS[:1], [PlaceholderRange(offset=11, length=494)]),
(IMG_URLS[1:4], [
PlaceholderRange(offset=11, length=266),
PlaceholderRange(offset=277, length=1056),
PlaceholderRange(offset=1333, length=418)
])])
def test_multi_modal_placeholders(vllm_runner, image_urls: list[str],
expected_ranges: list[PlaceholderRange],
monkeypatch) -> None:
local_asset_server, monkeypatch) -> None:
local_image_urls = [local_asset_server.url_for(u) for u in image_urls]
prompt = _create_engine_inputs_hf(local_image_urls)
# This placeholder checking test only works with V0 engine
# where `multi_modal_placeholders` is returned with `RequestOutput`
......
......@@ -154,7 +154,7 @@ def batch_make_image_embeddings(
embed_counter += cur_batch_embed_len
image_counter += cur_batch_image_count
# ensure we don't lost any images or embeddings
# ensure we don't lose any images or embeddings
assert embed_counter == image_embeds.size(0)
assert image_counter == image_grid_thw.size(0)
assert len(image_batches) == len(result)
......@@ -238,7 +238,7 @@ def batch_make_video_embeddings(
embed_counter += cur_batch_embed_len
video_counter += cur_batch_video_count
# ensure we don't lost any videos or embeddings
# ensure we don't lose any videos or embeddings
assert embed_counter == video_embeds.size(0)
assert video_counter == video_grid_thw.size(0)
assert len(video_batches) == len(result)
......
......@@ -122,8 +122,7 @@ def run_test(
@pytest.mark.core_model
@pytest.mark.parametrize(
"model", ["openai/whisper-small", "openai/whisper-large-v3-turbo"])
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
@create_new_process_for_each_test()
def test_models(vllm_runner, model) -> None:
run_test(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment