Commit 7e63ef82 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.14.0' into v0.14.0-dev

parents 8cbcac5d b17039bc
...@@ -4,90 +4,93 @@ import pytest ...@@ -4,90 +4,93 @@ import pytest
from tests.models.language.pooling.embed_utils import correctness_test_embed_models from tests.models.language.pooling.embed_utils import correctness_test_embed_models
from tests.models.utils import ( from tests.models.utils import (
CLSPoolingEmbedModelInfo,
CLSPoolingRerankModelInfo,
EmbedModelInfo, EmbedModelInfo,
LASTPoolingEmbedModelInfo,
RerankModelInfo, RerankModelInfo,
) )
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models from .mteb_embed_utils import mteb_test_embed_models
from .mteb_score_utils import mteb_test_rerank_models
MODELS = [ MODELS = [
########## BertModel ########## BertModel
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"BAAI/bge-base-en", "BAAI/bge-base-en",
architecture="BertModel", architecture="BertModel",
mteb_score=0.779336792, mteb_score=0.779336792,
seq_pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo("BAAI/bge-base-zh", architecture="BertModel", enable_test=False),
"BAAI/bge-base-zh", architecture="BertModel", enable_test=False EmbedModelInfo("BAAI/bge-small-en", architecture="BertModel", enable_test=False),
), EmbedModelInfo("BAAI/bge-small-zh", architecture="BertModel", enable_test=False),
CLSPoolingEmbedModelInfo( EmbedModelInfo("BAAI/bge-large-en", architecture="BertModel", enable_test=False),
"BAAI/bge-small-en", architecture="BertModel", enable_test=False EmbedModelInfo("BAAI/bge-large-zh", architecture="BertModel", enable_test=False),
), EmbedModelInfo(
CLSPoolingEmbedModelInfo(
"BAAI/bge-small-zh", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-large-en", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-large-zh", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-large-zh-noinstruct", architecture="BertModel", enable_test=False "BAAI/bge-large-zh-noinstruct", architecture="BertModel", enable_test=False
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"BAAI/bge-base-en-v1.5", architecture="BertModel", enable_test=False "BAAI/bge-base-en-v1.5", architecture="BertModel", enable_test=False
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"BAAI/bge-base-zh-v1.5", architecture="BertModel", enable_test=False "BAAI/bge-base-zh-v1.5", architecture="BertModel", enable_test=False
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"BAAI/bge-small-en-v1.5", architecture="BertModel", enable_test=False "BAAI/bge-small-en-v1.5", architecture="BertModel", enable_test=False
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"BAAI/bge-small-zh-v1.5", architecture="BertModel", enable_test=False "BAAI/bge-small-zh-v1.5", architecture="BertModel", enable_test=False
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"BAAI/bge-large-en-v1.5", architecture="BertModel", enable_test=False "BAAI/bge-large-en-v1.5", architecture="BertModel", enable_test=False
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"BAAI/bge-large-zh-v1.5", architecture="BertModel", enable_test=False "BAAI/bge-large-zh-v1.5", architecture="BertModel", enable_test=False
), ),
########## XLMRobertaModel ########## XLMRobertaModel
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"BAAI/bge-m3", "BAAI/bge-m3",
architecture="XLMRobertaModel", architecture="XLMRobertaModel",
mteb_score=0.787343078, mteb_score=0.787343078,
seq_pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
########## Qwen2Model ########## Qwen2Model
LASTPoolingEmbedModelInfo( EmbedModelInfo(
"BAAI/bge-code-v1", "BAAI/bge-code-v1",
architecture="Qwen2Model", architecture="Qwen2Model",
mteb_score=0.75724465, mteb_score=0.75724465,
dtype="float32", seq_pooling_type="LAST",
attn_type="decoder",
is_prefix_caching_supported=True,
is_chunked_prefill_supported=True,
enable_test=True, enable_test=True,
), ),
] ]
RERANK_MODELS = [ RERANK_MODELS = [
########## XLMRobertaForSequenceClassification ########## XLMRobertaForSequenceClassification
CLSPoolingRerankModelInfo( RerankModelInfo(
"BAAI/bge-reranker-base", "BAAI/bge-reranker-base",
architecture="XLMRobertaForSequenceClassification", architecture="XLMRobertaForSequenceClassification",
mteb_score=0.32398, mteb_score=0.32398,
seq_pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
CLSPoolingRerankModelInfo( RerankModelInfo(
"BAAI/bge-reranker-large", "BAAI/bge-reranker-large",
architecture="XLMRobertaForSequenceClassification", architecture="XLMRobertaForSequenceClassification",
enable_test=False, enable_test=False,
), ),
CLSPoolingRerankModelInfo( RerankModelInfo(
"BAAI/bge-reranker-v2-m3", "BAAI/bge-reranker-v2-m3",
architecture="XLMRobertaForSequenceClassification", architecture="XLMRobertaForSequenceClassification",
enable_test=False, enable_test=False,
...@@ -108,7 +111,5 @@ def test_embed_models_correctness( ...@@ -108,7 +111,5 @@ def test_embed_models_correctness(
@pytest.mark.parametrize("model_info", RERANK_MODELS) @pytest.mark.parametrize("model_info", RERANK_MODELS)
def test_rerank_models_mteb( def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
hf_runner, vllm_runner, model_info: RerankModelInfo mteb_test_rerank_models(vllm_runner, model_info)
) -> None:
mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
...@@ -9,40 +9,62 @@ import torch ...@@ -9,40 +9,62 @@ import torch
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
from tests.conftest import HfRunner from tests.conftest import HfRunner
from tests.models.language.pooling_mteb_test.mteb_utils import ( from tests.models.utils import RerankModelInfo
VllmMtebCrossEncoder,
from .mteb_score_utils import (
MtebCrossEncoderMixin,
mteb_test_rerank_models, mteb_test_rerank_models,
) )
from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
RERANK_MODELS = [ RERANK_MODELS = [
LASTPoolingRerankModelInfo( RerankModelInfo(
"BAAI/bge-reranker-v2-gemma", "BAAI/bge-reranker-v2-gemma",
architecture="GemmaForSequenceClassification", architecture="GemmaForSequenceClassification",
mteb_score=0.33757,
hf_overrides={ hf_overrides={
"architectures": ["GemmaForSequenceClassification"], "architectures": ["GemmaForSequenceClassification"],
"classifier_from_token": ["Yes"], "classifier_from_token": ["Yes"],
"method": "no_post_processing", "method": "no_post_processing",
}, },
mteb_score=0.33757,
seq_pooling_type="LAST",
attn_type="decoder",
is_prefix_caching_supported=True,
is_chunked_prefill_supported=True,
chat_template_name="bge-reranker-v2-gemma.jinja",
), ),
] ]
PROMPT = "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'." # noqa: E501 PROMPT = "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'." # noqa: E501
class GemmaRerankerHfRunner(HfRunner): class GemmaRerankerHfRunner(MtebCrossEncoderMixin, HfRunner):
def __init__( def __init__(
self, model_name: str, dtype: str = "auto", *args: Any, **kwargs: Any self, model_name: str, dtype: str = "auto", *args: Any, **kwargs: Any
) -> None: ) -> None:
from transformers import AutoModelForCausalLM, AutoTokenizer from transformers import AutoModelForCausalLM, AutoTokenizer
super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM) HfRunner.__init__(
self,
model_name=model_name,
auto_cls=AutoModelForCausalLM,
dtype=dtype,
**kwargs,
)
self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left") self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
self.yes_loc = self.tokenizer.convert_tokens_to_ids("Yes") self.yes_loc = self.tokenizer.convert_tokens_to_ids("Yes")
@torch.no_grad() @torch.no_grad
def predict(self, prompts: list[list[str]], *args, **kwargs) -> torch.Tensor: def predict(
self,
inputs1: DataLoader[mteb.types.BatchedInput],
inputs2: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
queries = [text for batch in inputs1 for text in batch["text"]]
corpus = [text for batch in inputs2 for text in batch["text"]]
def get_inputs(pairs, tokenizer, prompt=None): def get_inputs(pairs, tokenizer, prompt=None):
if prompt is None: if prompt is None:
prompt = PROMPT prompt = PROMPT
...@@ -87,8 +109,8 @@ class GemmaRerankerHfRunner(HfRunner): ...@@ -87,8 +109,8 @@ class GemmaRerankerHfRunner(HfRunner):
) )
scores = [] scores = []
for query, doc, *_ in prompts: for query, document in zip(queries, corpus):
pairs = [(query, doc)] pairs = [(query, document)]
inputs = get_inputs(pairs, self.tokenizer) inputs = get_inputs(pairs, self.tokenizer)
inputs = inputs.to(self.model.device) inputs = inputs.to(self.model.device)
_n_tokens = inputs["input_ids"].shape[1] _n_tokens = inputs["input_ids"].shape[1]
...@@ -105,41 +127,10 @@ class GemmaRerankerHfRunner(HfRunner): ...@@ -105,41 +127,10 @@ class GemmaRerankerHfRunner(HfRunner):
return torch.Tensor(scores) return torch.Tensor(scores)
class GemmaMtebEncoder(VllmMtebCrossEncoder):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.query_template = "A: {query}\n"
self.document_template = "B: {doc}\n{prompt}"
def predict(
self,
inputs1: DataLoader[mteb.types.BatchedInput],
inputs2: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
queries = [
self.query_template.format(query=text)
for batch in inputs1
for text in batch["text"]
]
corpus = [
self.document_template.format(doc=text, prompt=PROMPT)
for batch in inputs2
for text in batch["text"]
]
outputs = self.llm.score(
queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False
)
scores = np.array(outputs)
return scores
@pytest.mark.parametrize("model_info", RERANK_MODELS) @pytest.mark.parametrize("model_info", RERANK_MODELS)
def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None: def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
mteb_test_rerank_models( mteb_test_rerank_models(
GemmaRerankerHfRunner,
vllm_runner, vllm_runner,
model_info, model_info,
vllm_mteb_encoder=GemmaMtebEncoder, hf_runner=GemmaRerankerHfRunner,
) )
...@@ -3,29 +3,34 @@ ...@@ -3,29 +3,34 @@
import pytest import pytest
from tests.models.utils import ( from tests.models.utils import (
CLSPoolingRerankModelInfo,
LASTPoolingRerankModelInfo,
RerankModelInfo, RerankModelInfo,
) )
from .mteb_utils import mteb_test_rerank_models from .mteb_score_utils import mteb_test_rerank_models
RERANK_MODELS = [ RERANK_MODELS = [
CLSPoolingRerankModelInfo( RerankModelInfo(
"cross-encoder/ms-marco-TinyBERT-L-2-v2", "cross-encoder/ms-marco-TinyBERT-L-2-v2",
mteb_score=0.32898,
architecture="BertForSequenceClassification", architecture="BertForSequenceClassification",
seq_pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
mteb_score=0.32898,
), ),
LASTPoolingRerankModelInfo( RerankModelInfo(
"tomaarsen/Qwen3-Reranker-0.6B-seq-cls", "tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
mteb_score=0.25736,
architecture="Qwen3ForSequenceClassification", architecture="Qwen3ForSequenceClassification",
seq_pooling_type="LAST",
attn_type="decoder",
is_prefix_caching_supported=True,
is_chunked_prefill_supported=True,
chat_template_name="qwen3_reranker.jinja",
mteb_score=0.33459,
), ),
] ]
@pytest.mark.parametrize("model_info", RERANK_MODELS) @pytest.mark.parametrize("model_info", RERANK_MODELS)
def test_rerank_models_mteb( def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
hf_runner, vllm_runner, model_info: RerankModelInfo mteb_test_rerank_models(vllm_runner, model_info)
) -> None:
mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
...@@ -5,36 +5,32 @@ import pytest ...@@ -5,36 +5,32 @@ import pytest
from tests.models.language.pooling.embed_utils import correctness_test_embed_models from tests.models.language.pooling.embed_utils import correctness_test_embed_models
from tests.models.utils import ( from tests.models.utils import (
CLSPoolingEmbedModelInfo,
CLSPoolingRerankModelInfo,
EmbedModelInfo, EmbedModelInfo,
LASTPoolingEmbedModelInfo,
RerankModelInfo, RerankModelInfo,
) )
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models from .mteb_embed_utils import mteb_test_embed_models
from .mteb_score_utils import mteb_test_rerank_models
MODELS = [ MODELS = [
########## BertModel ########## BertModel
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"thenlper/gte-large", "thenlper/gte-large",
mteb_score=0.76807651, mteb_score=0.76807651,
architecture="BertModel", architecture="BertModel",
seq_pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo("thenlper/gte-base", architecture="BertModel", enable_test=False),
"thenlper/gte-base", architecture="BertModel", enable_test=False EmbedModelInfo("thenlper/gte-small", architecture="BertModel", enable_test=False),
), EmbedModelInfo(
CLSPoolingEmbedModelInfo(
"thenlper/gte-small", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"thenlper/gte-large-zh", architecture="BertModel", enable_test=False "thenlper/gte-large-zh", architecture="BertModel", enable_test=False
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo("thenlper/gte-base-zh", architecture="BertModel", enable_test=False),
"thenlper/gte-base-zh", architecture="BertModel", enable_test=False EmbedModelInfo(
),
CLSPoolingEmbedModelInfo(
"thenlper/gte-small-zh", architecture="BertModel", enable_test=False "thenlper/gte-small-zh", architecture="BertModel", enable_test=False
), ),
########### NewModel ########### NewModel
...@@ -43,68 +39,90 @@ MODELS = [ ...@@ -43,68 +39,90 @@ MODELS = [
# - whether to use token_type_embeddings # - whether to use token_type_embeddings
# - whether to use context expansion # - whether to use context expansion
# So only test one (the most widely used) model # So only test one (the most widely used) model
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"Alibaba-NLP/gte-multilingual-base", "Alibaba-NLP/gte-multilingual-base",
architecture="GteNewModel", architecture="GteNewModel",
mteb_score=0.775074696, mteb_score=0.775074696,
hf_overrides={"architectures": ["GteNewModel"]}, hf_overrides={"architectures": ["GteNewModel"]},
seq_pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"Alibaba-NLP/gte-base-en-v1.5", "Alibaba-NLP/gte-base-en-v1.5",
architecture="GteNewModel", architecture="GteNewModel",
hf_overrides={"architectures": ["GteNewModel"]}, hf_overrides={"architectures": ["GteNewModel"]},
enable_test=False, enable_test=False,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"Alibaba-NLP/gte-large-en-v1.5", "Alibaba-NLP/gte-large-en-v1.5",
architecture="GteNewModel", architecture="GteNewModel",
hf_overrides={"architectures": ["GteNewModel"]}, hf_overrides={"architectures": ["GteNewModel"]},
enable_test=False, enable_test=False,
), ),
########### Qwen2ForCausalLM ########### Qwen2ForCausalLM
LASTPoolingEmbedModelInfo( EmbedModelInfo(
"Alibaba-NLP/gte-Qwen2-1.5B-instruct", "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
mteb_score=0.758473459018872, mteb_score=0.758473459018872,
architecture="Qwen2ForCausalLM", architecture="Qwen2ForCausalLM",
seq_pooling_type="LAST",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
########## ModernBertModel ########## ModernBertModel
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"Alibaba-NLP/gte-modernbert-base", "Alibaba-NLP/gte-modernbert-base",
mteb_score=0.748193353, mteb_score=0.748193353,
architecture="ModernBertModel", architecture="ModernBertModel",
seq_pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
########## Qwen3ForCausalLM ########## Qwen3ForCausalLM
LASTPoolingEmbedModelInfo( EmbedModelInfo(
"Qwen/Qwen3-Embedding-0.6B", "Qwen/Qwen3-Embedding-0.6B",
mteb_score=0.771163695, mteb_score=0.771163695,
architecture="Qwen3ForCausalLM", architecture="Qwen3ForCausalLM",
dtype="float32", seq_pooling_type="LAST",
attn_type="decoder",
is_prefix_caching_supported=True,
is_chunked_prefill_supported=True,
enable_test=True, enable_test=True,
), ),
LASTPoolingEmbedModelInfo( EmbedModelInfo(
"Qwen/Qwen3-Embedding-4B", "Qwen/Qwen3-Embedding-4B",
architecture="Qwen3ForCausalLM", architecture="Qwen3ForCausalLM",
dtype="float32",
enable_test=False, enable_test=False,
), ),
] ]
RERANK_MODELS = [ RERANK_MODELS = [
CLSPoolingRerankModelInfo( RerankModelInfo(
# classifier_pooling: mean # classifier_pooling: mean
"Alibaba-NLP/gte-reranker-modernbert-base", "Alibaba-NLP/gte-reranker-modernbert-base",
mteb_score=0.33386, mteb_score=0.33386,
architecture="ModernBertForSequenceClassification", architecture="ModernBertForSequenceClassification",
seq_pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
CLSPoolingRerankModelInfo( RerankModelInfo(
"Alibaba-NLP/gte-multilingual-reranker-base", "Alibaba-NLP/gte-multilingual-reranker-base",
mteb_score=0.33062, mteb_score=0.33062,
architecture="GteNewForSequenceClassification", architecture="GteNewForSequenceClassification",
hf_overrides={"architectures": ["GteNewForSequenceClassification"]}, hf_overrides={"architectures": ["GteNewForSequenceClassification"]},
seq_pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
] ]
...@@ -123,7 +141,5 @@ def test_embed_models_correctness( ...@@ -123,7 +141,5 @@ def test_embed_models_correctness(
@pytest.mark.parametrize("model_info", RERANK_MODELS) @pytest.mark.parametrize("model_info", RERANK_MODELS)
def test_rerank_models_mteb( def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
hf_runner, vllm_runner, model_info: RerankModelInfo mteb_test_rerank_models(vllm_runner, model_info)
) -> None:
mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
...@@ -3,40 +3,44 @@ ...@@ -3,40 +3,44 @@
import pytest import pytest
from tests.models.language.pooling.embed_utils import correctness_test_embed_models from tests.models.language.pooling.embed_utils import correctness_test_embed_models
from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo from tests.models.utils import EmbedModelInfo
from .mteb_utils import mteb_test_embed_models from .mteb_embed_utils import mteb_test_embed_models
MODELS = [ MODELS = [
########## BertModel ########## BertModel
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"intfloat/e5-small", "intfloat/e5-small",
architecture="BertModel", architecture="BertModel",
mteb_score=0.742285423, mteb_score=0.742285423,
seq_pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo("intfloat/e5-base", architecture="BertModel", enable_test=False),
"intfloat/e5-base", architecture="BertModel", enable_test=False EmbedModelInfo("intfloat/e5-large", architecture="BertModel", enable_test=False),
), EmbedModelInfo(
CLSPoolingEmbedModelInfo(
"intfloat/e5-large", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"intfloat/multilingual-e5-small", architecture="BertModel", enable_test=False "intfloat/multilingual-e5-small", architecture="BertModel", enable_test=False
), ),
########## XLMRobertaModel ########## XLMRobertaModel
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"intfloat/multilingual-e5-base", "intfloat/multilingual-e5-base",
architecture="XLMRobertaModel", architecture="XLMRobertaModel",
mteb_score=0.779325955, mteb_score=0.779325955,
seq_pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"intfloat/multilingual-e5-large", "intfloat/multilingual-e5-large",
architecture="XLMRobertaModel", architecture="XLMRobertaModel",
enable_test=False, enable_test=False,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"intfloat/multilingual-e5-large-instruct", "intfloat/multilingual-e5-large-instruct",
architecture="XLMRobertaModel", architecture="XLMRobertaModel",
enable_test=False, enable_test=False,
......
...@@ -10,30 +10,36 @@ from tests.models.language.pooling.embed_utils import ( ...@@ -10,30 +10,36 @@ from tests.models.language.pooling.embed_utils import (
matryoshka_fy, matryoshka_fy,
) )
from tests.models.utils import ( from tests.models.utils import (
CLSPoolingEmbedModelInfo,
CLSPoolingRerankModelInfo,
EmbedModelInfo, EmbedModelInfo,
RerankModelInfo, RerankModelInfo,
) )
from vllm import PoolingParams from vllm import PoolingParams
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models from .mteb_embed_utils import mteb_test_embed_models
from .mteb_score_utils import mteb_test_rerank_models
EMBEDDING_MODELS = [ EMBEDDING_MODELS = [
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"jinaai/jina-embeddings-v3", "jinaai/jina-embeddings-v3",
mteb_score=0.824413164, mteb_score=0.824413164,
architecture="XLMRobertaModel", architecture="XLMRobertaModel",
is_matryoshka=True, is_matryoshka=True,
dtype="float32", seq_pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
) )
] ]
RERANK_MODELS = [ RERANK_MODELS = [
CLSPoolingRerankModelInfo( RerankModelInfo(
"jinaai/jina-reranker-v2-base-multilingual", "jinaai/jina-reranker-v2-base-multilingual",
mteb_score=0.33643, mteb_score=0.33643,
architecture="XLMRobertaForSequenceClassification", architecture="XLMRobertaForSequenceClassification",
seq_pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
) )
] ]
...@@ -65,10 +71,8 @@ def test_embed_models_correctness( ...@@ -65,10 +71,8 @@ def test_embed_models_correctness(
@pytest.mark.parametrize("model_info", RERANK_MODELS) @pytest.mark.parametrize("model_info", RERANK_MODELS)
def test_rerank_models_mteb( def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
hf_runner, vllm_runner, model_info: RerankModelInfo mteb_test_rerank_models(vllm_runner, model_info)
) -> None:
mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
@pytest.mark.parametrize("model_info", EMBEDDING_MODELS) @pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
......
...@@ -2,13 +2,16 @@ ...@@ -2,13 +2,16 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any from typing import Any
import mteb
import numpy as np
import pytest import pytest
import torch import torch
from torch.utils.data import DataLoader
from tests.conftest import HfRunner from tests.conftest import HfRunner
from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo from tests.models.utils import RerankModelInfo
from .mteb_utils import mteb_test_rerank_models from .mteb_score_utils import MtebCrossEncoderMixin, mteb_test_rerank_models
mxbai_rerank_hf_overrides = { mxbai_rerank_hf_overrides = {
"architectures": ["Qwen2ForSequenceClassification"], "architectures": ["Qwen2ForSequenceClassification"],
...@@ -17,50 +20,73 @@ mxbai_rerank_hf_overrides = { ...@@ -17,50 +20,73 @@ mxbai_rerank_hf_overrides = {
} }
RERANK_MODELS = [ RERANK_MODELS = [
LASTPoolingRerankModelInfo( RerankModelInfo(
"mixedbread-ai/mxbai-rerank-base-v2", "mixedbread-ai/mxbai-rerank-base-v2",
architecture="Qwen2ForSequenceClassification", architecture="Qwen2ForSequenceClassification",
hf_overrides=mxbai_rerank_hf_overrides, hf_overrides=mxbai_rerank_hf_overrides,
mteb_score=0.273, seq_pooling_type="LAST",
attn_type="decoder",
is_prefix_caching_supported=True,
is_chunked_prefill_supported=True,
chat_template_name="mxbai_rerank_v2.jinja",
mteb_score=0.33651,
enable_test=True, enable_test=True,
), ),
LASTPoolingRerankModelInfo( RerankModelInfo(
"mixedbread-ai/mxbai-rerank-large-v2", "mixedbread-ai/mxbai-rerank-large-v2",
architecture="Qwen2ForSequenceClassification", architecture="Qwen2ForSequenceClassification",
hf_overrides=mxbai_rerank_hf_overrides, hf_overrides=mxbai_rerank_hf_overrides,
chat_template_name="mxbai_rerank_v2.jinja",
enable_test=False, enable_test=False,
), ),
] ]
class MxbaiRerankerHfRunner(HfRunner): class MxbaiRerankerHfRunner(MtebCrossEncoderMixin, HfRunner):
def __init__( def __init__(
self, model_name: str, dtype: str = "auto", *args: Any, **kwargs: Any self, model_name: str, dtype: str = "auto", *args: Any, **kwargs: Any
) -> None: ) -> None:
from transformers import AutoModelForCausalLM, AutoTokenizer from transformers import AutoModelForCausalLM, AutoTokenizer
super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM) HfRunner.__init__(
self,
model_name=model_name,
auto_cls=AutoModelForCausalLM,
dtype=dtype,
**kwargs,
)
self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left") self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
self.yes_loc = self.tokenizer.convert_tokens_to_ids("1") self.yes_loc = self.tokenizer.convert_tokens_to_ids("1")
self.no_loc = self.tokenizer.convert_tokens_to_ids("0") self.no_loc = self.tokenizer.convert_tokens_to_ids("0")
def predict(self, prompts: list[list[str]], *args, **kwargs) -> torch.Tensor: @torch.no_grad
def process_inputs(pairs): def predict(
inputs = self.tokenizer( self,
pairs, inputs1: DataLoader[mteb.types.BatchedInput],
padding=False, inputs2: DataLoader[mteb.types.BatchedInput],
truncation="longest_first", *args,
return_attention_mask=False, **kwargs,
) -> np.ndarray:
queries = [text for batch in inputs1 for text in batch["text"]]
corpus = [text for batch in inputs2 for text in batch["text"]]
tokenizer = self.tokenizer
prompts = []
for query, document in zip(queries, corpus):
conversation = [
{"role": "query", "content": query},
{"role": "document", "content": document},
]
prompt = tokenizer.apply_chat_template(
conversation=conversation,
tools=None,
chat_template=self.chat_template,
tokenize=False,
) )
for i, ele in enumerate(inputs["input_ids"]): prompts.append(prompt)
inputs["input_ids"][i] = ele
inputs = self.tokenizer.pad(inputs, padding=True, return_tensors="pt")
for key in inputs:
inputs[key] = inputs[key].to(self.model.device)
return inputs
@torch.no_grad()
def compute_logits(inputs): def compute_logits(inputs):
logits = self.model(**inputs).logits[:, -1, :] logits = self.model(**inputs).logits[:, -1, :]
yes_logits = logits[:, self.yes_loc] yes_logits = logits[:, self.yes_loc]
...@@ -70,9 +96,9 @@ class MxbaiRerankerHfRunner(HfRunner): ...@@ -70,9 +96,9 @@ class MxbaiRerankerHfRunner(HfRunner):
return scores return scores
scores = [] scores = []
for query, doc, *_ in prompts: for prompt in prompts:
pairs = [(query, doc)] inputs = tokenizer([prompt], return_tensors="pt")
inputs = process_inputs(pairs) inputs = self.wrap_device(inputs)
score = compute_logits(inputs) score = compute_logits(inputs)
scores.append(score[0].item()) scores.append(score[0].item())
return torch.Tensor(scores) return torch.Tensor(scores)
...@@ -80,4 +106,4 @@ class MxbaiRerankerHfRunner(HfRunner): ...@@ -80,4 +106,4 @@ class MxbaiRerankerHfRunner(HfRunner):
@pytest.mark.parametrize("model_info", RERANK_MODELS) @pytest.mark.parametrize("model_info", RERANK_MODELS)
def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None: def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
mteb_test_rerank_models(MxbaiRerankerHfRunner, vllm_runner, model_info) mteb_test_rerank_models(vllm_runner, model_info, hf_runner=MxbaiRerankerHfRunner)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from tests.models.language.pooling_mteb_test.mteb_embed_utils import (
mteb_test_embed_models,
)
from tests.models.language.pooling_mteb_test.mteb_score_utils import (
mteb_test_rerank_models,
)
from tests.models.utils import (
EmbedModelInfo,
RerankModelInfo,
)
EMBEDDING_MODELS = [
EmbedModelInfo(
"nvidia/llama-nemotron-embed-1b-v2",
architecture="LlamaBidirectionalModel",
mteb_score=0.689164662128673,
seq_pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
)
]
RERANK_MODELS = [
RerankModelInfo(
"nvidia/llama-nemotron-rerank-1b-v2",
architecture="LlamaBidirectionalForSequenceClassification",
chat_template_name="nemotron-rerank.jinja",
mteb_score=0.33994,
seq_pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
),
]
@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
@pytest.mark.parametrize("model_info", RERANK_MODELS)
def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
mteb_test_rerank_models(vllm_runner, model_info)
...@@ -4,30 +4,38 @@ ...@@ -4,30 +4,38 @@
import pytest import pytest
from tests.models.language.pooling.embed_utils import correctness_test_embed_models from tests.models.language.pooling.embed_utils import correctness_test_embed_models
from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo from tests.models.utils import EmbedModelInfo
from .mteb_utils import mteb_test_embed_models from .mteb_embed_utils import mteb_test_embed_models
MODELS = [ MODELS = [
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"nomic-ai/nomic-embed-text-v1", "nomic-ai/nomic-embed-text-v1",
architecture="NomicBertModel", architecture="NomicBertModel",
mteb_score=0.737568559, mteb_score=0.737568559,
enable_test=True, enable_test=True,
seq_pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"nomic-ai/nomic-embed-text-v1.5", "nomic-ai/nomic-embed-text-v1.5",
architecture="NomicBertModel", architecture="NomicBertModel",
enable_test=False, enable_test=False,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"nomic-ai/CodeRankEmbed", architecture="NomicBertModel", enable_test=False "nomic-ai/CodeRankEmbed", architecture="NomicBertModel", enable_test=False
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"nomic-ai/nomic-embed-text-v2-moe", "nomic-ai/nomic-embed-text-v2-moe",
architecture="NomicBertModel", architecture="NomicBertModel",
mteb_score=0.715488912, mteb_score=0.715488912,
enable_test=True, enable_test=True,
seq_pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
), ),
] ]
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
from typing import Any from typing import Any
import mteb
import numpy as np
import pytest import pytest
import torch import torch
from torch.utils.data import DataLoader
from tests.conftest import HfRunner from tests.conftest import HfRunner
from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo from tests.models.utils import RerankModelInfo
from tests.utils import multi_gpu_test from tests.utils import multi_gpu_test
from .mteb_utils import mteb_test_rerank_models from .mteb_score_utils import MtebCrossEncoderMixin, mteb_test_rerank_models
qwen3_reranker_hf_overrides = { qwen3_reranker_hf_overrides = {
"architectures": ["Qwen3ForSequenceClassification"], "architectures": ["Qwen3ForSequenceClassification"],
...@@ -18,50 +22,74 @@ qwen3_reranker_hf_overrides = { ...@@ -18,50 +22,74 @@ qwen3_reranker_hf_overrides = {
} }
RERANK_MODELS = [ RERANK_MODELS = [
LASTPoolingRerankModelInfo( RerankModelInfo(
"Qwen/Qwen3-Reranker-0.6B", "Qwen/Qwen3-Reranker-0.6B",
architecture="Qwen3ForSequenceClassification", architecture="Qwen3ForSequenceClassification",
mteb_score=0.25736,
hf_overrides=qwen3_reranker_hf_overrides, hf_overrides=qwen3_reranker_hf_overrides,
chat_template_name="qwen3_reranker.jinja",
seq_pooling_type="LAST",
attn_type="decoder",
is_prefix_caching_supported=True,
is_chunked_prefill_supported=True,
mteb_score=0.33459,
enable_test=True, enable_test=True,
), ),
LASTPoolingRerankModelInfo( RerankModelInfo(
"Qwen/Qwen3-Reranker-4B", "Qwen/Qwen3-Reranker-4B",
architecture="Qwen3ForSequenceClassification", architecture="Qwen3ForSequenceClassification",
chat_template_name="qwen3_reranker.jinja",
hf_overrides=qwen3_reranker_hf_overrides, hf_overrides=qwen3_reranker_hf_overrides,
enable_test=False, enable_test=False,
), ),
] ]
class Qwen3RerankerHfRunner(HfRunner): class Qwen3RerankerHfRunner(MtebCrossEncoderMixin, HfRunner):
def __init__( def __init__(
self, model_name: str, dtype: str = "auto", *args: Any, **kwargs: Any self, model_name: str, dtype: str = "auto", *args: Any, **kwargs: Any
) -> None: ) -> None:
from transformers import AutoModelForCausalLM, AutoTokenizer from transformers import AutoModelForCausalLM, AutoTokenizer
super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM) HfRunner.__init__(
self,
model_name=model_name,
auto_cls=AutoModelForCausalLM,
dtype=dtype,
**kwargs,
)
self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left") self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
self.token_false_id = self.tokenizer.convert_tokens_to_ids("no") self.token_false_id = self.tokenizer.convert_tokens_to_ids("no")
self.token_true_id = self.tokenizer.convert_tokens_to_ids("yes") self.token_true_id = self.tokenizer.convert_tokens_to_ids("yes")
self.max_length = 40960
def predict(self, prompts: list[list[str]], *args, **kwargs) -> torch.Tensor:
def process_inputs(pairs): @torch.no_grad
inputs = self.tokenizer( def predict(
pairs, self,
padding=False, inputs1: DataLoader[mteb.types.BatchedInput],
truncation="longest_first", inputs2: DataLoader[mteb.types.BatchedInput],
return_attention_mask=False, *args,
**kwargs,
) -> np.ndarray:
queries = [text for batch in inputs1 for text in batch["text"]]
corpus = [text for batch in inputs2 for text in batch["text"]]
tokenizer = self.tokenizer
prompts = []
for query, document in zip(queries, corpus):
conversation = [
{"role": "query", "content": query},
{"role": "document", "content": document},
]
prompt = tokenizer.apply_chat_template(
conversation=conversation,
tools=None,
chat_template=self.chat_template,
tokenize=False,
) )
for i, ele in enumerate(inputs["input_ids"]): prompts.append(prompt)
inputs["input_ids"][i] = ele
inputs = self.tokenizer.pad(inputs, padding=True, return_tensors="pt")
for key in inputs:
inputs[key] = inputs[key].to(self.model.device)
return inputs
@torch.no_grad()
def compute_logits(inputs): def compute_logits(inputs):
batch_scores = self.model(**inputs).logits[:, -1, :] batch_scores = self.model(**inputs).logits[:, -1, :]
true_vector = batch_scores[:, self.token_true_id] true_vector = batch_scores[:, self.token_true_id]
...@@ -72,9 +100,9 @@ class Qwen3RerankerHfRunner(HfRunner): ...@@ -72,9 +100,9 @@ class Qwen3RerankerHfRunner(HfRunner):
return scores return scores
scores = [] scores = []
for query, doc, *_ in prompts: for prompt in prompts:
pairs = [(query, doc)] inputs = tokenizer([prompt], return_tensors="pt")
inputs = process_inputs(pairs) inputs = self.wrap_device(inputs)
score = compute_logits(inputs) score = compute_logits(inputs)
scores.append(score[0].item()) scores.append(score[0].item())
return torch.Tensor(scores) return torch.Tensor(scores)
...@@ -82,7 +110,7 @@ class Qwen3RerankerHfRunner(HfRunner): ...@@ -82,7 +110,7 @@ class Qwen3RerankerHfRunner(HfRunner):
@pytest.mark.parametrize("model_info", RERANK_MODELS) @pytest.mark.parametrize("model_info", RERANK_MODELS)
def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None: def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info) mteb_test_rerank_models(vllm_runner, model_info, hf_runner=Qwen3RerankerHfRunner)
@pytest.mark.parametrize("model_info", RERANK_MODELS) @pytest.mark.parametrize("model_info", RERANK_MODELS)
...@@ -95,5 +123,8 @@ def test_rerank_models_mteb_tp(vllm_runner, model_info: RerankModelInfo) -> None ...@@ -95,5 +123,8 @@ def test_rerank_models_mteb_tp(vllm_runner, model_info: RerankModelInfo) -> None
} }
mteb_test_rerank_models( mteb_test_rerank_models(
Qwen3RerankerHfRunner, vllm_runner, model_info, vllm_extra_kwargs vllm_runner,
model_info,
vllm_extra_kwargs=vllm_extra_kwargs,
hf_runner=Qwen3RerankerHfRunner,
) )
...@@ -4,62 +4,82 @@ ...@@ -4,62 +4,82 @@
import pytest import pytest
from tests.models.language.pooling.embed_utils import correctness_test_embed_models from tests.models.language.pooling.embed_utils import correctness_test_embed_models
from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo from tests.models.utils import EmbedModelInfo
from .mteb_utils import mteb_test_embed_models from .mteb_embed_utils import mteb_test_embed_models
MODELS = [ MODELS = [
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-xs", "Snowflake/snowflake-arctic-embed-xs",
is_matryoshka=False, is_matryoshka=False,
architecture="BertModel", architecture="BertModel",
mteb_score=0.714927797, mteb_score=0.714927797,
seq_pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-s", "Snowflake/snowflake-arctic-embed-s",
is_matryoshka=False, is_matryoshka=False,
architecture="BertModel", architecture="BertModel",
enable_test=False, enable_test=False,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-m", "Snowflake/snowflake-arctic-embed-m",
is_matryoshka=False, is_matryoshka=False,
architecture="BertModel", architecture="BertModel",
enable_test=False, enable_test=False,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-m-long", "Snowflake/snowflake-arctic-embed-m-long",
is_matryoshka=False, is_matryoshka=False,
architecture="NomicBertModel", architecture="NomicBertModel",
mteb_score=0.681146831, mteb_score=0.681146831,
seq_pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-l", "Snowflake/snowflake-arctic-embed-l",
is_matryoshka=False, is_matryoshka=False,
architecture="BertModel", architecture="BertModel",
enable_test=False, enable_test=False,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-m-v1.5", "Snowflake/snowflake-arctic-embed-m-v1.5",
is_matryoshka=True, is_matryoshka=True,
architecture="BertModel", architecture="BertModel",
mteb_score=0.649088363, mteb_score=0.649088363,
seq_pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-l-v2.0", "Snowflake/snowflake-arctic-embed-l-v2.0",
is_matryoshka=True, is_matryoshka=True,
architecture="XLMRobertaModel", architecture="XLMRobertaModel",
mteb_score=0.712258299, mteb_score=0.712258299,
seq_pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-m-v2.0", "Snowflake/snowflake-arctic-embed-m-v2.0",
is_matryoshka=True, is_matryoshka=True,
architecture="GteModel", architecture="GteModel",
mteb_score=0.706622444, mteb_score=0.706622444,
seq_pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
] ]
......
...@@ -3,27 +3,32 @@ ...@@ -3,27 +3,32 @@
import pytest import pytest
from tests.models.utils import ( from tests.models.utils import (
CLSPoolingEmbedModelInfo,
EmbedModelInfo, EmbedModelInfo,
LASTPoolingEmbedModelInfo,
) )
from .mteb_utils import mteb_test_embed_models from .mteb_embed_utils import mteb_test_embed_models
# ST models with projector (Dense) layers # ST models with projector (Dense) layers
ST_PROJECTOR_MODELS = [ ST_PROJECTOR_MODELS = [
CLSPoolingEmbedModelInfo( EmbedModelInfo(
"TencentBAC/Conan-embedding-v1", "TencentBAC/Conan-embedding-v1",
architecture="BertModel", architecture="BertModel",
mteb_score=0.688611955, mteb_score=0.688611955,
seq_pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
), ),
LASTPoolingEmbedModelInfo( EmbedModelInfo(
"google/embeddinggemma-300m", "google/embeddinggemma-300m",
architecture="Gemma3TextModel", architecture="Gemma3TextModel",
mteb_score=0.7473819294684156, mteb_score=0.7473819294684156,
seq_pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True, enable_test=True,
dtype="float32",
), ),
] ]
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Pytest configuration for vLLM tests.""" """Pytest configuration for vLLM multimodal tests."""
import warnings import warnings
...@@ -9,20 +9,17 @@ import torch ...@@ -9,20 +9,17 @@ import torch
from vllm.platforms import current_platform from vllm.platforms import current_platform
def pytest_configure(config): def pytest_collection_modifyitems(config, items):
"""Disable Flash/MemEfficient SDP on ROCm to avoid HF """Configure ROCm-specific settings based on collected tests."""
Transformers accuracy issues.
"""
if not current_platform.is_rocm(): if not current_platform.is_rocm():
return return
skip_patterns = ["test_granite_speech.py"] skip_patterns = ["test_granite_speech.py"]
if any(pattern in str(arg) for arg in config.args for pattern in skip_patterns): if any(pattern in str(arg) for arg in config.args for pattern in skip_patterns):
# Skip disabling SDP for Granite Speech tests on ROCm
return return
# Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
# accuracy issues # accuracy issues: https://github.com/vllm-project/vllm/issues/30167
# TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace # TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
torch.backends.cuda.enable_flash_sdp(False) torch.backends.cuda.enable_flash_sdp(False)
torch.backends.cuda.enable_mem_efficient_sdp(False) torch.backends.cuda.enable_mem_efficient_sdp(False)
......
...@@ -123,10 +123,6 @@ VLM_TEST_SETTINGS = { ...@@ -123,10 +123,6 @@ VLM_TEST_SETTINGS = {
), ),
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output, vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
dtype="bfloat16",
marks=[
pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")
],
), ),
"qwen2_5_vl": VLMTestInfo( "qwen2_5_vl": VLMTestInfo(
...@@ -176,6 +172,13 @@ VLM_TEST_SETTINGS = { ...@@ -176,6 +172,13 @@ VLM_TEST_SETTINGS = {
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
patch_hf_runner=model_utils.qwen3_vl_patch_hf_runner, patch_hf_runner=model_utils.qwen3_vl_patch_hf_runner,
vllm_runner_kwargs={
"attention_config": {
"backend": "ROCM_AITER_FA",
},
}
if current_platform.is_rocm()
else None,
image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
marks=[ marks=[
pytest.mark.core_model, pytest.mark.core_model,
...@@ -256,8 +259,19 @@ VLM_TEST_SETTINGS = { ...@@ -256,8 +259,19 @@ VLM_TEST_SETTINGS = {
image_size_factors=[(0.25, 0.2, 0.15)], image_size_factors=[(0.25, 0.2, 0.15)],
vllm_runner_kwargs={ vllm_runner_kwargs={
"model_impl": "transformers", "model_impl": "transformers",
# TODO: [ROCm] Revert this once issue #30167 is resolved
**(
{
"mm_processor_kwargs": {
"min_pixels": 256 * 28 * 28,
"max_pixels": 1280 * 28 * 28,
},
}
if current_platform.is_rocm()
else {}
),
}, },
marks=[large_gpu_mark(min_gb=32)], marks=[large_gpu_mark(min_gb=80 if current_platform.is_rocm() else 32)],
), ),
#### Extended model tests #### Extended model tests
"aria": VLMTestInfo( "aria": VLMTestInfo(
...@@ -498,6 +512,7 @@ VLM_TEST_SETTINGS = { ...@@ -498,6 +512,7 @@ VLM_TEST_SETTINGS = {
max_model_len=8192, max_model_len=8192,
use_tokenizer_eos=True, use_tokenizer_eos=True,
patch_hf_runner=model_utils.internvl_patch_hf_runner, patch_hf_runner=model_utils.internvl_patch_hf_runner,
num_logprobs=10 if current_platform.is_rocm() else 5,
), ),
"intern_vl-hf": VLMTestInfo( "intern_vl-hf": VLMTestInfo(
models=["OpenGVLab/InternVL3-1B-hf"], models=["OpenGVLab/InternVL3-1B-hf"],
...@@ -513,6 +528,34 @@ VLM_TEST_SETTINGS = { ...@@ -513,6 +528,34 @@ VLM_TEST_SETTINGS = {
use_tokenizer_eos=True, use_tokenizer_eos=True,
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
), ),
"isaac": VLMTestInfo(
models=[
"PerceptronAI/Isaac-0.1",
"PerceptronAI/Isaac-0.2-2B-Preview",
],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: (
f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n"
),
img_idx_to_prompt=lambda idx: "<image>",
single_image_prompts=IMAGE_ASSETS.prompts(
{
"stop_sign": "<vlm_image>Please describe the image shortly.",
"cherry_blossom": "<vlm_image>Please infer the season with reason.",
}
),
multi_image_prompt=(
"Picture 1: <vlm_image>\n"
"Picture 2: <vlm_image>\n"
"Describe these two images with one paragraph respectively."
),
enforce_eager=False,
max_model_len=4096,
max_num_seqs=2,
hf_model_kwargs={"device_map": "auto"},
patch_hf_runner=model_utils.isaac_patch_hf_runner,
image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
),
"kimi_vl": VLMTestInfo( "kimi_vl": VLMTestInfo(
models=["moonshotai/Kimi-VL-A3B-Instruct"], models=["moonshotai/Kimi-VL-A3B-Instruct"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
...@@ -648,7 +691,17 @@ VLM_TEST_SETTINGS = { ...@@ -648,7 +691,17 @@ VLM_TEST_SETTINGS = {
hf_output_post_proc=model_utils.minimax_vl_01_hf_output, hf_output_post_proc=model_utils.minimax_vl_01_hf_output,
patch_hf_runner=model_utils.minimax_vl_01_patch_hf_runner, patch_hf_runner=model_utils.minimax_vl_01_patch_hf_runner,
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
marks=[large_gpu_mark(min_gb=80)], marks=[
large_gpu_mark(min_gb=80),
# TODO: [ROCm] Fix pickle issue with ROCm spawn and tp>1
pytest.mark.skipif(
current_platform.is_rocm(),
reason=(
"ROCm: Model too large for single GPU; "
"multi-GPU blocked by HF _LazyConfigMapping pickle issue with spawn"
),
),
],
), ),
"molmo": VLMTestInfo( "molmo": VLMTestInfo(
models=["allenai/Molmo-7B-D-0924"], models=["allenai/Molmo-7B-D-0924"],
......
...@@ -37,10 +37,12 @@ audio_lora_path = MODEL_NAME ...@@ -37,10 +37,12 @@ audio_lora_path = MODEL_NAME
models = [MODEL_NAME] models = [MODEL_NAME]
@pytest.fixture(autouse=True) @pytest.fixture
def set_attention_backend_for_rocm(monkeypatch): def granite_speech_attention_config():
"""Return attention config for Granite Speech tests on ROCm."""
if current_platform.is_rocm(): if current_platform.is_rocm():
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN") return {"backend": "ROCM_AITER_FA"}
return None
def run_test( def run_test(
...@@ -55,6 +57,7 @@ def run_test( ...@@ -55,6 +57,7 @@ def run_test(
num_logprobs: int, num_logprobs: int,
tensor_parallel_size: int, tensor_parallel_size: int,
distributed_executor_backend: str | None = None, distributed_executor_backend: str | None = None,
attention_config: dict | None = None,
): ):
"""Inference result should be the same between hf and vllm. """Inference result should be the same between hf and vllm.
...@@ -82,6 +85,7 @@ def run_test( ...@@ -82,6 +85,7 @@ def run_test(
enable_lora=True, enable_lora=True,
max_lora_rank=64, max_lora_rank=64,
enforce_eager=True, enforce_eager=True,
attention_config=attention_config,
) as vllm_model: ) as vllm_model:
lora_request = LoRARequest("audio", 1, audio_lora_path) lora_request = LoRARequest("audio", 1, audio_lora_path)
vllm_outputs_per_case = [ vllm_outputs_per_case = [
...@@ -133,6 +137,7 @@ def test_models( ...@@ -133,6 +137,7 @@ def test_models(
vllm_runner, vllm_runner,
model: str, model: str,
audio_assets: AudioTestAssets, audio_assets: AudioTestAssets,
granite_speech_attention_config,
dtype: str, dtype: str,
max_model_len: int, max_model_len: int,
max_tokens: int, max_tokens: int,
...@@ -159,4 +164,5 @@ def test_models( ...@@ -159,4 +164,5 @@ def test_models(
max_tokens=max_tokens, max_tokens=max_tokens,
num_logprobs=num_logprobs, num_logprobs=num_logprobs,
tensor_parallel_size=1, tensor_parallel_size=1,
) attention_config=granite_speech_attention_config,
\ No newline at end of file )
...@@ -8,7 +8,7 @@ from PIL.Image import Image ...@@ -8,7 +8,7 @@ from PIL.Image import Image
from transformers import AutoProcessor from transformers import AutoProcessor
from vllm import LLM, EngineArgs, SamplingParams from vllm import LLM, EngineArgs, SamplingParams
from vllm.multimodal.utils import encode_image_base64 from vllm.multimodal.utils import encode_image_url
MODEL_NAME = "Kwai-Keye/Keye-VL-8B-Preview" MODEL_NAME = "Kwai-Keye/Keye-VL-8B-Preview"
...@@ -31,10 +31,7 @@ def test_keye_vl( ...@@ -31,10 +31,7 @@ def test_keye_vl(
question: str, question: str,
): ):
images = [asset.pil_image for asset in image_assets] images = [asset.pil_image for asset in image_assets]
image_urls = [encode_image_url(image) for image in images]
image_urls = [
f"data:image/jpeg;base64,{encode_image_base64(image)}" for image in images
]
engine_args = EngineArgs( engine_args = EngineArgs(
model=MODEL_NAME, model=MODEL_NAME,
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Sequence
import pytest
from transformers import AutoModel
from tests.models.utils import check_logprobs_close
from vllm.assets.image import ImageAsset
from ....conftest import HfRunner, PromptImageInput, VllmRunner
from ....utils import create_new_process_for_each_test
IMAGE = ImageAsset("paper-11").pil_image_ext(ext="png").convert("RGB")
PROMPT = "</s><s><predict_bbox><predict_classes><output_markdown>"
def run_test(
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
inputs: Sequence[tuple[list[str], PromptImageInput]],
model: str,
*,
dtype: str,
max_tokens: int,
num_logprobs: int,
) -> None:
"""Verify that the inference result is the same between hf and vllm."""
with vllm_runner(
model,
dtype=dtype,
max_num_seqs=64,
limit_mm_per_prompt={"image": 1},
trust_remote_code=True,
) as vllm_model:
vllm_outputs_per_case = [
vllm_model.generate_greedy_logprobs(
prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
)
for prompts, images in inputs
]
with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:
hf_outputs_per_case = [
hf_model.generate_greedy_logprobs_limit(
prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
use_cache=False, # HF Nemotron Parse crashes here without this
)
for prompts, images in inputs
]
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
@pytest.mark.core_model
@pytest.mark.parametrize("model", ["nvidia/NVIDIA-Nemotron-Parse-v1.1"])
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("num_logprobs", [5])
@create_new_process_for_each_test("spawn")
def test_models(
hf_runner, vllm_runner, model: str, dtype: str, num_logprobs: int
) -> None:
run_test(
hf_runner,
vllm_runner,
inputs=[
(
[PROMPT] * 10,
[IMAGE] * 10,
),
],
model=model,
dtype=dtype,
max_tokens=100,
num_logprobs=num_logprobs,
)
...@@ -269,7 +269,7 @@ def run_embedding_input_test( ...@@ -269,7 +269,7 @@ def run_embedding_input_test(
"""Inference result should be the same between """Inference result should be the same between
original image/video input and image/video embeddings input. original image/video input and image/video embeddings input.
""" """
from transformers import AutoProcessor # noqa: F401 from transformers import AutoProcessor
processor = AutoProcessor.from_pretrained(model) processor = AutoProcessor.from_pretrained(model)
......
...@@ -14,10 +14,10 @@ import pytest ...@@ -14,10 +14,10 @@ import pytest
from transformers import AutoProcessor from transformers import AutoProcessor
from vllm import LLM, EngineArgs, SamplingParams from vllm import LLM, EngineArgs, SamplingParams
from vllm.attention.backends.registry import AttentionBackendEnum from vllm.multimodal.utils import encode_image_url
from vllm.multimodal.utils import encode_image_base64
from vllm.multimodal.video import sample_frames_from_video from vllm.multimodal.video import sample_frames_from_video
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.v1.attention.backends.registry import AttentionBackendEnum
from ....utils import create_new_process_for_each_test from ....utils import create_new_process_for_each_test
from ...utils import dummy_hf_overrides from ...utils import dummy_hf_overrides
...@@ -178,8 +178,7 @@ def build_dots_ocr_prompt(images, config): ...@@ -178,8 +178,7 @@ def build_dots_ocr_prompt(images, config):
"""Build Dots.OCR specific prompt with OCR instructions.""" """Build Dots.OCR specific prompt with OCR instructions."""
# Use only stop_sign image for Dots.OCR # Use only stop_sign image for Dots.OCR
image = images[0] # Already filtered to stop_sign image = images[0] # Already filtered to stop_sign
image_url = encode_image_url(image)
image_url = f"data:image/jpeg;base64,{encode_image_base64(image)}"
placeholders = [{"type": "image_url", "image_url": {"url": image_url}}] placeholders = [{"type": "image_url", "image_url": {"url": image_url}}]
messages = [ messages = [
...@@ -204,9 +203,7 @@ def build_processor_prompt(images, config): ...@@ -204,9 +203,7 @@ def build_processor_prompt(images, config):
config["model_name"], trust_remote_code=True config["model_name"], trust_remote_code=True
) )
image_urls = [ image_urls = [encode_image_url(img) for img in images]
f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images
]
placeholders = [{"type": "image", "image": url} for url in image_urls] placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [ messages = [
{ {
...@@ -225,9 +222,7 @@ def build_processor_prompt(images, config): ...@@ -225,9 +222,7 @@ def build_processor_prompt(images, config):
def build_ovis_prompt(images, config): def build_ovis_prompt(images, config):
"""Build Ovis2.5 specific prompt with custom format.""" """Build Ovis2.5 specific prompt with custom format."""
image_urls = [ image_urls = [encode_image_url(img) for img in images]
f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images
]
placeholders = "\n".join( placeholders = "\n".join(
f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1) f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
......
...@@ -111,4 +111,5 @@ async def test_online_serving(client, audio_assets: AudioTestAssets): ...@@ -111,4 +111,5 @@ async def test_online_serving(client, audio_assets: AudioTestAssets):
assert len(chat_completion.choices) == 1 assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0] choice = chat_completion.choices[0]
assert choice.message.content == "In the first audio clip, you hear a brief"
assert choice.finish_reason == "length" assert choice.finish_reason == "length"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment