Unverified Commit 71d0ae1c authored by Roman Solomatin's avatar Roman Solomatin Committed by GitHub
Browse files

[Misc] Update embedding/cross encoder tests to use `mteb` v2 (#27329)


Signed-off-by: default avatarRoman Solomatin <36135455+Samoed@users.noreply.github.com>
Signed-off-by: default avatarwang.yuqi <noooop@126.com>
Signed-off-by: default avatarwang.yuqi <yuqi.wang@daocloud.io>
Co-authored-by: default avatarCyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: default avatarIsotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: default avatarwang.yuqi <noooop@126.com>
Co-authored-by: default avatarwang.yuqi <yuqi.wang@daocloud.io>
parent 3d4e7d34
...@@ -36,7 +36,7 @@ opencv-python-headless >= 4.11.0 # required for video test ...@@ -36,7 +36,7 @@ opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test datamodel_code_generator # required for minicpm3 test
# TODO: Use lm-eval[api]==0.4.10 once released # TODO: Use lm-eval[api]==0.4.10 once released
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
mteb[bm25s]>=1.38.11, <2 # required for mteb test mteb[bm25s]>=2, <3 # required for mteb test
transformers==4.57.1 transformers==4.57.1
tokenizers==0.22.0 tokenizers==0.22.0
schemathesis>=3.39.15 # Required for openai schema test. schemathesis>=3.39.15 # Required for openai schema test.
......
...@@ -201,8 +201,6 @@ email-validator==2.2.0 ...@@ -201,8 +201,6 @@ email-validator==2.2.0
# via pydantic # via pydantic
encodec==0.1.1 encodec==0.1.1
# via vocos # via vocos
eval-type-backport==0.2.2
# via mteb
evaluate==0.4.3 evaluate==0.4.3
# via lm-eval # via lm-eval
fastapi==0.116.1 fastapi==0.116.1
...@@ -490,7 +488,7 @@ msgpack==1.1.0 ...@@ -490,7 +488,7 @@ msgpack==1.1.0
# via # via
# librosa # librosa
# ray # ray
mteb==1.38.11 mteb==2.1.2
# via -r requirements/test.in # via -r requirements/test.in
multidict==6.1.0 multidict==6.1.0
# via # via
......
...@@ -2,12 +2,14 @@ ...@@ -2,12 +2,14 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import tempfile import tempfile
from collections.abc import Sequence
import mteb import mteb
import numpy as np import numpy as np
import requests import requests
import torch import torch
from mteb.models import ModelMeta
from mteb.types import Array
from torch.utils.data import DataLoader
import tests.ci_envs as ci_envs import tests.ci_envs as ci_envs
from tests.models.utils import ( from tests.models.utils import (
...@@ -27,24 +29,47 @@ MTEB_EMBED_TOL = 1e-4 ...@@ -27,24 +29,47 @@ MTEB_EMBED_TOL = 1e-4
# See #19344 # See #19344
MTEB_RERANK_TASKS = ["NFCorpus"] MTEB_RERANK_TASKS = ["NFCorpus"]
MTEB_RERANK_LANGS = ["en"] MTEB_RERANK_LANGS = ["eng"]
MTEB_RERANK_TOL = 2e-3 MTEB_RERANK_TOL = 2e-3
_empty_model_meta = ModelMeta(
loader=None,
name="vllm/model",
revision="1",
release_date=None,
languages=None,
framework=[],
similarity_fn_name=None,
n_parameters=None,
memory_usage_mb=None,
max_tokens=None,
embed_dim=None,
license=None,
open_weights=None,
public_training_code=None,
public_training_data=None,
use_instructions=None,
training_datasets=None,
modalities=["text"], # 'image' can be added to evaluate multimodal models
)
class VllmMtebEncoder(mteb.EncoderProtocol):
mteb_model_meta = _empty_model_meta
class VllmMtebEncoder(mteb.Encoder):
def __init__(self, vllm_model): def __init__(self, vllm_model):
super().__init__()
self.llm = vllm_model self.llm = vllm_model
self.rng = np.random.default_rng(seed=42) self.rng = np.random.default_rng(seed=42)
def encode( def encode(
self, self,
sentences: Sequence[str], inputs: DataLoader[mteb.types.BatchedInput],
*args, *args,
**kwargs, **kwargs,
) -> np.ndarray: ) -> np.ndarray:
# Hoping to discover potential scheduling # Hoping to discover potential scheduling
# issues by randomizing the order. # issues by randomizing the order.
sentences = [text for batch in inputs for text in batch["text"]]
r = self.rng.permutation(len(sentences)) r = self.rng.permutation(len(sentences))
sentences = [sentences[i] for i in r] sentences = [sentences[i] for i in r]
outputs = self.llm.embed(sentences, use_tqdm=False) outputs = self.llm.embed(sentences, use_tqdm=False)
...@@ -52,36 +77,70 @@ class VllmMtebEncoder(mteb.Encoder): ...@@ -52,36 +77,70 @@ class VllmMtebEncoder(mteb.Encoder):
embeds = embeds[np.argsort(r)] embeds = embeds[np.argsort(r)]
return embeds return embeds
def similarity(
self,
embeddings1: np.ndarray,
embeddings2: np.ndarray,
) -> np.ndarray:
# Cosine similarity
norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
sim = np.dot(embeddings1, embeddings2.T) / (norm1 * norm2.T)
return sim
def similarity_pairwise(
self,
embeddings1: Array,
embeddings2: Array,
) -> Array:
# Cosine similarity
norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
sim = np.sum(embeddings1 * embeddings2, axis=1) / (
norm1.flatten() * norm2.flatten()
)
return sim
class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol):
mteb_model_meta = _empty_model_meta
def __init__(self, vllm_model):
self.llm = vllm_model
self.rng = np.random.default_rng(seed=42)
def predict( def predict(
self, self,
sentences: list[tuple[str, str, str | None]], # query, corpus, prompt inputs1: DataLoader[mteb.types.BatchedInput],
inputs2: DataLoader[mteb.types.BatchedInput],
*args, *args,
**kwargs, **kwargs,
) -> np.ndarray: ) -> np.ndarray:
r = self.rng.permutation(len(sentences)) queries = [text for batch in inputs1 for text in batch["text"]]
sentences = [sentences[i] for i in r] corpus = [text for batch in inputs2 for text in batch["text"]]
queries = [s[0] for s in sentences]
corpus = [s[1] for s in sentences]
outputs = self.llm.score( outputs = self.llm.score(
queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False
) )
scores = np.array(outputs) scores = np.array(outputs)
scores = scores[np.argsort(r)]
return scores return scores
class OpenAIClientMtebEncoder(mteb.Encoder): class OpenAIClientMtebEncoder(VllmMtebEncoder):
def __init__(self, model_name: str, client): def __init__(self, model_name: str, client):
super().__init__()
self.model_name = model_name self.model_name = model_name
self.client = client self.client = client
self.rng = np.random.default_rng(seed=42) self.rng = np.random.default_rng(seed=42)
def encode(self, sentences: Sequence[str], *args, **kwargs) -> np.ndarray: def encode(
self,
inputs: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
# Hoping to discover potential scheduling # Hoping to discover potential scheduling
# issues by randomizing the order. # issues by randomizing the order.
sentences = [text for batch in inputs for text in batch["text"]]
r = self.rng.permutation(len(sentences)) r = self.rng.permutation(len(sentences))
sentences = [sentences[i] for i in r] sentences = [sentences[i] for i in r]
...@@ -94,28 +153,29 @@ class OpenAIClientMtebEncoder(mteb.Encoder): ...@@ -94,28 +153,29 @@ class OpenAIClientMtebEncoder(mteb.Encoder):
return embeds return embeds
class ScoreClientMtebEncoder(mteb.Encoder): class ScoreClientMtebEncoder(mteb.CrossEncoderProtocol):
mteb_model_meta = _empty_model_meta
def __init__(self, model_name: str, url): def __init__(self, model_name: str, url):
super().__init__()
self.model_name = model_name self.model_name = model_name
self.url = url self.url = url
self.rng = np.random.default_rng(seed=42) self.rng = np.random.default_rng(seed=42)
def predict( def predict(
self, self,
sentences: list[tuple[str, str, str | None]], # query, corpus, prompt inputs1: DataLoader[mteb.types.BatchedInput],
inputs2: DataLoader[mteb.types.BatchedInput],
*args, *args,
**kwargs, **kwargs,
) -> np.ndarray: ) -> np.ndarray:
r = self.rng.permutation(len(sentences)) queries = [text for batch in inputs1 for text in batch["text"]]
sentences = [sentences[i] for i in r] full_corpus = [text for batch in inputs2 for text in batch["text"]]
outputs = [] outputs = []
for query, corpus, prompt in sentences: for query, corpus in zip(queries, full_corpus):
outputs.append(self.get_score(query, corpus)) outputs.append(self.get_score(query, corpus))
scores = np.array(outputs) scores = np.array(outputs)
scores = scores[np.argsort(r)]
return scores return scores
def get_score(self, query, corpus): def get_score(self, query, corpus):
...@@ -145,16 +205,13 @@ class RerankClientMtebEncoder(ScoreClientMtebEncoder): ...@@ -145,16 +205,13 @@ class RerankClientMtebEncoder(ScoreClientMtebEncoder):
return response["results"][0]["relevance_score"] return response["results"][0]["relevance_score"]
def run_mteb_embed_task(encoder, tasks): def run_mteb_embed_task(encoder: mteb.EncoderProtocol, tasks):
tasks = mteb.get_tasks(tasks=tasks) tasks = mteb.get_tasks(tasks=tasks)
evaluation = mteb.MTEB(tasks=tasks) results = mteb.evaluate(
results = evaluation.run(
encoder, encoder,
verbosity=0, tasks,
output_folder=None, cache=None,
encode_kwargs={ show_progress_bar=False,
"show_progress_bar": False,
},
) )
main_score = results[0].scores["test"][0]["main_score"] main_score = results[0].scores["test"][0]["main_score"]
...@@ -244,33 +301,39 @@ def mteb_test_embed_models( ...@@ -244,33 +301,39 @@ def mteb_test_embed_models(
assert st_main_score - vllm_main_score < atol assert st_main_score - vllm_main_score < atol
def run_mteb_rerank(cross_encoder, tasks, languages): def run_mteb_rerank(cross_encoder: mteb.CrossEncoderProtocol, tasks, languages):
with tempfile.TemporaryDirectory() as results_folder: with tempfile.TemporaryDirectory() as prediction_folder:
bm25s = mteb.get_model("bm25s") bm25s = mteb.get_model("bm25s")
tasks = mteb.get_tasks(tasks=tasks, languages=languages)
subset = "default"
eval_splits = ["test"] eval_splits = ["test"]
evaluation = mteb.MTEB(tasks=tasks) mteb_tasks: list[mteb.abstasks.AbsTaskRetrieval] = mteb.get_tasks(
evaluation.run( tasks=tasks, languages=languages, eval_splits=eval_splits
)
mteb.evaluate(
bm25s, bm25s,
verbosity=0, mteb_tasks,
eval_splits=eval_splits, prediction_folder=prediction_folder,
save_predictions=True, show_progress_bar=False,
output_folder=f"{results_folder}/stage1", # don't save results for test runs
encode_kwargs={"show_progress_bar": False}, cache=None,
overwrite_strategy="always",
) )
results = evaluation.run( second_stage_tasks = []
cross_encoder, for task in mteb_tasks:
verbosity=0, second_stage_tasks.append(
eval_splits=eval_splits, task.convert_to_reranking(
prediction_folder,
top_k=10, top_k=10,
save_predictions=True, )
output_folder=f"{results_folder}/stage2", )
previous_results=f"{results_folder}/stage1/NFCorpus_{subset}_predictions.json",
encode_kwargs={"show_progress_bar": False}, results = mteb.evaluate(
cross_encoder,
second_stage_tasks,
show_progress_bar=False,
cache=None,
) )
main_score = results[0].scores["test"][0]["main_score"] main_score = results[0].scores["test"][0]["main_score"]
return main_score return main_score
...@@ -280,20 +343,6 @@ def mteb_test_rerank_models_hf( ...@@ -280,20 +343,6 @@ def mteb_test_rerank_models_hf(
hf_runner, model_name, hf_dtype="float32", hf_model_callback=None hf_runner, model_name, hf_dtype="float32", hf_model_callback=None
): ):
with hf_runner(model_name, is_cross_encoder=True, dtype=hf_dtype) as hf_model: with hf_runner(model_name, is_cross_encoder=True, dtype=hf_dtype) as hf_model:
original_predict = hf_model.predict
def _predict(
sentences: list[tuple[str, str, str | None]], # query, corpus, prompt
*args,
**kwargs,
):
# vllm and st both remove the prompt, fair comparison.
prompts = [(s[0], s[1]) for s in sentences]
return original_predict(prompts, *args, **kwargs, batch_size=8)
hf_model.predict = _predict
hf_model.original_predict = original_predict
if hf_model_callback is not None: if hf_model_callback is not None:
hf_model_callback(hf_model) hf_model_callback(hf_model)
...@@ -310,7 +359,7 @@ def mteb_test_rerank_models( ...@@ -310,7 +359,7 @@ def mteb_test_rerank_models(
model_info: RerankModelInfo, model_info: RerankModelInfo,
vllm_extra_kwargs=None, vllm_extra_kwargs=None,
hf_model_callback=None, hf_model_callback=None,
vllm_mteb_encoder=VllmMtebEncoder, vllm_mteb_encoder=VllmMtebCrossEncoder,
atol=MTEB_RERANK_TOL, atol=MTEB_RERANK_TOL,
): ):
vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs) vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
......
...@@ -2,13 +2,15 @@ ...@@ -2,13 +2,15 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any from typing import Any
import mteb
import numpy as np import numpy as np
import pytest import pytest
import torch import torch
from torch.utils.data import DataLoader
from tests.conftest import HfRunner from tests.conftest import HfRunner
from tests.models.language.pooling_mteb_test.mteb_utils import ( from tests.models.language.pooling_mteb_test.mteb_utils import (
VllmMtebEncoder, VllmMtebCrossEncoder,
mteb_test_rerank_models, mteb_test_rerank_models,
) )
from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
...@@ -103,7 +105,7 @@ class GemmaRerankerHfRunner(HfRunner): ...@@ -103,7 +105,7 @@ class GemmaRerankerHfRunner(HfRunner):
return torch.Tensor(scores) return torch.Tensor(scores)
class GemmaMtebEncoder(VllmMtebEncoder): class GemmaMtebEncoder(VllmMtebCrossEncoder):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
self.query_template = "A: {query}\n" self.query_template = "A: {query}\n"
...@@ -111,17 +113,26 @@ class GemmaMtebEncoder(VllmMtebEncoder): ...@@ -111,17 +113,26 @@ class GemmaMtebEncoder(VllmMtebEncoder):
def predict( def predict(
self, self,
sentences: list[tuple[str, str, str | None]], # query, corpus, prompt inputs1: DataLoader[mteb.types.BatchedInput],
inputs2: DataLoader[mteb.types.BatchedInput],
*args, *args,
**kwargs, **kwargs,
) -> np.ndarray: ) -> np.ndarray:
_sentences = [] queries = [
for query, corpus, prompt in sentences: self.query_template.format(query=text)
query = self.query_template.format(query=query) for batch in inputs1
corpus = self.document_template.format(doc=corpus, prompt=PROMPT) for text in batch["text"]
_sentences.append((query, corpus, prompt)) ]
corpus = [
return super().predict(_sentences, *args, **kwargs) self.document_template.format(doc=text, prompt=PROMPT)
for batch in inputs2
for text in batch["text"]
]
outputs = self.llm.score(
queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False
)
scores = np.array(outputs)
return scores
@pytest.mark.parametrize("model_info", RERANK_MODELS) @pytest.mark.parametrize("model_info", RERANK_MODELS)
......
...@@ -70,8 +70,9 @@ class MxbaiRerankerHfRunner(HfRunner): ...@@ -70,8 +70,9 @@ class MxbaiRerankerHfRunner(HfRunner):
return scores return scores
scores = [] scores = []
for prompt in prompts: for query, doc, *_ in prompts:
inputs = process_inputs([prompt]) pairs = [(query, doc)]
inputs = process_inputs(pairs)
score = compute_logits(inputs) score = compute_logits(inputs)
scores.append(score[0].item()) scores.append(score[0].item())
return torch.Tensor(scores) return torch.Tensor(scores)
......
...@@ -72,8 +72,9 @@ class Qwen3RerankerHfRunner(HfRunner): ...@@ -72,8 +72,9 @@ class Qwen3RerankerHfRunner(HfRunner):
return scores return scores
scores = [] scores = []
for prompt in prompts: for query, doc, *_ in prompts:
inputs = process_inputs([prompt]) pairs = [(query, doc)]
inputs = process_inputs(pairs)
score = compute_logits(inputs) score = compute_logits(inputs)
scores.append(score[0].item()) scores.append(score[0].item())
return torch.Tensor(scores) return torch.Tensor(scores)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment