mteb_utils.py 4.05 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
6
7
8
9
from collections.abc import Sequence

import mteb
import numpy as np
import pytest

from tests.models.utils import EmbedModelInfo
10
from vllm.model_executor.model_loader.utils import set_default_torch_dtype
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80

# Most models on the STS12 task (See #17175):
# - Model implementation and minor changes in tensor dtype
#   results in differences less than 1e-4
# - Different model results in differences more than 1e-3
# 1e-4 is a good tolerance threshold
MTEB_EMBED_TASKS = ["STS12"]
MTEB_EMBED_TOL = 1e-4


class VllmMtebEncoder(mteb.Encoder):

    def __init__(self, vllm_model):
        super().__init__()
        self.model = vllm_model
        self.rng = np.random.default_rng(seed=42)

    def encode(
        self,
        sentences: Sequence[str],
        *args,
        **kwargs,
    ) -> np.ndarray:
        # Hoping to discover potential scheduling
        # issues by randomizing the order.
        r = self.rng.permutation(len(sentences))
        sentences = [sentences[i] for i in r]
        outputs = self.model.encode(sentences, use_tqdm=False)
        embeds = np.array(outputs)
        embeds = embeds[np.argsort(r)]
        return embeds


class OpenAIClientMtebEncoder(mteb.Encoder):

    def __init__(self, model_name: str, client):
        super().__init__()
        self.model_name = model_name
        self.client = client
        self.rng = np.random.default_rng(seed=42)

    def encode(self, sentences: Sequence[str], *args, **kwargs) -> np.ndarray:
        # Hoping to discover potential scheduling
        # issues by randomizing the order.
        r = self.rng.permutation(len(sentences))
        sentences = [sentences[i] for i in r]

        embeddings = self.client.embeddings.create(model=self.model_name,
                                                   input=sentences)
        outputs = [d.embedding for d in embeddings.data]
        embeds = np.array(outputs)
        embeds = embeds[np.argsort(r)]
        return embeds


def run_mteb_embed_task(encoder, tasks):
    tasks = mteb.get_tasks(tasks=tasks)
    evaluation = mteb.MTEB(tasks=tasks)
    results = evaluation.run(encoder, verbosity=0, output_folder=None)

    main_score = results[0].scores["test"][0]["main_score"]
    return main_score


def run_mteb_embed_task_st(model_name, tasks):
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer(model_name)
    return run_mteb_embed_task(model, tasks)


81
82
83
def mteb_test_embed_models(hf_runner,
                           vllm_runner,
                           model_info: EmbedModelInfo,
84
85
                           vllm_extra_kwargs=None,
                           hf_model_callback=None):
86
87
88
89
90
    if not model_info.enable_test:
        # A model family has many models with the same architecture,
        # and we don't need to test each one.
        pytest.skip("Skipping test.")

91
    vllm_extra_kwargs = vllm_extra_kwargs or {}
92
    vllm_extra_kwargs["dtype"] = model_info.dtype
93

94
95
96
    with vllm_runner(model_info.name,
                     task="embed",
                     max_model_len=None,
97
                     **vllm_extra_kwargs) as vllm_model:
98
99
100
101
102
103
104
105
106

        if model_info.architecture:
            assert (model_info.architecture
                    in vllm_model.model.llm_engine.model_config.architectures)

        vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model),
                                              MTEB_EMBED_TASKS)
        vllm_dtype = vllm_model.model.llm_engine.model_config.dtype

107
    with set_default_torch_dtype(vllm_dtype) and hf_runner(
108
            model_info.name, is_sentence_transformer=True,
109
            dtype=vllm_dtype) as hf_model:
110
111
112
113

        if hf_model_callback is not None:
            hf_model_callback(hf_model)

114
115
        st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)

116
117
    print("VLLM:", vllm_main_score)
    print("SentenceTransformers:", st_main_score)
118
119
    print("Difference:", st_main_score - vllm_main_score)

120
    assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_EMBED_TOL)