mteb_utils.py 10.2 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4

import tempfile
5
from collections.abc import Sequence
6
from typing import Optional
7
8
9
10

import mteb
import numpy as np
import pytest
11
import requests
12

13
from tests.models.utils import EmbedModelInfo, RerankModelInfo
14

15
# Most embedding models on the STS12 task (See #17175):
16
17
18
19
20
21
22
# - Model implementation and minor changes in tensor dtype
#   results in differences less than 1e-4
# - Different model results in differences more than 1e-3
# 1e-4 is a good tolerance threshold
MTEB_EMBED_TASKS = ["STS12"]
MTEB_EMBED_TOL = 1e-4

23
24
25
26
27
# See #19344
MTEB_RERANK_TASKS = ["NFCorpus"]
MTEB_RERANK_LANGS = ["en"]
MTEB_RERANK_TOL = 1e-3

28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45

class VllmMtebEncoder(mteb.Encoder):

    def __init__(self, vllm_model):
        super().__init__()
        self.model = vllm_model
        self.rng = np.random.default_rng(seed=42)

    def encode(
        self,
        sentences: Sequence[str],
        *args,
        **kwargs,
    ) -> np.ndarray:
        # Hoping to discover potential scheduling
        # issues by randomizing the order.
        r = self.rng.permutation(len(sentences))
        sentences = [sentences[i] for i in r]
46
        outputs = self.model.embed(sentences, use_tqdm=False)
47
48
49
50
        embeds = np.array(outputs)
        embeds = embeds[np.argsort(r)]
        return embeds

51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
    def predict(
        self,
        sentences: list[tuple[str, str,
                              Optional[str]]],  # query, corpus, prompt
        *args,
        **kwargs,
    ) -> np.ndarray:
        r = self.rng.permutation(len(sentences))
        sentences = [sentences[i] for i in r]

        queries = [s[0] for s in sentences]
        corpus = [s[1] for s in sentences]

        outputs = self.model.score(queries,
                                   corpus,
                                   truncate_prompt_tokens=-1,
                                   use_tqdm=False)
        scores = np.array(outputs)
        scores = scores[np.argsort(r)]
        return scores

72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94

class OpenAIClientMtebEncoder(mteb.Encoder):

    def __init__(self, model_name: str, client):
        super().__init__()
        self.model_name = model_name
        self.client = client
        self.rng = np.random.default_rng(seed=42)

    def encode(self, sentences: Sequence[str], *args, **kwargs) -> np.ndarray:
        # Hoping to discover potential scheduling
        # issues by randomizing the order.
        r = self.rng.permutation(len(sentences))
        sentences = [sentences[i] for i in r]

        embeddings = self.client.embeddings.create(model=self.model_name,
                                                   input=sentences)
        outputs = [d.embedding for d in embeddings.data]
        embeds = np.array(outputs)
        embeds = embeds[np.argsort(r)]
        return embeds


95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
class ScoreClientMtebEncoder(mteb.Encoder):

    def __init__(self, model_name: str, url):
        super().__init__()
        self.model_name = model_name
        self.url = url
        self.rng = np.random.default_rng(seed=42)

    def predict(
        self,
        sentences: list[tuple[str, str,
                              Optional[str]]],  # query, corpus, prompt
        *args,
        **kwargs,
    ) -> np.ndarray:
        r = self.rng.permutation(len(sentences))
        sentences = [sentences[i] for i in r]

        outputs = []
        for query, corpus, prompt in sentences:
            outputs.append(self.get_score(query, corpus))

        scores = np.array(outputs)
        scores = scores[np.argsort(r)]
        return scores

    def get_score(self, query, corpus):
        response = requests.post(self.url,
                                 json={
                                     "model": self.model_name,
                                     "text_1": query,
                                     "text_2": corpus,
                                     "truncate_prompt_tokens": -1,
                                 }).json()
        return response['data'][0]["score"]


class RerankClientMtebEncoder(ScoreClientMtebEncoder):

    def get_score(self, query, corpus):
        response = requests.post(self.url,
                                 json={
                                     "model": self.model_name,
                                     "query": query,
                                     "documents": [corpus],
                                     "truncate_prompt_tokens": -1,
                                 }).json()
        return response['results'][0]["relevance_score"]


145
146
147
def run_mteb_embed_task(encoder, tasks):
    tasks = mteb.get_tasks(tasks=tasks)
    evaluation = mteb.MTEB(tasks=tasks)
148
149
150
151
152
153
154
155
    results = evaluation.run(
        encoder,
        verbosity=0,
        output_folder=None,
        encode_kwargs={
            "show_progress_bar": False,
        },
    )
156
157
158
159
160

    main_score = results[0].scores["test"][0]["main_score"]
    return main_score


161
162
163
def mteb_test_embed_models(hf_runner,
                           vllm_runner,
                           model_info: EmbedModelInfo,
164
165
                           vllm_extra_kwargs=None,
                           hf_model_callback=None):
166
167
168
169
170
    if not model_info.enable_test:
        # A model family has many models with the same architecture,
        # and we don't need to test each one.
        pytest.skip("Skipping test.")

171
    vllm_extra_kwargs = vllm_extra_kwargs or {}
172
    vllm_extra_kwargs["dtype"] = model_info.dtype
173

174
175
176
    with vllm_runner(model_info.name,
                     task="embed",
                     max_model_len=None,
177
                     **vllm_extra_kwargs) as vllm_model:
178
179
180
181
182
183
184
185
186

        if model_info.architecture:
            assert (model_info.architecture
                    in vllm_model.model.llm_engine.model_config.architectures)

        vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model),
                                              MTEB_EMBED_TASKS)
        vllm_dtype = vllm_model.model.llm_engine.model_config.dtype

187
188
189
    with hf_runner(model_info.name,
                   is_sentence_transformer=True,
                   dtype="float32") as hf_model:
190
191
192
193

        if hf_model_callback is not None:
            hf_model_callback(hf_model)

194
        st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
195
        st_dtype = next(hf_model.model.parameters()).dtype
196

197
198
    print("VLLM:", vllm_dtype, vllm_main_score)
    print("SentenceTransformers:", st_dtype, st_main_score)
199
200
    print("Difference:", st_main_score - vllm_main_score)

201
    assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_EMBED_TOL)
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236


def run_mteb_rerank(cross_encoder, tasks, languages):
    with tempfile.TemporaryDirectory() as results_folder:
        bm25s = mteb.get_model("bm25s")
        tasks = mteb.get_tasks(tasks=tasks, languages=languages)

        subset = "default"
        eval_splits = ["test"]

        evaluation = mteb.MTEB(tasks=tasks)
        evaluation.run(
            bm25s,
            verbosity=0,
            eval_splits=eval_splits,
            save_predictions=True,
            output_folder=f"{results_folder}/stage1",
            encode_kwargs={"show_progress_bar": False},
        )

        results = evaluation.run(
            cross_encoder,
            verbosity=0,
            eval_splits=eval_splits,
            top_k=10,
            save_predictions=True,
            output_folder=f"{results_folder}/stage2",
            previous_results=
            f"{results_folder}/stage1/NFCorpus_{subset}_predictions.json",
            encode_kwargs={"show_progress_bar": False},
        )
        main_score = results[0].scores["test"][0]["main_score"]
    return main_score


237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
def mteb_test_rerank_models_hf(hf_runner, model_name, hf_model_callback=None):
    with hf_runner(model_name, is_cross_encoder=True,
                   dtype="float32") as hf_model:

        original_predict = hf_model.predict

        def _predict(
            sentences: list[tuple[str, str,
                                  Optional[str]]],  # query, corpus, prompt
            *args,
            **kwargs,
        ):
            # vllm and st both remove the prompt, fair comparison.
            prompts = [(s[0], s[1]) for s in sentences]
            return original_predict(prompts, *args, **kwargs, batch_size=8)

        hf_model.predict = _predict
        hf_model.original_predict = original_predict

        if hf_model_callback is not None:
            hf_model_callback(hf_model)

        st_main_score = run_mteb_rerank(hf_model,
                                        tasks=MTEB_RERANK_TASKS,
                                        languages=MTEB_RERANK_LANGS)
        st_dtype = next(hf_model.model.model.parameters()).dtype
    return st_main_score, st_dtype


266
267
268
269
def mteb_test_rerank_models(hf_runner,
                            vllm_runner,
                            model_info: RerankModelInfo,
                            vllm_extra_kwargs=None,
270
271
                            hf_model_callback=None,
                            vllm_mteb_encoder=VllmMtebEncoder):
272
273
274
275
276
277
278
279
280
281
282
    if not model_info.enable_test:
        # A model family has many models with the same architecture,
        # and we don't need to test each one.
        pytest.skip("Skipping test.")

    vllm_extra_kwargs = vllm_extra_kwargs or {}
    vllm_extra_kwargs["dtype"] = model_info.dtype

    with vllm_runner(model_info.name,
                     task="score",
                     max_model_len=None,
283
                     max_num_seqs=8,
284
285
                     **vllm_extra_kwargs) as vllm_model:

286
287
        model_config = vllm_model.model.llm_engine.model_config

288
        if model_info.architecture:
289
290
            assert (model_info.architecture in model_config.architectures)
        assert model_config.hf_config.num_labels == 1
291

292
        vllm_main_score = run_mteb_rerank(vllm_mteb_encoder(vllm_model),
293
294
                                          tasks=MTEB_RERANK_TASKS,
                                          languages=MTEB_RERANK_LANGS)
295
        vllm_dtype = model_config.dtype
296

297
298
    st_main_score, st_dtype = mteb_test_rerank_models_hf(
        hf_runner, model_info.name, hf_model_callback)
299
300
301
302
303
304

    print("VLLM:", vllm_dtype, vllm_main_score)
    print("SentenceTransformers:", st_dtype, st_main_score)
    print("Difference:", st_main_score - vllm_main_score)

    assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL)