mteb_utils.py 11.9 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4

import tempfile
5
from collections.abc import Sequence
6
from typing import Optional
7
8
9
10

import mteb
import numpy as np
import pytest
11
import requests
12

13
14
from tests.models.utils import (EmbedModelInfo, RerankModelInfo,
                                check_embeddings_close)
15

16
# Most embedding models on the STS12 task (See #17175):
17
18
19
20
21
# - Model implementation and minor changes in tensor dtype
#   results in differences less than 1e-4
# - Different model results in differences more than 1e-3
# 1e-4 is a good tolerance threshold
MTEB_EMBED_TASKS = ["STS12"]
22
MTEB_EMBED_TOL = 1e-4
23

24
25
26
# See #19344
MTEB_RERANK_TASKS = ["NFCorpus"]
MTEB_RERANK_LANGS = ["en"]
27
MTEB_RERANK_TOL = 2e-3
28

29
30
31
32
33

class VllmMtebEncoder(mteb.Encoder):

    def __init__(self, vllm_model):
        super().__init__()
34
        self.llm = vllm_model
35
36
37
38
39
40
41
42
43
44
45
46
        self.rng = np.random.default_rng(seed=42)

    def encode(
        self,
        sentences: Sequence[str],
        *args,
        **kwargs,
    ) -> np.ndarray:
        # Hoping to discover potential scheduling
        # issues by randomizing the order.
        r = self.rng.permutation(len(sentences))
        sentences = [sentences[i] for i in r]
47
        outputs = self.llm.embed(sentences, use_tqdm=False)
48
49
50
51
        embeds = np.array(outputs)
        embeds = embeds[np.argsort(r)]
        return embeds

52
53
54
55
56
57
58
59
60
61
62
63
64
    def predict(
        self,
        sentences: list[tuple[str, str,
                              Optional[str]]],  # query, corpus, prompt
        *args,
        **kwargs,
    ) -> np.ndarray:
        r = self.rng.permutation(len(sentences))
        sentences = [sentences[i] for i in r]

        queries = [s[0] for s in sentences]
        corpus = [s[1] for s in sentences]

65
66
67
68
        outputs = self.llm.score(queries,
                                 corpus,
                                 truncate_prompt_tokens=-1,
                                 use_tqdm=False)
69
70
71
72
        scores = np.array(outputs)
        scores = scores[np.argsort(r)]
        return scores

73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95

class OpenAIClientMtebEncoder(mteb.Encoder):

    def __init__(self, model_name: str, client):
        super().__init__()
        self.model_name = model_name
        self.client = client
        self.rng = np.random.default_rng(seed=42)

    def encode(self, sentences: Sequence[str], *args, **kwargs) -> np.ndarray:
        # Hoping to discover potential scheduling
        # issues by randomizing the order.
        r = self.rng.permutation(len(sentences))
        sentences = [sentences[i] for i in r]

        embeddings = self.client.embeddings.create(model=self.model_name,
                                                   input=sentences)
        outputs = [d.embedding for d in embeddings.data]
        embeds = np.array(outputs)
        embeds = embeds[np.argsort(r)]
        return embeds


96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
class ScoreClientMtebEncoder(mteb.Encoder):

    def __init__(self, model_name: str, url):
        super().__init__()
        self.model_name = model_name
        self.url = url
        self.rng = np.random.default_rng(seed=42)

    def predict(
        self,
        sentences: list[tuple[str, str,
                              Optional[str]]],  # query, corpus, prompt
        *args,
        **kwargs,
    ) -> np.ndarray:
        r = self.rng.permutation(len(sentences))
        sentences = [sentences[i] for i in r]

        outputs = []
        for query, corpus, prompt in sentences:
            outputs.append(self.get_score(query, corpus))

        scores = np.array(outputs)
        scores = scores[np.argsort(r)]
        return scores

    def get_score(self, query, corpus):
        response = requests.post(self.url,
                                 json={
                                     "model": self.model_name,
                                     "text_1": query,
                                     "text_2": corpus,
                                     "truncate_prompt_tokens": -1,
                                 }).json()
        return response['data'][0]["score"]


class RerankClientMtebEncoder(ScoreClientMtebEncoder):

    def get_score(self, query, corpus):
        response = requests.post(self.url,
                                 json={
                                     "model": self.model_name,
                                     "query": query,
                                     "documents": [corpus],
                                     "truncate_prompt_tokens": -1,
                                 }).json()
        return response['results'][0]["relevance_score"]


146
147
148
def run_mteb_embed_task(encoder, tasks):
    tasks = mteb.get_tasks(tasks=tasks)
    evaluation = mteb.MTEB(tasks=tasks)
149
150
151
152
153
154
155
156
    results = evaluation.run(
        encoder,
        verbosity=0,
        output_folder=None,
        encode_kwargs={
            "show_progress_bar": False,
        },
    )
157
158
159
160
161

    main_score = results[0].scores["test"][0]["main_score"]
    return main_score


162
163
164
def mteb_test_embed_models(hf_runner,
                           vllm_runner,
                           model_info: EmbedModelInfo,
165
                           vllm_extra_kwargs=None,
166
                           hf_model_callback=None,
167
                           atol=MTEB_EMBED_TOL):
168
169
170
171
172
    if not model_info.enable_test:
        # A model family has many models with the same architecture,
        # and we don't need to test each one.
        pytest.skip("Skipping test.")

173
174
    example_prompts = ["The chef prepared a delicious meal."]

175
    vllm_extra_kwargs = vllm_extra_kwargs or {}
176
    vllm_extra_kwargs["dtype"] = model_info.dtype
177

178
179
180
    if model_info.hf_overrides is not None:
        vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides

181
    with vllm_runner(model_info.name,
182
                     runner="pooling",
183
                     max_model_len=None,
184
                     enforce_eager=True,
185
                     **vllm_extra_kwargs) as vllm_model:
186

187
188
        model_config = vllm_model.llm.llm_engine.model_config

189
        if model_info.architecture:
190
191
192
            assert model_info.architecture in model_config.architectures
        assert (model_config._model_info.default_pooling_type ==
                model_info.default_pooling_type)
193
194
195

        vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model),
                                              MTEB_EMBED_TASKS)
196
        vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
197
        vllm_outputs = vllm_model.embed(example_prompts)
198

199
200
201
202
    if model_info.mteb_score is None:
        with hf_runner(model_info.name,
                       is_sentence_transformer=True,
                       dtype="float32") as hf_model:
203

204
205
            if hf_model_callback is not None:
                hf_model_callback(hf_model)
206

207
208
            st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
            st_dtype = next(hf_model.model.parameters()).dtype
209
210
211
212
213
214
215
216
217
218

            # Test embed_dims and whether to use normalize
            hf_outputs = hf_model.encode(example_prompts)
            check_embeddings_close(
                embeddings_0_lst=hf_outputs,
                embeddings_1_lst=vllm_outputs,
                name_0="hf",
                name_1="vllm",
                tol=1e-2,
            )
219
220
221
    else:
        st_main_score = model_info.mteb_score
        st_dtype = "Constant"
222

223
    print("Model:", model_info.name)
224
225
    print("VLLM:", vllm_dtype, vllm_main_score)
    print("SentenceTransformers:", st_dtype, st_main_score)
226
227
    print("Difference:", st_main_score - vllm_main_score)

228
229
230
    # We are not concerned that the vllm mteb results are better
    # than SentenceTransformers, so we only perform one-sided testing.
    assert st_main_score - vllm_main_score < atol
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265


def run_mteb_rerank(cross_encoder, tasks, languages):
    with tempfile.TemporaryDirectory() as results_folder:
        bm25s = mteb.get_model("bm25s")
        tasks = mteb.get_tasks(tasks=tasks, languages=languages)

        subset = "default"
        eval_splits = ["test"]

        evaluation = mteb.MTEB(tasks=tasks)
        evaluation.run(
            bm25s,
            verbosity=0,
            eval_splits=eval_splits,
            save_predictions=True,
            output_folder=f"{results_folder}/stage1",
            encode_kwargs={"show_progress_bar": False},
        )

        results = evaluation.run(
            cross_encoder,
            verbosity=0,
            eval_splits=eval_splits,
            top_k=10,
            save_predictions=True,
            output_folder=f"{results_folder}/stage2",
            previous_results=
            f"{results_folder}/stage1/NFCorpus_{subset}_predictions.json",
            encode_kwargs={"show_progress_bar": False},
        )
        main_score = results[0].scores["test"][0]["main_score"]
    return main_score


266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
def mteb_test_rerank_models_hf(hf_runner, model_name, hf_model_callback=None):
    with hf_runner(model_name, is_cross_encoder=True,
                   dtype="float32") as hf_model:

        original_predict = hf_model.predict

        def _predict(
            sentences: list[tuple[str, str,
                                  Optional[str]]],  # query, corpus, prompt
            *args,
            **kwargs,
        ):
            # vllm and st both remove the prompt, fair comparison.
            prompts = [(s[0], s[1]) for s in sentences]
            return original_predict(prompts, *args, **kwargs, batch_size=8)

        hf_model.predict = _predict
        hf_model.original_predict = original_predict

        if hf_model_callback is not None:
            hf_model_callback(hf_model)

        st_main_score = run_mteb_rerank(hf_model,
                                        tasks=MTEB_RERANK_TASKS,
                                        languages=MTEB_RERANK_LANGS)
        st_dtype = next(hf_model.model.model.parameters()).dtype
    return st_main_score, st_dtype


295
296
297
298
def mteb_test_rerank_models(hf_runner,
                            vllm_runner,
                            model_info: RerankModelInfo,
                            vllm_extra_kwargs=None,
299
                            hf_model_callback=None,
300
301
                            vllm_mteb_encoder=VllmMtebEncoder,
                            atol=MTEB_RERANK_TOL):
302
303
304
305
306
307
308
309
    if not model_info.enable_test:
        # A model family has many models with the same architecture,
        # and we don't need to test each one.
        pytest.skip("Skipping test.")

    vllm_extra_kwargs = vllm_extra_kwargs or {}
    vllm_extra_kwargs["dtype"] = model_info.dtype

310
311
312
    if model_info.hf_overrides is not None:
        vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides

313
    with vllm_runner(model_info.name,
314
                     runner="pooling",
315
                     max_model_len=None,
316
                     max_num_seqs=8,
317
                     enforce_eager=True,
318
319
                     **vllm_extra_kwargs) as vllm_model:

320
        model_config = vllm_model.llm.llm_engine.model_config
321
322
323

        if model_info.architecture:
            assert (model_info.architecture in model_config.architectures)
324
        assert model_config.hf_config.num_labels == 1
325
326
        assert (model_config._model_info.default_pooling_type ==
                model_info.default_pooling_type)
327

328
        vllm_main_score = run_mteb_rerank(vllm_mteb_encoder(vllm_model),
329
330
                                          tasks=MTEB_RERANK_TASKS,
                                          languages=MTEB_RERANK_LANGS)
331
        vllm_dtype = model_config.dtype
332

333
334
335
336
337
338
    if model_info.mteb_score is None:
        st_main_score, st_dtype = mteb_test_rerank_models_hf(
            hf_runner, model_info.name, hf_model_callback)
    else:
        st_main_score = model_info.mteb_score
        st_dtype = "Constant"
339

340
    print("Model:", model_info.name)
341
342
343
344
    print("VLLM:", vllm_dtype, vllm_main_score)
    print("SentenceTransformers:", st_dtype, st_main_score)
    print("Difference:", st_main_score - vllm_main_score)

345
346
347
    # We are not concerned that the vllm mteb results are better
    # than SentenceTransformers, so we only perform one-sided testing.
    assert st_main_score - vllm_main_score < atol