mteb_utils.py 10.9 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4

import tempfile
5
from collections.abc import Sequence
6
from typing import Optional
7
8
9
10

import mteb
import numpy as np
import pytest
11
import requests
12

13
from tests.models.utils import EmbedModelInfo, RerankModelInfo
14

15
# Most embedding models on the STS12 task (See #17175):
16
17
18
19
20
# - Model implementation and minor changes in tensor dtype
#   results in differences less than 1e-4
# - Different model results in differences more than 1e-3
# 1e-4 is a good tolerance threshold
MTEB_EMBED_TASKS = ["STS12"]
21
MTEB_EMBED_TOL = 0.02
22

23
24
25
# See #19344
MTEB_RERANK_TASKS = ["NFCorpus"]
MTEB_RERANK_LANGS = ["en"]
26
MTEB_RERANK_TOL = 2e-3
27

28
29
30
31
32

class VllmMtebEncoder(mteb.Encoder):

    def __init__(self, vllm_model):
        super().__init__()
33
        self.llm = vllm_model
34
35
36
37
38
39
40
41
42
43
44
45
        self.rng = np.random.default_rng(seed=42)

    def encode(
        self,
        sentences: Sequence[str],
        *args,
        **kwargs,
    ) -> np.ndarray:
        # Hoping to discover potential scheduling
        # issues by randomizing the order.
        r = self.rng.permutation(len(sentences))
        sentences = [sentences[i] for i in r]
46
        outputs = self.llm.embed(sentences, use_tqdm=False)
47
48
49
50
        embeds = np.array(outputs)
        embeds = embeds[np.argsort(r)]
        return embeds

51
52
53
54
55
56
57
58
59
60
61
62
63
    def predict(
        self,
        sentences: list[tuple[str, str,
                              Optional[str]]],  # query, corpus, prompt
        *args,
        **kwargs,
    ) -> np.ndarray:
        r = self.rng.permutation(len(sentences))
        sentences = [sentences[i] for i in r]

        queries = [s[0] for s in sentences]
        corpus = [s[1] for s in sentences]

64
65
66
67
        outputs = self.llm.score(queries,
                                 corpus,
                                 truncate_prompt_tokens=-1,
                                 use_tqdm=False)
68
69
70
71
        scores = np.array(outputs)
        scores = scores[np.argsort(r)]
        return scores

72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94

class OpenAIClientMtebEncoder(mteb.Encoder):

    def __init__(self, model_name: str, client):
        super().__init__()
        self.model_name = model_name
        self.client = client
        self.rng = np.random.default_rng(seed=42)

    def encode(self, sentences: Sequence[str], *args, **kwargs) -> np.ndarray:
        # Hoping to discover potential scheduling
        # issues by randomizing the order.
        r = self.rng.permutation(len(sentences))
        sentences = [sentences[i] for i in r]

        embeddings = self.client.embeddings.create(model=self.model_name,
                                                   input=sentences)
        outputs = [d.embedding for d in embeddings.data]
        embeds = np.array(outputs)
        embeds = embeds[np.argsort(r)]
        return embeds


95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
class ScoreClientMtebEncoder(mteb.Encoder):

    def __init__(self, model_name: str, url):
        super().__init__()
        self.model_name = model_name
        self.url = url
        self.rng = np.random.default_rng(seed=42)

    def predict(
        self,
        sentences: list[tuple[str, str,
                              Optional[str]]],  # query, corpus, prompt
        *args,
        **kwargs,
    ) -> np.ndarray:
        r = self.rng.permutation(len(sentences))
        sentences = [sentences[i] for i in r]

        outputs = []
        for query, corpus, prompt in sentences:
            outputs.append(self.get_score(query, corpus))

        scores = np.array(outputs)
        scores = scores[np.argsort(r)]
        return scores

    def get_score(self, query, corpus):
        response = requests.post(self.url,
                                 json={
                                     "model": self.model_name,
                                     "text_1": query,
                                     "text_2": corpus,
                                     "truncate_prompt_tokens": -1,
                                 }).json()
        return response['data'][0]["score"]


class RerankClientMtebEncoder(ScoreClientMtebEncoder):

    def get_score(self, query, corpus):
        response = requests.post(self.url,
                                 json={
                                     "model": self.model_name,
                                     "query": query,
                                     "documents": [corpus],
                                     "truncate_prompt_tokens": -1,
                                 }).json()
        return response['results'][0]["relevance_score"]


145
146
147
def run_mteb_embed_task(encoder, tasks):
    tasks = mteb.get_tasks(tasks=tasks)
    evaluation = mteb.MTEB(tasks=tasks)
148
149
150
151
152
153
154
155
    results = evaluation.run(
        encoder,
        verbosity=0,
        output_folder=None,
        encode_kwargs={
            "show_progress_bar": False,
        },
    )
156
157
158
159
160

    main_score = results[0].scores["test"][0]["main_score"]
    return main_score


161
162
163
def mteb_test_embed_models(hf_runner,
                           vllm_runner,
                           model_info: EmbedModelInfo,
164
                           vllm_extra_kwargs=None,
165
166
                           hf_model_callback=None,
                           atol=MTEB_RERANK_TOL):
167
168
169
170
171
    if not model_info.enable_test:
        # A model family has many models with the same architecture,
        # and we don't need to test each one.
        pytest.skip("Skipping test.")

172
    vllm_extra_kwargs = vllm_extra_kwargs or {}
173
    vllm_extra_kwargs["dtype"] = model_info.dtype
174

175
176
177
    if model_info.hf_overrides is not None:
        vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides

178
    with vllm_runner(model_info.name,
179
                     runner="pooling",
180
                     max_model_len=None,
181
                     enforce_eager=True,
182
                     **vllm_extra_kwargs) as vllm_model:
183

184
185
        model_config = vllm_model.llm.llm_engine.model_config

186
        if model_info.architecture:
187
188
189
            assert model_info.architecture in model_config.architectures
        assert (model_config._model_info.default_pooling_type ==
                model_info.default_pooling_type)
190
191
192

        vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model),
                                              MTEB_EMBED_TASKS)
193
        vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
194

195
196
197
    with hf_runner(model_info.name,
                   is_sentence_transformer=True,
                   dtype="float32") as hf_model:
198
199
200
201

        if hf_model_callback is not None:
            hf_model_callback(hf_model)

202
        st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
203
        st_dtype = next(hf_model.model.parameters()).dtype
204

205
    print("Model:", model_info.name)
206
207
    print("VLLM:", vllm_dtype, vllm_main_score)
    print("SentenceTransformers:", st_dtype, st_main_score)
208
209
    print("Difference:", st_main_score - vllm_main_score)

210
    assert st_main_score == pytest.approx(vllm_main_score, abs=atol)
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245


def run_mteb_rerank(cross_encoder, tasks, languages):
    with tempfile.TemporaryDirectory() as results_folder:
        bm25s = mteb.get_model("bm25s")
        tasks = mteb.get_tasks(tasks=tasks, languages=languages)

        subset = "default"
        eval_splits = ["test"]

        evaluation = mteb.MTEB(tasks=tasks)
        evaluation.run(
            bm25s,
            verbosity=0,
            eval_splits=eval_splits,
            save_predictions=True,
            output_folder=f"{results_folder}/stage1",
            encode_kwargs={"show_progress_bar": False},
        )

        results = evaluation.run(
            cross_encoder,
            verbosity=0,
            eval_splits=eval_splits,
            top_k=10,
            save_predictions=True,
            output_folder=f"{results_folder}/stage2",
            previous_results=
            f"{results_folder}/stage1/NFCorpus_{subset}_predictions.json",
            encode_kwargs={"show_progress_bar": False},
        )
        main_score = results[0].scores["test"][0]["main_score"]
    return main_score


246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
def mteb_test_rerank_models_hf(hf_runner, model_name, hf_model_callback=None):
    with hf_runner(model_name, is_cross_encoder=True,
                   dtype="float32") as hf_model:

        original_predict = hf_model.predict

        def _predict(
            sentences: list[tuple[str, str,
                                  Optional[str]]],  # query, corpus, prompt
            *args,
            **kwargs,
        ):
            # vllm and st both remove the prompt, fair comparison.
            prompts = [(s[0], s[1]) for s in sentences]
            return original_predict(prompts, *args, **kwargs, batch_size=8)

        hf_model.predict = _predict
        hf_model.original_predict = original_predict

        if hf_model_callback is not None:
            hf_model_callback(hf_model)

        st_main_score = run_mteb_rerank(hf_model,
                                        tasks=MTEB_RERANK_TASKS,
                                        languages=MTEB_RERANK_LANGS)
        st_dtype = next(hf_model.model.model.parameters()).dtype
    return st_main_score, st_dtype


275
276
277
278
def mteb_test_rerank_models(hf_runner,
                            vllm_runner,
                            model_info: RerankModelInfo,
                            vllm_extra_kwargs=None,
279
                            hf_model_callback=None,
280
281
                            vllm_mteb_encoder=VllmMtebEncoder,
                            atol=MTEB_RERANK_TOL):
282
283
284
285
286
287
288
289
    if not model_info.enable_test:
        # A model family has many models with the same architecture,
        # and we don't need to test each one.
        pytest.skip("Skipping test.")

    vllm_extra_kwargs = vllm_extra_kwargs or {}
    vllm_extra_kwargs["dtype"] = model_info.dtype

290
291
292
    if model_info.hf_overrides is not None:
        vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides

293
    with vllm_runner(model_info.name,
294
                     runner="pooling",
295
                     max_model_len=None,
296
                     max_num_seqs=8,
297
                     enforce_eager=True,
298
299
                     **vllm_extra_kwargs) as vllm_model:

300
        model_config = vllm_model.llm.llm_engine.model_config
301
302
303

        if model_info.architecture:
            assert (model_info.architecture in model_config.architectures)
304
        assert model_config.hf_config.num_labels == 1
305
306
        assert (model_config._model_info.default_pooling_type ==
                model_info.default_pooling_type)
307

308
        vllm_main_score = run_mteb_rerank(vllm_mteb_encoder(vllm_model),
309
310
                                          tasks=MTEB_RERANK_TASKS,
                                          languages=MTEB_RERANK_LANGS)
311
        vllm_dtype = model_config.dtype
312

313
314
    st_main_score, st_dtype = mteb_test_rerank_models_hf(
        hf_runner, model_info.name, hf_model_callback)
315

316
    print("Model:", model_info.name)
317
318
319
320
    print("VLLM:", vllm_dtype, vllm_main_score)
    print("SentenceTransformers:", st_dtype, st_main_score)
    print("Difference:", st_main_score - vllm_main_score)

321
    assert st_main_score == pytest.approx(vllm_main_score, abs=atol)