"vscode:/vscode.git/clone" did not exist on "989f4f430cd74a14d539d8b59b9d239301f1bdcd"
mteb_utils.py 10.6 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4

import tempfile
5
from collections.abc import Sequence
6
from typing import Optional
7
8
9
10

import mteb
import numpy as np
import pytest
11
import requests
12

13
from tests.models.utils import EmbedModelInfo, RerankModelInfo
14

15
# Most embedding models on the STS12 task (See #17175):
16
17
18
19
20
# - Model implementation and minor changes in tensor dtype
#   results in differences less than 1e-4
# - Different model results in differences more than 1e-3
# 1e-4 is a good tolerance threshold
MTEB_EMBED_TASKS = ["STS12"]
21
MTEB_EMBED_TOL = 0.02
22

23
24
25
# See #19344
MTEB_RERANK_TASKS = ["NFCorpus"]
MTEB_RERANK_LANGS = ["en"]
26
MTEB_RERANK_TOL = 2e-3
27

28
29
30
31
32

class VllmMtebEncoder(mteb.Encoder):

    def __init__(self, vllm_model):
        super().__init__()
33
        self.llm = vllm_model
34
35
36
37
38
39
40
41
42
43
44
45
        self.rng = np.random.default_rng(seed=42)

    def encode(
        self,
        sentences: Sequence[str],
        *args,
        **kwargs,
    ) -> np.ndarray:
        # Hoping to discover potential scheduling
        # issues by randomizing the order.
        r = self.rng.permutation(len(sentences))
        sentences = [sentences[i] for i in r]
46
        outputs = self.llm.embed(sentences, use_tqdm=False)
47
48
49
50
        embeds = np.array(outputs)
        embeds = embeds[np.argsort(r)]
        return embeds

51
52
53
54
55
56
57
58
59
60
61
62
63
    def predict(
        self,
        sentences: list[tuple[str, str,
                              Optional[str]]],  # query, corpus, prompt
        *args,
        **kwargs,
    ) -> np.ndarray:
        r = self.rng.permutation(len(sentences))
        sentences = [sentences[i] for i in r]

        queries = [s[0] for s in sentences]
        corpus = [s[1] for s in sentences]

64
65
66
67
        outputs = self.llm.score(queries,
                                 corpus,
                                 truncate_prompt_tokens=-1,
                                 use_tqdm=False)
68
69
70
71
        scores = np.array(outputs)
        scores = scores[np.argsort(r)]
        return scores

72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94

class OpenAIClientMtebEncoder(mteb.Encoder):

    def __init__(self, model_name: str, client):
        super().__init__()
        self.model_name = model_name
        self.client = client
        self.rng = np.random.default_rng(seed=42)

    def encode(self, sentences: Sequence[str], *args, **kwargs) -> np.ndarray:
        # Hoping to discover potential scheduling
        # issues by randomizing the order.
        r = self.rng.permutation(len(sentences))
        sentences = [sentences[i] for i in r]

        embeddings = self.client.embeddings.create(model=self.model_name,
                                                   input=sentences)
        outputs = [d.embedding for d in embeddings.data]
        embeds = np.array(outputs)
        embeds = embeds[np.argsort(r)]
        return embeds


95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
class ScoreClientMtebEncoder(mteb.Encoder):

    def __init__(self, model_name: str, url):
        super().__init__()
        self.model_name = model_name
        self.url = url
        self.rng = np.random.default_rng(seed=42)

    def predict(
        self,
        sentences: list[tuple[str, str,
                              Optional[str]]],  # query, corpus, prompt
        *args,
        **kwargs,
    ) -> np.ndarray:
        r = self.rng.permutation(len(sentences))
        sentences = [sentences[i] for i in r]

        outputs = []
        for query, corpus, prompt in sentences:
            outputs.append(self.get_score(query, corpus))

        scores = np.array(outputs)
        scores = scores[np.argsort(r)]
        return scores

    def get_score(self, query, corpus):
        response = requests.post(self.url,
                                 json={
                                     "model": self.model_name,
                                     "text_1": query,
                                     "text_2": corpus,
                                     "truncate_prompt_tokens": -1,
                                 }).json()
        return response['data'][0]["score"]


class RerankClientMtebEncoder(ScoreClientMtebEncoder):

    def get_score(self, query, corpus):
        response = requests.post(self.url,
                                 json={
                                     "model": self.model_name,
                                     "query": query,
                                     "documents": [corpus],
                                     "truncate_prompt_tokens": -1,
                                 }).json()
        return response['results'][0]["relevance_score"]


145
146
147
def run_mteb_embed_task(encoder, tasks):
    tasks = mteb.get_tasks(tasks=tasks)
    evaluation = mteb.MTEB(tasks=tasks)
148
149
150
151
152
153
154
155
    results = evaluation.run(
        encoder,
        verbosity=0,
        output_folder=None,
        encode_kwargs={
            "show_progress_bar": False,
        },
    )
156
157
158
159
160

    main_score = results[0].scores["test"][0]["main_score"]
    return main_score


161
162
163
def mteb_test_embed_models(hf_runner,
                           vllm_runner,
                           model_info: EmbedModelInfo,
164
                           vllm_extra_kwargs=None,
165
166
                           hf_model_callback=None,
                           atol=MTEB_RERANK_TOL):
167
168
169
170
171
    if not model_info.enable_test:
        # A model family has many models with the same architecture,
        # and we don't need to test each one.
        pytest.skip("Skipping test.")

172
    vllm_extra_kwargs = vllm_extra_kwargs or {}
173
    vllm_extra_kwargs["dtype"] = model_info.dtype
174

175
    with vllm_runner(model_info.name,
176
                     runner="pooling",
177
                     max_model_len=None,
178
                     enforce_eager=True,
179
                     **vllm_extra_kwargs) as vllm_model:
180

181
182
        model_config = vllm_model.llm.llm_engine.model_config

183
        if model_info.architecture:
184
185
186
            assert model_info.architecture in model_config.architectures
        assert (model_config._model_info.default_pooling_type ==
                model_info.default_pooling_type)
187
188
189

        vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model),
                                              MTEB_EMBED_TASKS)
190
        vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
191

192
193
194
    with hf_runner(model_info.name,
                   is_sentence_transformer=True,
                   dtype="float32") as hf_model:
195
196
197
198

        if hf_model_callback is not None:
            hf_model_callback(hf_model)

199
        st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
200
        st_dtype = next(hf_model.model.parameters()).dtype
201

202
    print("Model:", model_info.name)
203
204
    print("VLLM:", vllm_dtype, vllm_main_score)
    print("SentenceTransformers:", st_dtype, st_main_score)
205
206
    print("Difference:", st_main_score - vllm_main_score)

207
    assert st_main_score == pytest.approx(vllm_main_score, abs=atol)
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242


def run_mteb_rerank(cross_encoder, tasks, languages):
    with tempfile.TemporaryDirectory() as results_folder:
        bm25s = mteb.get_model("bm25s")
        tasks = mteb.get_tasks(tasks=tasks, languages=languages)

        subset = "default"
        eval_splits = ["test"]

        evaluation = mteb.MTEB(tasks=tasks)
        evaluation.run(
            bm25s,
            verbosity=0,
            eval_splits=eval_splits,
            save_predictions=True,
            output_folder=f"{results_folder}/stage1",
            encode_kwargs={"show_progress_bar": False},
        )

        results = evaluation.run(
            cross_encoder,
            verbosity=0,
            eval_splits=eval_splits,
            top_k=10,
            save_predictions=True,
            output_folder=f"{results_folder}/stage2",
            previous_results=
            f"{results_folder}/stage1/NFCorpus_{subset}_predictions.json",
            encode_kwargs={"show_progress_bar": False},
        )
        main_score = results[0].scores["test"][0]["main_score"]
    return main_score


243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
def mteb_test_rerank_models_hf(hf_runner, model_name, hf_model_callback=None):
    with hf_runner(model_name, is_cross_encoder=True,
                   dtype="float32") as hf_model:

        original_predict = hf_model.predict

        def _predict(
            sentences: list[tuple[str, str,
                                  Optional[str]]],  # query, corpus, prompt
            *args,
            **kwargs,
        ):
            # vllm and st both remove the prompt, fair comparison.
            prompts = [(s[0], s[1]) for s in sentences]
            return original_predict(prompts, *args, **kwargs, batch_size=8)

        hf_model.predict = _predict
        hf_model.original_predict = original_predict

        if hf_model_callback is not None:
            hf_model_callback(hf_model)

        st_main_score = run_mteb_rerank(hf_model,
                                        tasks=MTEB_RERANK_TASKS,
                                        languages=MTEB_RERANK_LANGS)
        st_dtype = next(hf_model.model.model.parameters()).dtype
    return st_main_score, st_dtype


272
273
274
275
def mteb_test_rerank_models(hf_runner,
                            vllm_runner,
                            model_info: RerankModelInfo,
                            vllm_extra_kwargs=None,
276
                            hf_model_callback=None,
277
278
                            vllm_mteb_encoder=VllmMtebEncoder,
                            atol=MTEB_RERANK_TOL):
279
280
281
282
283
284
285
286
287
    if not model_info.enable_test:
        # A model family has many models with the same architecture,
        # and we don't need to test each one.
        pytest.skip("Skipping test.")

    vllm_extra_kwargs = vllm_extra_kwargs or {}
    vllm_extra_kwargs["dtype"] = model_info.dtype

    with vllm_runner(model_info.name,
288
                     runner="pooling",
289
                     max_model_len=None,
290
                     max_num_seqs=8,
291
                     enforce_eager=True,
292
293
                     **vllm_extra_kwargs) as vllm_model:

294
        model_config = vllm_model.llm.llm_engine.model_config
295
296
297

        if model_info.architecture:
            assert (model_info.architecture in model_config.architectures)
298
        assert model_config.hf_config.num_labels == 1
299
300
        assert (model_config._model_info.default_pooling_type ==
                model_info.default_pooling_type)
301

302
        vllm_main_score = run_mteb_rerank(vllm_mteb_encoder(vllm_model),
303
304
                                          tasks=MTEB_RERANK_TASKS,
                                          languages=MTEB_RERANK_LANGS)
305
        vllm_dtype = model_config.dtype
306

307
308
    st_main_score, st_dtype = mteb_test_rerank_models_hf(
        hf_runner, model_info.name, hf_model_callback)
309

310
    print("Model:", model_info.name)
311
312
313
314
    print("VLLM:", vllm_dtype, vllm_main_score)
    print("SentenceTransformers:", st_dtype, st_main_score)
    print("Difference:", st_main_score - vllm_main_score)

315
    assert st_main_score == pytest.approx(vllm_main_score, abs=atol)