mteb_utils.py 13.1 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4

import tempfile
5
from collections.abc import Sequence
6
from typing import Optional
7
8
9
10

import mteb
import numpy as np
import pytest
11
import requests
12
import torch
13

14
15
from tests.models.utils import (EmbedModelInfo, RerankModelInfo,
                                check_embeddings_close)
16

17
# Most embedding models on the STS12 task (See #17175):
18
19
20
21
22
# - Model implementation and minor changes in tensor dtype
#   results in differences less than 1e-4
# - Different model results in differences more than 1e-3
# 1e-4 is a good tolerance threshold
MTEB_EMBED_TASKS = ["STS12"]
23
MTEB_EMBED_TOL = 1e-4
24

25
26
27
# See #19344
MTEB_RERANK_TASKS = ["NFCorpus"]
MTEB_RERANK_LANGS = ["en"]
28
MTEB_RERANK_TOL = 2e-3
29

30
31
32
33
34

class VllmMtebEncoder(mteb.Encoder):

    def __init__(self, vllm_model):
        super().__init__()
35
        self.llm = vllm_model
36
37
38
39
40
41
42
43
44
45
46
47
        self.rng = np.random.default_rng(seed=42)

    def encode(
        self,
        sentences: Sequence[str],
        *args,
        **kwargs,
    ) -> np.ndarray:
        # Hoping to discover potential scheduling
        # issues by randomizing the order.
        r = self.rng.permutation(len(sentences))
        sentences = [sentences[i] for i in r]
48
        outputs = self.llm.embed(sentences, use_tqdm=False)
49
50
51
52
        embeds = np.array(outputs)
        embeds = embeds[np.argsort(r)]
        return embeds

53
54
55
56
57
58
59
60
61
62
63
64
65
    def predict(
        self,
        sentences: list[tuple[str, str,
                              Optional[str]]],  # query, corpus, prompt
        *args,
        **kwargs,
    ) -> np.ndarray:
        r = self.rng.permutation(len(sentences))
        sentences = [sentences[i] for i in r]

        queries = [s[0] for s in sentences]
        corpus = [s[1] for s in sentences]

66
67
68
69
        outputs = self.llm.score(queries,
                                 corpus,
                                 truncate_prompt_tokens=-1,
                                 use_tqdm=False)
70
71
72
73
        scores = np.array(outputs)
        scores = scores[np.argsort(r)]
        return scores

74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96

class OpenAIClientMtebEncoder(mteb.Encoder):

    def __init__(self, model_name: str, client):
        super().__init__()
        self.model_name = model_name
        self.client = client
        self.rng = np.random.default_rng(seed=42)

    def encode(self, sentences: Sequence[str], *args, **kwargs) -> np.ndarray:
        # Hoping to discover potential scheduling
        # issues by randomizing the order.
        r = self.rng.permutation(len(sentences))
        sentences = [sentences[i] for i in r]

        embeddings = self.client.embeddings.create(model=self.model_name,
                                                   input=sentences)
        outputs = [d.embedding for d in embeddings.data]
        embeds = np.array(outputs)
        embeds = embeds[np.argsort(r)]
        return embeds


97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
class ScoreClientMtebEncoder(mteb.Encoder):

    def __init__(self, model_name: str, url):
        super().__init__()
        self.model_name = model_name
        self.url = url
        self.rng = np.random.default_rng(seed=42)

    def predict(
        self,
        sentences: list[tuple[str, str,
                              Optional[str]]],  # query, corpus, prompt
        *args,
        **kwargs,
    ) -> np.ndarray:
        r = self.rng.permutation(len(sentences))
        sentences = [sentences[i] for i in r]

        outputs = []
        for query, corpus, prompt in sentences:
            outputs.append(self.get_score(query, corpus))

        scores = np.array(outputs)
        scores = scores[np.argsort(r)]
        return scores

    def get_score(self, query, corpus):
        response = requests.post(self.url,
                                 json={
                                     "model": self.model_name,
                                     "text_1": query,
                                     "text_2": corpus,
                                     "truncate_prompt_tokens": -1,
                                 }).json()
        return response['data'][0]["score"]


class RerankClientMtebEncoder(ScoreClientMtebEncoder):

    def get_score(self, query, corpus):
        response = requests.post(self.url,
                                 json={
                                     "model": self.model_name,
                                     "query": query,
                                     "documents": [corpus],
                                     "truncate_prompt_tokens": -1,
                                 }).json()
        return response['results'][0]["relevance_score"]


147
148
149
def run_mteb_embed_task(encoder, tasks):
    tasks = mteb.get_tasks(tasks=tasks)
    evaluation = mteb.MTEB(tasks=tasks)
150
151
152
153
154
155
156
157
    results = evaluation.run(
        encoder,
        verbosity=0,
        output_folder=None,
        encode_kwargs={
            "show_progress_bar": False,
        },
    )
158
159
160
161
162

    main_score = results[0].scores["test"][0]["main_score"]
    return main_score


163
164
165
def mteb_test_embed_models(hf_runner,
                           vllm_runner,
                           model_info: EmbedModelInfo,
166
                           vllm_extra_kwargs=None,
167
                           hf_model_callback=None,
168
                           atol=MTEB_EMBED_TOL):
169
170
    # A model family has many models with the same architecture,
    # and we don't need to test each one.
171
172
173
    if not model_info.enable_test:
        pytest.skip("Skipping test.")

174
175
    # Test embed_dims, isnan and whether to use normalize
    example_prompts = ["The chef prepared a delicious meal." * 1000]
176

177
    # Allow vllm to test using the given dtype, such as float32
178
    vllm_extra_kwargs = vllm_extra_kwargs or {}
179
    vllm_extra_kwargs["dtype"] = model_info.dtype
180

181
    # Allow vllm to test using hf_overrides
182
183
184
    if model_info.hf_overrides is not None:
        vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides

185
    with vllm_runner(model_info.name,
186
                     runner="pooling",
187
                     max_model_len=None,
188
                     enforce_eager=True,
189
                     **vllm_extra_kwargs) as vllm_model:
190

191
192
        model_config = vllm_model.llm.llm_engine.model_config

193
        # Confirm whether vllm is using the correct architecture
194
        if model_info.architecture:
195
            assert model_info.architecture in model_config.architectures
196
197
198

        # Confirm whether vllm uses the correct default_pooling_type, which
        # relates to whether chunked prefill and prefix caching are enabled
199
200
        assert (model_config._model_info.default_pooling_type ==
                model_info.default_pooling_type)
201
202
203

        vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model),
                                              MTEB_EMBED_TASKS)
204
        vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
205

206
207
208
209
210
211
212
        # Test embed_dims, isnan and whether to use normalize
        vllm_outputs = vllm_model.embed(example_prompts,
                                        truncate_prompt_tokens=-1)
        assert not torch.any(torch.isnan(torch.tensor(vllm_outputs)))

    # Accelerate mteb test by setting
    # SentenceTransformers mteb score to a constant
213
214
215
216
    if model_info.mteb_score is None:
        with hf_runner(model_info.name,
                       is_sentence_transformer=True,
                       dtype="float32") as hf_model:
217

218
            # e.g. setting default parameters for the encode method of hf_runner
219
220
            if hf_model_callback is not None:
                hf_model_callback(hf_model)
221

222
223
            st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)
            st_dtype = next(hf_model.model.parameters()).dtype
224
225
226
227
228
229
230
231
232
233

            # Test embed_dims and whether to use normalize
            hf_outputs = hf_model.encode(example_prompts)
            check_embeddings_close(
                embeddings_0_lst=hf_outputs,
                embeddings_1_lst=vllm_outputs,
                name_0="hf",
                name_1="vllm",
                tol=1e-2,
            )
234
235
236
    else:
        st_main_score = model_info.mteb_score
        st_dtype = "Constant"
237

238
    print("Model:", model_info.name)
239
240
    print("VLLM:", vllm_dtype, vllm_main_score)
    print("SentenceTransformers:", st_dtype, st_main_score)
241
242
    print("Difference:", st_main_score - vllm_main_score)

243
244
245
    # We are not concerned that the vllm mteb results are better
    # than SentenceTransformers, so we only perform one-sided testing.
    assert st_main_score - vllm_main_score < atol
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280


def run_mteb_rerank(cross_encoder, tasks, languages):
    with tempfile.TemporaryDirectory() as results_folder:
        bm25s = mteb.get_model("bm25s")
        tasks = mteb.get_tasks(tasks=tasks, languages=languages)

        subset = "default"
        eval_splits = ["test"]

        evaluation = mteb.MTEB(tasks=tasks)
        evaluation.run(
            bm25s,
            verbosity=0,
            eval_splits=eval_splits,
            save_predictions=True,
            output_folder=f"{results_folder}/stage1",
            encode_kwargs={"show_progress_bar": False},
        )

        results = evaluation.run(
            cross_encoder,
            verbosity=0,
            eval_splits=eval_splits,
            top_k=10,
            save_predictions=True,
            output_folder=f"{results_folder}/stage2",
            previous_results=
            f"{results_folder}/stage1/NFCorpus_{subset}_predictions.json",
            encode_kwargs={"show_progress_bar": False},
        )
        main_score = results[0].scores["test"][0]["main_score"]
    return main_score


281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
def mteb_test_rerank_models_hf(hf_runner, model_name, hf_model_callback=None):
    with hf_runner(model_name, is_cross_encoder=True,
                   dtype="float32") as hf_model:

        original_predict = hf_model.predict

        def _predict(
            sentences: list[tuple[str, str,
                                  Optional[str]]],  # query, corpus, prompt
            *args,
            **kwargs,
        ):
            # vllm and st both remove the prompt, fair comparison.
            prompts = [(s[0], s[1]) for s in sentences]
            return original_predict(prompts, *args, **kwargs, batch_size=8)

        hf_model.predict = _predict
        hf_model.original_predict = original_predict

        if hf_model_callback is not None:
            hf_model_callback(hf_model)

        st_main_score = run_mteb_rerank(hf_model,
                                        tasks=MTEB_RERANK_TASKS,
                                        languages=MTEB_RERANK_LANGS)
        st_dtype = next(hf_model.model.model.parameters()).dtype
    return st_main_score, st_dtype


310
311
312
313
def mteb_test_rerank_models(hf_runner,
                            vllm_runner,
                            model_info: RerankModelInfo,
                            vllm_extra_kwargs=None,
314
                            hf_model_callback=None,
315
316
                            vllm_mteb_encoder=VllmMtebEncoder,
                            atol=MTEB_RERANK_TOL):
317
318
    # A model family has many models with the same architecture,
    # and we don't need to test each one.
319
320
321
    if not model_info.enable_test:
        pytest.skip("Skipping test.")

322
    # Allow vllm to test using the given dtype, such as float32
323
324
325
    vllm_extra_kwargs = vllm_extra_kwargs or {}
    vllm_extra_kwargs["dtype"] = model_info.dtype

326
    # Allow vllm to test using hf_overrides
327
328
329
    if model_info.hf_overrides is not None:
        vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides

330
    with vllm_runner(model_info.name,
331
                     runner="pooling",
332
                     max_model_len=None,
333
                     max_num_seqs=8,
334
                     enforce_eager=True,
335
336
                     **vllm_extra_kwargs) as vllm_model:

337
        model_config = vllm_model.llm.llm_engine.model_config
338

339
        # Confirm whether vllm is using the correct architecture
340
341
        if model_info.architecture:
            assert (model_info.architecture in model_config.architectures)
342
343

        # Score API is only enabled for num_labels == 1
344
        assert model_config.hf_config.num_labels == 1
345
346
347

        # Confirm whether vllm uses the correct default_pooling_type, which
        # relates to whether chunked prefill and prefix caching are enabled
348
349
        assert (model_config._model_info.default_pooling_type ==
                model_info.default_pooling_type)
350

351
        vllm_main_score = run_mteb_rerank(vllm_mteb_encoder(vllm_model),
352
353
                                          tasks=MTEB_RERANK_TASKS,
                                          languages=MTEB_RERANK_LANGS)
354
        vllm_dtype = model_config.dtype
355

356
357
    # Accelerate mteb test by setting
    # SentenceTransformers mteb score to a constant
358
359
360
361
362
363
    if model_info.mteb_score is None:
        st_main_score, st_dtype = mteb_test_rerank_models_hf(
            hf_runner, model_info.name, hf_model_callback)
    else:
        st_main_score = model_info.mteb_score
        st_dtype = "Constant"
364

365
    print("Model:", model_info.name)
366
367
368
369
    print("VLLM:", vllm_dtype, vllm_main_score)
    print("SentenceTransformers:", st_dtype, st_main_score)
    print("Difference:", st_main_score - vllm_main_score)

370
371
372
    # We are not concerned that the vllm mteb results are better
    # than SentenceTransformers, so we only perform one-sided testing.
    assert st_main_score - vllm_main_score < atol