"vllm/vscode:/vscode.git/clone" did not exist on "1f8b7c536be40975573eeebf36204286cfb4e4e9"
mteb_utils.py 3.98 KB
Newer Older
1
2
3
4
5
6
7
8
# SPDX-License-Identifier: Apache-2.0
from collections.abc import Sequence

import mteb
import numpy as np
import pytest

from tests.models.utils import EmbedModelInfo
9
from vllm.model_executor.model_loader.utils import set_default_torch_dtype
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79

# Most models on the STS12 task (See #17175):
# - Model implementation and minor changes in tensor dtype
#   results in differences less than 1e-4
# - Different model results in differences more than 1e-3
# 1e-4 is a good tolerance threshold
MTEB_EMBED_TASKS = ["STS12"]
MTEB_EMBED_TOL = 1e-4


class VllmMtebEncoder(mteb.Encoder):

    def __init__(self, vllm_model):
        super().__init__()
        self.model = vllm_model
        self.rng = np.random.default_rng(seed=42)

    def encode(
        self,
        sentences: Sequence[str],
        *args,
        **kwargs,
    ) -> np.ndarray:
        # Hoping to discover potential scheduling
        # issues by randomizing the order.
        r = self.rng.permutation(len(sentences))
        sentences = [sentences[i] for i in r]
        outputs = self.model.encode(sentences, use_tqdm=False)
        embeds = np.array(outputs)
        embeds = embeds[np.argsort(r)]
        return embeds


class OpenAIClientMtebEncoder(mteb.Encoder):

    def __init__(self, model_name: str, client):
        super().__init__()
        self.model_name = model_name
        self.client = client
        self.rng = np.random.default_rng(seed=42)

    def encode(self, sentences: Sequence[str], *args, **kwargs) -> np.ndarray:
        # Hoping to discover potential scheduling
        # issues by randomizing the order.
        r = self.rng.permutation(len(sentences))
        sentences = [sentences[i] for i in r]

        embeddings = self.client.embeddings.create(model=self.model_name,
                                                   input=sentences)
        outputs = [d.embedding for d in embeddings.data]
        embeds = np.array(outputs)
        embeds = embeds[np.argsort(r)]
        return embeds


def run_mteb_embed_task(encoder, tasks):
    tasks = mteb.get_tasks(tasks=tasks)
    evaluation = mteb.MTEB(tasks=tasks)
    results = evaluation.run(encoder, verbosity=0, output_folder=None)

    main_score = results[0].scores["test"][0]["main_score"]
    return main_score


def run_mteb_embed_task_st(model_name, tasks):
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer(model_name)
    return run_mteb_embed_task(model, tasks)


80
81
82
def mteb_test_embed_models(hf_runner,
                           vllm_runner,
                           model_info: EmbedModelInfo,
83
84
                           vllm_extra_kwargs=None,
                           hf_model_callback=None):
85
86
87
88
89
    if not model_info.enable_test:
        # A model family has many models with the same architecture,
        # and we don't need to test each one.
        pytest.skip("Skipping test.")

90
    vllm_extra_kwargs = vllm_extra_kwargs or {}
91
    vllm_extra_kwargs["dtype"] = model_info.dtype
92

93
94
95
    with vllm_runner(model_info.name,
                     task="embed",
                     max_model_len=None,
96
                     **vllm_extra_kwargs) as vllm_model:
97
98
99
100
101
102
103
104
105

        if model_info.architecture:
            assert (model_info.architecture
                    in vllm_model.model.llm_engine.model_config.architectures)

        vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model),
                                              MTEB_EMBED_TASKS)
        vllm_dtype = vllm_model.model.llm_engine.model_config.dtype

106
    with set_default_torch_dtype(vllm_dtype) and hf_runner(
107
            model_info.name, is_sentence_transformer=True,
108
            dtype=vllm_dtype) as hf_model:
109
110
111
112

        if hf_model_callback is not None:
            hf_model_callback(hf_model)

113
114
        st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS)

115
116
    print("VLLM:", vllm_main_score)
    print("SentenceTransformers:", st_main_score)
117
118
    print("Difference:", st_main_score - vllm_main_score)

119
    assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_EMBED_TOL)