test_bge_m3.py 5.74 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import httpx
import openai
import pytest
import pytest_asyncio
import torch

from ....utils import RemoteOpenAIServer
from .embed_utils import run_client_embeddings

MODEL_NAME = "BAAI/bge-m3"
MAX_MODEL_LEN = 512


# Example from https://huggingface.co/BAAI/bge-m3
Jiayi Yan's avatar
Jiayi Yan committed
17
sentences_1 = ["What is BGE M3?", "Definition of BM25"]
18
19
20
21
22
23
24
sentences_2 = [
    "BGE M3 is an embedding model supporting dense retrieval, "
    "lexical matching and multi-vector interaction.",
    "BM25 is a bag-of-words retrieval function that ranks a set "
    "of documents based on the query terms appearing in each document",
]

25
similarity_reference = [[0.6259, 0.3474], [0.3309, 0.6734]]
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
lexical_score_reference = [0.19554901123046875, 0.0]
colbert_score_reference = [0.7797, 0.4620]


@pytest.fixture(scope="module")
def server():
    args = [
        "--max-model-len",
        str(MAX_MODEL_LEN),
        "--hf-overrides",
        '{"architectures": ["BgeM3EmbeddingModel"]}',
    ]

    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server


@pytest_asyncio.fixture
async def client(server):
    async with server.get_async_client() as async_client:
        yield async_client


@pytest.mark.asyncio
async def test_bge_m3_api_server_embedding(client: openai.AsyncOpenAI):
    embeddings_list_1 = await run_client_embeddings(
        client,
        MODEL_NAME,
        sentences_1,
    )
    embeddings_list_2 = await run_client_embeddings(
        client,
        MODEL_NAME,
        sentences_2,
    )

    embeddings_1 = torch.tensor(embeddings_list_1)
    embeddings_2 = torch.tensor(embeddings_list_2)
    similarity = embeddings_1 @ embeddings_2.T

    # reference values from BAAI/bge-m3 documentation
    reference = torch.tensor(similarity_reference)

    assert torch.allclose(similarity, reference, rtol=0.01)


async def tokenize(client: openai.AsyncOpenAI, sentences: list[str]) -> list[list[int]]:
    futures = []
    for sentence in sentences:
        futures.append(
            client.post(
                "../tokenize",
                body={"model": MODEL_NAME, "prompt": sentence},
                cast_to=httpx.Response,
            )
        )
    return [(await future).json()["tokens"] for future in futures]


async def sparse_embeddings(
    client: openai.AsyncOpenAI, sentences: list[str]
) -> list[dict[int, float]]:
    all_tokens = await tokenize(client, sentences)
    result = await client.post(
        "../pooling",
        body={"model": MODEL_NAME, "input": sentences, "task": "token_classify"},
        cast_to=httpx.Response,
    )
    all_embeddings = [data["data"] for data in result.json()["data"]]

    ret = []

    for sent_tokens, sent_emb in zip(all_tokens, all_embeddings):
        token_embs = dict[int, float]()
        if sent_tokens[0] == 0:
            sent_tokens = sent_tokens[1:]
        for token, val in zip(sent_tokens, sent_emb):
            token_embs[token] = max(val, token_embs.get(token, 0.0))
        ret.append(token_embs)
    return ret


# Based on https://github.com/FlagOpen/FlagEmbedding/blob/6fd176266f2382878bcc69cd656cff425d52f49b/FlagEmbedding/inference/embedder/encoder_only/m3.py#L129
def compute_lexical_matching_score(
    lw1: dict[int, float], lw2: dict[int, float]
) -> float:
    scores = 0.0
    for token, weight in lw1.items():
        if token in lw2:
            scores += weight * lw2[token]
    return scores


@pytest.mark.asyncio
async def test_bge_m3_api_server_sparse_embedding(client: openai.AsyncOpenAI):
    embeddings_1 = await sparse_embeddings(client, sentences_1)
    embeddings_2 = await sparse_embeddings(client, sentences_2)

    lexical_scores_1_0_x_2_0 = compute_lexical_matching_score(
        embeddings_1[0], embeddings_2[0]
    )
    assert lexical_scores_1_0_x_2_0 == pytest.approx(
        lexical_score_reference[0], rel=0.01
    )

    lexical_scores_1_0_x_1_1 = compute_lexical_matching_score(
        embeddings_1[0], embeddings_1[1]
    )
    assert lexical_scores_1_0_x_1_1 == pytest.approx(
        lexical_score_reference[1], rel=0.01
    )


139
140
141
142
143
144
145
146
147
148
@pytest.mark.asyncio
async def test_bge_m3_api_server_sparse_embedding_corner_case(
    client: openai.AsyncOpenAI,
):
    embeddings = await sparse_embeddings(client, ["Hi"])
    assert len(embeddings) == 1
    assert 2673 in embeddings[0]
    assert embeddings[0][2673] == pytest.approx(0.26710861921310425, rel=0.01)


149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# https://github.com/FlagOpen/FlagEmbedding/blob/6fd176266f2382878bcc69cd656cff425d52f49b/FlagEmbedding/inference/embedder/encoder_only/m3.py#L163
def colbert_score(q_reps: torch.Tensor, p_reps: torch.Tensor) -> torch.Tensor:
    token_scores = torch.einsum("in,jn->ij", q_reps, p_reps)
    scores, _ = token_scores.max(-1)
    scores = torch.sum(scores) / q_reps.size(0)
    return scores


@pytest.mark.asyncio
async def test_bge_m3_api_server_multi_vector(client: openai.AsyncOpenAI):
    result_1 = await client.post(
        "../pooling",
        body={"model": MODEL_NAME, "input": sentences_1, "task": "token_embed"},
        cast_to=httpx.Response,
    )
    embeddings_1 = [torch.tensor(data["data"]) for data in result_1.json()["data"]]

    result_2 = await client.post(
        "../pooling",
        body={"model": MODEL_NAME, "input": sentences_2, "task": "token_embed"},
        cast_to=httpx.Response,
    )
    embeddings_2 = [torch.tensor(data["data"]) for data in result_2.json()["data"]]

    colbert_score_1_0_x_2_0 = colbert_score(embeddings_1[0], embeddings_2[0])
    assert colbert_score_1_0_x_2_0 == pytest.approx(
        colbert_score_reference[0], rel=0.01
    )
    colbert_score_1_0_x_2_1 = colbert_score(embeddings_1[0], embeddings_2[1])
    assert colbert_score_1_0_x_2_1 == pytest.approx(
        colbert_score_reference[1], rel=0.01
    )