test_truncation.py 4.11 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
from typing import Any

import openai
import pytest
import pytest_asyncio

from tests.utils import RemoteOpenAIServer

MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2"
max_model_len = 128

input = """Immerse yourself in the enchanting chronicle of calculus, a 
    mathematical domain that has radically transformed our comprehension of 
    change and motion. Despite its roots in ancient civilizations, the 
    formal birth of calculus predominantly occurred in the 17th century, 
    primarily under the influential guidance of Sir Isaac Newton and Gottfried 
    Wilhelm Leibniz. The earliest traces of calculus concepts are found in 
    ancient Greek mathematics,most notably in the works of Eudoxus and 
    Archimedes, around 300 BCE. They utilized the 'method of exhaustion'—a 
    technique for computing areas and volumes through the use of finite sums. 
    This methodology laid crucial foundational work for integral calculus. 
    In the 17th century, both Newton and Leibniz independently pioneered 
    calculus, each contributing unique perspectives that would shape this new 
    field."""


@pytest.fixture(scope="module")
def server():
    args = [
32
33
        "--runner",
        "pooling",
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
        "--dtype",
        "bfloat16",
        "--enforce-eager",
        "--max-model-len",
        str(max_model_len),
    ]

    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
        yield remote_server


@pytest_asyncio.fixture
async def client(server):
    async with server.get_async_client() as async_client:
        yield async_client


@pytest.mark.asyncio
async def test_smaller_truncation_size(client: openai.AsyncOpenAI):
    truncation_size = 10
    kwargs: dict[str, Any] = {
        "model": MODEL_NAME,
        "input": input,
        "truncate_prompt_tokens": truncation_size
    }

    response = await client.post(path="embeddings",
                                 cast_to=object,
                                 body={**kwargs})

    assert response["usage"]["prompt_tokens"] == truncation_size


67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
@pytest.mark.asyncio
async def test_zero_truncation_size(client: openai.AsyncOpenAI):
    truncation_size = 0
    kwargs: dict[str, Any] = {
        "model": MODEL_NAME,
        "input": input,
        "truncate_prompt_tokens": truncation_size
    }

    with pytest.raises(openai.BadRequestError) as err:
        await client.post(path="embeddings", cast_to=object, body={**kwargs})

    assert err.value.status_code == 400
    error_details = err.value.response.json()["error"]

    assert error_details["type"] == "BadRequestError"
    assert "This model's maximum context length is" in error_details["message"]
    assert "tokens in the input for embedding generation" in error_details[
        "message"]
    assert "Please reduce the length of the input" in error_details["message"]


89
90
91
92
93
94
95
96
97
98
@pytest.mark.asyncio
async def test_bigger_truncation_size(client: openai.AsyncOpenAI):
    truncation_size = max_model_len + 1
    kwargs: dict[str, Any] = {
        "model": MODEL_NAME,
        "input": input,
        "truncate_prompt_tokens": truncation_size
    }

    with pytest.raises(openai.BadRequestError) as err:
99
100
101
102
103
104
105
106
107
        await client.post(path="embeddings", cast_to=object, body={**kwargs})

    assert err.value.status_code == 400
    error_details = err.value.response.json()["error"]
    assert error_details["type"] == "BadRequestError"
    expected_message = ("truncate_prompt_tokens value is "
                        "greater than max_model_len."
                        " Please, select a smaller truncation size.")
    assert error_details["message"] == expected_message
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123


@pytest.mark.asyncio
async def test_max_truncation_size(client: openai.AsyncOpenAI):
    truncation_size = -1
    kwargs: dict[str, Any] = {
        "model": MODEL_NAME,
        "input": input,
        "truncate_prompt_tokens": truncation_size
    }

    response = await client.post(path="embeddings",
                                 cast_to=object,
                                 body={**kwargs})

    assert response["usage"]["prompt_tokens"] == max_model_len