test_prompt_validation.py 4 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
import io
5
from unittest.mock import Mock
6

7
# imports for structured outputs tests
8
import openai
9
import pybase64
10
import pytest
11
import regex as re
12
13
import torch

14
from tests.utils import RemoteOpenAIServer
15
from vllm.config import ModelConfig
16
from vllm.renderers.embed_utils import safe_load_prompt_embeds
17
18
19
20
21
22
23
24
25


@pytest.mark.asyncio
async def test_empty_prompt():
    model_name = "gpt2"
    server_args = ["--enforce-eager"]
    with RemoteOpenAIServer(model_name, server_args) as remote_server:
        client = remote_server.get_async_client()

26
        with pytest.raises(
27
28
            openai.BadRequestError,
            match="Either prompt or prompt_embeds must be provided and non-empty.",
29
        ):
30
31
            await client.completions.create(
                model=model_name,
32
                prompt=None,
33
34
35
36
                max_tokens=5,
                temperature=0.0,
                extra_body={"prompt_embeds": []},
            )
37
38
39
40
41
42
43
44
45


@pytest.mark.asyncio
async def test_out_of_vocab_token_ids():
    model_name = "gpt2"
    server_args = ["--enforce-eager"]
    with RemoteOpenAIServer(model_name, server_args) as remote_server:
        client = remote_server.get_async_client()

46
47
48
49
50
51
        with pytest.raises(
            openai.BadRequestError, match=re.compile(".*out of vocabulary.*").pattern
        ):
            await client.completions.create(
                model=model_name, prompt=[999999], max_tokens=5, temperature=0.0
            )
52
53


54
@pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16, torch.float16])
55
@pytest.mark.parametrize(
56
57
    "layout", [torch.strided, torch.sparse_coo, torch.sparse_csc, torch.sparse_csr]
)
58
59
@pytest.mark.parametrize("seq_len", [2, 10])
@pytest.mark.parametrize("hidden_size", [2, 10])
60
61
62
def test_load_prompt_embeds(
    dtype: torch.dtype, layout: torch.layout, seq_len: int, hidden_size: int
):
63
64
65
    model_config = Mock(spec=ModelConfig)
    model_config.enable_prompt_embeds = True

66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
    # construct arbitrary tensors of various dtypes, layouts, and sizes.
    # We need to check against different layouts to make sure that if a user
    # uses sparse tensors to reduce the transmission size of prompt embeddings,
    # we must cast them to dense/strided before passing them into the engine.
    # We don't use non-CPU tensors in this test to avoid preemptively
    # initializing cuda and break other tests in the suite that fork processes.
    # We also need to make sure that we only use devices that are actually
    # available in the environment the test is running on. For simplicity,
    # we just test against CPU.
    tensor = torch.randn((seq_len, hidden_size), dtype=dtype)
    if layout == torch.strided:
        tensor = tensor.contiguous()
    elif layout == torch.sparse_coo:
        tensor = tensor.to_sparse_coo()
    elif layout == torch.sparse_csc:
        tensor = tensor.to_sparse_csc()
    elif layout == torch.sparse_csr:
        tensor = tensor.to_sparse_csr()

    buffer = io.BytesIO()
    torch.save(tensor, buffer)
    buffer.seek(0)
    encoded_tensor = pybase64.b64encode(buffer.getvalue())

90
    loaded_tensor = safe_load_prompt_embeds(model_config, encoded_tensor)
91
92
    assert loaded_tensor.device.type == "cpu"
    assert loaded_tensor.layout == torch.strided
93
94
95
    torch.testing.assert_close(
        loaded_tensor, tensor.to("cpu").to_dense(), equal_nan=True
    )
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112


@pytest.mark.parametrize("dtype", [torch.float32])
@pytest.mark.parametrize("seq_len", [2])
@pytest.mark.parametrize("hidden_size", [2])
def test_disable_prompt_embeds(dtype: torch.dtype, seq_len: int, hidden_size: int):
    model_config = Mock(spec=ModelConfig)
    model_config.enable_prompt_embeds = False

    tensor = torch.randn((seq_len, hidden_size), dtype=dtype)

    buffer = io.BytesIO()
    torch.save(tensor, buffer)
    buffer.seek(0)
    encoded_tensor = pybase64.b64encode(buffer.getvalue())

    with pytest.raises(ValueError, match="--enable-prompt-embeds"):
113
        safe_load_prompt_embeds(model_config, encoded_tensor)