test_prompt_validation.py 4.26 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
import io
zhuwenwen's avatar
zhuwenwen committed
5
import os
6

7
from unittest.mock import Mock
8

9
# imports for structured outputs tests
10
import openai
11
import pybase64
12
import pytest
13
import regex as re
14
15
import torch

16
17
from vllm.config import ModelConfig
from vllm.entrypoints.renderer import CompletionRenderer
18

19
from ...utils import RemoteOpenAIServer, models_path_prefix
20
21
22
23


@pytest.mark.asyncio
async def test_empty_prompt():
24
    model_name = os.path.join(models_path_prefix, "gpt2")
25
26
27
28
    server_args = ["--enforce-eager"]
    with RemoteOpenAIServer(model_name, server_args) as remote_server:
        client = remote_server.get_async_client()

29
        with pytest.raises(
30
31
            openai.BadRequestError,
            match="Either prompt or prompt_embeds must be provided and non-empty.",
32
        ):
33
34
35
36
37
38
39
            await client.completions.create(
                model=model_name,
                prompt="",
                max_tokens=5,
                temperature=0.0,
                extra_body={"prompt_embeds": []},
            )
40
41
42
43
44
45
46
47
48


@pytest.mark.asyncio
async def test_out_of_vocab_token_ids():
    model_name = "gpt2"
    server_args = ["--enforce-eager"]
    with RemoteOpenAIServer(model_name, server_args) as remote_server:
        client = remote_server.get_async_client()

49
50
51
52
53
54
        with pytest.raises(
            openai.BadRequestError, match=re.compile(".*out of vocabulary.*").pattern
        ):
            await client.completions.create(
                model=model_name, prompt=[999999], max_tokens=5, temperature=0.0
            )
55
56


57
@pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16, torch.float16])
58
@pytest.mark.parametrize(
59
60
    "layout", [torch.strided, torch.sparse_coo, torch.sparse_csc, torch.sparse_csr]
)
61
62
@pytest.mark.parametrize("seq_len", [2, 10])
@pytest.mark.parametrize("hidden_size", [2, 10])
63
64
65
def test_load_prompt_embeds(
    dtype: torch.dtype, layout: torch.layout, seq_len: int, hidden_size: int
):
66
67
68
69
    model_config = Mock(spec=ModelConfig)
    model_config.enable_prompt_embeds = True
    renderer = CompletionRenderer(model_config, tokenizer=None)

70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
    # construct arbitrary tensors of various dtypes, layouts, and sizes.
    # We need to check against different layouts to make sure that if a user
    # uses sparse tensors to reduce the transmission size of prompt embeddings,
    # we must cast them to dense/strided before passing them into the engine.
    # We don't use non-CPU tensors in this test to avoid preemptively
    # initializing cuda and break other tests in the suite that fork processes.
    # We also need to make sure that we only use devices that are actually
    # available in the environment the test is running on. For simplicity,
    # we just test against CPU.
    tensor = torch.randn((seq_len, hidden_size), dtype=dtype)
    if layout == torch.strided:
        tensor = tensor.contiguous()
    elif layout == torch.sparse_coo:
        tensor = tensor.to_sparse_coo()
    elif layout == torch.sparse_csc:
        tensor = tensor.to_sparse_csc()
    elif layout == torch.sparse_csr:
        tensor = tensor.to_sparse_csr()

    buffer = io.BytesIO()
    torch.save(tensor, buffer)
    buffer.seek(0)
    encoded_tensor = pybase64.b64encode(buffer.getvalue())
93

94
    loaded_prompt_embeds = renderer.load_prompt_embeds(encoded_tensor)
95
96
97
98
    assert len(loaded_prompt_embeds) == 1
    loaded_tensor = loaded_prompt_embeds[0]["prompt_embeds"]
    assert loaded_tensor.device.type == "cpu"
    assert loaded_tensor.layout == torch.strided
99
100
101
    torch.testing.assert_close(
        loaded_tensor, tensor.to("cpu").to_dense(), equal_nan=True
    )
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120


@pytest.mark.parametrize("dtype", [torch.float32])
@pytest.mark.parametrize("seq_len", [2])
@pytest.mark.parametrize("hidden_size", [2])
def test_disable_prompt_embeds(dtype: torch.dtype, seq_len: int, hidden_size: int):
    model_config = Mock(spec=ModelConfig)
    model_config.enable_prompt_embeds = False
    renderer = CompletionRenderer(model_config, tokenizer=None)

    tensor = torch.randn((seq_len, hidden_size), dtype=dtype)

    buffer = io.BytesIO()
    torch.save(tensor, buffer)
    buffer.seek(0)
    encoded_tensor = pybase64.b64encode(buffer.getvalue())

    with pytest.raises(ValueError, match="--enable-prompt-embeds"):
        renderer.load_prompt_embeds(encoded_tensor)