test_prompt_validation.py 4.69 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
import io
5
from unittest.mock import Mock
zhuwenwen's avatar
zhuwenwen committed
6
import os
7
# imports for structured outputs tests
8
import openai
9
import pybase64
10
import pytest
11
import regex as re
12
13
import torch

14
15
from vllm.config import ModelConfig
from vllm.entrypoints.renderer import CompletionRenderer
16

17
from ...utils import RemoteOpenAIServer, models_path_prefix
18
19


20
21
22
23
24
@pytest.fixture(scope="function", autouse=True)
def use_v1_only(monkeypatch):
    monkeypatch.setenv('VLLM_USE_V1', '1')


25
26
@pytest.mark.asyncio
async def test_empty_prompt():
27
    model_name = os.path.join(models_path_prefix, "gpt2")
28
29
30
31
    server_args = ["--enforce-eager"]
    with RemoteOpenAIServer(model_name, server_args) as remote_server:
        client = remote_server.get_async_client()

32
33
34
35
36
        with pytest.raises(
                openai.BadRequestError,
                match=
                "Either prompt or prompt_embeds must be provided and non-empty."
        ):
37
38
39
            await client.completions.create(model=model_name,
                                            prompt="",
                                            max_tokens=5,
40
41
                                            temperature=0.0,
                                            extra_body={"prompt_embeds": []})
42
43
44
45
46
47
48
49
50
51


@pytest.mark.asyncio
async def test_out_of_vocab_token_ids():
    model_name = "gpt2"
    server_args = ["--enforce-eager"]
    with RemoteOpenAIServer(model_name, server_args) as remote_server:
        client = remote_server.get_async_client()

        with pytest.raises(openai.BadRequestError,
52
                           match=re.compile('.*out of vocabulary.*').pattern):
53
54
55
56
            await client.completions.create(model=model_name,
                                            prompt=[999999],
                                            max_tokens=5,
                                            temperature=0.0)
57
58


59
60
61
62
63
64
65
66
67
@pytest.mark.parametrize("dtype",
                         [torch.float32, torch.bfloat16, torch.float16])
@pytest.mark.parametrize(
    "layout",
    [torch.strided, torch.sparse_coo, torch.sparse_csc, torch.sparse_csr])
@pytest.mark.parametrize("seq_len", [2, 10])
@pytest.mark.parametrize("hidden_size", [2, 10])
def test_load_prompt_embeds(dtype: torch.dtype, layout: torch.layout,
                            seq_len: int, hidden_size: int):
68
69
70
    model_config = Mock(spec=ModelConfig)
    model_config.enable_prompt_embeds = True
    renderer = CompletionRenderer(model_config, tokenizer=None)
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
    # construct arbitrary tensors of various dtypes, layouts, and sizes.
    # We need to check against different layouts to make sure that if a user
    # uses sparse tensors to reduce the transmission size of prompt embeddings,
    # we must cast them to dense/strided before passing them into the engine.
    # We don't use non-CPU tensors in this test to avoid preemptively
    # initializing cuda and break other tests in the suite that fork processes.
    # We also need to make sure that we only use devices that are actually
    # available in the environment the test is running on. For simplicity,
    # we just test against CPU.
    tensor = torch.randn((seq_len, hidden_size), dtype=dtype)
    if layout == torch.strided:
        tensor = tensor.contiguous()
    elif layout == torch.sparse_coo:
        tensor = tensor.to_sparse_coo()
    elif layout == torch.sparse_csc:
        tensor = tensor.to_sparse_csc()
    elif layout == torch.sparse_csr:
        tensor = tensor.to_sparse_csr()

    buffer = io.BytesIO()
    torch.save(tensor, buffer)
    buffer.seek(0)
    encoded_tensor = pybase64.b64encode(buffer.getvalue())
94

95
    loaded_prompt_embeds = renderer.load_prompt_embeds(encoded_tensor)
96
97
98
99
100
101
102
    assert len(loaded_prompt_embeds) == 1
    loaded_tensor = loaded_prompt_embeds[0]["prompt_embeds"]
    assert loaded_tensor.device.type == "cpu"
    assert loaded_tensor.layout == torch.strided
    torch.testing.assert_close(loaded_tensor,
                               tensor.to("cpu").to_dense(),
                               equal_nan=True)
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
    

@pytest.mark.parametrize("dtype", [torch.float32])
@pytest.mark.parametrize("seq_len", [2])
@pytest.mark.parametrize("hidden_size", [2])
def test_disable_prompt_embeds(dtype: torch.dtype, seq_len: int, hidden_size: int):
    model_config = Mock(spec=ModelConfig)
    model_config.enable_prompt_embeds = False
    renderer = CompletionRenderer(model_config, tokenizer=None)

    tensor = torch.randn((seq_len, hidden_size), dtype=dtype)

    buffer = io.BytesIO()
    torch.save(tensor, buffer)
    buffer.seek(0)
    encoded_tensor = pybase64.b64encode(buffer.getvalue())

    with pytest.raises(ValueError, match="--enable-prompt-embeds"):
        renderer.load_prompt_embeds(encoded_tensor)