test_chat_completions.py 6.55 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
5
6
import openai
import pytest

7
8
9
10
11
12
from .utils import (
    MESSAGES_WITHOUT_TOOLS,
    WEATHER_TOOL,
    ServerConfig,
    ensure_system_prompt,
)
13
14
15
16
17
18


# test: make sure chat completions without tools provided work even when tools
# are enabled. This makes sure tool call chat templates work, AND that the tool
# parser stream processing doesn't change the output of the model.
@pytest.mark.asyncio
19
20
21
async def test_chat_completion_without_tools(
    client: openai.AsyncOpenAI, server_config: ServerConfig
):
22
23
24
    models = await client.models.list()
    model_name: str = models.data[0].id
    chat_completion = await client.chat.completions.create(
25
        messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config),
26
        temperature=0,
27
        max_completion_tokens=150,
28
        model=model_name,
29
30
        logprobs=False,
    )
31
32
33
34
35
36
37
38
39
40
    choice = chat_completion.choices[0]
    stop_reason = chat_completion.choices[0].finish_reason
    output_text = chat_completion.choices[0].message.content

    # check to make sure we got text
    assert output_text is not None
    assert len(output_text) > 0
    assert stop_reason != "tool_calls"

    # check to make sure no tool calls were returned
41
    assert choice.message.tool_calls is None or len(choice.message.tool_calls) == 0
42
43
44

    # make the same request, streaming
    stream = await client.chat.completions.create(
45
        messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config),
46
        temperature=0,
47
        max_completion_tokens=150,
48
49
50
51
        model=model_name,
        logprobs=False,
        stream=True,
    )
52
    chunks: list[str] = []
53
54
55
56
57
58
59
60
61
62
    finish_reason_count = 0
    role_sent: bool = False

    # assemble streamed chunks
    async for chunk in stream:
        delta = chunk.choices[0].delta

        # make sure the role is assistant
        if delta.role:
            assert not role_sent
63
            assert delta.role == "assistant"
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
            role_sent = True

        if delta.content:
            chunks.append(delta.content)

        if chunk.choices[0].finish_reason is not None:
            finish_reason_count += 1
            assert chunk.choices[0].finish_reason == choice.finish_reason

        # make sure tool call chunks aren't being streamed
        assert not delta.tool_calls or len(delta.tool_calls) == 0

    # make sure the role was sent, only 1 finish reason was sent, that chunks
    # were in fact sent, and that the chunks match non-streaming
    assert role_sent
    assert finish_reason_count == 1
    assert len(chunks)
    assert "".join(chunks) == output_text


# test: conversation with tools enabled and provided that should not invoke
# tools, to make sure we can still get normal chat completion responses
# and that they won't be parsed as tools
@pytest.mark.asyncio
88
89
90
async def test_chat_completion_with_tools(
    client: openai.AsyncOpenAI, server_config: ServerConfig
):
91
92
93
    models = await client.models.list()
    model_name: str = models.data[0].id
    chat_completion = await client.chat.completions.create(
94
        messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config),
95
        temperature=0,
96
        max_completion_tokens=150,
97
98
        model=model_name,
        tools=[WEATHER_TOOL],
99
100
        logprobs=False,
    )
101
102
103
104
105
106
    choice = chat_completion.choices[0]
    stop_reason = chat_completion.choices[0].finish_reason
    output_text = chat_completion.choices[0].message.content

    # check to make sure we got text
    assert output_text is not None
107
    assert stop_reason != "tool_calls"
108
109
110
    assert len(output_text) > 0

    # check to make sure no tool calls were returned
111
    assert choice.message.tool_calls is None or len(choice.message.tool_calls) == 0
112
113
114

    # make the same request, streaming
    stream = await client.chat.completions.create(
115
        messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config),
116
        temperature=0,
117
        max_completion_tokens=150,
118
119
120
121
122
123
        model=model_name,
        logprobs=False,
        tools=[WEATHER_TOOL],
        stream=True,
    )

124
    chunks: list[str] = []
125
126
127
128
129
130
131
132
133
    finish_reason_count = 0
    role_sent: bool = False

    # assemble streamed chunks
    async for chunk in stream:
        delta = chunk.choices[0].delta

        # make sure the role is assistant
        if delta.role:
134
            assert delta.role == "assistant"
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
            role_sent = True

        if delta.content:
            chunks.append(delta.content)

        if chunk.choices[0].finish_reason is not None:
            finish_reason_count += 1

        # make sure tool call chunks aren't being streamed
        assert not delta.tool_calls or len(delta.tool_calls) == 0

    # make sure the role was sent, only 1 finish reason was sent, that chunks
    # were in fact sent, and that the chunks match non-streaming
    assert role_sent
    assert finish_reason_count == 1
    assert chunk.choices[0].finish_reason == stop_reason
151
    assert chunk.choices[0].finish_reason != "tool_calls"
152
153
    assert len(chunks)
    assert "".join(chunks) == output_text
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195


# Regression test for https://github.com/vllm-project/vllm/issues/32006
# Engine crash when combining response_format: json_object with
# tool_choice: required
@pytest.mark.asyncio
@pytest.mark.timeout(120)
async def test_response_format_with_tool_choice_required(
    client: openai.AsyncOpenAI, server_config: ServerConfig
):
    """
    Test that combining response_format: json_object with tool_choice: required
    doesn't crash the engine.

    Before the fix, this would cause a validation error:
    "You can only use one kind of structured outputs constraint but multiple
    are specified" because both json_object and json (from tool schema) would
    be set in StructuredOutputsParams.
    """
    models = await client.models.list()
    model_name: str = models.data[0].id

    # This combination previously crashed the engine
    chat_completion = await client.chat.completions.create(
        messages=ensure_system_prompt(
            [{"role": "user", "content": "What is the weather in Dallas, Texas?"}],
            server_config,
        ),
        temperature=0,
        max_completion_tokens=150,
        model=model_name,
        tools=[WEATHER_TOOL],
        tool_choice="required",
        response_format={"type": "json_object"},
    )

    # The fix clears response_format when tool_choice forces tool calling,
    # so the request should complete successfully with tool calls
    choice = chat_completion.choices[0]
    assert choice.finish_reason == "tool_calls"
    assert choice.message.tool_calls is not None
    assert len(choice.message.tool_calls) > 0