test_chat_completions.py 6.64 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
5
6
import openai
import pytest

7
8
from .utils import (
    MESSAGES_WITHOUT_TOOLS,
9
    SEED,
10
11
12
13
    WEATHER_TOOL,
    ServerConfig,
    ensure_system_prompt,
)
14
15
16
17
18
19


# test: make sure chat completions without tools provided work even when tools
# are enabled. This makes sure tool call chat templates work, AND that the tool
# parser stream processing doesn't change the output of the model.
@pytest.mark.asyncio
20
21
22
async def test_chat_completion_without_tools(
    client: openai.AsyncOpenAI, server_config: ServerConfig
):
23
24
25
    models = await client.models.list()
    model_name: str = models.data[0].id
    chat_completion = await client.chat.completions.create(
26
        messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config),
27
        temperature=0,
28
        max_completion_tokens=150,
29
        model=model_name,
30
        logprobs=False,
31
        seed=SEED,
32
    )
33
34
35
36
37
38
39
40
41
42
    choice = chat_completion.choices[0]
    stop_reason = chat_completion.choices[0].finish_reason
    output_text = chat_completion.choices[0].message.content

    # check to make sure we got text
    assert output_text is not None
    assert len(output_text) > 0
    assert stop_reason != "tool_calls"

    # check to make sure no tool calls were returned
43
    assert choice.message.tool_calls is None or len(choice.message.tool_calls) == 0
44
45
46

    # make the same request, streaming
    stream = await client.chat.completions.create(
47
        messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config),
48
        temperature=0,
49
        max_completion_tokens=150,
50
51
        model=model_name,
        logprobs=False,
52
        seed=SEED,
53
54
        stream=True,
    )
55
    chunks: list[str] = []
56
57
58
59
60
61
62
63
64
65
    finish_reason_count = 0
    role_sent: bool = False

    # assemble streamed chunks
    async for chunk in stream:
        delta = chunk.choices[0].delta

        # make sure the role is assistant
        if delta.role:
            assert not role_sent
66
            assert delta.role == "assistant"
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
            role_sent = True

        if delta.content:
            chunks.append(delta.content)

        if chunk.choices[0].finish_reason is not None:
            finish_reason_count += 1
            assert chunk.choices[0].finish_reason == choice.finish_reason

        # make sure tool call chunks aren't being streamed
        assert not delta.tool_calls or len(delta.tool_calls) == 0

    # make sure the role was sent, only 1 finish reason was sent, that chunks
    # were in fact sent, and that the chunks match non-streaming
    assert role_sent
    assert finish_reason_count == 1
    assert len(chunks)
    assert "".join(chunks) == output_text


# test: conversation with tools enabled and provided that should not invoke
# tools, to make sure we can still get normal chat completion responses
# and that they won't be parsed as tools
@pytest.mark.asyncio
91
92
93
async def test_chat_completion_with_tools(
    client: openai.AsyncOpenAI, server_config: ServerConfig
):
94
95
96
    models = await client.models.list()
    model_name: str = models.data[0].id
    chat_completion = await client.chat.completions.create(
97
        messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config),
98
        temperature=0,
99
        max_completion_tokens=150,
100
101
        model=model_name,
        tools=[WEATHER_TOOL],
102
        logprobs=False,
103
        seed=SEED,
104
    )
105
106
107
108
109
110
    choice = chat_completion.choices[0]
    stop_reason = chat_completion.choices[0].finish_reason
    output_text = chat_completion.choices[0].message.content

    # check to make sure we got text
    assert output_text is not None
111
    assert stop_reason != "tool_calls"
112
113
114
    assert len(output_text) > 0

    # check to make sure no tool calls were returned
115
    assert choice.message.tool_calls is None or len(choice.message.tool_calls) == 0
116
117
118

    # make the same request, streaming
    stream = await client.chat.completions.create(
119
        messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config),
120
        temperature=0,
121
        max_completion_tokens=150,
122
123
124
        model=model_name,
        logprobs=False,
        tools=[WEATHER_TOOL],
125
        seed=SEED,
126
127
128
        stream=True,
    )

129
    chunks: list[str] = []
130
131
132
133
134
135
136
137
138
    finish_reason_count = 0
    role_sent: bool = False

    # assemble streamed chunks
    async for chunk in stream:
        delta = chunk.choices[0].delta

        # make sure the role is assistant
        if delta.role:
139
            assert delta.role == "assistant"
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
            role_sent = True

        if delta.content:
            chunks.append(delta.content)

        if chunk.choices[0].finish_reason is not None:
            finish_reason_count += 1

        # make sure tool call chunks aren't being streamed
        assert not delta.tool_calls or len(delta.tool_calls) == 0

    # make sure the role was sent, only 1 finish reason was sent, that chunks
    # were in fact sent, and that the chunks match non-streaming
    assert role_sent
    assert finish_reason_count == 1
    assert chunk.choices[0].finish_reason == stop_reason
156
    assert chunk.choices[0].finish_reason != "tool_calls"
157
158
    assert len(chunks)
    assert "".join(chunks) == output_text
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200


# Regression test for https://github.com/vllm-project/vllm/issues/32006
# Engine crash when combining response_format: json_object with
# tool_choice: required
@pytest.mark.asyncio
@pytest.mark.timeout(120)
async def test_response_format_with_tool_choice_required(
    client: openai.AsyncOpenAI, server_config: ServerConfig
):
    """
    Test that combining response_format: json_object with tool_choice: required
    doesn't crash the engine.

    Before the fix, this would cause a validation error:
    "You can only use one kind of structured outputs constraint but multiple
    are specified" because both json_object and json (from tool schema) would
    be set in StructuredOutputsParams.
    """
    models = await client.models.list()
    model_name: str = models.data[0].id

    # This combination previously crashed the engine
    chat_completion = await client.chat.completions.create(
        messages=ensure_system_prompt(
            [{"role": "user", "content": "What is the weather in Dallas, Texas?"}],
            server_config,
        ),
        temperature=0,
        max_completion_tokens=150,
        model=model_name,
        tools=[WEATHER_TOOL],
        tool_choice="required",
        response_format={"type": "json_object"},
    )

    # The fix clears response_format when tool_choice forces tool calling,
    # so the request should complete successfully with tool calls
    choice = chat_completion.choices[0]
    assert choice.finish_reason == "tool_calls"
    assert choice.message.tool_calls is not None
    assert len(choice.message.tool_calls) > 0