Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
9665313c
Unverified
Commit
9665313c
authored
Apr 10, 2025
by
Russell Bryant
Committed by
GitHub
Apr 10, 2025
Browse files
[V1] Set structured output backend to `auto` by default (#15724)
Signed-off-by:
Russell Bryant
<
rbryant@redhat.com
>
parent
0c54fc72
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
22 additions
and
68 deletions
+22
-68
tests/entrypoints/openai/test_chat.py
tests/entrypoints/openai/test_chat.py
+11
-63
vllm/config.py
vllm/config.py
+2
-2
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+3
-3
vllm/model_executor/guided_decoding/__init__.py
vllm/model_executor/guided_decoding/__init__.py
+6
-0
No files found.
tests/entrypoints/openai/test_chat.py
View file @
9665313c
...
...
@@ -20,8 +20,6 @@ from .test_completion import zephyr_lora_files # noqa: F401
# any model with a chat template should work here
MODEL_NAME
=
"HuggingFaceH4/zephyr-7b-beta"
GUIDED_DECODING_BACKENDS
=
[
"outlines"
,
"lm-format-enforcer"
,
"xgrammar"
]
@
pytest
.
fixture
(
scope
=
"module"
)
def
monkeypatch_module
():
...
...
@@ -487,20 +485,9 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
assert
last_completion_tokens
==
10
# NOTE: Not sure why, but when I place this after `test_guided_regex_chat`
# (i.e. using the same ordering as in the Completions API tests), the test
# will fail on the second `guided_decoding_backend` even when I swap their order
# (ref: https://github.com/vllm-project/vllm/pull/5526#issuecomment-2173772256)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
GUIDED_DECODING_BACKENDS
)
async
def
test_guided_choice_chat
(
client
:
openai
.
AsyncOpenAI
,
is_v1_server
:
bool
,
guided_decoding_backend
:
str
,
sample_guided_choice
):
if
is_v1_server
and
guided_decoding_backend
!=
'xgrammar'
:
pytest
.
skip
(
"Only xgrammar backend is supported with V1"
)
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
...
...
@@ -515,8 +502,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
messages
=
messages
,
max_completion_tokens
=
10
,
temperature
=
0.7
,
extra_body
=
dict
(
guided_choice
=
sample_guided_choice
,
guided_decoding_backend
=
guided_decoding_backend
))
extra_body
=
dict
(
guided_choice
=
sample_guided_choice
))
choice1
=
chat_completion
.
choices
[
0
].
message
.
content
assert
choice1
in
sample_guided_choice
...
...
@@ -530,22 +516,16 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
messages
=
messages
,
max_completion_tokens
=
10
,
temperature
=
0.7
,
extra_body
=
dict
(
guided_choice
=
sample_guided_choice
,
guided_decoding_backend
=
guided_decoding_backend
))
extra_body
=
dict
(
guided_choice
=
sample_guided_choice
))
choice2
=
chat_completion
.
choices
[
0
].
message
.
content
assert
choice2
in
sample_guided_choice
assert
choice1
!=
choice2
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
GUIDED_DECODING_BACKENDS
)
async
def
test_guided_json_chat
(
client
:
openai
.
AsyncOpenAI
,
is_v1_server
:
bool
,
guided_decoding_backend
:
str
,
async
def
test_guided_json_chat
(
client
:
openai
.
AsyncOpenAI
,
sample_json_schema
):
if
is_v1_server
:
pytest
.
skip
(
"sample_json_schema has features unsupported in V1"
)
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
...
...
@@ -560,8 +540,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
model
=
MODEL_NAME
,
messages
=
messages
,
max_completion_tokens
=
1000
,
extra_body
=
dict
(
guided_json
=
sample_json_schema
,
guided_decoding_backend
=
guided_decoding_backend
))
extra_body
=
dict
(
guided_json
=
sample_json_schema
))
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
json1
=
json
.
loads
(
message
.
content
)
...
...
@@ -578,8 +557,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
model
=
MODEL_NAME
,
messages
=
messages
,
max_completion_tokens
=
1000
,
extra_body
=
dict
(
guided_json
=
sample_json_schema
,
guided_decoding_backend
=
guided_decoding_backend
))
extra_body
=
dict
(
guided_json
=
sample_json_schema
))
message
=
chat_completion
.
choices
[
0
].
message
assert
message
.
content
is
not
None
json2
=
json
.
loads
(
message
.
content
)
...
...
@@ -589,13 +567,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
GUIDED_DECODING_BACKENDS
)
async
def
test_guided_regex_chat
(
client
:
openai
.
AsyncOpenAI
,
is_v1_server
:
bool
,
guided_decoding_backend
:
str
,
sample_regex
):
if
is_v1_server
and
guided_decoding_backend
!=
'xgrammar'
:
pytest
.
skip
(
"Only xgrammar backend is supported with V1"
)
async
def
test_guided_regex_chat
(
client
:
openai
.
AsyncOpenAI
,
sample_regex
):
messages
=
[{
"role"
:
"system"
,
...
...
@@ -610,8 +582,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
model
=
MODEL_NAME
,
messages
=
messages
,
max_completion_tokens
=
20
,
extra_body
=
dict
(
guided_regex
=
sample_regex
,
guided_decoding_backend
=
guided_decoding_backend
))
extra_body
=
dict
(
guided_regex
=
sample_regex
))
ip1
=
chat_completion
.
choices
[
0
].
message
.
content
assert
ip1
is
not
None
assert
re
.
fullmatch
(
sample_regex
,
ip1
)
is
not
None
...
...
@@ -622,8 +593,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
model
=
MODEL_NAME
,
messages
=
messages
,
max_completion_tokens
=
20
,
extra_body
=
dict
(
guided_regex
=
sample_regex
,
guided_decoding_backend
=
guided_decoding_backend
))
extra_body
=
dict
(
guided_regex
=
sample_regex
))
ip2
=
chat_completion
.
choices
[
0
].
message
.
content
assert
ip2
is
not
None
assert
re
.
fullmatch
(
sample_regex
,
ip2
)
is
not
None
...
...
@@ -652,15 +622,9 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
GUIDED_DECODING_BACKENDS
)
async
def
test_guided_choice_chat_logprobs
(
client
:
openai
.
AsyncOpenAI
,
is_v1_server
:
bool
,
guided_decoding_backend
:
str
,
sample_guided_choice
):
if
is_v1_server
and
guided_decoding_backend
!=
'xgrammar'
:
pytest
.
skip
(
"Only xgrammar backend is supported with V1"
)
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
...
...
@@ -676,8 +640,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
max_completion_tokens
=
10
,
logprobs
=
True
,
top_logprobs
=
5
,
extra_body
=
dict
(
guided_choice
=
sample_guided_choice
,
guided_decoding_backend
=
guided_decoding_backend
))
extra_body
=
dict
(
guided_choice
=
sample_guided_choice
))
assert
chat_completion
.
choices
[
0
].
logprobs
is
not
None
assert
chat_completion
.
choices
[
0
].
logprobs
.
content
is
not
None
...
...
@@ -689,14 +652,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
GUIDED_DECODING_BACKENDS
)
async
def
test_named_tool_use
(
client
:
openai
.
AsyncOpenAI
,
is_v1_server
:
bool
,
guided_decoding_backend
:
str
,
sample_json_schema
):
if
is_v1_server
:
pytest
.
skip
(
"sample_json_schema has features unsupported on V1"
)
async
def
test_named_tool_use
(
client
:
openai
.
AsyncOpenAI
,
sample_json_schema
):
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
...
...
@@ -728,7 +684,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool,
"name"
:
"dummy_function_name"
}
},
extra_body
=
dict
(
guided_decoding_backend
=
guided_decoding_backend
)
)
)
message
=
chat_completion
.
choices
[
0
].
message
assert
len
(
message
.
content
)
==
0
json_string
=
message
.
tool_calls
[
0
].
function
.
arguments
...
...
@@ -763,7 +719,6 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool,
"name"
:
"dummy_function_name"
}
},
extra_body
=
dict
(
guided_decoding_backend
=
guided_decoding_backend
),
stream
=
True
)
output
=
[]
...
...
@@ -888,7 +843,6 @@ async def test_required_tool_use(client: openai.AsyncOpenAI,
model
=
model_name
,
tools
=
tools
,
tool_choice
=
"required"
,
extra_body
=
dict
(
guided_decoding_backend
=
"outlines"
),
)
assert
chat_completion
.
choices
[
0
].
message
.
tool_calls
is
not
None
...
...
@@ -900,7 +854,6 @@ async def test_required_tool_use(client: openai.AsyncOpenAI,
model
=
model_name
,
tools
=
tools
,
tool_choice
=
"required"
,
extra_body
=
dict
(
guided_decoding_backend
=
"outlines"
),
stream
=
True
,
)
...
...
@@ -914,12 +867,7 @@ async def test_required_tool_use(client: openai.AsyncOpenAI,
@
pytest
.
mark
.
asyncio
async
def
test_inconsistent_tool_choice_and_tools
(
client
:
openai
.
AsyncOpenAI
,
is_v1_server
:
bool
,
sample_json_schema
):
if
is_v1_server
:
pytest
.
skip
(
"sample_json_schema has features unsupported on V1"
)
messages
=
[{
"role"
:
"system"
,
"content"
:
"you are a helpful assistant"
...
...
vllm/config.py
View file @
9665313c
...
...
@@ -2976,7 +2976,7 @@ class DecodingConfig:
# Which guided decoding algo to use.
# 'outlines' / 'lm-format-enforcer' / 'xgrammar'
guided_decoding_backend
:
str
=
'
xgrammar
'
guided_decoding_backend
:
str
=
"auto"
if
envs
.
VLLM_USE_V1
else
"
xgrammar
"
reasoning_backend
:
Optional
[
str
]
=
None
...
...
@@ -3001,7 +3001,7 @@ class DecodingConfig:
def
__post_init__
(
self
):
v0_valid_guided_backends
=
[
'outlines'
,
'lm-format-enforcer'
,
'xgrammar'
'outlines'
,
'lm-format-enforcer'
,
'xgrammar'
,
'auto'
]
v1_valid_guided_backends
=
[
'xgrammar'
,
'guidance'
,
'auto'
]
...
...
vllm/engine/arg_utils.py
View file @
9665313c
...
...
@@ -182,7 +182,7 @@ class EngineArgs:
enable_chunked_prefill
:
Optional
[
bool
]
=
None
disable_chunked_mm_input
:
bool
=
False
guided_decoding_backend
:
str
=
'xgrammar'
guided_decoding_backend
:
str
=
DecodingConfig
.
guided_decoding_backend
logits_processor_pattern
:
Optional
[
str
]
=
None
speculative_config
:
Optional
[
Dict
[
str
,
Any
]]
=
None
...
...
@@ -407,13 +407,13 @@ class EngineArgs:
parser
.
add_argument
(
'--guided-decoding-backend'
,
type
=
str
,
default
=
'xgrammar'
,
default
=
DecodingConfig
.
guided_decoding_backend
,
help
=
'Which engine will be used for guided decoding'
' (JSON schema / regex etc) by default. Currently support '
'https://github.com/mlc-ai/xgrammar and '
'https://github.com/guidance-ai/llguidance.'
'Valid backend values are "xgrammar", "guidance", and "auto". '
'With "auto", we will make opinionated choices based on request'
'With "auto", we will make opinionated choices based on request
'
'contents and what the backend libraries currently support, so '
'the behavior is subject to change in each release.'
)
parser
.
add_argument
(
...
...
vllm/model_executor/guided_decoding/__init__.py
View file @
9665313c
...
...
@@ -33,6 +33,12 @@ def maybe_backend_fallback(
logger
.
warning
(
"%s Falling back to use %s instead."
,
message
,
fallback
)
guided_params
.
backend
=
fallback
# `auto` was added for V1 to explicitly declare a mode that has fallbacks
# in place. If that is specified with V0, treat it as `xgrammar`, as we have
# fallbacks enabled for that and it is the V0 default.
if
guided_params
.
backend
==
"auto"
:
guided_params
.
backend
=
"xgrammar"
# lm-format-enforce doesn't support grammar, fallback to xgrammar
if
guided_params
.
backend_name
==
"lm-format-enforcer"
:
if
guided_params
.
grammar
is
not
None
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment