Unverified Commit 29283e89 authored by Aaron Pham's avatar Aaron Pham Committed by GitHub
Browse files

[Chore] Cleanup guided namespace, move to structured outputs config (#22772)


Signed-off-by: default avatarAaron Pham <contact@aarnphm.xyz>
Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 05b044e6
...@@ -167,12 +167,6 @@ if [[ $commands == *" entrypoints/llm "* ]]; then ...@@ -167,12 +167,6 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
--ignore=entrypoints/llm/test_prompt_validation.py "} --ignore=entrypoints/llm/test_prompt_validation.py "}
fi fi
#Obsolete currently
##ignore certain Entrypoints/llm tests
#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
#fi
# --ignore=entrypoints/openai/test_encoder_decoder.py \ # --ignore=entrypoints/openai/test_encoder_decoder.py \
# --ignore=entrypoints/openai/test_embedding.py \ # --ignore=entrypoints/openai/test_embedding.py \
# --ignore=entrypoints/openai/test_oot_registration.py # --ignore=entrypoints/openai/test_oot_registration.py
......
...@@ -108,8 +108,7 @@ steps: ...@@ -108,8 +108,7 @@ steps:
- tests/entrypoints/offline_mode - tests/entrypoints/offline_mode
commands: commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
......
...@@ -171,7 +171,7 @@ pull_request_rules: ...@@ -171,7 +171,7 @@ pull_request_rules:
- files=examples/online_serving/openai_chat_completion_structured_outputs.py - files=examples/online_serving/openai_chat_completion_structured_outputs.py
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
- files~=^tests/v1/structured_output/ - files~=^tests/v1/structured_output/
- files=tests/v1/entrypoints/llm/test_guided_generate.py - files=tests/v1/entrypoints/llm/test_struct_output_generate.py
- files~=^vllm/v1/structured_output/ - files~=^vllm/v1/structured_output/
actions: actions:
label: label:
......
...@@ -696,11 +696,11 @@ def evaluate(ret, args): ...@@ -696,11 +696,11 @@ def evaluate(ret, args):
return re.match(args.regex, actual) is not None return re.match(args.regex, actual) is not None
def _eval_correctness(expected, actual): def _eval_correctness(expected, actual):
if args.structure_type == "guided_json": if args.structure_type == "json":
return _eval_correctness_json(expected, actual) return _eval_correctness_json(expected, actual)
elif args.structure_type == "guided_regex": elif args.structure_type == "regex":
return _eval_correctness_regex(expected, actual) return _eval_correctness_regex(expected, actual)
elif args.structure_type == "guided_choice": elif args.structure_type == "choice":
return _eval_correctness_choice(expected, actual) return _eval_correctness_choice(expected, actual)
else: else:
return None return None
...@@ -780,18 +780,18 @@ def main(args: argparse.Namespace): ...@@ -780,18 +780,18 @@ def main(args: argparse.Namespace):
) )
if args.dataset == "grammar": if args.dataset == "grammar":
args.structure_type = "guided_grammar" args.structure_type = "grammar"
elif args.dataset == "regex": elif args.dataset == "regex":
args.structure_type = "guided_regex" args.structure_type = "regex"
elif args.dataset == "choice": elif args.dataset == "choice":
args.structure_type = "guided_choice" args.structure_type = "choice"
else: else:
args.structure_type = "guided_json" args.structure_type = "json"
if args.no_structured_output: if args.no_structured_output:
args.structured_output_ratio = 0 args.structured_output_ratio = 0
if args.save_results: if args.save_results:
result_file_name = f"{args.structured_output_ratio}guided" result_file_name = f"{args.structured_output_ratio}so"
result_file_name += f"_{backend}" result_file_name += f"_{backend}"
result_file_name += f"_{args.request_rate}qps" result_file_name += f"_{args.request_rate}qps"
result_file_name += f"_{args.model.split('/')[-1]}" result_file_name += f"_{args.model.split('/')[-1]}"
......
...@@ -14,7 +14,7 @@ API documentation for vLLM's configuration classes. ...@@ -14,7 +14,7 @@ API documentation for vLLM's configuration classes.
- [vllm.config.LoRAConfig][] - [vllm.config.LoRAConfig][]
- [vllm.config.MultiModalConfig][] - [vllm.config.MultiModalConfig][]
- [vllm.config.PoolerConfig][] - [vllm.config.PoolerConfig][]
- [vllm.config.DecodingConfig][] - [vllm.config.StructuredOutputsConfig][]
- [vllm.config.ObservabilityConfig][] - [vllm.config.ObservabilityConfig][]
- [vllm.config.KVTransferConfig][] - [vllm.config.KVTransferConfig][]
- [vllm.config.CompilationConfig][] - [vllm.config.CompilationConfig][]
......
...@@ -10,12 +10,12 @@ vLLM currently supports the following reasoning models: ...@@ -10,12 +10,12 @@ vLLM currently supports the following reasoning models:
| Model Series | Parser Name | Structured Output Support | Tool Calling | | Model Series | Parser Name | Structured Output Support | Tool Calling |
|--------------|-------------|------------------|-------------| |--------------|-------------|------------------|-------------|
| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `guided_json`, `guided_regex` | ❌ | | [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `json`, `regex` | ❌ |
| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `guided_json`, `guided_regex` | ✅ | | [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `json`, `regex` | ✅ |
| [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ | | [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ |
| [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `guided_json`, `guided_regex` | ✅ | | [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `json`, `regex` | ✅ |
| [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `guided_json`, `guided_regex` | ✅ | | [Hunyuan A13B series](https://huggingface.co/collections/tencent/hunyuan-a13b-685ec38e5b46321e3ea7c4be) | `hunyuan_a13b` | `json`, `regex` | ✅ |
| [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `guided_json`, `guided_regex` | ✅ | | [GLM-4.5 series](https://huggingface.co/collections/zai-org/glm-45-687c621d34bda8c9e4bf503b) | `glm45` | `json`, `regex` | ✅ |
!!! note !!! note
IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`. IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
......
...@@ -12,23 +12,23 @@ You can generate structured outputs using the OpenAI's [Completions](https://pla ...@@ -12,23 +12,23 @@ You can generate structured outputs using the OpenAI's [Completions](https://pla
The following parameters are supported, which must be added as extra parameters: The following parameters are supported, which must be added as extra parameters:
- `guided_choice`: the output will be exactly one of the choices. - `choice`: the output will be exactly one of the choices.
- `guided_regex`: the output will follow the regex pattern. - `regex`: the output will follow the regex pattern.
- `guided_json`: the output will follow the JSON schema. - `json`: the output will follow the JSON schema.
- `guided_grammar`: the output will follow the context free grammar. - `grammar`: the output will follow the context free grammar.
- `structural_tag`: Follow a JSON schema within a set of specified tags within the generated text. - `structural_tag`: Follow a JSON schema within a set of specified tags within the generated text.
You can see the complete list of supported parameters on the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) page. You can see the complete list of supported parameters on the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) page.
Structured outputs are supported by default in the OpenAI-Compatible Server. You Structured outputs are supported by default in the OpenAI-Compatible Server. You
may choose to specify the backend to use by setting the may choose to specify the backend to use by setting the
`--guided-decoding-backend` flag to `vllm serve`. The default backend is `auto`, `--structured-outputs-config.backend` flag to `vllm serve`. The default backend is `auto`,
which will try to choose an appropriate backend based on the details of the which will try to choose an appropriate backend based on the details of the
request. You may also choose a specific backend, along with request. You may also choose a specific backend, along with
some options. A full set of options is available in the `vllm serve --help` some options. A full set of options is available in the `vllm serve --help`
text. text.
Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one: Now let´s see an example for each of the cases, starting with the `choice`, as it´s the easiest one:
??? code ??? code
...@@ -45,12 +45,12 @@ Now let´s see an example for each of the cases, starting with the `guided_choic ...@@ -45,12 +45,12 @@ Now let´s see an example for each of the cases, starting with the `guided_choic
messages=[ messages=[
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
], ],
extra_body={"guided_choice": ["positive", "negative"]}, extra_body={"structured_outputs": {"choice": ["positive", "negative"]}},
) )
print(completion.choices[0].message.content) print(completion.choices[0].message.content)
``` ```
The next example shows how to use the `guided_regex`. The idea is to generate an email address, given a simple regex template: The next example shows how to use the `regex`. The idea is to generate an email address, given a simple regex template:
??? code ??? code
...@@ -63,18 +63,18 @@ The next example shows how to use the `guided_regex`. The idea is to generate an ...@@ -63,18 +63,18 @@ The next example shows how to use the `guided_regex`. The idea is to generate an
"content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n", "content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n",
} }
], ],
extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]}, extra_body={"structured_outputs": {"regex": r"\w+@\w+\.com\n"}, "stop": ["\n"]},
) )
print(completion.choices[0].message.content) print(completion.choices[0].message.content)
``` ```
One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats. One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats.
For this we can use the `guided_json` parameter in two different ways: For this we can use the `json` parameter in two different ways:
- Using directly a [JSON Schema](https://json-schema.org/) - Using directly a [JSON Schema](https://json-schema.org/)
- Defining a [Pydantic model](https://docs.pydantic.dev/latest/) and then extracting the JSON Schema from it (which is normally an easier option). - Defining a [Pydantic model](https://docs.pydantic.dev/latest/) and then extracting the JSON Schema from it (which is normally an easier option).
The next example shows how to use the `guided_json` parameter with a Pydantic model: The next example shows how to use the `response_format` parameter with a Pydantic model:
??? code ??? code
...@@ -119,7 +119,7 @@ The next example shows how to use the `guided_json` parameter with a Pydantic mo ...@@ -119,7 +119,7 @@ The next example shows how to use the `guided_json` parameter with a Pydantic mo
JSON schema and how the fields should be populated. This can improve the JSON schema and how the fields should be populated. This can improve the
results notably in most cases. results notably in most cases.
Finally we have the `guided_grammar` option, which is probably the most Finally we have the `grammar` option, which is probably the most
difficult to use, but it´s really powerful. It allows us to define complete difficult to use, but it´s really powerful. It allows us to define complete
languages like SQL queries. It works by using a context free EBNF grammar. languages like SQL queries. It works by using a context free EBNF grammar.
As an example, we can use to define a specific format of simplified SQL queries: As an example, we can use to define a specific format of simplified SQL queries:
...@@ -149,7 +149,7 @@ As an example, we can use to define a specific format of simplified SQL queries: ...@@ -149,7 +149,7 @@ As an example, we can use to define a specific format of simplified SQL queries:
"content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.", "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.",
} }
], ],
extra_body={"guided_grammar": simplified_sql_grammar}, extra_body={"structured_outputs": {"grammar": simplified_sql_grammar}},
) )
print(completion.choices[0].message.content) print(completion.choices[0].message.content)
``` ```
...@@ -292,8 +292,8 @@ An example of using `structural_tag` can be found here: <gh-file:examples/online ...@@ -292,8 +292,8 @@ An example of using `structural_tag` can be found here: <gh-file:examples/online
## Offline Inference ## Offline Inference
Offline inference allows for the same types of structured outputs. Offline inference allows for the same types of structured outputs.
To use it, we´ll need to configure the guided decoding using the class `GuidedDecodingParams` inside `SamplingParams`. To use it, we´ll need to configure the structured outputs using the class `StructuredOutputsParams` inside `SamplingParams`.
The main available options inside `GuidedDecodingParams` are: The main available options inside `StructuredOutputsParams` are:
- `json` - `json`
- `regex` - `regex`
...@@ -309,12 +309,12 @@ shown below: ...@@ -309,12 +309,12 @@ shown below:
```python ```python
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.sampling_params import GuidedDecodingParams from vllm.sampling_params import StructuredOutputsParams
llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct") llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct")
guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"]) structured_outputs_params = StructuredOutputsParams(choice=["Positive", "Negative"])
sampling_params = SamplingParams(guided_decoding=guided_decoding_params) sampling_params = SamplingParams(structured_outputs=structured_outputs_params)
outputs = llm.generate( outputs = llm.generate(
prompts="Classify this sentiment: vLLM is wonderful!", prompts="Classify this sentiment: vLLM is wonderful!",
sampling_params=sampling_params, sampling_params=sampling_params,
......
...@@ -71,7 +71,7 @@ This example demonstrates: ...@@ -71,7 +71,7 @@ This example demonstrates:
* Making a request with `tool_choice="auto"` * Making a request with `tool_choice="auto"`
* Handling the structured response and executing the corresponding function * Handling the structured response and executing the corresponding function
You can also specify a particular function using named function calling by setting `tool_choice={"type": "function", "function": {"name": "get_weather"}}`. Note that this will use the guided decoding backend - so the first time this is used, there will be several seconds of latency (or more) as the FSM is compiled for the first time before it is cached for subsequent requests. You can also specify a particular function using named function calling by setting `tool_choice={"type": "function", "function": {"name": "get_weather"}}`. Note that this will use the structured outputs backend - so the first time this is used, there will be several seconds of latency (or more) as the FSM is compiled for the first time before it is cached for subsequent requests.
Remember that it's the caller's responsibility to: Remember that it's the caller's responsibility to:
...@@ -83,19 +83,18 @@ For more advanced usage, including parallel tool calls and different model-speci ...@@ -83,19 +83,18 @@ For more advanced usage, including parallel tool calls and different model-speci
## Named Function Calling ## Named Function Calling
vLLM supports named function calling in the chat completion API by default. It does so using Outlines through guided decoding, so this is vLLM supports named function calling in the chat completion API by default. This should work with most structured outputs backends supported by vLLM. You are guaranteed a validly-parsable function call - not a
enabled by default and will work with any supported model. You are guaranteed a validly-parsable function call - not a
high-quality one. high-quality one.
vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter. vLLM will use structured outputs to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter.
For best results, we recommend ensuring that the expected output format / schema is specified in the prompt to ensure that the model's intended generation is aligned with the schema that it's being forced to generate by the guided decoding backend. For best results, we recommend ensuring that the expected output format / schema is specified in the prompt to ensure that the model's intended generation is aligned with the schema that it's being forced to generate by the structured outputs backend.
To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and
specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request. specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request.
## Required Function Calling ## Required Function Calling
vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses guided decoding, so this is enabled by default and will work with any supported model. The guided decoding features for `tool_choice='required'` (such as JSON schema with `anyOf`) are currently only supported in the V0 engine with the guided decoding backend `outlines`. However, support for alternative decoding backends are on the [roadmap](../usage/v1_guide.md#features) for the V1 engine. vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses structured outputs, so this is enabled by default and will work with any supported model. However, support for alternative decoding backends are on the [roadmap](../usage/v1_guide.md#features) for the V1 engine.
When tool_choice='required' is set, the model is guaranteed to generate one or more tool calls based on the specified tool list in the `tools` parameter. The number of tool calls depends on the user's query. The output format strictly follows the schema defined in the `tools` parameter. When tool_choice='required' is set, the model is guaranteed to generate one or more tool calls based on the specified tool list in the `tools` parameter. The number of tool calls depends on the user's query. The output format strictly follows the schema defined in the `tools` parameter.
......
...@@ -133,7 +133,7 @@ completion = client.chat.completions.create( ...@@ -133,7 +133,7 @@ completion = client.chat.completions.create(
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
], ],
extra_body={ extra_body={
"guided_choice": ["positive", "negative"] "structured_outputs": {"choice": ["positive", "negative"]}
} }
) )
``` ```
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
""" """
This file demonstrates the example usage of guided decoding This file demonstrates the example usage of structured outputs
to generate structured outputs using vLLM. It shows how to apply in vLLM. It shows how to apply different constraints such as choice,
different guided decoding techniques such as Choice, Regex, JSON schema, regex, json schema, and grammar to produce structured and formatted
and Grammar to produce structured and formatted results results based on specific prompts.
based on specific prompts.
""" """
from enum import Enum from enum import Enum
...@@ -13,19 +12,23 @@ from enum import Enum ...@@ -13,19 +12,23 @@ from enum import Enum
from pydantic import BaseModel from pydantic import BaseModel
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.sampling_params import GuidedDecodingParams from vllm.sampling_params import StructuredOutputsParams
MAX_TOKENS = 50 MAX_TOKENS = 50
# Guided decoding by Choice (list of possible options) # Structured outputs by Choice (list of possible options)
guided_decoding_params_choice = GuidedDecodingParams(choice=["Positive", "Negative"]) structured_outputs_params_choice = StructuredOutputsParams(
sampling_params_choice = SamplingParams(guided_decoding=guided_decoding_params_choice) choice=["Positive", "Negative"]
)
sampling_params_choice = SamplingParams(
structured_outputs=structured_outputs_params_choice
)
prompt_choice = "Classify this sentiment: vLLM is wonderful!" prompt_choice = "Classify this sentiment: vLLM is wonderful!"
# Guided decoding by Regex # Structured outputs by Regex
guided_decoding_params_regex = GuidedDecodingParams(regex=r"\w+@\w+\.com\n") structured_outputs_params_regex = StructuredOutputsParams(regex=r"\w+@\w+\.com\n")
sampling_params_regex = SamplingParams( sampling_params_regex = SamplingParams(
guided_decoding=guided_decoding_params_regex, structured_outputs=structured_outputs_params_regex,
stop=["\n"], stop=["\n"],
max_tokens=MAX_TOKENS, max_tokens=MAX_TOKENS,
) )
...@@ -36,7 +39,7 @@ prompt_regex = ( ...@@ -36,7 +39,7 @@ prompt_regex = (
) )
# Guided decoding by JSON using Pydantic schema # Structured outputs by JSON using Pydantic schema
class CarType(str, Enum): class CarType(str, Enum):
sedan = "sedan" sedan = "sedan"
suv = "SUV" suv = "SUV"
...@@ -51,17 +54,16 @@ class CarDescription(BaseModel): ...@@ -51,17 +54,16 @@ class CarDescription(BaseModel):
json_schema = CarDescription.model_json_schema() json_schema = CarDescription.model_json_schema()
guided_decoding_params_json = GuidedDecodingParams(json=json_schema) structured_outputs_params_json = StructuredOutputsParams(json=json_schema)
sampling_params_json = SamplingParams( sampling_params_json = SamplingParams(
guided_decoding=guided_decoding_params_json, structured_outputs=structured_outputs_params_json, max_tokens=MAX_TOKENS
max_tokens=MAX_TOKENS,
) )
prompt_json = ( prompt_json = (
"Generate a JSON with the brand, model and car_type of" "Generate a JSON with the brand, model and car_type of "
"the most iconic car from the 90's" "the most iconic car from the 90's"
) )
# Guided decoding by Grammar # Structured outputs by Grammar
simplified_sql_grammar = """ simplified_sql_grammar = """
root ::= select_statement root ::= select_statement
select_statement ::= "SELECT " column " from " table " where " condition select_statement ::= "SELECT " column " from " table " where " condition
...@@ -70,13 +72,15 @@ table ::= "table_1 " | "table_2 " ...@@ -70,13 +72,15 @@ table ::= "table_1 " | "table_2 "
condition ::= column "= " number condition ::= column "= " number
number ::= "1 " | "2 " number ::= "1 " | "2 "
""" """
guided_decoding_params_grammar = GuidedDecodingParams(grammar=simplified_sql_grammar) structured_outputs_params_grammar = StructuredOutputsParams(
grammar=simplified_sql_grammar
)
sampling_params_grammar = SamplingParams( sampling_params_grammar = SamplingParams(
guided_decoding=guided_decoding_params_grammar, structured_outputs=structured_outputs_params_grammar,
max_tokens=MAX_TOKENS, max_tokens=MAX_TOKENS,
) )
prompt_grammar = ( prompt_grammar = (
"Generate an SQL query to show the 'username' and 'email'from the 'users' table." "Generate an SQL query to show the 'username' and 'email' from the 'users' table."
) )
...@@ -93,16 +97,16 @@ def main(): ...@@ -93,16 +97,16 @@ def main():
llm = LLM(model="Qwen/Qwen2.5-3B-Instruct", max_model_len=100) llm = LLM(model="Qwen/Qwen2.5-3B-Instruct", max_model_len=100)
choice_output = generate_output(prompt_choice, sampling_params_choice, llm) choice_output = generate_output(prompt_choice, sampling_params_choice, llm)
format_output("Guided decoding by Choice", choice_output) format_output("Structured outputs by Choice", choice_output)
regex_output = generate_output(prompt_regex, sampling_params_regex, llm) regex_output = generate_output(prompt_regex, sampling_params_regex, llm)
format_output("Guided decoding by Regex", regex_output) format_output("Structured outputs by Regex", regex_output)
json_output = generate_output(prompt_json, sampling_params_json, llm) json_output = generate_output(prompt_json, sampling_params_json, llm)
format_output("Guided decoding by JSON", json_output) format_output("Structured outputs by JSON", json_output)
grammar_output = generate_output(prompt_grammar, sampling_params_grammar, llm) grammar_output = generate_output(prompt_grammar, sampling_params_grammar, llm)
format_output("Guided decoding by Grammar", grammar_output) format_output("Structured outputs by Grammar", grammar_output)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -6,7 +6,7 @@ without any specific flags: ...@@ -6,7 +6,7 @@ without any specific flags:
```bash ```bash
VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \ VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \
--guided-decoding-backend outlines --structured-outputs-config.backend outlines
``` ```
This example demonstrates how to generate chat completions This example demonstrates how to generate chat completions
......
...@@ -86,7 +86,7 @@ PARAMS: dict[ConstraintsFormat, dict[str, Any]] = { ...@@ -86,7 +86,7 @@ PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
"content": "Classify this sentiment: vLLM is wonderful!", "content": "Classify this sentiment: vLLM is wonderful!",
} }
], ],
"extra_body": {"guided_choice": ["positive", "negative"]}, "extra_body": {"structured_outputs": {"choice": ["positive", "negative"]}},
}, },
"regex": { "regex": {
"messages": [ "messages": [
...@@ -96,7 +96,7 @@ PARAMS: dict[ConstraintsFormat, dict[str, Any]] = { ...@@ -96,7 +96,7 @@ PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
} }
], ],
"extra_body": { "extra_body": {
"guided_regex": r"[a-z0-9.]{1,20}@\w{6,10}\.com\n", "structured_outputs": {"regex": r"[a-z0-9.]{1,20}@\w{6,10}\.com\n"},
}, },
}, },
"json": { "json": {
...@@ -122,7 +122,8 @@ PARAMS: dict[ConstraintsFormat, dict[str, Any]] = { ...@@ -122,7 +122,8 @@ PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
} }
], ],
"extra_body": { "extra_body": {
"guided_grammar": """ "structured_outputs": {
"grammar": """
root ::= select_statement root ::= select_statement
select_statement ::= "SELECT " column " from " table " where " condition select_statement ::= "SELECT " column " from " table " where " condition
...@@ -135,6 +136,7 @@ condition ::= column "= " number ...@@ -135,6 +136,7 @@ condition ::= column "= " number
number ::= "1 " | "2 " number ::= "1 " | "2 "
""", """,
}
}, },
}, },
"structural_tag": { "structural_tag": {
......
...@@ -184,7 +184,7 @@ def sample_enum_json_schema(): ...@@ -184,7 +184,7 @@ def sample_enum_json_schema():
@pytest.fixture @pytest.fixture
def sample_guided_choice(): def sample_structured_outputs_choices():
return [ return [
"Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript", "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript",
"Ruby", "Swift", "Kotlin" "Ruby", "Swift", "Kotlin"
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import sys
from contextlib import nullcontext
from vllm_test_utils import BlameResult, blame
from vllm import LLM, SamplingParams
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.sampling_params import GuidedDecodingParams
def run_normal():
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create an LLM without guided decoding as a baseline.
llm = LLM(model="distilbert/distilgpt2",
enforce_eager=True,
gpu_memory_utilization=0.3)
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
# Destroy the LLM object and free up the GPU memory.
del llm
cleanup_dist_env_and_memory()
def run_xgrammar(sample_regex):
# Create an LLM with guided decoding enabled.
llm = LLM(model="distilbert/distilgpt2",
enforce_eager=True,
guided_decoding_backend="xgrammar",
gpu_memory_utilization=0.3)
prompt = f"Give an example IPv4 address with this regex: {sample_regex}"
guided_decoding = GuidedDecodingParams(regex=sample_regex)
sampling_params = SamplingParams(temperature=0.8,
top_p=0.95,
guided_decoding=guided_decoding)
outputs = llm.generate(
prompts=[prompt] * 2,
sampling_params=sampling_params,
use_tqdm=True,
)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
def test_lazy_outlines(sample_regex):
"""If users don't use guided decoding, outlines should not be imported.
"""
# make sure outlines is not imported
module_name = "outlines"
# In CI, we only check finally if the module is imported.
# If it is indeed imported, we can rerun the test with `use_blame=True`,
# which will trace every function call to find the first import location,
# and help find the root cause.
# We don't run it in CI by default because it is slow.
use_blame = False
context = blame(
lambda: module_name in sys.modules) if use_blame else nullcontext()
with context as result:
run_normal()
run_xgrammar(sample_regex)
if use_blame:
assert isinstance(result, BlameResult)
print(f"the first import location is:\n{result.trace_stack}")
assert module_name not in sys.modules, (
f"Module {module_name} is imported. To see the first"
f" import location, run the test with `use_blame=True`.")
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# imports for guided decoding tests # imports for structured outputs tests
import json import json
from typing import Optional from typing import Optional
...@@ -480,10 +480,11 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, ...@@ -480,10 +480,11 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_guided_choice_chat(client: openai.AsyncOpenAI, async def test_structured_outputs_choice_chat(
sample_guided_choice, is_v1_server: bool): client: openai.AsyncOpenAI, sample_structured_outputs_choices,
is_v1_server: bool):
if not is_v1_server: if not is_v1_server:
pytest.skip("Guided decoding is only supported in v1 engine") pytest.skip("Structured outputs is only supported in v1 engine")
messages = [{ messages = [{
"role": "system", "role": "system",
"content": "you are a helpful assistant" "content": "you are a helpful assistant"
...@@ -498,9 +499,10 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI, ...@@ -498,9 +499,10 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
messages=messages, messages=messages,
max_completion_tokens=10, max_completion_tokens=10,
temperature=0.7, temperature=0.7,
extra_body=dict(guided_choice=sample_guided_choice)) extra_body=dict(
structured_outputs={"choice": sample_structured_outputs_choices}))
choice1 = chat_completion.choices[0].message.content choice1 = chat_completion.choices[0].message.content
assert choice1 in sample_guided_choice assert choice1 in sample_structured_outputs_choices
messages.append({"role": "assistant", "content": choice1}) messages.append({"role": "assistant", "content": choice1})
messages.append({ messages.append({
...@@ -512,17 +514,19 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI, ...@@ -512,17 +514,19 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
messages=messages, messages=messages,
max_completion_tokens=10, max_completion_tokens=10,
temperature=0.7, temperature=0.7,
extra_body=dict(guided_choice=sample_guided_choice)) extra_body=dict(
structured_outputs={"choice": sample_structured_outputs_choices}))
choice2 = chat_completion.choices[0].message.content choice2 = chat_completion.choices[0].message.content
assert choice2 in sample_guided_choice assert choice2 in sample_structured_outputs_choices
assert choice1 != choice2 assert choice1 != choice2
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema, async def test_structured_outputs_json_chat(client: openai.AsyncOpenAI,
sample_json_schema,
is_v1_server: bool): is_v1_server: bool):
if not is_v1_server: if not is_v1_server:
pytest.skip("Guided decoding is only supported in v1 engine") pytest.skip("Structured outputs is only supported in v1 engine")
messages = [{ messages = [{
"role": "system", "role": "system",
...@@ -538,7 +542,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema, ...@@ -538,7 +542,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema,
model=MODEL_NAME, model=MODEL_NAME,
messages=messages, messages=messages,
max_completion_tokens=1000, max_completion_tokens=1000,
extra_body=dict(guided_json=sample_json_schema)) extra_body=dict(structured_outputs={"json": sample_json_schema}))
message = chat_completion.choices[0].message message = chat_completion.choices[0].message
assert message.content is not None assert message.content is not None
json1 = json.loads(message.content) json1 = json.loads(message.content)
...@@ -555,7 +559,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema, ...@@ -555,7 +559,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema,
model=MODEL_NAME, model=MODEL_NAME,
messages=messages, messages=messages,
max_completion_tokens=1000, max_completion_tokens=1000,
extra_body=dict(guided_json=sample_json_schema)) extra_body=dict(structured_outputs={"json": sample_json_schema}))
message = chat_completion.choices[0].message message = chat_completion.choices[0].message
assert message.content is not None assert message.content is not None
json2 = json.loads(message.content) json2 = json.loads(message.content)
...@@ -565,10 +569,10 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema, ...@@ -565,10 +569,10 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, sample_json_schema,
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex, async def test_structured_outputs_regex_chat(client: openai.AsyncOpenAI,
is_v1_server: bool): sample_regex, is_v1_server: bool):
if not is_v1_server: if not is_v1_server:
pytest.skip("Guided decoding is only supported in v1 engine") pytest.skip("Structured outputs is only supported in v1 engine")
messages = [{ messages = [{
"role": "system", "role": "system",
...@@ -583,7 +587,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex, ...@@ -583,7 +587,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex,
model=MODEL_NAME, model=MODEL_NAME,
messages=messages, messages=messages,
max_completion_tokens=20, max_completion_tokens=20,
extra_body=dict(guided_regex=sample_regex)) extra_body=dict(structured_outputs={"regex": sample_regex}))
ip1 = chat_completion.choices[0].message.content ip1 = chat_completion.choices[0].message.content
assert ip1 is not None assert ip1 is not None
assert re.fullmatch(sample_regex, ip1) is not None assert re.fullmatch(sample_regex, ip1) is not None
...@@ -594,7 +598,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex, ...@@ -594,7 +598,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex,
model=MODEL_NAME, model=MODEL_NAME,
messages=messages, messages=messages,
max_completion_tokens=20, max_completion_tokens=20,
extra_body=dict(guided_regex=sample_regex)) extra_body=dict(structured_outputs={"regex": sample_regex}))
ip2 = chat_completion.choices[0].message.content ip2 = chat_completion.choices[0].message.content
assert ip2 is not None assert ip2 is not None
assert re.fullmatch(sample_regex, ip2) is not None assert re.fullmatch(sample_regex, ip2) is not None
...@@ -602,7 +606,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex, ...@@ -602,7 +606,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI, sample_regex,
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_guided_decoding_type_error(client: openai.AsyncOpenAI): async def test_structured_outputs_type_error(client: openai.AsyncOpenAI):
messages = [{ messages = [{
"role": "system", "role": "system",
"content": "you are a helpful assistant" "content": "you are a helpful assistant"
...@@ -614,17 +618,19 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI): ...@@ -614,17 +618,19 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):
}] }]
with pytest.raises(openai.BadRequestError): with pytest.raises(openai.BadRequestError):
_ = await client.chat.completions.create(model=MODEL_NAME, _ = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages, messages=messages,
extra_body=dict(guided_regex={ extra_body=dict(
structured_outputs={"regex": {
1: "Python", 1: "Python",
2: "C++" 2: "C++"
})) }}))
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI, async def test_structured_outputs_choice_chat_logprobs(
sample_guided_choice): client: openai.AsyncOpenAI, sample_structured_outputs_choices):
messages = [{ messages = [{
"role": "system", "role": "system",
...@@ -641,7 +647,8 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI, ...@@ -641,7 +647,8 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
max_completion_tokens=10, max_completion_tokens=10,
logprobs=True, logprobs=True,
top_logprobs=5, top_logprobs=5,
extra_body=dict(guided_choice=sample_guided_choice)) extra_body=dict(
structured_outputs={"choice": sample_structured_outputs_choices}))
assert chat_completion.choices[0].logprobs is not None assert chat_completion.choices[0].logprobs is not None
assert chat_completion.choices[0].logprobs.content is not None assert chat_completion.choices[0].logprobs.content is not None
...@@ -663,31 +670,32 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema, ...@@ -663,31 +670,32 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema,
}, { }, {
"role": "role":
"user", "user",
"content": "content": ("Give an example JSON for an employee "
f"Give an example JSON for an employee profile that " "profile using the specified tool.")
f"fits this schema: {sample_json_schema}"
}] }]
tools = [{
# non-streaming
chat_completion = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_completion_tokens=1000,
tools=[{
"type": "function", "type": "function",
"function": { "function": {
"name": "dummy_function_name", "name": "dummy_function_name",
"description": "This is a dummy function", "description": "This is a dummy function",
"parameters": sample_json_schema "parameters": sample_json_schema
} }
}], }]
tool_choice={ tool_choice = {
"type": "function", "type": "function",
"function": { "function": {
"name": "dummy_function_name" "name": "dummy_function_name"
} }
}, }
# non-streaming
chat_completion = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_completion_tokens=1000,
tools=tools,
tool_choice=tool_choice,
) )
message = chat_completion.choices[0].message message = chat_completion.choices[0].message
assert len(message.content) == 0 assert len(message.content) == 0
...@@ -705,24 +713,11 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema, ...@@ -705,24 +713,11 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, sample_json_schema,
# streaming # streaming
stream = await client.chat.completions.create( stream = await client.chat.completions.create(model=MODEL_NAME,
model=MODEL_NAME,
messages=messages, messages=messages,
max_completion_tokens=1000, max_completion_tokens=1000,
tools=[{ tools=tools,
"type": "function", tool_choice=tool_choice,
"function": {
"name": "dummy_function_name",
"description": "This is a dummy function",
"parameters": sample_json_schema
}
}],
tool_choice={
"type": "function",
"function": {
"name": "dummy_function_name"
}
},
stream=True) stream=True)
output = [] output = []
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# imports for guided decoding tests # imports for structured outputs tests
import json import json
import os import os
from typing import Optional from typing import Optional
...@@ -23,8 +23,6 @@ MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" ...@@ -23,8 +23,6 @@ MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
# technically these adapters use a different base model, # technically these adapters use a different base model,
# but we're not testing generation quality here # but we're not testing generation quality here
GUIDED_DECODING_BACKENDS = ["outlines", "xgrammar", "guidance"]
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def default_server_args(zephyr_lora_files): def default_server_args(zephyr_lora_files):
...@@ -595,12 +593,13 @@ async def test_allowed_token_ids(client: openai.AsyncOpenAI): ...@@ -595,12 +593,13 @@ async def test_allowed_token_ids(client: openai.AsyncOpenAI):
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) async def test_structured_outputs_json_completion(
async def test_guided_json_completion(client: openai.AsyncOpenAI, client: openai.AsyncOpenAI,
guided_decoding_backend: str, sample_json_schema,
sample_json_schema, is_v1_server: bool): is_v1_server: bool,
):
if not is_v1_server: if not is_v1_server:
pytest.skip("Guided decoding is only supported in v1 engine") pytest.skip("structured outputs is only supported in v1 engine")
completion = await client.completions.create( completion = await client.completions.create(
model=MODEL_NAME, model=MODEL_NAME,
...@@ -609,8 +608,7 @@ async def test_guided_json_completion(client: openai.AsyncOpenAI, ...@@ -609,8 +608,7 @@ async def test_guided_json_completion(client: openai.AsyncOpenAI,
n=3, n=3,
temperature=1.0, temperature=1.0,
max_tokens=500, max_tokens=500,
extra_body=dict(guided_json=sample_json_schema, extra_body=dict(structured_outputs=dict(json=sample_json_schema)))
guided_decoding_backend=guided_decoding_backend))
assert completion.id is not None assert completion.id is not None
assert len(completion.choices) == 3 assert len(completion.choices) == 3
...@@ -620,12 +618,13 @@ async def test_guided_json_completion(client: openai.AsyncOpenAI, ...@@ -620,12 +618,13 @@ async def test_guided_json_completion(client: openai.AsyncOpenAI,
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) async def test_structured_outputs_regex_completion(
async def test_guided_regex_completion(client: openai.AsyncOpenAI, client: openai.AsyncOpenAI,
guided_decoding_backend: str, sample_regex,
sample_regex, is_v1_server: bool): is_v1_server: bool,
):
if not is_v1_server: if not is_v1_server:
pytest.skip("Guided decoding is only supported in v1 engine") pytest.skip("structured outputs is only supported in v1 engine")
completion = await client.completions.create( completion = await client.completions.create(
model=MODEL_NAME, model=MODEL_NAME,
...@@ -633,8 +632,7 @@ async def test_guided_regex_completion(client: openai.AsyncOpenAI, ...@@ -633,8 +632,7 @@ async def test_guided_regex_completion(client: openai.AsyncOpenAI,
n=3, n=3,
temperature=1.0, temperature=1.0,
max_tokens=20, max_tokens=20,
extra_body=dict(guided_regex=sample_regex, extra_body=dict(structured_outputs=dict(regex=sample_regex)))
guided_decoding_backend=guided_decoding_backend))
assert completion.id is not None assert completion.id is not None
assert len(completion.choices) == 3 assert len(completion.choices) == 3
...@@ -644,13 +642,13 @@ async def test_guided_regex_completion(client: openai.AsyncOpenAI, ...@@ -644,13 +642,13 @@ async def test_guided_regex_completion(client: openai.AsyncOpenAI,
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) async def test_structured_outputs_choice_completion(
async def test_guided_choice_completion(client: openai.AsyncOpenAI, client: openai.AsyncOpenAI,
guided_decoding_backend: str, sample_structured_outputs_choices,
sample_guided_choice, is_v1_server: bool,
is_v1_server: bool): ):
if not is_v1_server: if not is_v1_server:
pytest.skip("Guided decoding is only supported in v1 engine") pytest.skip("structured outputs is only supported in v1 engine")
completion = await client.completions.create( completion = await client.completions.create(
model=MODEL_NAME, model=MODEL_NAME,
...@@ -658,20 +656,21 @@ async def test_guided_choice_completion(client: openai.AsyncOpenAI, ...@@ -658,20 +656,21 @@ async def test_guided_choice_completion(client: openai.AsyncOpenAI,
n=2, n=2,
temperature=1.0, temperature=1.0,
max_tokens=10, max_tokens=10,
extra_body=dict(guided_choice=sample_guided_choice, extra_body=dict(structured_outputs=dict(
guided_decoding_backend=guided_decoding_backend)) choice=sample_structured_outputs_choices)))
assert completion.id is not None assert completion.id is not None
assert len(completion.choices) == 2 assert len(completion.choices) == 2
for i in range(2): for i in range(2):
assert completion.choices[i].text in sample_guided_choice assert completion.choices[i].text in sample_structured_outputs_choices
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_guided_grammar(client: openai.AsyncOpenAI, async def test_structured_outputs_grammar(client: openai.AsyncOpenAI,
sample_sql_statements, is_v1_server: bool): sample_sql_statements,
is_v1_server: bool):
if not is_v1_server: if not is_v1_server:
pytest.skip("Guided grammar is only supported in v1 engine") pytest.skip("grammar is only supported in v1 engine")
completion = await client.completions.create( completion = await client.completions.create(
model=MODEL_NAME, model=MODEL_NAME,
...@@ -679,7 +678,8 @@ async def test_guided_grammar(client: openai.AsyncOpenAI, ...@@ -679,7 +678,8 @@ async def test_guided_grammar(client: openai.AsyncOpenAI,
"table_1 where it is equals to 1"), "table_1 where it is equals to 1"),
temperature=1.0, temperature=1.0,
max_tokens=500, max_tokens=500,
extra_body=dict(guided_grammar=sample_sql_statements)) extra_body=dict(
structured_outputs=dict(grammar=sample_sql_statements), ))
content = completion.choices[0].text content = completion.choices[0].text
...@@ -730,27 +730,26 @@ async def test_echo_logprob_completion(client: openai.AsyncOpenAI, ...@@ -730,27 +730,26 @@ async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) async def test_structured_outputs_type_error(client: openai.AsyncOpenAI,
async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
guided_decoding_backend: str,
sample_json_schema, sample_regex, sample_json_schema, sample_regex,
is_v1_server: bool): is_v1_server: bool):
if not is_v1_server: if not is_v1_server:
pytest.skip("Guided decoding is only supported in v1 engine") pytest.skip("structured outputs is only supported in v1 engine")
with pytest.raises(openai.BadRequestError): with pytest.raises(openai.BadRequestError):
_ = await client.completions.create( _ = await client.completions.create(
model=MODEL_NAME, model=MODEL_NAME,
prompt="Give an example JSON that fits this schema: 42", prompt="Give an example JSON that fits this schema: 42",
extra_body=dict(guided_json=42, extra_body=dict(structured_outputs=dict(json=42)))
guided_decoding_backend=guided_decoding_backend))
with pytest.raises(openai.BadRequestError): with pytest.raises(openai.BadRequestError):
_ = await client.completions.create( _ = await client.completions.create(
model=MODEL_NAME, model=MODEL_NAME,
prompt="Give an example string that fits this regex", prompt="Give an example string that fits this regex",
extra_body=dict(guided_regex=sample_regex, extra_body=dict(structured_outputs=dict(
guided_json=sample_json_schema)) regex=sample_regex,
json=sample_json_schema,
)))
@pytest.mark.asyncio @pytest.mark.asyncio
......
...@@ -142,7 +142,7 @@ def server(): # noqa: F811 ...@@ -142,7 +142,7 @@ def server(): # noqa: F811
"--dtype", "--dtype",
"half", "half",
"--enable-auto-tool-choice", "--enable-auto-tool-choice",
"--guided-decoding-backend", "--structured-outputs-config.backend",
"xgrammar", "xgrammar",
"--tool-call-parser", "--tool-call-parser",
"hermes", "hermes",
...@@ -225,7 +225,7 @@ def k2_server(): # noqa: F811 ...@@ -225,7 +225,7 @@ def k2_server(): # noqa: F811
"--dtype", "--dtype",
"half", "half",
"--enable-auto-tool-choice", "--enable-auto-tool-choice",
"--guided-decoding-backend", "--structured-outputs-config.backend",
"xgrammar", "xgrammar",
"--tool-call-parser", "--tool-call-parser",
"hermes", "hermes",
......
...@@ -102,12 +102,14 @@ def before_generate_case(context: schemathesis.hooks.HookContext, strategy): ...@@ -102,12 +102,14 @@ def before_generate_case(context: schemathesis.hooks.HookContext, strategy):
if "custom" in tool_call: if "custom" in tool_call:
return False return False
# Sometimes guided_grammar is generated to be empty # Sometimes structured_outputs.grammar is generated to be empty
# Causing a server error in EBNF grammar parsing # Causing a server error in EBNF grammar parsing
# https://github.com/vllm-project/vllm/pull/22587#issuecomment-3195253421 # https://github.com/vllm-project/vllm/pull/22587#issuecomment-3195253421
guided_grammar = case.body.get("guided_grammar") structured_outputs = case.body.get("structured_outputs", {})
grammar = structured_outputs.get("grammar") if isinstance(
structured_outputs, dict) else None
if guided_grammar == '': if grammar == '':
# Allow None (will be handled as no grammar) # Allow None (will be handled as no grammar)
# But skip empty strings # But skip empty strings
return False return False
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
import io import io
# imports for guided decoding tests # imports for structured outputs tests
import openai import openai
import pybase64 import pybase64
import pytest import pytest
......
...@@ -333,7 +333,6 @@ async def test_serving_chat_should_set_correct_max_tokens(): ...@@ -333,7 +333,6 @@ async def test_serving_chat_should_set_correct_max_tokens():
"role": "user", "role": "user",
"content": "what is 1+1?" "content": "what is 1+1?"
}], }],
guided_decoding_backend="outlines",
) )
with suppress(Exception): with suppress(Exception):
...@@ -378,7 +377,6 @@ async def test_serving_chat_should_set_correct_max_tokens(): ...@@ -378,7 +377,6 @@ async def test_serving_chat_should_set_correct_max_tokens():
"role": "user", "role": "user",
"content": "what is 1+1?" "content": "what is 1+1?"
}], }],
guided_decoding_backend="outlines",
) )
with suppress(Exception): with suppress(Exception):
...@@ -433,7 +431,6 @@ async def test_serving_chat_should_set_correct_max_tokens(): ...@@ -433,7 +431,6 @@ async def test_serving_chat_should_set_correct_max_tokens():
"role": "user", "role": "user",
"content": "what is 1+1?" "content": "what is 1+1?"
}], }],
guided_decoding_backend="outlines",
) )
with suppress(Exception): with suppress(Exception):
...@@ -489,7 +486,6 @@ async def test_serving_chat_could_load_correct_generation_config(): ...@@ -489,7 +486,6 @@ async def test_serving_chat_could_load_correct_generation_config():
"role": "user", "role": "user",
"content": "what is 1+1?" "content": "what is 1+1?"
}], }],
guided_decoding_backend="outlines",
) )
with suppress(Exception): with suppress(Exception):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment