Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
04421dff
Unverified
Commit
04421dff
authored
Mar 10, 2025
by
Russell Bryant
Committed by
GitHub
Mar 10, 2025
Browse files
[V1] Prevent xgrammar from breaking TPU support (#14575)
Signed-off-by:
Russell Bryant
<
rbryant@redhat.com
>
parent
432d6dad
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
11 additions
and
2 deletions
+11
-2
vllm/v1/engine/processor.py
vllm/v1/engine/processor.py
+4
-0
vllm/v1/structured_output/__init__.py
vllm/v1/structured_output/__init__.py
+7
-2
No files found.
vllm/v1/engine/processor.py
View file @
04421dff
...
...
@@ -4,6 +4,7 @@ import time
from
collections.abc
import
Mapping
from
typing
import
Optional
,
Union
import
vllm.platforms
from
vllm.config
import
VllmConfig
from
vllm.inputs
import
(
INPUT_REGISTRY
,
InputRegistry
,
ProcessorInputs
,
PromptType
,
SingletonInputsAdapter
)
...
...
@@ -133,6 +134,9 @@ class Processor:
if
self
.
vllm_config
.
speculative_config
:
raise
ValueError
(
"Structured output is not supported with "
"speculative decoding."
)
if
vllm
.
platforms
.
current_platform
.
is_tpu
():
raise
ValueError
(
"Structured output is not supported on TPU."
)
validate_structured_output_request
(
params
)
def
process_inputs
(
...
...
vllm/v1/structured_output/__init__.py
View file @
04421dff
...
...
@@ -17,6 +17,7 @@ from vllm.v1.structured_output.grammar import (Grammar, StructuredOutputKey,
if
TYPE_CHECKING
:
import
numpy
as
np
import
numpy.typing
as
npt
import
torch
import
xgrammar
as
xgr
from
vllm.v1.request
import
Request
...
...
@@ -53,8 +54,7 @@ class StructuredOutputManager:
# compilation, so we set it to half the number of CPUs.
max_workers
=
max
(
1
,
(
multiprocessing
.
cpu_count
()
+
1
)
//
2
)
self
.
executor
=
ThreadPoolExecutor
(
max_workers
=
max_workers
)
self
.
_grammar_bitmask
=
xgr
.
allocate_token_bitmask
(
self
.
vllm_config
.
scheduler_config
.
max_num_seqs
,
self
.
vocab_size
)
self
.
_grammar_bitmask
:
Optional
[
torch
.
Tensor
]
=
None
def
__getitem__
(
self
,
key
:
StructuredOutputKey
)
->
Optional
[
Grammar
]:
# We need to pop and re-insert the grammar here for LRU cache
...
...
@@ -134,6 +134,11 @@ class StructuredOutputManager:
if
not
structured_output_request_ids
:
return
None
if
self
.
_grammar_bitmask
is
None
:
self
.
_grammar_bitmask
=
xgr
.
allocate_token_bitmask
(
self
.
vllm_config
.
scheduler_config
.
max_num_seqs
,
self
.
vocab_size
)
# Fill the bitmask using the index of each request equal to its
# position in the batch. Resize the bitmask down to the size of
# the batch.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment