Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
61a01b27
Unverified
Commit
61a01b27
authored
Mar 11, 2025
by
Russell Bryant
Committed by
GitHub
Mar 11, 2025
Browse files
[V1] Delay all xgrammar usage until needed (#14616)
Signed-off-by:
Russell Bryant
<
rbryant@redhat.com
>
parent
53056731
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
20 additions
and
13 deletions
+20
-13
vllm/v1/structured_output/__init__.py
vllm/v1/structured_output/__init__.py
+20
-13
No files found.
vllm/v1/structured_output/__init__.py
View file @
61a01b27
...
...
@@ -14,7 +14,6 @@ from vllm.v1.structured_output.grammar import Grammar, StructuredOutputOptions
if
TYPE_CHECKING
:
import
numpy
as
np
import
numpy.typing
as
npt
import
torch
import
xgrammar
as
xgr
from
vllm.v1.request
import
Request
...
...
@@ -27,14 +26,18 @@ logger = init_logger(__name__)
class
StructuredOutputManager
:
def
__init__
(
self
,
vllm_config
:
VllmConfig
):
tokenizer_group
=
init_tokenizer_from_configs
(
model_config
=
vllm_config
.
model_config
,
scheduler_config
=
vllm_config
.
scheduler_config
,
parallel_config
=
vllm_config
.
parallel_config
,
lora_config
=
vllm_config
.
lora_config
)
# type: ignore[arg-type]
tokenizer_group
.
ping
()
self
.
vocab_size
=
vllm_config
.
model_config
.
get_vocab_size
()
self
.
vllm_config
=
vllm_config
self
.
init_complete
=
False
def
_delayed_init
(
self
):
"""Initialization delayed until we know it is needed."""
tokenizer_group
=
init_tokenizer_from_configs
(
model_config
=
self
.
vllm_config
.
model_config
,
scheduler_config
=
self
.
vllm_config
.
scheduler_config
,
parallel_config
=
self
.
vllm_config
.
parallel_config
,
lora_config
=
self
.
vllm_config
.
lora_config
)
# type: ignore[arg-type]
tokenizer_group
.
ping
()
tokenizer
=
tokenizer_group
.
get_lora_tokenizer
(
None
)
tokenizer_info
=
xgr
.
TokenizerInfo
.
from_huggingface
(
...
...
@@ -47,12 +50,21 @@ class StructuredOutputManager:
# compilation, so we set it to half the number of CPUs.
max_workers
=
max
(
1
,
(
multiprocessing
.
cpu_count
()
+
1
)
//
2
)
self
.
executor
=
ThreadPoolExecutor
(
max_workers
=
max_workers
)
self
.
_grammar_bitmask
:
Optional
[
torch
.
Tensor
]
=
None
self
.
_grammar_bitmask
=
xgr
.
allocate_token_bitmask
(
self
.
vllm_config
.
scheduler_config
.
max_num_seqs
,
self
.
vocab_size
)
self
.
init_complete
=
True
def
grammar_init
(
self
,
request
:
Request
)
->
None
:
if
request
.
structured_output_request
is
None
:
return
# The first time this is called, we need to finish initialization
# of xgrammar. We defer it to avoid the import of xgrammar and
# initialization cost if it is not going to be used.
if
not
self
.
init_complete
:
self
.
_delayed_init
()
grammar
:
Future
[
Grammar
]
=
self
.
executor
.
submit
(
self
.
_async_create_grammar
,
request
)
request
.
structured_output_request
.
grammar
=
grammar
# type: ignore[assignment]
...
...
@@ -100,11 +112,6 @@ class StructuredOutputManager:
if
not
structured_output_request_ids
:
return
None
if
self
.
_grammar_bitmask
is
None
:
self
.
_grammar_bitmask
=
xgr
.
allocate_token_bitmask
(
self
.
vllm_config
.
scheduler_config
.
max_num_seqs
,
self
.
vocab_size
)
# Fill the bitmask using the index of each request equal to its
# position in the batch. Resize the bitmask down to the size of
# the batch.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment