Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
e32a0e86
Unverified
Commit
e32a0e86
authored
Sep 02, 2025
by
Russell Bryant
Committed by
GitHub
Sep 03, 2025
Browse files
Upgrade xgrammar to 0.1.23 (#22988)
Signed-off-by:
Russell Bryant
<
rbryant@redhat.com
>
parent
42dc59db
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
2 additions
and
9 deletions
+2
-9
requirements/common.txt
requirements/common.txt
+1
-1
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+1
-8
No files found.
requirements/common.txt
View file @
e32a0e86
...
@@ -25,7 +25,7 @@ outlines == 0.1.11 ; platform_machine == "s390x"
...
@@ -25,7 +25,7 @@ outlines == 0.1.11 ; platform_machine == "s390x"
# required for outlines backend disk cache
# required for outlines backend disk cache
diskcache == 5.6.3
diskcache == 5.6.3
lark == 1.2.2
lark == 1.2.2
xgrammar == 0.1.2
1
; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
xgrammar == 0.1.2
3
; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
typing_extensions >= 4.10
typing_extensions >= 4.10
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
partial-json-parser # used for parsing partial JSON outputs
partial-json-parser # used for parsing partial JSON outputs
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
e32a0e86
...
@@ -90,15 +90,11 @@ from .utils import (AttentionGroup, MultiModalBudget,
...
@@ -90,15 +90,11 @@ from .utils import (AttentionGroup, MultiModalBudget,
if
TYPE_CHECKING
:
if
TYPE_CHECKING
:
import
xgrammar
as
xgr
import
xgrammar
as
xgr
import
xgrammar.kernels.apply_token_bitmask_inplace_torch_compile
as
xgr_torch_compile
# noqa: E501
from
vllm.model_executor.model_loader.tensorizer
import
TensorizerConfig
from
vllm.model_executor.model_loader.tensorizer
import
TensorizerConfig
from
vllm.v1.core.sched.output
import
SchedulerOutput
from
vllm.v1.core.sched.output
import
SchedulerOutput
else
:
else
:
xgr
=
LazyLoader
(
"xgr"
,
globals
(),
"xgrammar"
)
xgr
=
LazyLoader
(
"xgr"
,
globals
(),
"xgrammar"
)
xgr_torch_compile
=
LazyLoader
(
"xgr_torch_compile"
,
globals
(),
"xgrammar.kernels.apply_token_bitmask_inplace_torch_compile"
)
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
@@ -1333,10 +1329,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -1333,10 +1329,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
# so we receive it in that format.
# so we receive it in that format.
grammar_bitmask
=
torch
.
from_numpy
(
grammar_bitmask
).
contiguous
()
grammar_bitmask
=
torch
.
from_numpy
(
grammar_bitmask
).
contiguous
()
# Force use of the torch.compile implementation from xgrammar to work
xgr
.
apply_token_bitmask_inplace
(
# around issues with the Triton kernel in concurrent structured output
# scenarios. See PR #19565 and issues #19493, #18376 for details.
xgr_torch_compile
.
apply_token_bitmask_inplace_torch_compile
(
logits
,
logits
,
grammar_bitmask
.
to
(
self
.
device
,
non_blocking
=
True
),
grammar_bitmask
.
to
(
self
.
device
,
non_blocking
=
True
),
indices
=
out_indices
if
not
skip_out_indices
else
None
,
indices
=
out_indices
if
not
skip_out_indices
else
None
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment