Upgrade xgrammar to 0.1.23 (#22988)

Signed-off-by: Russell Bryant <rbryant@redhat.com>

Upgrade xgrammar to 0.1.23 (#22988)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
e32a0e86 · Russell Bryant · GitHub · 42dc59db · e32a0e86 · e32a0e86
Unverified Commit e32a0e86 authored Sep 02, 2025 by Russell Bryant Committed by GitHub Sep 03, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 2 additions and 9 deletions

requirements/common.txt requirements/common.txt +1 -1

vllm/v1/worker/gpu_model_runner.py vllm/v1/worker/gpu_model_runner.py +1 -8

No files found.
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -25,7 +25,7 @@ outlines == 0.1.11 ; platform_machine == "s390x"
 # required for outlines backend disk cache
 diskcache == 5.6.3
 lark == 1.2.2
-xgrammar == 0.1.21; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
+xgrammar == 0.1.23; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs

--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -90,15 +90,11 @@ from .utils import (AttentionGroup, MultiModalBudget,
 if TYPE_CHECKING:
    import xgrammar as xgr
-    import xgrammar.kernels.apply_token_bitmask_inplace_torch_compile as xgr_torch_compile  # noqa: E501
    from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
    from vllm.v1.core.sched.output import SchedulerOutput
 else:
    xgr = LazyLoader("xgr", globals(), "xgrammar")
-    xgr_torch_compile = LazyLoader(
-        "xgr_torch_compile", globals(),
-        "xgrammar.kernels.apply_token_bitmask_inplace_torch_compile")
 logger = init_logger(__name__)
@@ -1333,10 +1329,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
        # so we receive it in that format.
        grammar_bitmask = torch.from_numpy(grammar_bitmask).contiguous()
-        # Force use of the torch.compile implementation from xgrammar to work
+        xgr.apply_token_bitmask_inplace(
-        # around issues with the Triton kernel in concurrent structured output
-        # scenarios. See PR #19565 and issues #19493, #18376 for details.
-        xgr_torch_compile.apply_token_bitmask_inplace_torch_compile(
            logits,
            grammar_bitmask.to(self.device, non_blocking=True),
            indices=out_indices if not skip_out_indices else None,