Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
cb84e45a
Unverified
Commit
cb84e45a
authored
Apr 08, 2025
by
Russell Bryant
Committed by
GitHub
Apr 08, 2025
Browse files
[Core] Upgrade to xgrammar 0.1.18, add cache size limit (#16283)
Signed-off-by:
Russell Bryant
<
rbryant@redhat.com
>
parent
4716377f
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
22 additions
and
3 deletions
+22
-3
requirements/common.txt
requirements/common.txt
+1
-1
vllm/envs.py
vllm/envs.py
+7
-0
vllm/model_executor/guided_decoding/xgrammar_decoding.py
vllm/model_executor/guided_decoding/xgrammar_decoding.py
+7
-1
vllm/v1/structured_output/backend_xgrammar.py
vllm/v1/structured_output/backend_xgrammar.py
+7
-1
No files found.
requirements/common.txt
View file @
cb84e45a
...
...
@@ -22,7 +22,7 @@ lm-format-enforcer >= 0.10.11, < 0.11
llguidance >= 0.7.9, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
outlines == 0.1.11
lark == 1.2.2
xgrammar == 0.1.1
7
; platform_machine == "x86_64" or platform_machine == "aarch64"
xgrammar == 0.1.1
8
; platform_machine == "x86_64" or platform_machine == "aarch64"
typing_extensions >= 4.10
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
partial-json-parser # used for parsing partial JSON outputs
...
...
vllm/envs.py
View file @
cb84e45a
...
...
@@ -106,6 +106,7 @@ if TYPE_CHECKING:
VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION
:
bool
=
False
VLLM_TPU_BUCKET_PADDING_GAP
:
int
=
0
VLLM_USE_DEEP_GEMM
:
bool
=
False
VLLM_XGRAMMAR_CACHE_MB
:
int
=
0
def
get_default_cache_root
():
...
...
@@ -697,6 +698,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
# Allow use of DeepGemm kernels for fused moe ops.
"VLLM_USE_DEEP_GEMM"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_USE_DEEP_GEMM"
,
"0"
))),
# Control the cache sized used by the xgrammar compiler. The default
# of 512 MB should be enough for roughly 1000 JSON schemas.
# It can be changed with this variable if needed for some reason.
"VLLM_XGRAMMAR_CACHE_MB"
:
lambda
:
int
(
os
.
getenv
(
"VLLM_XGRAMMAR_CACHE_MB"
,
"512"
)),
}
# end-env-vars-definition
...
...
vllm/model_executor/guided_decoding/xgrammar_decoding.py
View file @
cb84e45a
...
...
@@ -10,6 +10,7 @@ from typing import TYPE_CHECKING, Any, List
import
torch
import
vllm.envs
from
vllm.logger
import
init_logger
try
:
...
...
@@ -131,8 +132,13 @@ class GrammarCompilerCache:
encoded_vocab
=
config_data
.
encoded_vocab
,
metadata
=
config_data
.
metadata
,
)
cache_size
=
vllm
.
envs
.
VLLM_XGRAMMAR_CACHE_MB
*
1024
*
1024
cls
.
_cache
[
cache_key
]
=
xgr
.
GrammarCompiler
(
tokenizer_info
,
max_threads
=
config
.
max_threads
)
tokenizer_info
,
max_threads
=
config
.
max_threads
,
cache_enabled
=
True
,
cache_limit_bytes
=
cache_size
,
)
return
cls
.
_cache
[
cache_key
]
...
...
vllm/v1/structured_output/backend_xgrammar.py
View file @
cb84e45a
...
...
@@ -5,6 +5,7 @@ from typing import TYPE_CHECKING
import
torch
import
vllm.envs
from
vllm.config
import
VllmConfig
from
vllm.logger
import
init_logger
from
vllm.transformers_utils.tokenizer_group
import
init_tokenizer_from_configs
...
...
@@ -76,7 +77,12 @@ class XgrammarBackend(StructuredOutputBackend):
tokenizer
,
vocab_size
=
self
.
vocab_size
,
)
self
.
compiler
=
xgr
.
GrammarCompiler
(
tokenizer_info
,
max_threads
=
8
)
self
.
compiler
=
xgr
.
GrammarCompiler
(
tokenizer_info
,
max_threads
=
8
,
cache_enabled
=
True
,
cache_limit_bytes
=
vllm
.
envs
.
VLLM_XGRAMMAR_CACHE_MB
*
1024
*
1024
,
)
def
compile_grammar
(
self
,
request_type
:
StructuredOutputOptions
,
grammar_spec
:
str
)
->
StructuredOutputGrammar
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment