Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
8d0e36b5
"csrc/vscode:/vscode.git/clone" did not exist on "91feb245d2fc177f30fe64c1273d5182eaa42497"
Commit
8d0e36b5
authored
Dec 18, 2025
by
zhuwenwen
Browse files
skip static_scaled_fp8_quant and set VLLM_USE_BYTECODE_HOOK=0
parent
b66c8e4b
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
7 additions
and
7 deletions
+7
-7
csrc/cpu/torch_bindings.cpp
csrc/cpu/torch_bindings.cpp
+3
-3
vllm/compilation/matcher_utils.py
vllm/compilation/matcher_utils.py
+3
-3
vllm/envs.py
vllm/envs.py
+1
-1
No files found.
csrc/cpu/torch_bindings.cpp
View file @
8d0e36b5
...
@@ -284,9 +284,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
...
@@ -284,9 +284,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
&
cpu_attention_with_kv_cache
);
&
cpu_attention_with_kv_cache
);
// placeholders
// placeholders
ops
.
def
(
"static_scaled_fp8_quant() -> ()"
,
placeholder_op
);
//
ops.def("static_scaled_fp8_quant() -> ()", placeholder_op);
ops
.
def
(
"dynamic_scaled_fp8_quant() -> ()"
,
placeholder_op
);
//
ops.def("dynamic_scaled_fp8_quant() -> ()", placeholder_op);
ops
.
def
(
"dynamic_per_token_scaled_fp8_quant() -> ()"
,
placeholder_op
);
//
ops.def("dynamic_per_token_scaled_fp8_quant() -> ()", placeholder_op);
// WNA16
// WNA16
#if defined(__AVX512F__)
#if defined(__AVX512F__)
...
...
vllm/compilation/matcher_utils.py
View file @
8d0e36b5
...
@@ -27,9 +27,9 @@ ROTARY_OP = torch.ops._C.rotary_embedding.default
...
@@ -27,9 +27,9 @@ ROTARY_OP = torch.ops._C.rotary_embedding.default
FLASHINFER_ROTARY_OP
=
torch
.
ops
.
vllm
.
flashinfer_rotary_embedding
.
default
FLASHINFER_ROTARY_OP
=
torch
.
ops
.
vllm
.
flashinfer_rotary_embedding
.
default
QUANT_OPS
:
dict
[
QuantKey
,
OpOverload
]
=
{
QUANT_OPS
:
dict
[
QuantKey
,
OpOverload
]
=
{
kFp8StaticTensorSym
:
torch
.
ops
.
_C
.
static_scaled_fp8_quant
.
default
,
# noqa: E501
#
kFp8StaticTensorSym: torch.ops._C.static_scaled_fp8_quant.default, # noqa: E501
kFp8DynamicTensorSym
:
torch
.
ops
.
_C
.
dynamic_scaled_fp8_quant
.
default
,
# noqa: E501
#
kFp8DynamicTensorSym: torch.ops._C.dynamic_scaled_fp8_quant.default, # noqa: E501
kFp8DynamicTokenSym
:
torch
.
ops
.
_C
.
dynamic_per_token_scaled_fp8_quant
.
default
,
# noqa: E501
#
kFp8DynamicTokenSym: torch.ops._C.dynamic_per_token_scaled_fp8_quant.default, # noqa: E501
}
}
if
current_platform
.
is_cuda
()
and
hasattr
(
torch
.
ops
.
_C
,
"scaled_fp4_quant"
):
if
current_platform
.
is_cuda
()
and
hasattr
(
torch
.
ops
.
_C
,
"scaled_fp4_quant"
):
...
...
vllm/envs.py
View file @
8d0e36b5
...
@@ -612,7 +612,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
...
@@ -612,7 +612,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
# Feature flag to enable/disable bytecode in
# Feature flag to enable/disable bytecode in
# TorchCompileWithNoGuardsWrapper.
# TorchCompileWithNoGuardsWrapper.
"VLLM_USE_BYTECODE_HOOK"
:
lambda
:
bool
(
"VLLM_USE_BYTECODE_HOOK"
:
lambda
:
bool
(
int
(
os
.
environ
.
get
(
"VLLM_USE_BYTECODE_HOOK"
,
"
1
"
))
int
(
os
.
environ
.
get
(
"VLLM_USE_BYTECODE_HOOK"
,
"
0
"
))
),
),
# Force vllm to always load AOT compiled models from disk. Failure
# Force vllm to always load AOT compiled models from disk. Failure
# to load will result in a hard error when this is enabled.
# to load will result in a hard error when this is enabled.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment