skip static_scaled_fp8_quant and set VLLM_USE_BYTECODE_HOOK=0

8d0e36b5 · zhuwenwen · b66c8e4b · 8d0e36b5 · 8d0e36b5 · 8d0e36b5
Commit 8d0e36b5 authored Dec 18, 2025 by zhuwenwen
Hide whitespace changes
Inline Side-by-side

Showing with 7 additions and 7 deletions

csrc/cpu/torch_bindings.cpp csrc/cpu/torch_bindings.cpp +3 -3

vllm/compilation/matcher_utils.py vllm/compilation/matcher_utils.py +3 -3

vllm/envs.py vllm/envs.py +1 -1

No files found.
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -284,9 +284,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      &cpu_attention_with_kv_cache);
  // placeholders
-  ops.def("static_scaled_fp8_quant() -> ()", placeholder_op);
+//   ops.def("static_scaled_fp8_quant() -> ()", placeholder_op);
-  ops.def("dynamic_scaled_fp8_quant() -> ()", placeholder_op);
+//   ops.def("dynamic_scaled_fp8_quant() -> ()", placeholder_op);
-  ops.def("dynamic_per_token_scaled_fp8_quant() -> ()", placeholder_op);
+//   ops.def("dynamic_per_token_scaled_fp8_quant() -> ()", placeholder_op);
  // WNA16
 #if defined(__AVX512F__)

--- a/vllm/compilation/matcher_utils.py
+++ b/vllm/compilation/matcher_utils.py
@@ -27,9 +27,9 @@ ROTARY_OP = torch.ops._C.rotary_embedding.default
 FLASHINFER_ROTARY_OP = torch.ops.vllm.flashinfer_rotary_embedding.default
 QUANT_OPS: dict[QuantKey, OpOverload] = {
-    kFp8StaticTensorSym: torch.ops._C.static_scaled_fp8_quant.default,  # noqa: E501
+    # kFp8StaticTensorSym: torch.ops._C.static_scaled_fp8_quant.default,  # noqa: E501
-    kFp8DynamicTensorSym: torch.ops._C.dynamic_scaled_fp8_quant.default,  # noqa: E501
+    # kFp8DynamicTensorSym: torch.ops._C.dynamic_scaled_fp8_quant.default,  # noqa: E501
-    kFp8DynamicTokenSym: torch.ops._C.dynamic_per_token_scaled_fp8_quant.default,  # noqa: E501
+    # kFp8DynamicTokenSym: torch.ops._C.dynamic_per_token_scaled_fp8_quant.default,  # noqa: E501
 }
 if current_platform.is_cuda() and hasattr(torch.ops._C, "scaled_fp4_quant"):

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -612,7 +612,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # Feature flag to enable/disable bytecode in
    # TorchCompileWithNoGuardsWrapper.
    "VLLM_USE_BYTECODE_HOOK": lambda: bool(
-        int(os.environ.get("VLLM_USE_BYTECODE_HOOK", "1"))
+        int(os.environ.get("VLLM_USE_BYTECODE_HOOK", "0"))
    ),
    # Force vllm to always load AOT compiled models from disk. Failure
    # to load will result in a hard error when this is enabled.