Commit 8d0e36b5 authored by zhuwenwen's avatar zhuwenwen
Browse files

skip static_scaled_fp8_quant and set VLLM_USE_BYTECODE_HOOK=0

parent b66c8e4b
...@@ -284,9 +284,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -284,9 +284,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
&cpu_attention_with_kv_cache); &cpu_attention_with_kv_cache);
// placeholders // placeholders
ops.def("static_scaled_fp8_quant() -> ()", placeholder_op); // ops.def("static_scaled_fp8_quant() -> ()", placeholder_op);
ops.def("dynamic_scaled_fp8_quant() -> ()", placeholder_op); // ops.def("dynamic_scaled_fp8_quant() -> ()", placeholder_op);
ops.def("dynamic_per_token_scaled_fp8_quant() -> ()", placeholder_op); // ops.def("dynamic_per_token_scaled_fp8_quant() -> ()", placeholder_op);
// WNA16 // WNA16
#if defined(__AVX512F__) #if defined(__AVX512F__)
......
...@@ -27,9 +27,9 @@ ROTARY_OP = torch.ops._C.rotary_embedding.default ...@@ -27,9 +27,9 @@ ROTARY_OP = torch.ops._C.rotary_embedding.default
FLASHINFER_ROTARY_OP = torch.ops.vllm.flashinfer_rotary_embedding.default FLASHINFER_ROTARY_OP = torch.ops.vllm.flashinfer_rotary_embedding.default
QUANT_OPS: dict[QuantKey, OpOverload] = { QUANT_OPS: dict[QuantKey, OpOverload] = {
kFp8StaticTensorSym: torch.ops._C.static_scaled_fp8_quant.default, # noqa: E501 # kFp8StaticTensorSym: torch.ops._C.static_scaled_fp8_quant.default, # noqa: E501
kFp8DynamicTensorSym: torch.ops._C.dynamic_scaled_fp8_quant.default, # noqa: E501 # kFp8DynamicTensorSym: torch.ops._C.dynamic_scaled_fp8_quant.default, # noqa: E501
kFp8DynamicTokenSym: torch.ops._C.dynamic_per_token_scaled_fp8_quant.default, # noqa: E501 # kFp8DynamicTokenSym: torch.ops._C.dynamic_per_token_scaled_fp8_quant.default, # noqa: E501
} }
if current_platform.is_cuda() and hasattr(torch.ops._C, "scaled_fp4_quant"): if current_platform.is_cuda() and hasattr(torch.ops._C, "scaled_fp4_quant"):
......
...@@ -612,7 +612,7 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -612,7 +612,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
# Feature flag to enable/disable bytecode in # Feature flag to enable/disable bytecode in
# TorchCompileWithNoGuardsWrapper. # TorchCompileWithNoGuardsWrapper.
"VLLM_USE_BYTECODE_HOOK": lambda: bool( "VLLM_USE_BYTECODE_HOOK": lambda: bool(
int(os.environ.get("VLLM_USE_BYTECODE_HOOK", "1")) int(os.environ.get("VLLM_USE_BYTECODE_HOOK", "0"))
), ),
# Force vllm to always load AOT compiled models from disk. Failure # Force vllm to always load AOT compiled models from disk. Failure
# to load will result in a hard error when this is enabled. # to load will result in a hard error when this is enabled.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment