skip fp32_precision and static_scaled_fp8_quant, set VLLM_USE_BYTECODE_HOOK=0

e9660f3a · zhuwenwen · c98b6a8f · e9660f3a · e9660f3a · e9660f3a
Commit e9660f3a authored Dec 18, 2025 by zhuwenwen
4 changed files
--- a/vllm/compilation/matcher_utils.py
+++ b/vllm/compilation/matcher_utils.py
@@ -29,9 +29,9 @@ ROTARY_OP = torch.ops._C.rotary_embedding.default
 FLASHINFER_ROTARY_OP = torch.ops.vllm.flashinfer_rotary_embedding.default

 QUANT_OPS: dict[QuantKey, OpOverload] = {
-    kFp8StaticTensorSym: torch.ops._C.static_scaled_fp8_quant.default,  # noqa: E501
-    kFp8DynamicTensorSym: torch.ops._C.dynamic_scaled_fp8_quant.default,  # noqa: E501
-    kFp8DynamicTokenSym: torch.ops._C.dynamic_per_token_scaled_fp8_quant.default,  # noqa: E501
+    # kFp8StaticTensorSym: torch.ops._C.static_scaled_fp8_quant.default,  # noqa: E501
+    # kFp8DynamicTensorSym: torch.ops._C.dynamic_scaled_fp8_quant.default,  # noqa: E501
+    # kFp8DynamicTokenSym: torch.ops._C.dynamic_per_token_scaled_fp8_quant.default,  # noqa: E501
 }

 if current_platform.is_cuda() and hasattr(torch.ops._C, "scaled_fp4_quant"):

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -589,7 +589,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # Feature flag to enable/disable bytecode in
    # TorchCompileWithNoGuardsWrapper.
    "VLLM_USE_BYTECODE_HOOK": lambda: bool(
-        int(os.environ.get("VLLM_USE_BYTECODE_HOOK", "1"))
+        int(os.environ.get("VLLM_USE_BYTECODE_HOOK", "0"))
    ),
    # Force vllm to always load AOT compiled models from disk. Failure
    # to load will result in a hard error when this is enabled.

--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -195,7 +195,7 @@ class RocmPlatform(Platform):
        selected_backend: "AttentionBackendEnum",
        attn_selector_config: "AttentionSelectorConfig",
    ) -> str:
-        from vllm._aiter_ops import rocm_aiter_ops
+        # from vllm._aiter_ops import rocm_aiter_ops

        block_size = attn_selector_config.block_size
        kv_cache_dtype = attn_selector_config.kv_cache_dtype
@@ -285,13 +285,13 @@ class RocmPlatform(Platform):

            # Priority 4: Check for AITER enabled without specific flags
            # This defaults to AITER FA only if MHA is not explicitly disabled
-            if (
-                envs.VLLM_ROCM_USE_AITER
-                and on_gfx9()
-                and envs.VLLM_ROCM_USE_AITER_MHA is not False
-            ):
-                logger.info("Using Aiter Flash Attention backend on V1 engine.")
-                return AttentionBackendEnum.ROCM_AITER_FA.get_path()
+            # if (
+            #     envs.VLLM_ROCM_USE_AITER
+            #     and on_gfx9()
+            #     and envs.VLLM_ROCM_USE_AITER_MHA is not False
+            # ):
+            #     logger.info("Using Aiter Flash Attention backend on V1 engine.")
+            #     return AttentionBackendEnum.ROCM_AITER_FA.get_path()

            # Default: Triton Unified Attention
            logger.info("Using Triton Attention backend on V1 engine.")

--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -81,8 +81,8 @@ class Worker(WorkerBase):
        )

        # configure float32 matmul precision according to vLLM env.
-        precision = envs.VLLM_FLOAT32_MATMUL_PRECISION
-        torch.backends.cuda.matmul.fp32_precision = precision
+        # precision = envs.VLLM_FLOAT32_MATMUL_PRECISION
+        # torch.backends.cuda.matmul.fp32_precision = precision

        if self.model_config.trust_remote_code:
            # note: lazy import to avoid importing torch before initializing