[BugFix] Make FP8 Linear compatible with torch.compile (#13918)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>

[BugFix] Make FP8 Linear compatible with torch.compile (#13918)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
b382a7f2 · Woosuk Kwon · GitHub · 4cb6fa0a · b382a7f2 · b382a7f2
Unverified Commit b382a7f2 authored Feb 26, 2025 by Woosuk Kwon Committed by GitHub Feb 26, 2025
2 changed files
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -369,12 +369,9 @@ class Fp8LinearMethod(LinearMethodBase):
                size_k=layer.input_size_per_partition,
                bias=bias)
-        # Note: lazy import to avoid triton import error.
-        from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-            apply_w8a8_block_fp8_linear)
        if self.block_quant:
            assert self.quant_config.weight_block_size is not None
-            return apply_w8a8_block_fp8_linear(
+            return torch.ops.vllm.apply_w8a8_block_fp8_linear(
                input=x,
                weight=layer.weight,
                block_size=self.quant_config.weight_block_size,

--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -17,6 +17,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
    CUTLASS_BLOCK_FP8_SUPPORTED, CUTLASS_FP8_SUPPORTED, apply_fp8_linear)
 from vllm.platforms import current_platform
+from vllm.utils import direct_register_custom_op
 logger = init_logger(__name__)
@@ -81,6 +82,25 @@ def apply_w8a8_block_fp8_linear(
    return output.to(dtype=input.dtype).view(*output_shape)
+def apply_w8a8_block_fp8_linear_fake(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    block_size: List[int],
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    output_shape = [*input.shape[:-1], weight.shape[0]]
+    return torch.empty(output_shape, dtype=input.dtype, device=input.device)
+direct_register_custom_op(
+    op_name="apply_w8a8_block_fp8_linear",
+    op_func=apply_w8a8_block_fp8_linear,
+    mutates_args=[],
+    fake_impl=apply_w8a8_block_fp8_linear_fake,
+)
 # Unify the interface between `apply_w8a8_block_fp8_linear` and
 # `apply_fp8_linear`
 # NOTE(lucas): this is quite messy, we should think through this more formally