[XPU][CT] support per-channel quantization in xpu fp8 linear method (#38316)

Signed-off-by: Yan Ma <yan.ma@intel.com>

[XPU][CT] support per-channel quantization in xpu fp8 linear method (#38316)
Signed-off-by: Yan Ma <yan.ma@intel.com>
394ff869 · Yan Ma · GitHub · df1e30e7 · 394ff869 · 394ff869
Unverified Commit 394ff869 authored Apr 12, 2026 by Yan Ma Committed by GitHub Apr 12, 2026
Showing with 14 additions and 1 deletion

vllm/model_executor/kernels/linear/__init__.py vllm/model_executor/kernels/linear/__init__.py +1 -1

vllm/model_executor/kernels/linear/scaled_mm/xpu.py vllm/model_executor/kernels/linear/scaled_mm/xpu.py +13 -0

No files found.
--- a/vllm/model_executor/kernels/linear/__init__.py
+++ b/vllm/model_executor/kernels/linear/__init__.py
@@ -204,7 +204,7 @@ _POSSIBLE_WFP8A16_KERNELS: dict[PlatformEnum, list[type[FP8ScaledMMLinearKernel]
        # To be added
    ],
    PlatformEnum.XPU: [
-        # To be added
+        XPUFP8ScaledMMLinearKernel,
    ],
 }

--- a/vllm/model_executor/kernels/linear/scaled_mm/xpu.py
+++ b/vllm/model_executor/kernels/linear/scaled_mm/xpu.py
@@ -9,6 +9,11 @@ from vllm.model_executor.kernels.linear import (  # noqa: E501
    FP8ScaledMMLinearKernel,
    FP8ScaledMMLinearLayerConfig,
 )
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    kFp8StaticChannelSym,
+    kFp8StaticTensorSym,
+)
+from vllm.model_executor.utils import replace_parameter
 from vllm.platforms import current_platform
@@ -23,6 +28,11 @@ class XPUFP8ScaledMMLinearKernel(FP8ScaledMMLinearKernel):
    @classmethod
    def can_implement(cls, c: FP8ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
+        if c.weight_quant_key not in {kFp8StaticChannelSym, kFp8StaticTensorSym}:
+            return (
+                False,
+                "XPUFP8ScaledMM only support per-channel and per-tensor quantization",
+            )
        if c.weight_quant_key.dtype not in {torch.float8_e5m2, torch.float8_e4m3fn}:
            return False, "XPUFP8ScaledMM only support FP8 weight dtype"
        return True, None
@@ -35,6 +45,9 @@ class XPUFP8ScaledMMLinearKernel(FP8ScaledMMLinearKernel):
        self.config = c
        self.layer_param_names = layer_param_names
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        replace_parameter(layer, "weight", layer.weight.data.t())
    def apply_weights(
        self,
        layer: torch.nn.Module,