[XPU] fix MoE triton backend in online fp8 quantization (#40109)

Signed-off-by: Yan Ma <yan.ma@intel.com>

[XPU] fix MoE triton backend in online fp8 quantization (#40109)
Signed-off-by: Yan Ma <yan.ma@intel.com>
59556265 · Yan Ma · GitHub · 3a30eaa1 · 59556265 · 59556265
Unverified Commit 59556265 authored Apr 20, 2026 by Yan Ma Committed by GitHub Apr 20, 2026
3 changed files
--- a/vllm/model_executor/layers/fused_moe/oracle/fp8.py
+++ b/vllm/model_executor/layers/fused_moe/oracle/fp8.py
@@ -471,6 +471,12 @@ def convert_to_fp8_moe_kernel_format(
            w2_input_scale=w2_input_scale,
            is_trtllm=(fp8_backend == Fp8MoeBackend.FLASHINFER_TRTLLM),
        )
+    elif fp8_backend == Fp8MoeBackend.XPU:
+        from vllm.model_executor.layers.fused_moe.xpu_fused_moe import (
+            prepare_fp8_moe_layer_for_xpu,
+        )
+
+        w13, w2 = prepare_fp8_moe_layer_for_xpu(w13, w2)
    else:
        if fp8_backend not in [
            Fp8MoeBackend.TRITON,

--- a/vllm/model_executor/layers/fused_moe/xpu_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/xpu_fused_moe.py
@@ -24,6 +24,13 @@ if current_platform.is_xpu():
    from vllm_xpu_kernels.fused_moe_interface import xpu_fused_moe


+def prepare_fp8_moe_layer_for_xpu(
+    w13: torch.Tensor,
+    w2: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    return w13.transpose(-1, -2).contiguous(), w2.transpose(-1, -2).contiguous()
+
+
 class XPUExperts(mk.FusedMoEExpertsModular):
    def __init__(
        self,

--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1019,10 +1019,6 @@ class Fp8OnlineMoEMethod(Fp8MoEMethod):
                layer.w2_weight[expert, :, :]
            )

-        if current_platform.is_xpu():
-            w13.data = w13.transpose(-1, -2).contiguous()
-            w2.data = w2.transpose(-1, -2).contiguous()
-
        # Shuffle weights to runtime format and setup kernel.
        self._setup_kernel(
            layer,