Unverified Commit 59556265 authored by Yan Ma's avatar Yan Ma Committed by GitHub
Browse files

[XPU] fix MoE triton backend in online fp8 quantization (#40109)


Signed-off-by: default avatarYan Ma <yan.ma@intel.com>
parent 3a30eaa1
......@@ -471,6 +471,12 @@ def convert_to_fp8_moe_kernel_format(
w2_input_scale=w2_input_scale,
is_trtllm=(fp8_backend == Fp8MoeBackend.FLASHINFER_TRTLLM),
)
elif fp8_backend == Fp8MoeBackend.XPU:
from vllm.model_executor.layers.fused_moe.xpu_fused_moe import (
prepare_fp8_moe_layer_for_xpu,
)
w13, w2 = prepare_fp8_moe_layer_for_xpu(w13, w2)
else:
if fp8_backend not in [
Fp8MoeBackend.TRITON,
......
......@@ -24,6 +24,13 @@ if current_platform.is_xpu():
from vllm_xpu_kernels.fused_moe_interface import xpu_fused_moe
def prepare_fp8_moe_layer_for_xpu(
w13: torch.Tensor,
w2: torch.Tensor,
) -> tuple[torch.Tensor, torch.Tensor]:
return w13.transpose(-1, -2).contiguous(), w2.transpose(-1, -2).contiguous()
class XPUExperts(mk.FusedMoEExpertsModular):
def __init__(
self,
......
......@@ -1019,10 +1019,6 @@ class Fp8OnlineMoEMethod(Fp8MoEMethod):
layer.w2_weight[expert, :, :]
)
if current_platform.is_xpu():
w13.data = w13.transpose(-1, -2).contiguous()
w2.data = w2.transpose(-1, -2).contiguous()
# Shuffle weights to runtime format and setup kernel.
self._setup_kernel(
layer,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment