Unverified Commit 59556265 authored by Yan Ma's avatar Yan Ma Committed by GitHub
Browse files

[XPU] fix MoE triton backend in online fp8 quantization (#40109)


Signed-off-by: default avatarYan Ma <yan.ma@intel.com>
parent 3a30eaa1
...@@ -471,6 +471,12 @@ def convert_to_fp8_moe_kernel_format( ...@@ -471,6 +471,12 @@ def convert_to_fp8_moe_kernel_format(
w2_input_scale=w2_input_scale, w2_input_scale=w2_input_scale,
is_trtllm=(fp8_backend == Fp8MoeBackend.FLASHINFER_TRTLLM), is_trtllm=(fp8_backend == Fp8MoeBackend.FLASHINFER_TRTLLM),
) )
elif fp8_backend == Fp8MoeBackend.XPU:
from vllm.model_executor.layers.fused_moe.xpu_fused_moe import (
prepare_fp8_moe_layer_for_xpu,
)
w13, w2 = prepare_fp8_moe_layer_for_xpu(w13, w2)
else: else:
if fp8_backend not in [ if fp8_backend not in [
Fp8MoeBackend.TRITON, Fp8MoeBackend.TRITON,
......
...@@ -24,6 +24,13 @@ if current_platform.is_xpu(): ...@@ -24,6 +24,13 @@ if current_platform.is_xpu():
from vllm_xpu_kernels.fused_moe_interface import xpu_fused_moe from vllm_xpu_kernels.fused_moe_interface import xpu_fused_moe
def prepare_fp8_moe_layer_for_xpu(
w13: torch.Tensor,
w2: torch.Tensor,
) -> tuple[torch.Tensor, torch.Tensor]:
return w13.transpose(-1, -2).contiguous(), w2.transpose(-1, -2).contiguous()
class XPUExperts(mk.FusedMoEExpertsModular): class XPUExperts(mk.FusedMoEExpertsModular):
def __init__( def __init__(
self, self,
......
...@@ -1019,10 +1019,6 @@ class Fp8OnlineMoEMethod(Fp8MoEMethod): ...@@ -1019,10 +1019,6 @@ class Fp8OnlineMoEMethod(Fp8MoEMethod):
layer.w2_weight[expert, :, :] layer.w2_weight[expert, :, :]
) )
if current_platform.is_xpu():
w13.data = w13.transpose(-1, -2).contiguous()
w2.data = w2.transpose(-1, -2).contiguous()
# Shuffle weights to runtime format and setup kernel. # Shuffle weights to runtime format and setup kernel.
self._setup_kernel( self._setup_kernel(
layer, layer,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment