[XPU] Enable Expert parallel for MoE models (#28263)

Signed-off-by: Yan Ma <yan.ma@intel.com> Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>

[XPU] Enable Expert parallel for MoE models (#28263)
Signed-off-by: Yan Ma <yan.ma@intel.com> Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
1aaecda0 · Kunshang Ji · GitHub · 811df41e · 1aaecda0 · 1aaecda0
Unverified Commit 1aaecda0 authored Nov 08, 2025 by Kunshang Ji Committed by GitHub Nov 08, 2025
3 changed files
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -642,10 +642,12 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
        if current_platform.is_xpu():
            import intel_extension_for_pytorch as ipex
+            ep_rank_start = self.moe.ep_rank * self.moe.num_local_experts
            layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
                layer.w13_weight,
                layer.w2_weight,
                use_prepack=True,
+                experts_start_id=ep_rank_start,
            )
        elif current_platform.is_cpu():
            from vllm.model_executor.layers.fused_moe import cpu_fused_moe

--- a/vllm/model_executor/layers/quantization/ipex_quant.py
+++ b/vllm/model_executor/layers/quantization/ipex_quant.py
@@ -399,6 +399,7 @@ class XPUFp8MoEMethod(FusedMoEMethodBase):
            layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
        import intel_extension_for_pytorch as ipex
+        ep_rank_start = self.moe.ep_rank * self.moe.num_local_experts
        layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
            layer.w13_weight,
            layer.w2_weight,
@@ -407,6 +408,7 @@ class XPUFp8MoEMethod(FusedMoEMethodBase):
            a1_scale_inv=layer.w13_input_scale,
            a2_scale_inv=layer.w2_input_scale,
            use_prepack=True,
+            experts_start_id=ep_rank_start,
        )
    def get_fused_moe_quant_config(

--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -1113,6 +1113,7 @@ class IpexMxfp4MoEMethod(Mxfp4MoEMethod):
        layer.w13_weight.data = layer.w13_weight.data.view(torch.int32)
        layer.w2_weight.data = layer.w2_weight.data.view(torch.int32)
+        ep_rank_start = self.moe_config.ep_rank * self.moe_config.num_local_experts
        layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
            layer.w13_weight,
            layer.w2_weight,
@@ -1121,6 +1122,7 @@ class IpexMxfp4MoEMethod(Mxfp4MoEMethod):
            w13_bias=layer.w13_bias,
            w2_bias=layer.w2_bias,
            is_mxfp4=True,
+            experts_start_id=ep_rank_start,
        )
    def apply(