[TPU][Bugfix] fix moe layer (#21340)

Signed-off-by: Chengji Yao <chengjiyao@google.com> Co-authored-by: Simon Mo <simon.mo@hey.com>

[TPU][Bugfix] fix moe layer (#21340)
Signed-off-by: Chengji Yao <chengjiyao@google.com> Co-authored-by: Simon Mo <simon.mo@hey.com>
e74bfc70 · Chengji Yao · GitHub · 90eeea8f · e74bfc70 · e74bfc70
Unverified Commit e74bfc70 authored Jul 24, 2025 by Chengji Yao Committed by GitHub Jul 24, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 19 additions and 1 deletion

tests/v1/tpu/test_basic.py tests/v1/tpu/test_basic.py +1 -0

vllm/model_executor/layers/fused_moe/layer.py vllm/model_executor/layers/fused_moe/layer.py +18 -1

No files found.
--- a/tests/v1/tpu/test_basic.py
+++ b/tests/v1/tpu/test_basic.py
@@ -18,6 +18,7 @@ if TYPE_CHECKING:

 MODELS = [
    "Qwen/Qwen2.5-1.5B-Instruct",
+    "Qwen/Qwen1.5-MoE-A2.7B",
    # TODO: Enable this models with v6e
    # "Qwen/Qwen2-7B-Instruct",
    # "meta-llama/Llama-3.1-8B",

--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -481,8 +481,16 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
        e_score_correction_bias: Optional[torch.Tensor] = None,
        apply_router_weight_on_input: bool = False,
        activation: str = "silu",
-        **kwargs,
+        enable_eplb: bool = False,
+        expert_load_view: Optional[torch.Tensor] = None,
+        logical_to_physical_map: Optional[torch.Tensor] = None,
+        logical_replica_count: Optional[torch.Tensor] = None,
    ):
+        if enable_eplb is not False or expert_load_view is not None or \
+                logical_to_physical_map is not None or \
+                logical_replica_count is not None:
+            raise NotImplementedError("Expert load balancing is not supported "
+                                      "for CPU.")
        return layer.cpu_fused_moe(
            layer,
            x,
@@ -518,6 +526,10 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
        e_score_correction_bias: Optional[torch.Tensor] = None,
        apply_router_weight_on_input: bool = False,
        activation: str = "silu",
+        enable_eplb: bool = False,
+        expert_load_view: Optional[torch.Tensor] = None,
+        logical_to_physical_map: Optional[torch.Tensor] = None,
+        logical_replica_count: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        assert not use_grouped_topk
        assert num_expert_group is None
@@ -531,6 +543,11 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
            raise NotImplementedError(
                "Expert score correction bias is not supported for TPU.")
        assert activation == "silu", f"{activation} is not supported for TPU."
+        if enable_eplb is not False or expert_load_view is not None or \
+                logical_to_physical_map is not None or \
+                logical_replica_count is not None:
+            raise NotImplementedError("Expert load balancing is not supported "
+                                      "for TPU.")
        return fused_moe_pallas(hidden_states=x,
                                w1=layer.w13_weight,
                                w2=layer.w2_weight,