Unverified Commit e74bfc70 authored by Chengji Yao's avatar Chengji Yao Committed by GitHub
Browse files

[TPU][Bugfix] fix moe layer (#21340)


Signed-off-by: default avatarChengji Yao <chengjiyao@google.com>
Co-authored-by: default avatarSimon Mo <simon.mo@hey.com>
parent 90eeea8f
...@@ -18,6 +18,7 @@ if TYPE_CHECKING: ...@@ -18,6 +18,7 @@ if TYPE_CHECKING:
MODELS = [ MODELS = [
"Qwen/Qwen2.5-1.5B-Instruct", "Qwen/Qwen2.5-1.5B-Instruct",
"Qwen/Qwen1.5-MoE-A2.7B",
# TODO: Enable this models with v6e # TODO: Enable this models with v6e
# "Qwen/Qwen2-7B-Instruct", # "Qwen/Qwen2-7B-Instruct",
# "meta-llama/Llama-3.1-8B", # "meta-llama/Llama-3.1-8B",
......
...@@ -481,8 +481,16 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): ...@@ -481,8 +481,16 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
e_score_correction_bias: Optional[torch.Tensor] = None, e_score_correction_bias: Optional[torch.Tensor] = None,
apply_router_weight_on_input: bool = False, apply_router_weight_on_input: bool = False,
activation: str = "silu", activation: str = "silu",
**kwargs, enable_eplb: bool = False,
expert_load_view: Optional[torch.Tensor] = None,
logical_to_physical_map: Optional[torch.Tensor] = None,
logical_replica_count: Optional[torch.Tensor] = None,
): ):
if enable_eplb is not False or expert_load_view is not None or \
logical_to_physical_map is not None or \
logical_replica_count is not None:
raise NotImplementedError("Expert load balancing is not supported "
"for CPU.")
return layer.cpu_fused_moe( return layer.cpu_fused_moe(
layer, layer,
x, x,
...@@ -518,6 +526,10 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): ...@@ -518,6 +526,10 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
e_score_correction_bias: Optional[torch.Tensor] = None, e_score_correction_bias: Optional[torch.Tensor] = None,
apply_router_weight_on_input: bool = False, apply_router_weight_on_input: bool = False,
activation: str = "silu", activation: str = "silu",
enable_eplb: bool = False,
expert_load_view: Optional[torch.Tensor] = None,
logical_to_physical_map: Optional[torch.Tensor] = None,
logical_replica_count: Optional[torch.Tensor] = None,
) -> torch.Tensor: ) -> torch.Tensor:
assert not use_grouped_topk assert not use_grouped_topk
assert num_expert_group is None assert num_expert_group is None
...@@ -531,6 +543,11 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): ...@@ -531,6 +543,11 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
raise NotImplementedError( raise NotImplementedError(
"Expert score correction bias is not supported for TPU.") "Expert score correction bias is not supported for TPU.")
assert activation == "silu", f"{activation} is not supported for TPU." assert activation == "silu", f"{activation} is not supported for TPU."
if enable_eplb is not False or expert_load_view is not None or \
logical_to_physical_map is not None or \
logical_replica_count is not None:
raise NotImplementedError("Expert load balancing is not supported "
"for TPU.")
return fused_moe_pallas(hidden_states=x, return fused_moe_pallas(hidden_states=x,
w1=layer.w13_weight, w1=layer.w13_weight,
w2=layer.w2_weight, w2=layer.w2_weight,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment