Unverified Commit 2f47d710 authored by Xiaoyu Zhang's avatar Xiaoyu Zhang Committed by GitHub
Browse files

refine some typo (#3473)

parent 4fe92bfc
...@@ -30,7 +30,7 @@ def get_model_config(model_name: str, tp_size: int): ...@@ -30,7 +30,7 @@ def get_model_config(model_name: str, tp_size: int):
topk = config.num_experts_per_tok topk = config.num_experts_per_tok
intermediate_size = config.moe_intermediate_size intermediate_size = config.moe_intermediate_size
shard_intermediate_size = 2 * intermediate_size // tp_size shard_intermediate_size = 2 * intermediate_size // tp_size
elif config.architectures[0] == "DeepseekV2ForCausalLM": elif config.architectures[0] in ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]:
E = config.n_routed_experts E = config.n_routed_experts
topk = config.num_experts_per_tok topk = config.num_experts_per_tok
intermediate_size = config.intermediate_size intermediate_size = config.intermediate_size
......
...@@ -1094,7 +1094,7 @@ def fused_moe( ...@@ -1094,7 +1094,7 @@ def fused_moe(
- num_expert_group: Optional[int]: additional parameter for grouped_topk - num_expert_group: Optional[int]: additional parameter for grouped_topk
- topk_group: Optional[int]: additional parameter for grouped_topk - topk_group: Optional[int]: additional parameter for grouped_topk
- use_grouped_topk: If True, use grouped_topk instead of fused_topk - use_grouped_topk: If True, use grouped_topk instead of fused_topk
note: Deepseekv2 model uses grouped_topk note: Deepseek V2/V3/R1 series models use grouped_topk
- use_fp8_w8a8 (bool): If True, use fp8 arithmetic to compute the inner - use_fp8_w8a8 (bool): If True, use fp8 arithmetic to compute the inner
products for w1 and w2. Defaults to False. products for w1 and w2. Defaults to False.
- use_int8_w8a16 (bool): If True, use fp8 arithmetic to compute the inner - use_int8_w8a16 (bool): If True, use fp8 arithmetic to compute the inner
......
...@@ -75,7 +75,7 @@ def fused_topk( ...@@ -75,7 +75,7 @@ def fused_topk(
return topk_weights, topk_ids return topk_weights, topk_ids
# This is used by the Deepseek-V2 model # This is used by the Deepseek V2/V3/R1 series models
@torch.compile(dynamic=True, backend=get_compiler_backend()) @torch.compile(dynamic=True, backend=get_compiler_backend())
def grouped_topk( def grouped_topk(
hidden_states: torch.Tensor, hidden_states: torch.Tensor,
......
...@@ -795,7 +795,7 @@ class ServerArgs: ...@@ -795,7 +795,7 @@ class ServerArgs:
parser.add_argument( parser.add_argument(
"--disable-mla", "--disable-mla",
action="store_true", action="store_true",
help="Disable Multi-head Latent Attention (MLA) for DeepSeek-V2.", help="Disable Multi-head Latent Attention (MLA) for DeepSeek V2/V3/R1 series models.",
) )
parser.add_argument( parser.add_argument(
"--disable-overlap-schedule", "--disable-overlap-schedule",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment