refine some typo (#3473)

2f47d710 · Xiaoyu Zhang · GitHub · 4fe92bfc · 2f47d710 · 2f47d710
Unverified Commit 2f47d710 authored Feb 10, 2025 by Xiaoyu Zhang Committed by GitHub Feb 10, 2025
4 changed files
--- a/benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py
+++ b/benchmark/kernels/fused_moe_triton/benchmark_torch_compile_fused_moe.py
@@ -30,7 +30,7 @@ def get_model_config(model_name: str, tp_size: int):
        topk = config.num_experts_per_tok
        intermediate_size = config.moe_intermediate_size
        shard_intermediate_size = 2 * intermediate_size // tp_size
-    elif config.architectures[0] == "DeepseekV2ForCausalLM":
+    elif config.architectures[0] in ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]:
        E = config.n_routed_experts
        topk = config.num_experts_per_tok
        intermediate_size = config.intermediate_size

--- a/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
@@ -1094,7 +1094,7 @@ def fused_moe(
    - num_expert_group: Optional[int]: additional parameter for grouped_topk
    - topk_group: Optional[int]: additional parameter for grouped_topk
    - use_grouped_topk: If True, use grouped_topk instead of fused_topk
-        note: Deepseekv2 model uses grouped_topk
+        note: Deepseek V2/V3/R1 series models use grouped_topk
    - use_fp8_w8a8 (bool): If True, use fp8 arithmetic to compute the inner
        products for w1 and w2. Defaults to False.
    - use_int8_w8a16 (bool): If True, use fp8 arithmetic to compute the inner

--- a/python/sglang/srt/layers/moe/topk.py
+++ b/python/sglang/srt/layers/moe/topk.py
@@ -75,7 +75,7 @@ def fused_topk(
    return topk_weights, topk_ids
-# This is used by the Deepseek-V2 model
+# This is used by the Deepseek V2/V3/R1 series models
 @torch.compile(dynamic=True, backend=get_compiler_backend())
 def grouped_topk(
    hidden_states: torch.Tensor,

--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -795,7 +795,7 @@ class ServerArgs:
        parser.add_argument(
            "--disable-mla",
            action="store_true",
-            help="Disable Multi-head Latent Attention (MLA) for DeepSeek-V2.",
+            help="Disable Multi-head Latent Attention (MLA) for DeepSeek V2/V3/R1 series models.",
        )
        parser.add_argument(
            "--disable-overlap-schedule",