feat: support MQA + MoE (Qwen3 MoE) TEP/DEP in Planner Profiler (#4612)

Signed-off-by: hongkuanz <hongkuanz@nvidia.com>

feat: support MQA + MoE (Qwen3 MoE) TEP/DEP in Planner Profiler (#4612)
Signed-off-by: hongkuanz <hongkuanz@nvidia.com>
5652d670 · Hongkuan Zhou · GitHub · 81ad38ef · 5652d670 · 5652d670
Unverified Commit 5652d670 authored Nov 25, 2025 by Hongkuan Zhou Committed by GitHub Nov 26, 2025
7 changed files
--- a/benchmarks/profiler/profile_sla.py
+++ b/benchmarks/profiler/profile_sla.py
@@ -135,19 +135,6 @@ async def run_profile(args):
        args.aic_backend = args.backend

    try:
-        # Log MoE model support
-        if args.model_info.is_moe:
-            logger.info(
-                "MoE (Mixture of Experts) model profiling, sweeping TEP/DEP size for prefill and decode"
-            )
-            assert args.backend in [
-                "sglang"
-            ], "MoE model support is only available for SGLang"
-        else:
-            logger.info(
-                "Dense model profiling, sweeping TP size for prefill and decode"
-            )
-
        config_modifier = CONFIG_MODIFIERS[args.backend]

        with open(args.config, "r") as f:
@@ -162,11 +149,7 @@ async def run_profile(args):
            for i in range(int(math.log2(args.max_num_gpus_per_engine)) + 1)
            if args.min_num_gpus_per_engine <= 2**i <= args.max_num_gpus_per_engine
        ]
-        if args.model_info.is_moe:
-            logger.info(f"Profiling MoE GPU counts (TEP/DEP): {profile_num_gpus}")
-        else:
-            logger.info(f"Profiling dense model GPU counts (TP): {profile_num_gpus}")
-
+        logger.info(f"Profiling GPU counts: {profile_num_gpus}")
        os.makedirs(args.output_dir, exist_ok=True)

        model_name = config_modifier.get_model_name(config)
@@ -722,11 +705,10 @@ async def run_profile(args):
        config = generate_dgd_config_with_planner(
            config_path=args.config,
            config_modifier=config_modifier,
-            best_prefill_gpus=best_prefill_gpus,
-            best_decode_gpus=best_decode_gpus,
            output_dir=args.output_dir,
            args=args,
-            is_moe_model=args.model_info.is_moe,
+            best_prefill_mapping=best_prefill_mapping,
+            best_decode_mapping=best_decode_mapping,
            num_gpus_per_node=args.num_gpus_per_node,
        )
        logger.debug(f"Final DGD config with planner: {config}")

--- a/benchmarks/profiler/utils/config.py
+++ b/benchmarks/profiler/utils/config.py
@@ -49,8 +49,8 @@ class PodSpec(BaseModel):


 class ServiceResources(BaseModel):
-    requests: Optional[dict[str, str]] = None
-    limits: Optional[dict[str, str]] = None
+    requests: Optional[dict[str, str | dict]] = None
+    limits: Optional[dict[str, str | dict]] = None


 class Service(BaseModel):
@@ -298,21 +298,37 @@ def setup_worker_service_resources(
    if worker_service.resources is None:
        worker_service.resources = ServiceResources()

-    # Ensure requests exists
-    if worker_service.resources.requests is None:
-        worker_service.resources.requests = {}
+    # Ensure limits exists
+    if worker_service.resources.limits is None:
+        worker_service.resources.limits = {}

-    # Set GPU requests
+    # Calculate GPU value
    gpu_value = (
        min(gpu_count, num_gpus_per_node)
        if num_gpus_per_node is not None
        else gpu_count
    )
-    worker_service.resources.requests["gpu"] = str(gpu_value)

-    # Update limits if they exist
-    if worker_service.resources.limits is not None:
-        worker_service.resources.limits["gpu"] = str(gpu_value)
+    def _update_resource_dict(resource_dict: dict[str, str], gpu_value: int):
+        """Helper function to update gpu and custom rdma/ib fields in a resource dictionary.
+
+        Args:
+            resource_dict: The resource dictionary (either limits or requests) to update
+            gpu_value: The GPU value to set
+        """
+        resource_dict["gpu"] = str(gpu_value)
+
+        # also update custom rdma/ib if it exists (some cluster requires this)
+        if "custom" in resource_dict:
+            if isinstance(resource_dict["custom"], dict):
+                if "rdma/ib" in resource_dict["custom"]:
+                    resource_dict["custom"]["rdma/ib"] = str(gpu_value)
+
+    # Update limits
+    _update_resource_dict(worker_service.resources.limits, gpu_value)
+    # Also update requests if they exist
+    if worker_service.resources.requests is not None:
+        _update_resource_dict(worker_service.resources.requests, gpu_value)


 def validate_and_get_worker_args(worker_service, backend):

--- a/benchmarks/profiler/utils/config_modifiers/parallelization_mapping.py
+++ b/benchmarks/profiler/utils/config_modifiers/parallelization_mapping.py
@@ -7,7 +7,10 @@ from dataclasses import dataclass
 from enum import Enum

 from benchmarks.profiler.utils.defaults import PREFILL_MAX_NUM_TOKENS, EngineType
-from benchmarks.profiler.utils.model_info import ModelInfo
+from benchmarks.profiler.utils.model_info import (
+    MOE_ADDITIONAL_TP_ARCHITECTURES,
+    ModelInfo,
+)

 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -83,6 +86,23 @@ class ParallelizationMapping:
        """
        return self.dep if self.dep is not None else 1  # TP and TEP → 1

+    def get_num_gpus(self) -> int:
+        """
+        Get the total number of GPUs for this parallelization mapping.
+
+        Returns:
+            The total number of GPUs used
+        """
+        if self.tp is not None:
+            return self.tp
+        if self.tep is not None:
+            return self.tep
+        if self.dep is not None:
+            return self.dep
+        raise ValueError(
+            "Invalid ParallelizationMapping: no parallelization strategy set"
+        )
+

 def _check_divisibility(
    value: int | None,
@@ -166,16 +186,12 @@ def get_candidate_parallel_mappings(

    candidates: list[ParallelizationMapping] = []
    if is_moe:
-        if phase == EngineType.PREFILL:
-            candidates = [
-                ParallelizationMapping(tep=num_gpus),
-                ParallelizationMapping(dep=num_gpus),
-            ]
-        elif phase == EngineType.DECODE:
-            candidates = [
-                ParallelizationMapping(dep=num_gpus),
-                ParallelizationMapping(tep=num_gpus),
-            ]
+        candidates = [
+            ParallelizationMapping(tep=num_gpus),
+            ParallelizationMapping(dep=num_gpus),
+        ]
+        if model_info.architecture in MOE_ADDITIONAL_TP_ARCHITECTURES:
+            candidates.append(ParallelizationMapping(tp=num_gpus))
    else:
        candidates = [ParallelizationMapping(tp=num_gpus)]


--- a/benchmarks/profiler/utils/config_modifiers/sglang.py
+++ b/benchmarks/profiler/utils/config_modifiers/sglang.py
@@ -205,6 +205,18 @@ class SGLangConfigModifier:

        # Set --tp argument
        args = set_argument_value(args, "--tp", str(tp_size))
+        args = remove_valued_arguments(args, "--tp-size")
+        args = remove_valued_arguments(args, "--tensor-parallel-size")
+
+        # Remove --ep if present
+        args = remove_valued_arguments(args, "--ep")
+        args = remove_valued_arguments(args, "--ep-size")
+        args = remove_valued_arguments(args, "--expert-parallel-size")
+
+        # remove --dp if present
+        args = remove_valued_arguments(args, "--dp")
+        args = remove_valued_arguments(args, "--dp-size")
+        args = remove_valued_arguments(args, "--data-parallel-size")

        worker_service.extraPodSpec.mainContainer.args = args
        return cfg.model_dump()
@@ -230,12 +242,18 @@ class SGLangConfigModifier:

        # 1. Set --tp=tep_size, if not present add it
        args = set_argument_value(args, "--tp", str(tep_size))
+        args = remove_valued_arguments(args, "--tp-size")
+        args = remove_valued_arguments(args, "--tensor-parallel-size")

-        # 2. Set --ep-size=tep_size, if not present add it
-        args = set_argument_value(args, "--ep-size", str(tep_size))
+        # 2. Set --ep=tep_size, if not present add it
+        args = set_argument_value(args, "--ep", str(tep_size))
+        args = remove_valued_arguments(args, "--ep-size")
+        args = remove_valued_arguments(args, "--expert-parallel-size")

        # 3. Remove --dp if present
        args = remove_valued_arguments(args, "--dp")
+        args = remove_valued_arguments(args, "--dp-size")
+        args = remove_valued_arguments(args, "--data-parallel-size")

        # 4. Remove --enable-dp-attention if present
        if "--enable-dp-attention" in args:
@@ -265,16 +283,21 @@ class SGLangConfigModifier:

        # 1. Set --tp=dep_size
        args = set_argument_value(args, "--tp", str(dep_size))
+        args = remove_valued_arguments(args, "--tp-size")
+        args = remove_valued_arguments(args, "--tensor-parallel-size")

        # 2. Set --dp=dep_size (data parallelism across experts)
        args = set_argument_value(args, "--dp", str(dep_size))
+        args = remove_valued_arguments(args, "--dp-size")
+        args = remove_valued_arguments(args, "--data-parallel-size")

        # 3. Enable --enable-dp-attention
-        if "--enable-dp-attention" not in args:
-            args = append_argument(args, "--enable-dp-attention")
+        args = append_argument(args, "--enable-dp-attention")

-        # 4. Set --ep-size=dep_size (expert parallelism size)
-        args = set_argument_value(args, "--ep-size", str(dep_size))
+        # 4. Set --ep=dep_size (expert parallelism size)
+        args = set_argument_value(args, "--ep", str(dep_size))
+        args = remove_valued_arguments(args, "--ep-size")
+        args = remove_valued_arguments(args, "--expert-parallel-size")

        worker_service.extraPodSpec.mainContainer.args = args
        return cfg.model_dump()

--- a/benchmarks/profiler/utils/dgd_generation.py
+++ b/benchmarks/profiler/utils/dgd_generation.py
@@ -20,6 +20,9 @@ import numpy as np
 import yaml

 from benchmarks.profiler.utils.config import Config, DgdPlannerServiceConfig
+from benchmarks.profiler.utils.config_modifiers.parallelization_mapping import (
+    ParallelizationMapping,
+)
 from benchmarks.profiler.utils.planner_utils import build_planner_args_from_namespace
 from dynamo.common.utils.paths import get_workspace_dir
 from dynamo.planner.defaults import SubComponentType
@@ -28,11 +31,10 @@ from dynamo.planner.defaults import SubComponentType
 def generate_dgd_config_with_planner(
    config_path: str,
    config_modifier,
-    best_prefill_gpus: int,
-    best_decode_gpus: int,
    output_dir: str,
    args,
-    is_moe_model: bool = False,
+    best_prefill_mapping: ParallelizationMapping,
+    best_decode_mapping: ParallelizationMapping,
    num_gpus_per_node: int = 8,
 ):
    """Generate DGD config with planner based on profiling results.
@@ -40,12 +42,11 @@ def generate_dgd_config_with_planner(
    Args:
        config_path: Path to the YAML config file
        config_modifier: Config modifier instance (e.g., SGLangConfigModifier)
-        best_prefill_gpus: Number of GPUs for prefill engine
-        best_decode_gpus: Number of GPUs for decode engine
        output_dir: Output directory for profile results
        args: Parsed arguments namespace from profile_sla
-        is_moe_model: Whether this is an MoE model
-        num_gpus_per_node: Number of GPUs per node (for MoE models)
+        best_prefill_mapping: Parallel mapping for prefill (TP/TEP/DEP)
+        best_decode_mapping: Parallel mapping for decode (TP/TEP/DEP)
+        num_gpus_per_node: Number of GPUs per node (for TEP/DEP models)

    Returns:
        list[dict] | dict: If a ConfigMap is generated for planner data, returns a list
@@ -61,28 +62,52 @@ def generate_dgd_config_with_planner(
    if args.dgd_image:
        config = config_modifier.update_image(config, args.dgd_image)

-    if not is_moe_model:
-        # dense model, use TP for both prefill and decode
+    # Apply prefill parallelization based on the actual mapping used in profiling
+    if best_prefill_mapping.tp is not None:
+        # Dense model or TP for prefill
        config = config_modifier.set_config_tp_size(
-            config, best_prefill_gpus, SubComponentType.PREFILL
+            config, best_prefill_mapping.tp, SubComponentType.PREFILL
        )
+    elif best_prefill_mapping.tep is not None:
+        # MoE model with TEP for prefill
+        config = config_modifier.set_config_tep_size(
+            config,
+            best_prefill_mapping.tep,
+            num_gpus_per_node,
+            SubComponentType.PREFILL,
+        )
+    elif best_prefill_mapping.dep is not None:
+        # MoE model with DEP for prefill
+        config = config_modifier.set_config_dep_size(
+            config,
+            best_prefill_mapping.dep,
+            num_gpus_per_node,
+            SubComponentType.PREFILL,
+        )
+
+    # Apply decode parallelization based on the actual mapping used in profiling
+    if best_decode_mapping.tp is not None:
+        # Dense model or TP for decode
        config = config_modifier.set_config_tp_size(
-            config, best_decode_gpus, SubComponentType.DECODE
+            config, best_decode_mapping.tp, SubComponentType.DECODE
        )
-    else:
-        # MoE model, use TEP for prefill and DEP for decode
+    elif best_decode_mapping.tep is not None:
+        # MoE model with TEP for decode
        config = config_modifier.set_config_tep_size(
            config,
-            best_prefill_gpus,
+            best_decode_mapping.tep,
            num_gpus_per_node,
-            SubComponentType.PREFILL,
+            SubComponentType.DECODE,
        )
+    elif best_decode_mapping.dep is not None:
+        # MoE model with DEP for decode
        config = config_modifier.set_config_dep_size(
            config,
-            best_decode_gpus,
+            best_decode_mapping.dep,
            num_gpus_per_node,
            SubComponentType.DECODE,
        )
+
    config = Config.model_validate(config)

    # add the planner service
@@ -121,8 +146,8 @@ def generate_dgd_config_with_planner(
    planner_args.extend(
        [
            f"--namespace={frontend_namespace}",
-            f"--prefill-engine-num-gpu={best_prefill_gpus}",
-            f"--decode-engine-num-gpu={best_decode_gpus}",
+            f"--prefill-engine-num-gpu={best_prefill_mapping.get_num_gpus()}",
+            f"--decode-engine-num-gpu={best_decode_mapping.get_num_gpus()}",
            f"--profile-results-dir={cm_mount_path}",
        ]
    )

--- a/benchmarks/profiler/utils/model_info.py
+++ b/benchmarks/profiler/utils/model_info.py
@@ -28,8 +28,16 @@ CONTEXT_LENGTH_ATTRS = [
    "sliding_window",  # Mistral with sliding window attention
 ]

-# only for MLA + MoE models, treat other MoE models as dense models
-MOE_ARCHITECTURES = {"DeepseekV3ForCausalLM", "DeepseekV32ForCausalLM"}
+# supported MoE architectures
+MOE_ARCHITECTURES = {
+    "DeepseekV3ForCausalLM",
+    "DeepseekV32ForCausalLM",  # MLA + MoE
+    "Qwen3MoeForCausalLM",  # GQA + MoE
+}
+# MoE architectures that sweeps TP additionally to TEP/DEP
+MOE_ADDITIONAL_TP_ARCHITECTURES = {
+    "Qwen3MoeForCausalLM",  # GQA + MoE
+}


 def get_local_model_weight_size(

--- a/docs/benchmarks/sla_driven_profiling.md
+++ b/docs/benchmarks/sla_driven_profiling.md
@@ -21,15 +21,20 @@ This document covers:

 ## Support Matrix

-| Backend | Dense Models (P:TP, D:TP) | MoE Models (P:TEP, D:DEP) |
+| Backend | Dense Models | MoE Models |
 |---------|-------------|------------|
 | vLLM | ✅ | 🚧 |
 | SGLang | ✅ | ✅ |
 | TensorRT-LLM | ✅ | 🚧 |

+Specifically, the profiler sweeps over the following parallelization mapping for prefill and decode:
+| Model Architecture | Prefill Parallelization Mapping | Decode Parallelization Mapping |
+|---------|-------------|------------|
+| MLA+MoE (DeepseekV3ForCausalLM, DeepseekV32ForCausalLM) | TEP, DEP | TEP, DEP |
+| GQA+MoE (Qwen3MoeForCausalLM) | TP, TEP, DEP | TP, TEP, DEP |
+| Other Models | TP | TP |
+
 > [!NOTE]
-> - We only support multi-node engines for MoE models.
-> - For MoE models, we currently only support deepseek-style MLA+MoE models. For other MoE models like GQA+MoE, please use the dense mode (sweep over TP sizes) instead.
 > - Exact model x parallelization mapping support is dependent on the backend. The profiler does not guarantee that the recommended P/D engine configuration is supported and bug-free by the backend.

 ## Using DGDR for Profiling (Recommended)
@@ -269,7 +274,7 @@ profilingConfig:
 **When to use:**
 - **min_num_gpus_per_engine**: Skip small TP sizes if your model is large
 - **max_num_gpus_per_engine**: Limit search space or work around constraints (e.g., [AIC attention heads](#ai-configurator-attention-head-constraint-error))
- **num_gpus_per_node**: Required for MoE models with TEP/DEP sizing
+- **num_gpus_per_node**: Determine the upper bound of number of GPUs per node for dense models and configure Grove for multi-node MoE engines.
 - **gpu_type**: Informational, auto-detected by controller

 > [!TIP]