feat: support MoE model in SLA Planner Sglang (#3185)

Signed-off-by: hongkuanz <hongkuanz@nvidia.com> Signed-off-by: Hongkuan Zhou <tedzhouhk@gmail.com> Co-authored-by: hhzhang16 <54051230+hhzhang16@users.noreply.github.com>

feat: support MoE model in SLA Planner Sglang (#3185)
Signed-off-by: hongkuanz <hongkuanz@nvidia.com> Signed-off-by: Hongkuan Zhou <tedzhouhk@gmail.com> Co-authored-by: hhzhang16 <54051230+hhzhang16@users.noreply.github.com>
6243bcbe · Hongkuan Zhou · GitHub · 8f338a63 · 6243bcbe · 6243bcbe
Unverified Commit 6243bcbe authored Sep 23, 2025 by Hongkuan Zhou Committed by GitHub Sep 23, 2025
19 changed files
--- a/benchmarks/profiler/deploy/profile_sla_moe_job.yaml
+++ b/benchmarks/profiler/deploy/profile_sla_moe_job.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: profile-sla
+  namespace: ${NAMESPACE}
+spec:
+  template:
+    spec:
+      serviceAccountName: dynamo-sa
+      containers:
+      - name: profile-sla
+        image: ${DOCKER_IMAGE}
+        resources:
+          requests:
+            cpu: "32"
+            memory: "50Gi"
+        env:
+          - name: HUGGING_FACE_HUB_TOKEN
+            valueFrom:
+              secretKeyRef:
+                name: hf-token-secret
+                key: HF_TOKEN
+          - name: NATS_SERVER
+            value: nats://${NAMESPACE}-nats:4222
+          - name: ETCD_ENDPOINTS
+            value: ${NAMESPACE}-etcd:2379
+        workingDir: /sgl-workspace/dynamo
+        command: ["python", "-m", "benchmarks.profiler.profile_sla"]
+        args:
+          - --config
+          - /sgl-workspace/dynamo/recipes/deepseek-r1/sglang-wideep/tep16p-dep16d-disagg.yaml
+          - --output-dir
+          - /data/profiling_results
+          - --namespace
+          - ${NAMESPACE}
+          - --backend
+          - sglang
+          - --is-moe-model
+          - --min-num-gpus-per-engine
+          - "8"
+          - --max-num-gpus-per-engine
+          - "16"
+          - --isl
+          - "3000"
+          - --osl
+          - "150"
+          - --ttft
+          - "200"
+          - --itl
+          - "20"
+        volumeMounts:
+          - name: output-volume
+            mountPath: /data
+      restartPolicy: Never
+      volumes:
+        - name: output-volume
+          persistentVolumeClaim:
+            claimName: dynamo-pvc
+  backoffLimit: 0
--- a/benchmarks/profiler/profile_endpoint.py
+++ b/benchmarks/profiler/profile_endpoint.py
@@ -22,6 +22,7 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="profile a given endpoint's performance for prefill or decode"
    )
+    # TODO: use kebab case
    parser.add_argument(
        "--mode",
        type=str,
@@ -79,6 +80,12 @@ if __name__ == "__main__":
        default=8,
        help="interpolation granularity for the results",
    )
+    parser.add_argument(
+        "--attention_dp_size",
+        type=int,
+        default=1,
+        help="attention dp size of the endpoint for MoE models",
+    )
    args = parser.parse_args()
    os.makedirs(args.work_dir, exist_ok=True)
@@ -105,6 +112,7 @@ if __name__ == "__main__":
            args.max_kv_tokens,
            args.max_context_length,
            args.interpolation_granularity,
+            args.attention_dp_size,
        )
    else:
        raise ValueError(f"Invalid mode: {args.mode}")
--- a/benchmarks/profiler/profile_sla.py
+++ b/benchmarks/profiler/profile_sla.py
--- a/benchmarks/profiler/utils/config.py
+++ b/benchmarks/profiler/utils/config.py
@@ -15,6 +15,7 @@
 import json
 import logging
+import math
 import re
 import shlex
 from typing import Literal, Optional, Protocol
@@ -79,6 +80,10 @@ class Config(BaseModel):
    model_config = {"extra": "allow"}
+class MultinodeConfig(BaseModel):
+    nodeCount: int
 def break_arguments(args: list[str] | None) -> list[str]:
    ans: list[str] = []
    if args is None:
@@ -159,15 +164,114 @@ def parse_override_engine_args(args: list[str]) -> tuple[dict, list[str]]:
    return override_dict, args
+def set_multinode_config(worker_service, gpu_count: int, num_gpus_per_node: int):
+    """Helper function to set multinode configuration based on GPU count and GPUs per node."""
+    if gpu_count <= num_gpus_per_node:
+        # Single node: remove multinode configuration if present
+        if (
+            hasattr(worker_service, "multinode")
+            and worker_service.multinode is not None
+        ):
+            worker_service.multinode = None
+    else:
+        # Multi-node: set nodeCount = math.ceil(gpu_count / num_gpus_per_node)
+        node_count = math.ceil(gpu_count / num_gpus_per_node)
+        if not hasattr(worker_service, "multinode") or worker_service.multinode is None:
+            # Create multinode configuration if it doesn't exist
+            worker_service.multinode = MultinodeConfig(nodeCount=node_count)
+        else:
+            # Handle both dict (from YAML) and MultinodeConfig object cases
+            if isinstance(worker_service.multinode, dict):
+                worker_service.multinode["nodeCount"] = node_count
+            else:
+                worker_service.multinode.nodeCount = node_count
+# TODO: make is work for all frameworks
+def get_worker_service_from_config(config: dict):
+    """Helper function to get the SGLang decode worker service from config."""
+    cfg = Config.model_validate(config)
+    return cfg.spec.services[WORKER_COMPONENT_NAMES["sglang"].decode_worker_k8s_name]
+# TODO: make is work for all frameworks
+def setup_worker_service_resources(
+    worker_service, gpu_count: int, num_gpus_per_node: Optional[int] = None
+):
+    """Helper function to set up worker service resources (requests and limits)."""
+    # Handle multinode configuration if num_gpus_per_node is provided
+    if num_gpus_per_node is not None:
+        set_multinode_config(worker_service, gpu_count, num_gpus_per_node)
+    # Ensure resources exists
+    if worker_service.resources is None:
+        worker_service.resources = ServiceResources()
+    # Ensure requests exists
+    if worker_service.resources.requests is None:
+        worker_service.resources.requests = {}
+    # Set GPU requests
+    gpu_value = (
+        min(gpu_count, num_gpus_per_node)
+        if num_gpus_per_node is not None
+        else gpu_count
+    )
+    worker_service.resources.requests["gpu"] = str(gpu_value)
+    # Update limits if they exist
+    if worker_service.resources.limits is not None:
+        worker_service.resources.limits["gpu"] = str(gpu_value)
+# TODO: make is work for all frameworks
+def validate_and_get_worker_args(worker_service):
+    """Helper function to validate worker service and get its arguments."""
+    if not worker_service.extraPodSpec or not worker_service.extraPodSpec.mainContainer:
+        raise ValueError(
+            f"Missing extraPodSpec or mainContainer in SGLang decode worker service '{WORKER_COMPONENT_NAMES['sglang'].decode_worker_k8s_name}'"
+        )
+    args = worker_service.extraPodSpec.mainContainer.args
+    return break_arguments(args)
+def set_argument_value(args: list, arg_name: str, value: str):
+    """Helper function to set an argument value, adding it if not present."""
+    try:
+        idx = args.index(arg_name)
+        args[idx + 1] = value
+    except ValueError:
+        args = append_argument(args, [arg_name, value])
+    return args
 class ConfigModifierProtocol(Protocol):
    @classmethod
-    def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> dict:
+    def convert_config(
+        cls,
+        config: dict,
+        target: Literal["prefill", "decode"],
+        is_moe_model: bool = False,
+    ) -> dict:
        ...
    @classmethod
    def set_config_tp_size(cls, config: dict, tp_size: int) -> dict:
        ...
+    @classmethod
+    def set_config_tep_size(
+        cls, config: dict, tep_size: int, num_gpus_per_node: int
+    ) -> dict:
+        ...
+    @classmethod
+    def set_config_dep_size(
+        cls, config: dict, dep_size: int, num_gpus_per_node: int
+    ) -> dict:
+        ...
    @classmethod
    def get_model_name(cls, config: dict) -> str:
        ...
@@ -177,13 +281,25 @@ class ConfigModifierProtocol(Protocol):
        ...
    @classmethod
-    def get_kv_cache_size_from_dynamo_log(cls, dynamo_log_fn: str) -> int:
+    def get_kv_cache_size_from_dynamo_log(
+        cls, dynamo_log_fn: str, attention_dp_size: int = 1
+    ) -> int:
        ...
 class VllmV1ConfigModifier:
    @classmethod
-    def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> dict:
+    def convert_config(
+        cls,
+        config: dict,
+        target: Literal["prefill", "decode"],
+        is_moe_model: bool = False,
+    ) -> dict:
+        if is_moe_model:
+            raise NotImplementedError(
+                "MoE model support is not implemented for VLLM backend"
+            )
        cfg = Config.model_validate(config)
        # set metadata name
@@ -308,6 +424,18 @@ class VllmV1ConfigModifier:
        return cfg.model_dump()
+    @classmethod
+    def set_config_tep_size(cls, config: dict, tep_size: int, num_gpus_per_node: int):
+        raise NotImplementedError(
+            "TEP (Tensor Expert Parallelism) is not implemented for VLLM backend"
+        )
+    @classmethod
+    def set_config_dep_size(cls, config: dict, dep_size: int, num_gpus_per_node: int):
+        raise NotImplementedError(
+            "DEP (Data Expert Parallelism) is not implemented for VLLM backend"
+        )
    @classmethod
    def get_model_name(cls, config: dict) -> str:
        cfg = Config.model_validate(config)
@@ -365,8 +493,9 @@ class VllmV1ConfigModifier:
            return DYNAMO_RUN_DEFAULT_PORT
    @classmethod
-    def get_kv_cache_size_from_dynamo_log(cls, dynamo_log_fn: str) -> int:
+    def get_kv_cache_size_from_dynamo_log(
-        # TODO
+        cls, dynamo_log_fn: str, attention_dp_size: int = 1
+    ) -> int:
        try:
            with open(dynamo_log_fn, "r") as f:
                for line in f:
@@ -390,7 +519,12 @@ class VllmV1ConfigModifier:
 class SGLangConfigModifier:
    @classmethod
-    def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> dict:
+    def convert_config(
+        cls,
+        config: dict,
+        target: Literal["prefill", "decode"],
+        is_moe_model: bool = False,
+    ) -> dict:
        cfg = Config.model_validate(config)
        # set metadata name
@@ -425,9 +559,10 @@ class SGLangConfigModifier:
            args = break_arguments(args)
-            # remove `--disaggregation-mode` and `--disaggregation-transfer-backend`
+            # remove disagg flags
            args = remove_valued_arguments(args, "--disaggregation-mode")
            args = remove_valued_arguments(args, "--disaggregation-transfer-backend")
+            args = remove_valued_arguments(args, "--disaggregation-bootstrap-port")
            # disable prefix caching
            if "--disable-radix-cache" not in args:
@@ -455,14 +590,25 @@ class SGLangConfigModifier:
            args = break_arguments(args)
-            # remove `--disaggregation-mode` and `--disaggregation-transfer-backend`
+            # remove disagg flags
            args = remove_valued_arguments(args, "--disaggregation-mode")
            args = remove_valued_arguments(args, "--disaggregation-transfer-backend")
+            args = remove_valued_arguments(args, "--disaggregation-bootstrap-port")
            # enable prefix caching
            if "--disable-radix-cache" in args:
                args.remove("--disable-radix-cache")
+            if is_moe_model:
+                # need to use round_robin dp attention routing for MoE models to ensure kv reuse can skip prefill
+                if "--load-balance-method" in args:
+                    idx = args.index("--load-balance-method")
+                    args[idx + 1] = "round_robin"
+                else:
+                    args = append_argument(
+                        args, ["--load-balance-method", "round_robin"]
+                    )
            worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
        # set num workers to 1
@@ -471,49 +617,77 @@ class SGLangConfigModifier:
        ]
        decode_worker_config["replicas"] = 1
-        return config
+        return cfg.model_dump()
    @classmethod
    def set_config_tp_size(cls, config: dict, tp_size: int):
        cfg = Config.model_validate(config)
+        worker_service = get_worker_service_from_config(config)
-        worker_service = cfg.spec.services[
+        # Set up resources
-            WORKER_COMPONENT_NAMES["sglang"].decode_worker_k8s_name
+        setup_worker_service_resources(worker_service, tp_size)
-        ]
-        # Ensure resources exists
+        # Get and validate args
-        if worker_service.resources is None:
+        args = validate_and_get_worker_args(worker_service)
-            worker_service.resources = ServiceResources()
-        # Ensure requests exists
+        # Set --tp argument
-        if worker_service.resources.requests is None:
+        args = set_argument_value(args, "--tp", str(tp_size))
-            worker_service.resources.requests = {}
-        worker_service.resources.requests["gpu"] = str(tp_size)
+        worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
+        return cfg.model_dump()
-        # Update limits if they exist
+    @classmethod
-        if worker_service.resources.limits is not None:
+    def set_config_tep_size(cls, config: dict, tep_size: int, num_gpus_per_node: int):
-            worker_service.resources.limits["gpu"] = str(tp_size)
+        cfg = Config.model_validate(config)
+        worker_service = get_worker_service_from_config(config)
-        if (
+        # Set up resources with multinode configuration
-            not worker_service.extraPodSpec
+        setup_worker_service_resources(worker_service, tep_size, num_gpus_per_node)
-            or not worker_service.extraPodSpec.mainContainer
-        ):
-            raise ValueError(
-                f"Missing extraPodSpec or mainContainer in SGLang decode worker service '{WORKER_COMPONENT_NAMES['sglang'].decode_worker_k8s_name}'"
-            )
-        args = worker_service.extraPodSpec.mainContainer.args
-        args = break_arguments(args)
+        # Get and validate args
+        args = validate_and_get_worker_args(worker_service)
-        try:
+        # 1. Set --tp=tep_size, if not present add it
-            idx = args.index("--tp")
+        args = set_argument_value(args, "--tp", str(tep_size))
-            args[idx + 1] = str(tp_size)
-        except ValueError:
+        # 2. Set --ep-size=tep_size, if not present add it
-            args = append_argument(args, ["--tp", str(tp_size)])
+        args = set_argument_value(args, "--ep-size", str(tep_size))
+        # 3. Remove --dp if present
+        args = remove_valued_arguments(args, "--dp")
+        # 4. Remove --enable-dp-attention if present
+        if "--enable-dp-attention" in args:
+            args.remove("--enable-dp-attention")
        worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
+        return cfg.model_dump()
+    @classmethod
+    def set_config_dep_size(cls, config: dict, dep_size: int, num_gpus_per_node: int):
+        cfg = Config.model_validate(config)
+        worker_service = get_worker_service_from_config(config)
+        # Set up resources with multinode configuration
+        setup_worker_service_resources(worker_service, dep_size, num_gpus_per_node)
+        # Get and validate args
+        args = validate_and_get_worker_args(worker_service)
+        # 1. Set --tp=dep_size
+        args = set_argument_value(args, "--tp", str(dep_size))
+        # 2. Set --dp=dep_size (data parallelism across experts)
+        args = set_argument_value(args, "--dp", str(dep_size))
+        # 3. Enable --enable-dp-attention
+        if "--enable-dp-attention" not in args:
+            args = append_argument(args, "--enable-dp-attention")
+        # 4. Set --ep-size=dep_size (expert parallelism size)
+        args = set_argument_value(args, "--ep-size", str(dep_size))
+        worker_service.extraPodSpec.mainContainer.args = join_arguments(args)
        return cfg.model_dump()
    @classmethod
@@ -573,8 +747,9 @@ class SGLangConfigModifier:
            return DYNAMO_RUN_DEFAULT_PORT
    @classmethod
-    def get_kv_cache_size_from_dynamo_log(cls, dynamo_log_fn: str) -> int:
+    def get_kv_cache_size_from_dynamo_log(
-        # TODO
+        cls, dynamo_log_fn: str, attention_dp_size: int = 1
+    ) -> int:
        try:
            with open(dynamo_log_fn, "r") as f:
                for line in f:
@@ -582,7 +757,7 @@ class SGLangConfigModifier:
                        # Extract the number after "#tokens:"
                        match = re.search(r"#tokens:\s*(\d+)", line)
                        if match:
-                            return int(match.group(1))
+                            return int(match.group(1)) * attention_dp_size
        except Exception as e:
            logger.warning(f"Failed to parse KV cache size from log file. Error: {e}")
        return 0
@@ -590,7 +765,17 @@ class SGLangConfigModifier:
 class TrtllmConfigModifier:
    @classmethod
-    def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> dict:
+    def convert_config(
+        cls,
+        config: dict,
+        target: Literal["prefill", "decode"],
+        is_moe_model: bool = False,
+    ) -> dict:
+        if is_moe_model:
+            raise NotImplementedError(
+                "MoE model support is not implemented for TrtLLM backend"
+            )
        cfg = Config.model_validate(config)
        # set metadata name
@@ -752,6 +937,18 @@ class TrtllmConfigModifier:
        return cfg.model_dump()
+    @classmethod
+    def set_config_tep_size(cls, config: dict, tep_size: int, num_gpus_per_node: int):
+        raise NotImplementedError(
+            "TEP (Tensor Expert Parallelism) is not implemented for TrtLLM backend"
+        )
+    @classmethod
+    def set_config_dep_size(cls, config: dict, dep_size: int, num_gpus_per_node: int):
+        raise NotImplementedError(
+            "DEP (Data Expert Parallelism) is not implemented for TrtLLM backend"
+        )
    @classmethod
    def get_model_name(cls, config: dict) -> str:
        cfg = Config.model_validate(config)
@@ -810,7 +1007,9 @@ class TrtllmConfigModifier:
        return DYNAMO_RUN_DEFAULT_PORT
    @classmethod
-    def get_kv_cache_size_from_dynamo_log(cls, dynamo_log_fn: str) -> int:
+    def get_kv_cache_size_from_dynamo_log(
+        cls, dynamo_log_fn: str, attention_dp_size: int = 1
+    ) -> int:
        # TRT-LLM log parsing for KV cache size
        # Format: [TensorRT-LLM][INFO] [MemUsageChange] Allocated XX GiB for max tokens in paged KV cache (XXXXXX).
        try:

--- a/benchmarks/profiler/utils/defaults.py
+++ b/benchmarks/profiler/utils/defaults.py
@@ -13,22 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-DECODE_NUM_REQUESTS_RANGE = [
-    1,
-    5,
-    10,
-    25,
-    50,
-    100,
-    150,
-    200,
-    250,
-    300,
-    350,
-    400,
-    450,
-    500,
-]
 DEFAULT_MODEL_NAME = "Qwen/Qwen3-0.6B"
 DYNAMO_RUN_DEFAULT_PORT = 8000
+# set a decode maximum concurrency due to limits of profiling tools
+# for MoE models with attn-dp, we might hit this limit
+DECODE_MAX_CONCURRENCY = 2000
--- a/benchmarks/profiler/utils/plot.py
+++ b/benchmarks/profiler/utils/plot.py
@@ -32,13 +32,13 @@ logger.addHandler(console_handler)
 def plot_prefill_performance(
-    prefill_tp_size, prefill_ttft, prefill_thpt_per_gpu, target_ttft, output_dir
+    prefill_num_gpu, prefill_ttft, prefill_thpt_per_gpu, target_ttft, output_dir
 ):
    """
-    Plot prefill performance as a 2D scatter plot with TP size annotations.
+    Plot prefill performance as a 2D scatter plot with GPU count annotations.
    Args:
-        prefill_tp_size: list of TP sizes
+        prefill_num_gpu: list of GPU counts
        prefill_ttft: list of time to first token values
        prefill_thpt_per_gpu: list of throughput per GPU values
        target_ttft: target TTFT value for the vertical line
@@ -46,9 +46,9 @@ def plot_prefill_performance(
    """
    plt.figure(figsize=(10, 6))
    plt.scatter(prefill_ttft, prefill_thpt_per_gpu, s=100)
-    for i, tp in enumerate(prefill_tp_size):
+    for i, num_gpu in enumerate(prefill_num_gpu):
        plt.annotate(
-            f"TP{tp}",
+            f"{num_gpu} GPU(s)",
            (prefill_ttft[i], prefill_thpt_per_gpu[i]),
            xytext=(10, 0),
            textcoords="offset points",
@@ -73,17 +73,17 @@ def plot_prefill_performance(
 def plot_decode_performance(decode_results, target_itl, output_dir):
    """
-    Plot decode performance with multiple TP size lines.
+    Plot decode performance with multiple GPU count lines.
    Args:
-        decode_results: list of tuples (tp_size, itl_list, thpt_per_gpu_list)
+        decode_results: list of tuples (num_gpu, itl_list, thpt_per_gpu_list)
        target_itl: target ITL value for the vertical line
        output_dir: directory to save the plot
    """
    plt.figure(figsize=(10, 6))
-    for tp_size, itl_list, thpt_per_gpu_list in decode_results:
+    for num_gpu, itl_list, thpt_per_gpu_list in decode_results:
-        plt.plot(itl_list, thpt_per_gpu_list, label=f"TP{tp_size}")
+        plt.plot(itl_list, thpt_per_gpu_list, label=f"{num_gpu} GPU(s)")
    plt.axvline(
        x=target_itl, color="r", linestyle="--", label=f"Target ITL: {target_itl} ms"

--- a/benchmarks/profiler/utils/profile_decode.py
+++ b/benchmarks/profiler/utils/profile_decode.py
@@ -6,6 +6,7 @@ from typing import Callable, Optional, Tuple
 import numpy as np
+from benchmarks.profiler.utils.defaults import DECODE_MAX_CONCURRENCY
 from benchmarks.profiler.utils.estimate_perf import AIConfiguratorPerfEstimator
 from benchmarks.profiler.utils.genai_perf import benchmark_decode
 from benchmarks.profiler.utils.plot import plot_decode_3d_surface
@@ -21,6 +22,21 @@ console_handler.setFormatter(formatter)
 logger.addHandler(console_handler)
+def get_num_request_range(attn_dp_size, engine_max_concurrency, granularity):
+    # for MoE models with attn-dp, we want the num_request to be a multiple of attn_dp_size
+    # so that we can make sure the request is sent to the same dp rank as the warmup request
+    # this is guaranteed because the dp scheduler is scheduling round-robin
+    max_concurrency = min(engine_max_concurrency, DECODE_MAX_CONCURRENCY)
+    conc_per_dp = max_concurrency // attn_dp_size
+    if conc_per_dp < granularity:
+        ans = list(range(attn_dp_size, conc_per_dp * attn_dp_size + 1, attn_dp_size))
+    else:
+        step = (conc_per_dp - 1) * attn_dp_size / (granularity - 1)
+        ans = [attn_dp_size + int(i * step) * attn_dp_size for i in range(granularity)]
+    return ans
 def _profile_decode_helper(
    work_dir,
    num_gpus,
@@ -30,6 +46,7 @@ def _profile_decode_helper(
    get_itl_and_thpt_per_gpu: Callable[
        [int, int, int], Tuple[Optional[float], Optional[float]]
    ],
+    attention_dp_size,
 ):
    """interpolate ITL - Active_KV_Cache - Decode_Context_Length"""
    x_kv_usage = []
@@ -51,18 +68,9 @@ def _profile_decode_helper(
                f" isl {isl} + osl {osl}, skipping."
            )
            break
-        elif max_concurrency < interpolation_granularity:
-            logger.warning(
-                f"max_concurrency {max_concurrency} is too small for"
-                f" interpolation granularity {interpolation_granularity}."
-                f" max_kv_tokens {max_kv_tokens}, isl {isl}, osl {osl}"
-            )
-            sweep_num_request = range(1, max_concurrency + 1)
        else:
-            sweep_num_request = range(
+            sweep_num_request = get_num_request_range(
-                1,
+                attention_dp_size, max_concurrency, interpolation_granularity
-                max_concurrency,
-                max_concurrency // interpolation_granularity,
            )
        for num_request in sweep_num_request:
            itl, thpt_per_gpu = get_itl_and_thpt_per_gpu(isl, osl, num_request)
@@ -102,6 +110,7 @@ def profile_decode(
    max_kv_tokens,
    max_context_length,
    interpolation_granularity,
+    attention_dp_size,
 ):
    def get_itl_and_thpt_per_gpu(isl, osl, num_request):
        genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}_osl{osl}_n{num_request}"
@@ -127,6 +136,7 @@ def profile_decode(
        max_context_length,
        interpolation_granularity,
        get_itl_and_thpt_per_gpu,
+        attention_dp_size,
    )
@@ -137,6 +147,7 @@ def profile_decode_aiconfigurator(
    max_context_length,
    interpolation_granularity,
    ai_configurator_perf_estimator: AIConfiguratorPerfEstimator,
+    attention_dp_size,
    **model_config_kwargs,
 ):
    def get_itl_and_thpt_per_gpu(isl, osl, num_request):
@@ -156,4 +167,5 @@ def profile_decode_aiconfigurator(
        max_context_length,
        interpolation_granularity,
        get_itl_and_thpt_per_gpu,
+        attention_dp_size,
    )
--- a/components/backends/sglang/deploy/agg.yaml
+++ b/components/backends/sglang/deploy/agg.yaml
@@ -14,7 +14,7 @@ spec:
      extraPodSpec:
        mainContainer:
          image: my-registry/sglang-runtime:my-tag
-    SGLangDecodeWorker:
+    decode:
      envFromSecret: hf-token-secret
      dynamoNamespace: sglang-agg
      componentType: worker

--- a/components/backends/sglang/deploy/agg_logging.yaml
+++ b/components/backends/sglang/deploy/agg_logging.yaml
@@ -17,7 +17,7 @@ spec:
      extraPodSpec:
        mainContainer:
          image: my-registry/sglang-runtime:my-tag
-    SGLangDecodeWorker:
+    decode:
      envFromSecret: hf-token-secret
      dynamoNamespace: sglang-agg
      componentType: worker

--- a/components/backends/sglang/deploy/agg_router.yaml
+++ b/components/backends/sglang/deploy/agg_router.yaml
@@ -17,7 +17,7 @@ spec:
      envs:
        - name: DYN_ROUTER_MODE
          value: kv
-    SGLangDecodeWorker:
+    decode:
      envFromSecret: hf-token-secret
      dynamoNamespace: sglang-agg-router
      componentType: worker

--- a/components/backends/sglang/deploy/disagg.yaml
+++ b/components/backends/sglang/deploy/disagg.yaml
@@ -14,7 +14,7 @@ spec:
      extraPodSpec:
        mainContainer:
          image: my-registry/sglang-runtime:my-tag
-    SGLangDecodeWorker:
+    decode:
      envFromSecret: hf-token-secret
      dynamoNamespace: sglang-disagg
      componentType: worker
@@ -41,7 +41,7 @@ spec:
              --disaggregation-mode decode
              --disaggregation-transfer-backend nixl
-    SGLangPrefillWorker:
+    prefill:
      envFromSecret: hf-token-secret
      dynamoNamespace: sglang-disagg
      componentType: worker

--- a/components/backends/sglang/deploy/disagg_planner.yaml
+++ b/components/backends/sglang/deploy/disagg_planner.yaml
@@ -98,7 +98,7 @@ spec:
            - -c
          args:
            - "python3 -m dynamo.planner.prometheus"
-    SGLangDecodeWorker:
+    decode:
      dynamoNamespace: dynamo
      envFromSecret: hf-token-secret
      componentType: worker
@@ -129,7 +129,7 @@ spec:
            - decode
            - --disaggregation-transfer-backend
            - nixl
-    SGLangPrefillWorker:
+    prefill:
      dynamoNamespace: dynamo
      envFromSecret: hf-token-secret
      componentType: worker

--- a/components/planner/src/dynamo/planner/defaults.py
+++ b/components/planner/src/dynamo/planner/defaults.py
@@ -93,10 +93,14 @@ class VllmComponentName:
 class SGLangComponentName:
-    prefill_worker_k8s_name = "SGLangPrefillWorker"
+    prefill_worker_k8s_name = (
+        "prefill"  # use short name to stay within k8s limits with grove
+    )
    prefill_worker_component_name = "prefill"
    prefill_worker_endpoint = "generate"
-    decode_worker_k8s_name = "SGLangDecodeWorker"
+    decode_worker_k8s_name = (
+        "decode"  # use short name to stay within k8s limits with grove
+    )
    decode_worker_component_name = "backend"
    decode_worker_endpoint = "generate"

--- a/components/planner/src/dynamo/planner/utils/planner_argparse.py
+++ b/components/planner/src/dynamo/planner/utils/planner_argparse.py
@@ -86,12 +86,6 @@ def create_sla_planner_parser() -> argparse.ArgumentParser:
        default=SLAPlannerDefaults.profile_results_dir,
        help="Profile results directory",
    )
-    parser.add_argument(
-        "--isl", type=int, default=SLAPlannerDefaults.isl, help="Input sequence length"
-    )
-    parser.add_argument(
-        "--osl", type=int, default=SLAPlannerDefaults.osl, help="Output sequence length"
-    )
    parser.add_argument(
        "--ttft",
        type=float,

--- a/container/Dockerfile.sglang-wideep
+++ b/container/Dockerfile.sglang-wideep
@@ -58,7 +58,7 @@ ENV LD_LIBRARY_PATH=/usr/lib:/usr/local/ucx/lib:$LD_LIBRARY_PATH
 # Dynamo
 WORKDIR /sgl-workspace
-RUN git clone https://github.com/ai-dynamo/dynamo.git
+COPY . /sgl-workspace/dynamo
 ENV RUSTUP_HOME=/usr/local/rustup \
    CARGO_HOME=/usr/local/cargo \
@@ -87,6 +87,10 @@ RUN cd dynamo/lib/bindings/python && \
 RUN pip install --break-system-packages sglang-router==0.1.9
+# Install dependencies
+RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \
+    pip install --break-system-packages --requirement /tmp/requirements.txt
 RUN wget --tries=3 --waitretry=5 \
      https://github.com/nats-io/nats-server/releases/download/v2.10.28/\
 nats-server-v2.10.28-${ARCH}.deb && \

--- a/deploy/utils/dynamo_deployment.py
+++ b/deploy/utils/dynamo_deployment.py
@@ -21,6 +21,7 @@ import socket
 import subprocess
 import sys
 import time
+import uuid
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
@@ -114,9 +115,9 @@ class DynamoDeploymentClient:
            service_name: Service name for connecting to the service, defaults to {deployment_name}-frontend
        """
        self.namespace = namespace
-        self.deployment_name = deployment_name
+        self.deployment_name = f"{deployment_name}-{str(uuid.uuid4())[:4]}"
        self.model_name = model_name
-        self.service_name = service_name or f"{deployment_name}-frontend"
+        self.service_name = service_name or f"{self.deployment_name}-frontend"
        self.components: List[str] = []  # Will store component names from CR
        self.deployment_spec: Optional[
            Dict[str, Any]
@@ -247,13 +248,6 @@ class DynamoDeploymentClient:
        self.deployment_spec["metadata"]["name"] = self.deployment_name
        self.deployment_spec["metadata"]["namespace"] = self.namespace
-        # Disable grove as it will cause the deployment to not report ready
-        if "annotations" not in self.deployment_spec["metadata"]:
-            self.deployment_spec["metadata"]["annotations"] = {}
-        self.deployment_spec["metadata"]["annotations"][
-            "nvidia.com/enable-grove"
-        ] = "false"
        try:
            await self.custom_api.create_namespaced_custom_object(
                group="nvidia.com",

--- a/docs/benchmarks/pre_deployment_profiling.md
+++ b/docs/benchmarks/pre_deployment_profiling.md
@@ -13,7 +13,7 @@ Support matrix:
 | vLLM | Dense | ✅ |
 | vLLM | MoE | 🚧 |
 | SGLang | Dense | ✅ |
-| SGLang | MoE | 🚧 |
+| SGLang | MoE | ✅ |
 | TensorRT-LLM | Dense | ✅ |
 | TensorRT-LLM | MoE | 🚧 |
@@ -63,9 +63,15 @@ After finding the best TP size for prefill and decode, the script will then inte
 In prefill engine, prefills are usually done with batch size=1 and only the ISL (excluding prefix cache hit) affects the iteration time. The script profiles the selected prefill TP configuration across different ISLs and record the TTFT and prefill throughput per GPU under those ISLs.
+For dense models, the script profiles different TP sizes.
+For MoE models, the script only profiles different TEP sizes, since DEP is generally not the optimal prefill configuration.
 ### Decode Interpolation Data
 In decode engine, decode requests are added inflight and iteration time (or ITL) depends on both the context length and the real-time load of the engine. We capture the real-time load of the engine with active kv usage and average context length. The active kv usage determines the complexity of the memory-bounded attention kernel while the active kv usage divided the average context length determines the complexity of the computation bound MLP kernel. For example, the below figure shows the ITL of DS-Distilled Llama 8b model on H100 TP4. The ITL grows near-linearly with active kv usage under a fixed context length. And the slope increases as the context length decreases.
+For dense models, the script profiles different TP sizes.
+For MoE models, the script profiles different DEP sizes. TEP decode engines for low latency will be supported in the future.
 ![images](../../docs/images/itl_interpolation.png)
 The script profiles the selected decode TP configuration across different active kv blocks and average context length.
@@ -96,7 +102,7 @@ Set up your Kubernetes namespace for profiling (one-time per namespace). First e
 pip install -r deploy/utils/requirements.txt
 ```
-### Step 1: Inject your DGD configuration
+**Step 1: Inject your DGD configuration**
 Use the injector utility to place your DGD manifest into the PVC. The profiling job will read the path you specify.
@@ -113,11 +119,14 @@ Use the injector utility to place your DGD manifest into the PVC. The profiling
   > **Note**: All paths must start with `/data/` for security reasons. If you forget this prefix, the script will show a helpful error message with the correct path.
-> **Important**: For profiling, disagg configs should be run with Grove disabled by adding the annotation `nvidia.com/enable-grove: "false"` to avoid alpha Grove status issues.
 **Step 2: Set SLA target**
-Edit `$DYNAMO_HOME/benchmarks/profiler/deploy/profile_sla_job.yaml` to set the target ISL, OSL, TTFT, and ITL. Also, set the backend type to `vllm` or `sglang`. The backend type must match the dynamo deployment in the `DGD_CONFIG_FILE`.
+For dense models, edit `$DYNAMO_HOME/benchmarks/profiler/deploy/profile_sla_job.yaml` to set the target ISL, OSL, TTFT, and ITL. Also, set the backend type to match the dynamo deployment in the `DGD_CONFIG_FILE`.
+For MoE models, edit `$DYNAMO_HOME/benchmarks/profiler/deploy/profile_sla_moe_job.yaml` to set the target TEP, DEP, TTFT, and ITL.
+> [!NOTE]
+> If the model is too large to be downloaded every time, you can create a multi-attach PVC to cache the model. Refer to [recipes](../../recipes/README.md) for more details.
 ```yaml
 spec:
@@ -145,7 +154,7 @@ spec:
   export DOCKER_IMAGE=nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.4.1 # or any existing image tag (TODO: update to 0.5.0 upon release as profiling with 0.4.1 is broken)
   ```
-3. **Set the config path for the profiling job:**
+2. **Set the config path for the profiling job:**
   ```bash
   export DGD_CONFIG_FILE=/data/configs/disagg.yaml # should be the same path you set for --dest in Step 1
   ```
@@ -153,7 +162,11 @@ spec:
 **Step 4: Run profiling (required)**
 ```bash
+# for dense models
 envsubst < benchmarks/profiler/deploy/profile_sla_job.yaml | kubectl apply -f -
+# for MoE models
+envsubst < benchmarks/profiler/deploy/profile_sla_moe_job.yaml | kubectl apply -f -
 ```
 **Step 5: Wait for profiling to complete**

--- a/tests/profiler/test_profile_sla_aiconfigurator.py
+++ b/tests/profiler/test_profile_sla_aiconfigurator.py
@@ -26,27 +26,30 @@ class TestProfileSlaAiconfigurator:
    @pytest.fixture
    def trtllm_args(self):
        class Args:
-            backend = "trtllm"
+            def __init__(self):
-            config = "components/backends/trtllm/deploy/disagg.yaml"
+                self.backend = "trtllm"
-            output_dir = "/tmp/test_profiling_results"
+                self.config = "components/backends/trtllm/deploy/disagg.yaml"
-            namespace = "test-namespace"
+                self.output_dir = "/tmp/test_profiling_results"
-            min_num_gpus_per_engine = 1
+                self.namespace = "test-namespace"
-            max_num_gpus_per_engine = 8
+                self.min_num_gpus_per_engine = 1
-            skip_existing_results = False
+                self.max_num_gpus_per_engine = 8
-            force_rerun = False
+                self.skip_existing_results = False
-            isl = 3000
+                self.force_rerun = False
-            osl = 500
+                self.isl = 3000
-            ttft = 50
+                self.osl = 500
-            itl = 10
+                self.ttft = 50
-            max_context_length = 16384
+                self.itl = 10
-            prefill_interpolation_granularity = 16
+                self.max_context_length = 16384
-            decode_interpolation_granularity = 6
+                self.prefill_interpolation_granularity = 16
-            service_name = ""
+                self.decode_interpolation_granularity = 6
-            dry_run = False
+                self.service_name = ""
-            use_ai_configurator = True
+                self.is_moe_model = False
-            aic_system = "h200_sxm"
+                self.dry_run = False
-            aic_model_name = "QWEN3_32B"
+                self.use_ai_configurator = True
-            backend_version = "0.20.0"
+                self.aic_system = "h200_sxm"
+                self.aic_model_name = "QWEN3_32B"
+                self.backend_version = "0.20.0"
+                self.num_gpus_per_node = 8
        return Args()

--- a/tests/profiler/test_profile_sla_dryrun.py
+++ b/tests/profiler/test_profile_sla_dryrun.py
@@ -28,27 +28,30 @@ class TestProfileSLADryRun:
        """Create arguments for vllm backend dry-run test."""
        class Args:
-            backend = "vllm"
+            def __init__(self):
-            config = "components/backends/vllm/deploy/disagg.yaml"
+                self.backend = "vllm"
-            output_dir = "/tmp/test_profiling_results"
+                self.config = "components/backends/vllm/deploy/disagg.yaml"
-            namespace = "test-namespace"
+                self.output_dir = "/tmp/test_profiling_results"
-            min_num_gpus_per_engine = 1
+                self.namespace = "test-namespace"
-            max_num_gpus_per_engine = 8
+                self.min_num_gpus_per_engine = 1
-            skip_existing_results = False
+                self.max_num_gpus_per_engine = 8
-            force_rerun = False
+                self.skip_existing_results = False
-            isl = 3000
+                self.force_rerun = False
-            osl = 500
+                self.isl = 3000
-            ttft = 50
+                self.osl = 500
-            itl = 10
+                self.ttft = 50
-            max_context_length = 16384
+                self.itl = 10
-            prefill_interpolation_granularity = 16
+                self.max_context_length = 16384
-            decode_interpolation_granularity = 6
+                self.prefill_interpolation_granularity = 16
-            service_name = ""
+                self.decode_interpolation_granularity = 6
-            dry_run = True
+                self.service_name = ""
-            use_ai_configurator = False
+                self.is_moe_model = False
-            aic_system = None
+                self.dry_run = True
-            aic_model_name = None
+                self.use_ai_configurator = False
-            backend_version = None
+                self.aic_system = None
+                self.aic_model_name = None
+                self.backend_version = None
+                self.num_gpus_per_node = 8
        return Args()
@@ -57,27 +60,30 @@ class TestProfileSLADryRun:
        """Create arguments for sglang backend dry-run test."""
        class Args:
-            backend = "sglang"
+            def __init__(self):
-            config = "components/backends/sglang/deploy/disagg.yaml"
+                self.backend = "sglang"
-            output_dir = "/tmp/test_profiling_results"
+                self.config = "components/backends/sglang/deploy/disagg.yaml"
-            namespace = "test-namespace"
+                self.output_dir = "/tmp/test_profiling_results"
-            min_num_gpus_per_engine = 1
+                self.namespace = "test-namespace"
-            max_num_gpus_per_engine = 8
+                self.min_num_gpus_per_engine = 1
-            skip_existing_results = False
+                self.max_num_gpus_per_engine = 8
-            force_rerun = False
+                self.skip_existing_results = False
-            isl = 3000
+                self.force_rerun = False
-            osl = 500
+                self.isl = 3000
-            ttft = 50
+                self.osl = 500
-            itl = 10
+                self.ttft = 50
-            max_context_length = 16384
+                self.itl = 10
-            prefill_interpolation_granularity = 16
+                self.max_context_length = 16384
-            decode_interpolation_granularity = 6
+                self.prefill_interpolation_granularity = 16
-            service_name = ""
+                self.decode_interpolation_granularity = 6
-            dry_run = True
+                self.service_name = ""
-            use_ai_configurator = False
+                self.is_moe_model = False
-            aic_system = None
+                self.dry_run = True
-            aic_model_name = None
+                self.use_ai_configurator = False
-            backend_version = None
+                self.aic_system = None
+                self.aic_model_name = None
+                self.backend_version = None
+                self.num_gpus_per_node = 8
        return Args()
@@ -100,27 +106,30 @@ class TestProfileSLADryRun:
        """Create arguments for trtllm backend dry-run test."""
        class Args:
-            backend = "trtllm"
+            def __init__(self):
-            config = "components/backends/trtllm/deploy/disagg.yaml"
+                self.backend = "trtllm"
-            output_dir = "/tmp/test_profiling_results"
+                self.config = "components/backends/trtllm/deploy/disagg.yaml"
-            namespace = "test-namespace"
+                self.output_dir = "/tmp/test_profiling_results"
-            min_num_gpus_per_engine = 1
+                self.namespace = "test-namespace"
-            max_num_gpus_per_engine = 8
+                self.min_num_gpus_per_engine = 1
-            skip_existing_results = False
+                self.max_num_gpus_per_engine = 8
-            force_rerun = False
+                self.skip_existing_results = False
-            isl = 3000
+                self.force_rerun = False
-            osl = 500
+                self.isl = 3000
-            ttft = 50
+                self.osl = 500
-            itl = 10
+                self.ttft = 50
-            max_context_length = 16384
+                self.itl = 10
-            prefill_interpolation_granularity = 16
+                self.max_context_length = 16384
-            decode_interpolation_granularity = 6
+                self.prefill_interpolation_granularity = 16
-            service_name = ""
+                self.decode_interpolation_granularity = 6
-            dry_run = True
+                self.service_name = ""
-            use_ai_configurator = False
+                self.is_moe_model = False
-            aic_system = None
+                self.dry_run = True
-            aic_model_name = None
+                self.use_ai_configurator = False
-            backend_version = None
+                self.aic_system = None
+                self.aic_model_name = None
+                self.backend_version = None
+                self.num_gpus_per_node = 8
        return Args()
@@ -130,3 +139,44 @@ class TestProfileSLADryRun:
        """Test that profile_sla dry-run works for trtllm backend with disagg.yaml config."""
        # Run the profile in dry-run mode - should complete without errors
        await run_profile(trtllm_args)
+    @pytest.fixture
+    def sglang_moe_args(self):
+        """Create arguments for trtllm backend dry-run test."""
+        class Args:
+            def __init__(self):
+                self.backend = "sglang"
+                self.config = (
+                    "recipes/deepseek-r1/sglang-wideep/tep16p-dep16d-disagg.yaml"
+                )
+                self.output_dir = "/tmp/test_profiling_results"
+                self.namespace = "test-namespace"
+                self.min_num_gpus_per_engine = 8
+                self.max_num_gpus_per_engine = 32
+                self.skip_existing_results = False
+                self.force_rerun = False
+                self.isl = 3000
+                self.osl = 500
+                self.ttft = 50
+                self.itl = 10
+                self.max_context_length = 16384
+                self.prefill_interpolation_granularity = 16
+                self.decode_interpolation_granularity = 6
+                self.service_name = ""
+                self.is_moe_model = True
+                self.dry_run = True
+                self.use_ai_configurator = False
+                self.aic_system = None
+                self.aic_model_name = None
+                self.backend_version = None
+                self.num_gpus_per_node = 8
+        return Args()
+    @pytest.mark.pre_merge
+    @pytest.mark.asyncio
+    async def test_sglang_moe_dryrun(self, sglang_moe_args):
+        """Test that profile_sla dry-run works for sglang backend with MoE config."""
+        # Run the profile in dry-run mode - should complete without errors
+        await run_profile(sglang_moe_args)