refactor: preview config in profiler webui use planner dgd generation logic (#4940)

Signed-off-by: hongkuanz <hongkuanz@nvidia.com>

refactor: preview config in profiler webui use planner dgd generation logic (#4940)
Signed-off-by: hongkuanz <hongkuanz@nvidia.com>
e49834c9 · Hongkuan Zhou · GitHub · cceeb8e3 · e49834c9 · e49834c9
Unverified Commit e49834c9 authored Dec 31, 2025 by Hongkuan Zhou Committed by GitHub Dec 31, 2025
8 changed files
--- a/benchmarks/profiler/profile_sla.py
+++ b/benchmarks/profiler/profile_sla.py
@@ -59,7 +59,7 @@ from deploy.utils.dynamo_deployment import (
    DynamoDeploymentClient,
    cleanup_remaining_deployments,
 )
-from dynamo.planner.defaults import WORKER_COMPONENT_NAMES
+from dynamo.planner.defaults import WORKER_COMPONENT_NAMES, SubComponentType


 @dataclass
@@ -239,7 +239,7 @@ async def run_profile(args):
                prefill_config = apply_parallel_mapping_to_config(
                    base_prefill_config,
                    mapping,
-                    EngineType.PREFILL,
+                    SubComponentType.PREFILL,
                    config_modifier,
                    args.num_gpus_per_node,
                )
@@ -344,7 +344,7 @@ async def run_profile(args):
                decode_config = apply_parallel_mapping_to_config(
                    base_decode_config,
                    mapping,
-                    EngineType.DECODE,
+                    SubComponentType.DECODE,
                    config_modifier,
                    args.num_gpus_per_node,
                )
@@ -493,6 +493,9 @@ async def run_profile(args):
                selected_prefill_idx, selected_decode_idx = pick_config_with_webui(
                    prefill_data, decode_data, args
                )
+                # update TTFT/ITL SLA based on selected config
+                args.ttft = prefill_data.ttft[selected_prefill_idx]
+                args.itl = decode_data.itl[selected_decode_idx]
            else:
                # automatically select P/D config within SLA with the highest throughput/GPU
                # select best parallel mapping for prefill
@@ -563,7 +566,7 @@ async def run_profile(args):
        prefill_config = apply_parallel_mapping_to_config(
            prefill_config,
            best_prefill_mapping,
-            EngineType.PREFILL,
+            SubComponentType.PREFILL,
            config_modifier,
            args.num_gpus_per_node,
        )
@@ -647,7 +650,7 @@ async def run_profile(args):
        decode_config = apply_parallel_mapping_to_config(
            decode_config,
            best_decode_mapping,
-            EngineType.DECODE,
+            SubComponentType.DECODE,
            config_modifier,
            args.num_gpus_per_node,
        )
@@ -738,17 +741,17 @@ async def run_profile(args):
        # save DGD config with planner; support multi-document output when a ConfigMap is included
        with open(f"{args.output_dir}/config_with_planner.yaml", "w") as f:
            if isinstance(config, list):
-                yaml.dump_all(config, f)
+                yaml.safe_dump_all(config, f, sort_keys=False)
            else:
-                yaml.dump(config, f)
+                yaml.safe_dump(config, f, sort_keys=False)

        # save mocker config with planner for testing purposes
        logger.debug(f"Mocker config with planner: {mocker_config}")
        with open(f"{args.output_dir}/mocker_config_with_planner.yaml", "w") as f:
            if isinstance(mocker_config, list):
-                yaml.dump_all(mocker_config, f)
+                yaml.safe_dump_all(mocker_config, f, sort_keys=False)
            else:
-                yaml.dump(mocker_config, f)
+                yaml.safe_dump(mocker_config, f, sort_keys=False)

    except Exception as e:
        logger.error(f"Profile job failed with error: {e}")

--- a/benchmarks/profiler/utils/config_modifiers/parallelization_mapping.py
+++ b/benchmarks/profiler/utils/config_modifiers/parallelization_mapping.py
@@ -6,11 +6,12 @@ import logging
 from dataclasses import dataclass
 from enum import Enum

-from benchmarks.profiler.utils.defaults import PREFILL_MAX_NUM_TOKENS, EngineType
+from benchmarks.profiler.utils.defaults import PREFILL_MAX_NUM_TOKENS
 from benchmarks.profiler.utils.model_info import (
    MOE_ADDITIONAL_TP_ARCHITECTURES,
    ModelInfo,
 )
+from dynamo.planner.defaults import SubComponentType

 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -222,27 +223,44 @@ def get_candidate_parallel_mappings(
 def apply_parallel_mapping_to_config(
    base_config: dict,
    mapping: ParallelizationMapping,
-    phase: str,
+    phase: SubComponentType,
    config_modifier,
    num_gpus_per_node: int | None,
+    is_aggregated_config: bool = True,
 ) -> dict:
    cfg = copy.deepcopy(base_config)
+
+    # In aggregated configs (used for profiling individual phases), the worker service we mutate
+    # is always the decode worker (prefill is converted to decode in convert_config()).
+    # In disaggregated configs (final DGD), we mutate the service matching the requested phase.
+    component_type = SubComponentType.DECODE if is_aggregated_config else phase
+
    if mapping.tp is not None:
-        cfg = config_modifier.set_config_tp_size(cfg, mapping.tp)
+        cfg = config_modifier.set_config_tp_size(cfg, mapping.tp, component_type)
    elif mapping.tep is not None:
-        cfg = config_modifier.set_config_tep_size(cfg, mapping.tep, num_gpus_per_node)
+        cfg = config_modifier.set_config_tep_size(
+            cfg, mapping.tep, num_gpus_per_node, component_type
+        )
    elif mapping.dep is not None:
-        cfg = config_modifier.set_config_dep_size(cfg, mapping.dep, num_gpus_per_node)
+        cfg = config_modifier.set_config_dep_size(
+            cfg, mapping.dep, num_gpus_per_node, component_type
+        )
    else:
        raise ValueError(f"Invalid mapping: {mapping.label()}")

-    # for prefill,set batch size to attention_dp_size
+    # For prefill, set batch size to attention_dp_size
    # (this assume prompt is long enough to saturate the GPU, which is usually valid in disagg)
-    if phase == EngineType.PREFILL:
+    if phase == SubComponentType.PREFILL:
+        prefill_component_type = (
+            SubComponentType.DECODE
+            if is_aggregated_config
+            else SubComponentType.PREFILL
+        )
        cfg = config_modifier.set_prefill_config(
            cfg,
            max_batch_size=mapping.get_attn_dp_size(),
            # max num tokens is shared by all attention dp ranks
            max_num_tokens=PREFILL_MAX_NUM_TOKENS * mapping.get_attn_dp_size(),
+            component_type=prefill_component_type,
        )
    return cfg
--- a/benchmarks/profiler/utils/config_modifiers/protocol.py
+++ b/benchmarks/profiler/utils/config_modifiers/protocol.py
@@ -64,7 +64,11 @@ class ConfigModifierProtocol(Protocol):

    @classmethod
    def set_prefill_config(
-        cls, config: dict, max_batch_size: int, max_num_tokens: int
+        cls,
+        config: dict,
+        max_batch_size: int,
+        max_num_tokens: int,
+        component_type: SubComponentType = SubComponentType.DECODE,
    ) -> dict:
        ...


--- a/benchmarks/profiler/utils/config_modifiers/sglang.py
+++ b/benchmarks/profiler/utils/config_modifiers/sglang.py
@@ -379,7 +379,11 @@ class SGLangConfigModifier:

    @classmethod
    def set_prefill_config(
-        cls, config: dict, max_batch_size: int, max_num_tokens: int
+        cls,
+        config: dict,
+        max_batch_size: int,
+        max_num_tokens: int,
+        component_type: SubComponentType = SubComponentType.DECODE,
    ) -> dict:
        """
        Configure prefill-related limits for aggregated prefill runs.
@@ -388,7 +392,7 @@ class SGLangConfigModifier:
        """
        cfg = Config.model_validate(config)
        worker_service = get_worker_service_from_config(
-            cfg, backend="sglang", sub_component_type=SubComponentType.DECODE
+            cfg, backend="sglang", sub_component_type=component_type
        )
        args = validate_and_get_worker_args(worker_service, backend="sglang")
        args = break_arguments(args)

--- a/benchmarks/profiler/utils/config_modifiers/trtllm.py
+++ b/benchmarks/profiler/utils/config_modifiers/trtllm.py
@@ -350,7 +350,11 @@ class TrtllmConfigModifier:

    @classmethod
    def set_prefill_config(
-        cls, config: dict, max_batch_size: int, max_num_tokens: int
+        cls,
+        config: dict,
+        max_batch_size: int,
+        max_num_tokens: int,
+        component_type: SubComponentType = SubComponentType.DECODE,
    ) -> dict:
        """
        Configure prefill-related limits for aggregated prefill runs.
@@ -360,7 +364,7 @@ class TrtllmConfigModifier:
        """
        cfg = Config.model_validate(config)
        worker_service = get_worker_service_from_config(
-            cfg, backend="trtllm", sub_component_type=SubComponentType.DECODE
+            cfg, backend="trtllm", sub_component_type=component_type
        )
        args = validate_and_get_worker_args(worker_service, backend="trtllm")
        args = break_arguments(args)

--- a/benchmarks/profiler/utils/config_modifiers/vllm.py
+++ b/benchmarks/profiler/utils/config_modifiers/vllm.py
@@ -307,7 +307,11 @@ class VllmV1ConfigModifier:

    @classmethod
    def set_prefill_config(
-        cls, config: dict, max_batch_size: int, max_num_tokens: int
+        cls,
+        config: dict,
+        max_batch_size: int,
+        max_num_tokens: int,
+        component_type: SubComponentType = SubComponentType.DECODE,
    ) -> dict:
        """
        Configure prefill-related limits for aggregated prefill runs.
@@ -316,7 +320,7 @@ class VllmV1ConfigModifier:
        """
        cfg = Config.model_validate(config)
        worker_service = get_worker_service_from_config(
-            cfg, backend="vllm", sub_component_type=SubComponentType.DECODE
+            cfg, backend="vllm", sub_component_type=component_type
        )
        args = validate_and_get_worker_args(worker_service, backend="vllm")
        args = break_arguments(args)

--- a/benchmarks/profiler/utils/dgd_generation.py
+++ b/benchmarks/profiler/utils/dgd_generation.py
@@ -26,8 +26,10 @@ from benchmarks.profiler.utils.config import (
    DgdPlannerServiceConfig,
    set_argument_value,
 )
+from benchmarks.profiler.utils.config_modifiers import CONFIG_MODIFIERS
 from benchmarks.profiler.utils.config_modifiers.parallelization_mapping import (
    ParallelizationMapping,
+    apply_parallel_mapping_to_config,
 )
 from benchmarks.profiler.utils.planner_utils import build_planner_args_from_namespace
 from dynamo.common.utils.paths import get_workspace_dir
@@ -37,14 +39,196 @@ from dynamo.planner.defaults import MockerComponentName, SubComponentType
 MOCKER_DISAGG_CONFIG_PATH = "examples/backends/mocker/deploy/disagg.yaml"


-def generate_dgd_config_with_planner(
+def _get_config_modifier_from_args(args):
+    """Return an instantiated config modifier for args.backend."""
+    config_modifier_cls = CONFIG_MODIFIERS[args.backend]
+    return config_modifier_cls()
+
+
+def _find_service_name_for_subcomponent(
+    config: Config, subcomponent: SubComponentType
+) -> str:
+    """Find the service name in a DGD config for a given subComponentType."""
+    for service_name, service_cfg in config.spec.services.items():
+        if getattr(service_cfg, "subComponentType", None) == subcomponent:
+            return service_name
+    raise KeyError(f"Could not find service with subComponentType={subcomponent!r}")
+
+
+def _load_and_apply_mappings(
+    *,
    config_path: str,
+    args,
    config_modifier,
-    output_dir: str,
+    best_prefill_mapping: ParallelizationMapping | None,
+    best_decode_mapping: ParallelizationMapping | None,
+    num_gpus_per_node: int,
+) -> Config:
+    """Load a DGD config file and apply optional prefill/decode parallel mappings (single source of truth)."""
+    with open(config_path, "r") as f:
+        raw = yaml.safe_load(f)
+
+    # Update container image if provided (overrides config file images)
+    if getattr(args, "dgd_image", None):
+        raw = config_modifier.update_image(raw, args.dgd_image)
+
+    if best_prefill_mapping is not None:
+        raw = apply_parallel_mapping_to_config(
+            raw,
+            best_prefill_mapping,
+            SubComponentType.PREFILL,
+            config_modifier,
+            num_gpus_per_node,
+            is_aggregated_config=False,
+        )
+    if best_decode_mapping is not None:
+        raw = apply_parallel_mapping_to_config(
+            raw,
+            best_decode_mapping,
+            SubComponentType.DECODE,
+            config_modifier,
+            num_gpus_per_node,
+            is_aggregated_config=False,
+        )
+
+    return Config.model_validate(raw)
+
+
+def build_prefill_service_config(
+    *,
+    config_path: str,
+    args,
+    best_prefill_mapping: ParallelizationMapping,
+    num_gpus_per_node: int = 8,
+) -> tuple[str, dict]:
+    """Return (service_name, service_dict) for the prefill worker after applying mapping."""
+    return _build_single_worker_service_config(
+        config_path=config_path,
+        args=args,
+        mapping=best_prefill_mapping,
+        subcomponent=SubComponentType.PREFILL,
+        num_gpus_per_node=num_gpus_per_node,
+    )
+
+
+def build_decode_service_config(
+    *,
+    config_path: str,
+    args,
+    best_decode_mapping: ParallelizationMapping,
+    num_gpus_per_node: int = 8,
+) -> tuple[str, dict]:
+    """Return (service_name, service_dict) for the decode worker after applying mapping."""
+    return _build_single_worker_service_config(
+        config_path=config_path,
+        args=args,
+        mapping=best_decode_mapping,
+        subcomponent=SubComponentType.DECODE,
+        num_gpus_per_node=num_gpus_per_node,
+    )
+
+
+def _build_single_worker_service_config(
+    *,
+    config_path: str,
+    args,
+    mapping: ParallelizationMapping,
+    subcomponent: SubComponentType,
+    num_gpus_per_node: int,
+) -> tuple[str, dict]:
+    """Shared helper for building a single worker service dict (prefill or decode)."""
+    config_modifier = _get_config_modifier_from_args(args)
+    config = _load_and_apply_mappings(
+        config_path=config_path,
+        args=args,
+        config_modifier=config_modifier,
+        best_prefill_mapping=mapping
+        if subcomponent == SubComponentType.PREFILL
+        else None,
+        best_decode_mapping=mapping
+        if subcomponent == SubComponentType.DECODE
+        else None,
+        num_gpus_per_node=num_gpus_per_node,
+    )
+    service_name = _find_service_name_for_subcomponent(config, subcomponent)
+    config_dict = config.model_dump(exclude_unset=False)
+    return service_name, config_dict["spec"]["services"][service_name]
+
+
+def generate_prefill_service_config_preview(
+    *,
+    config_path: str,
+    args,
+    best_prefill_mapping: ParallelizationMapping,
+    num_gpus_per_node: int = 8,
+) -> dict:
+    """Generate a prefill-only service config object for WebUI 'Show Config'."""
+    service_name, service_dict = build_prefill_service_config(
+        config_path=config_path,
+        args=args,
+        best_prefill_mapping=best_prefill_mapping,
+        num_gpus_per_node=num_gpus_per_node,
+    )
+    return {service_name: service_dict}
+
+
+def generate_decode_service_config_preview(
+    *,
+    config_path: str,
+    args,
+    best_decode_mapping: ParallelizationMapping,
+    num_gpus_per_node: int = 8,
+) -> dict:
+    """Generate a decode-only service config object for WebUI 'Show Config'."""
+    service_name, service_dict = build_decode_service_config(
+        config_path=config_path,
+        args=args,
+        best_decode_mapping=best_decode_mapping,
+        num_gpus_per_node=num_gpus_per_node,
+    )
+    return {service_name: service_dict}
+
+
+def generate_prefill_decode_services_config_preview(
+    *,
+    config_path: str,
    args,
    best_prefill_mapping: ParallelizationMapping,
    best_decode_mapping: ParallelizationMapping,
    num_gpus_per_node: int = 8,
+) -> dict[str, dict]:
+    """Generate a (prefill+decode)-only services config object for WebUI 'Show Config'."""
+    config_modifier = _get_config_modifier_from_args(args)
+    config = _load_and_apply_mappings(
+        config_path=config_path,
+        args=args,
+        config_modifier=config_modifier,
+        best_prefill_mapping=best_prefill_mapping,
+        best_decode_mapping=best_decode_mapping,
+        num_gpus_per_node=num_gpus_per_node,
+    )
+    prefill_service_name = _find_service_name_for_subcomponent(
+        config, SubComponentType.PREFILL
+    )
+    decode_service_name = _find_service_name_for_subcomponent(
+        config, SubComponentType.DECODE
+    )
+    config_dict = config.model_dump(exclude_unset=False)
+    services = {
+        prefill_service_name: config_dict["spec"]["services"][prefill_service_name],
+        decode_service_name: config_dict["spec"]["services"][decode_service_name],
+    }
+    return services
+
+
+def generate_dgd_config_with_planner(
+    config_path: str,
+    config_modifier,
+    output_dir: str | None,
+    args,
+    best_prefill_mapping: ParallelizationMapping | None,
+    best_decode_mapping: ParallelizationMapping | None,
+    num_gpus_per_node: int = 8,
 ) -> tuple[list[dict] | dict, list[dict] | dict]:
    """Generate DGD config with planner based on profiling results.

@@ -65,62 +249,14 @@ def generate_dgd_config_with_planner(
              If a ConfigMap is generated, returns [ConfigMap, DGD]; otherwise returns a single DGD dict.
    """

-    # Load config from file
-    with open(config_path, "r") as f:
-        config = yaml.safe_load(f)
-
-    # Update container image if provided
-    # This overrides the default image in the config file for all DGD components
-    if args.dgd_image:
-        config = config_modifier.update_image(config, args.dgd_image)
-
-    # Apply prefill parallelization based on the actual mapping used in profiling
-    if best_prefill_mapping.tp is not None:
-        # Dense model or TP for prefill
-        config = config_modifier.set_config_tp_size(
-            config, best_prefill_mapping.tp, SubComponentType.PREFILL
-        )
-    elif best_prefill_mapping.tep is not None:
-        # MoE model with TEP for prefill
-        config = config_modifier.set_config_tep_size(
-            config,
-            best_prefill_mapping.tep,
-            num_gpus_per_node,
-            SubComponentType.PREFILL,
-        )
-    elif best_prefill_mapping.dep is not None:
-        # MoE model with DEP for prefill
-        config = config_modifier.set_config_dep_size(
-            config,
-            best_prefill_mapping.dep,
-            num_gpus_per_node,
-            SubComponentType.PREFILL,
-        )
-
-    # Apply decode parallelization based on the actual mapping used in profiling
-    if best_decode_mapping.tp is not None:
-        # Dense model or TP for decode
-        config = config_modifier.set_config_tp_size(
-            config, best_decode_mapping.tp, SubComponentType.DECODE
-        )
-    elif best_decode_mapping.tep is not None:
-        # MoE model with TEP for decode
-        config = config_modifier.set_config_tep_size(
-            config,
-            best_decode_mapping.tep,
-            num_gpus_per_node,
-            SubComponentType.DECODE,
-        )
-    elif best_decode_mapping.dep is not None:
-        # MoE model with DEP for decode
-        config = config_modifier.set_config_dep_size(
-            config,
-            best_decode_mapping.dep,
-            num_gpus_per_node,
-            SubComponentType.DECODE,
-        )
-
-    config = Config.model_validate(config)
+    config = _load_and_apply_mappings(
+        config_path=config_path,
+        args=args,
+        config_modifier=config_modifier,
+        best_prefill_mapping=best_prefill_mapping,
+        best_decode_mapping=best_decode_mapping,
+        num_gpus_per_node=num_gpus_per_node,
+    )

    # add the planner service
    planner_config = DgdPlannerServiceConfig()
@@ -157,66 +293,69 @@ def generate_dgd_config_with_planner(

    # Add arguments determined by profiling results
    cm_mount_path = f"{get_workspace_dir()}/profiling_results"
-    planner_args.extend(
-        [
-            f"--prefill-engine-num-gpu={best_prefill_mapping.get_num_gpus()}",
-            f"--decode-engine-num-gpu={best_decode_mapping.get_num_gpus()}",
-            f"--profile-results-dir={cm_mount_path}",
-        ]
-    )
+    if best_prefill_mapping is not None:
+        planner_args.append(
+            f"--prefill-engine-num-gpu={best_prefill_mapping.get_num_gpus()}"
+        )
+    if best_decode_mapping is not None:
+        planner_args.append(
+            f"--decode-engine-num-gpu={best_decode_mapping.get_num_gpus()}"
+        )

-    if (
-        planner_config.extraPodSpec.mainContainer
-        and planner_config.extraPodSpec.mainContainer.args is not None
-    ):
-        planner_config.extraPodSpec.mainContainer.args.extend(planner_args)
-    # Convert planner config to dict first, then the entire config to dict
+    # Work with plain dicts for PodSpec/Container extras (e.g. volumes, volumeMounts)
+    # because those fields are stored as "extra" and aren't exposed as pydantic attributes.
    planner_dict = planner_config.model_dump(exclude_unset=False)
    config_dict = config.model_dump(exclude_unset=False)

-    # Build a ConfigMap from NPZ profiling outputs and mount it into the Planner
-    # We store data as plain JSON (lists/float/int) to avoid binary artifacts.
-    prefill_npz = f"{output_dir}/selected_prefill_interpolation/raw_data.npz"
-    decode_npz = f"{output_dir}/selected_decode_interpolation/raw_data.npz"
-
    config_map_obj: Optional[dict] = None
-    try:
-        with np.load(prefill_npz) as p_raw:
-            prefill_json = {
-                "prefill_isl": p_raw["prefill_isl"].tolist(),
-                "prefill_ttft": p_raw["prefill_ttft"].tolist(),
-                "prefill_thpt_per_gpu": p_raw["prefill_thpt_per_gpu"].tolist(),
-            }
-    except FileNotFoundError:
-        prefill_json = None
-
-    try:
-        with np.load(decode_npz) as d_raw:
-            # max_kv_tokens saved as array; convert to int
-            max_kv_tokens = d_raw["max_kv_tokens"]
-            if hasattr(max_kv_tokens, "tolist"):
-                max_kv_tokens_val = max_kv_tokens.tolist()
-                # Handle [value] vs value
-                if isinstance(max_kv_tokens_val, list):
-                    max_kv_tokens_val = (
-                        int(max_kv_tokens_val[0]) if max_kv_tokens_val else 0
-                    )
+    prefill_json = None
+    decode_json = None
+    if output_dir is not None:
+        # Build a ConfigMap from NPZ profiling outputs and mount it into the Planner
+        # We store data as plain JSON (lists/float/int) to avoid binary artifacts.
+        prefill_npz = f"{output_dir}/selected_prefill_interpolation/raw_data.npz"
+        decode_npz = f"{output_dir}/selected_decode_interpolation/raw_data.npz"
+
+        try:
+            with np.load(prefill_npz) as p_raw:
+                prefill_json = {
+                    "prefill_isl": p_raw["prefill_isl"].tolist(),
+                    "prefill_ttft": p_raw["prefill_ttft"].tolist(),
+                    "prefill_thpt_per_gpu": p_raw["prefill_thpt_per_gpu"].tolist(),
+                }
+        except FileNotFoundError:
+            prefill_json = None
+
+        try:
+            with np.load(decode_npz) as d_raw:
+                # max_kv_tokens saved as array; convert to int
+                max_kv_tokens = d_raw["max_kv_tokens"]
+                if hasattr(max_kv_tokens, "tolist"):
+                    max_kv_tokens_val = max_kv_tokens.tolist()
+                    # Handle [value] vs value
+                    if isinstance(max_kv_tokens_val, list):
+                        max_kv_tokens_val = (
+                            int(max_kv_tokens_val[0]) if max_kv_tokens_val else 0
+                        )
+                    else:
+                        max_kv_tokens_val = int(max_kv_tokens_val)
                else:
-                    max_kv_tokens_val = int(max_kv_tokens_val)
-            else:
-                max_kv_tokens_val = int(max_kv_tokens)
-
-            decode_json = {
-                "x_kv_usage": d_raw["x_kv_usage"].tolist(),
-                "y_context_length": d_raw["y_context_length"].tolist(),
-                "z_itl": d_raw["z_itl"].tolist(),
-                "z_thpt_per_gpu": d_raw["z_thpt_per_gpu"].tolist(),
-                "max_kv_tokens": max_kv_tokens_val,
-            }
-    except FileNotFoundError:
-        decode_json = None
+                    max_kv_tokens_val = int(max_kv_tokens)
+
+                decode_json = {
+                    "x_kv_usage": d_raw["x_kv_usage"].tolist(),
+                    "y_context_length": d_raw["y_context_length"].tolist(),
+                    "z_itl": d_raw["z_itl"].tolist(),
+                    "z_thpt_per_gpu": d_raw["z_thpt_per_gpu"].tolist(),
+                    "max_kv_tokens": max_kv_tokens_val,
+                }
+        except FileNotFoundError:
+            decode_json = None

    if prefill_json is not None and decode_json is not None:
+        # Only override planner profile directory when we actually have data to mount.
+        planner_args.append(f"--profile-results-dir={cm_mount_path}")
+
        config_map_obj = {
            "apiVersion": "v1",
            "kind": "ConfigMap",
@@ -249,6 +388,13 @@ def generate_dgd_config_with_planner(
            }
        )

+    # Attach planner args (always)
+    mc_dict = planner_dict.setdefault("extraPodSpec", {}).setdefault(
+        "mainContainer", {}
+    )
+    mc_args = mc_dict.setdefault("args", [])
+    mc_args.extend(planner_args)
+
    # Finalize DGD services
    config_dict["spec"]["services"]["Planner"] = planner_dict

@@ -310,7 +456,7 @@ def _generate_mocker_config_with_planner(
                    "image"
                ] = args.dgd_image

-    # Update worker args: --planner-profile-data, --model-path, --model-name
+    # Update worker args: --planner-profile-data (if available), --model-path, --model-name
    mocker_worker_names = [
        MockerComponentName.prefill_worker_k8s_name,
        MockerComponentName.decode_worker_k8s_name,
@@ -324,9 +470,10 @@ def _generate_mocker_config_with_planner(
                "mainContainer", {}
            )
            args_list = main_container.get("args", [])
-            args_list = set_argument_value(
-                args_list, "--planner-profile-data", cm_mount_path
-            )
+            if config_map_obj is not None:
+                args_list = set_argument_value(
+                    args_list, "--planner-profile-data", cm_mount_path
+                )
            # Update model path and name if available in args
            args_list = set_argument_value(args_list, "--model-path", args.model)
            args_list = set_argument_value(args_list, "--model-name", args.model)

--- a/benchmarks/profiler/webui/utils.py
+++ b/benchmarks/profiler/webui/utils.py
@@ -20,6 +20,11 @@ from aiconfigurator.webapp.components.profiling import (
    load_profiling_javascript,
 )

+from benchmarks.profiler.utils.dgd_generation import (
+    generate_decode_service_config_preview,
+    generate_prefill_decode_services_config_preview,
+    generate_prefill_service_config_preview,
+)
 from benchmarks.profiler.utils.pareto import compute_pareto

 logger = logging.getLogger(__name__)
@@ -56,149 +61,87 @@ def clear_profiling_errors() -> None:
    _profiling_errors.clear()


-def generate_dgd_worker_config_yaml(
-    parallel_mapping,
-    engine_type: str,
-    model: str | None = None,
-    backend: str | None = None,
-    ttft_or_itl: float | None = None,
-    thpt_per_gpu: float | None = None,
-) -> str:
-    """
-    Generate a DGD worker service config snippet for display in the WebUI.
+def dump_yaml_with_header(header_lines: list[str], obj: dict) -> str:
+    """Dump YAML with a leading comment header (used for WebUI config previews)."""
+    header = "\n".join(header_lines + ["#"])
+    body = yaml.safe_dump(obj, sort_keys=False)
+    return f"{header}\n{body}"

-    Uses ParallelizationMapping.label() for display and shows the service structure
-    that would be used in the final DynamoGraphDeployment.

-    Args:
-        parallel_mapping: ParallelizationMapping instance
-        engine_type: "prefill" or "decode"
-        model: Model name/path
-        backend: Backend name (sglang, vllm, trtllm)
-        ttft_or_itl: TTFT (prefill) or ITL (decode) in ms
-        thpt_per_gpu: Throughput per GPU in tokens/s/GPU
+def _maybe_add_model_backend_header_lines(header_lines: list[str], args) -> None:
+    model = getattr(args, "model", None)
+    backend = getattr(args, "backend", None)
+    if model:
+        header_lines.append(f"# Model: {model}")
+    if backend:
+        header_lines.append(f"# Backend: {backend}")

-    Returns:
-        YAML string representation of the DGD worker config
-    """
-    num_gpus = parallel_mapping.get_num_gpus()
-
-    # Build the worker config in DGD style
-    # Note: Actual args vary by backend; this shows the structure
-    worker_config = {
-        "componentType": "worker",
-        "subComponentType": engine_type,
-        "replicas": 1,
-        "resources": {
-            "limits": {
-                "gpu": str(num_gpus),
-            }
-        },
-    }

-    # Build header comments with profiling metadata
+def build_single_service_preview_header_lines(
+    *,
+    service_name: str,
+    engine_type: str,
+    mapping,
+    ttft_or_itl_ms: float | None,
+    thpt_per_gpu: float | None,
+    args,
+) -> list[str]:
    header_lines = [
-        "# DynamoGraphDeployment Worker Config",
+        "# DynamoGraphDeployment Service Config Preview",
+        f"# Service: {service_name}",
        f"# Engine: {engine_type}",
-        f"# Num GPUs: {num_gpus}",
-        f"# Parallelization: {parallel_mapping.label()}",
+        f"# Num GPUs: {mapping.get_num_gpus()}",
+        f"# Parallelization: {mapping.label()}",
    ]
-
-    if engine_type == "prefill" and ttft_or_itl is not None:
-        header_lines.append(f"# Profiled TTFT: {round(ttft_or_itl, 2)} ms")
-    elif engine_type == "decode" and ttft_or_itl is not None:
-        header_lines.append(f"# Profiled ITL: {round(ttft_or_itl, 2)} ms")
-
+    if engine_type == "prefill" and ttft_or_itl_ms is not None:
+        header_lines.append(f"# Profiled TTFT: {round(ttft_or_itl_ms, 2)} ms")
+    if engine_type == "decode" and ttft_or_itl_ms is not None:
+        header_lines.append(f"# Profiled ITL: {round(ttft_or_itl_ms, 2)} ms")
    if thpt_per_gpu is not None:
        header_lines.append(
            f"# Profiled Throughput: {round(thpt_per_gpu, 2)} tokens/s/GPU"
        )
-
-    if model:
-        header_lines.append(f"# Model: {model}")
-    if backend:
-        header_lines.append(f"# Backend: {backend}")
-
-    header_lines.append("#")
-    header_lines.append("# Note: Final config generated after selection includes")
-    header_lines.append("# backend-specific args and planner configuration.")
-
-    # Add the actual config
-    service_name = f"{engine_type.capitalize()}Worker"
-    body = yaml.dump(
-        {service_name: worker_config}, default_flow_style=False, sort_keys=False
+    _maybe_add_model_backend_header_lines(header_lines, args)
+    header_lines.append(
+        "# Note: This is a service-only preview. Final config includes planner."
    )
-
-    return "\n".join(header_lines) + "\n" + body
+    return header_lines


-def generate_dgd_config_yaml_for_display(
+def build_two_service_preview_header_lines(
+    *,
+    prefill_service_name: str,
+    decode_service_name: str,
    prefill_mapping,
    decode_mapping,
-    model: str | None = None,
-    backend: str | None = None,
-) -> str:
-    """
-    Generate a DGD config snippet for display in the WebUI.
-
-    This shows the combined prefill + decode DynamoGraphDeployment structure.
-    Uses ParallelizationMapping.label() for parallelization info.
-
-    Args:
-        prefill_mapping: ParallelizationMapping for prefill
-        decode_mapping: ParallelizationMapping for decode
-        model: Model name/path
-        backend: Backend name
-
-    Returns:
-        YAML string representation of the DGD configuration
-    """
-    prefill_gpus = prefill_mapping.get_num_gpus()
-    decode_gpus = decode_mapping.get_num_gpus()
-
-    # Build DGD-style config showing the service structure
-    config = {
-        "apiVersion": "nvidia.com/v1alpha1",
-        "kind": "DynamoGraphDeployment",
-        "spec": {
-            "services": {
-                "PrefillWorker": {
-                    "componentType": "worker",
-                    "subComponentType": "prefill",
-                    "replicas": 1,
-                    "resources": {
-                        "limits": {"gpu": str(prefill_gpus)},
-                    },
-                },
-                "DecodeWorker": {
-                    "componentType": "worker",
-                    "subComponentType": "decode",
-                    "replicas": 1,
-                    "resources": {
-                        "limits": {"gpu": str(decode_gpus)},
-                    },
-                },
-            }
-        },
-    }
-
-    # Build header comments with parallelization and model info
+    prefill_ttft_ms: float | None,
+    prefill_thpt_per_gpu: float | None,
+    decode_itl_ms: float | None,
+    decode_thpt_per_gpu: float | None,
+    args,
+) -> list[str]:
    header_lines = [
-        "# DynamoGraphDeployment Configuration Preview",
-        f"# Prefill: {prefill_gpus} GPU(s), {prefill_mapping.label()}",
-        f"# Decode: {decode_gpus} GPU(s), {decode_mapping.label()}",
+        "# DynamoGraphDeployment Services Config Preview",
+        f"# Prefill service: {prefill_service_name} ({prefill_mapping.get_num_gpus()} GPU(s), {prefill_mapping.label()})",
+        f"# Decode service: {decode_service_name} ({decode_mapping.get_num_gpus()} GPU(s), {decode_mapping.label()})",
    ]
-    if model:
-        header_lines.append(f"# Model: {model}")
-    if backend:
-        header_lines.append(f"# Backend: {backend}")
-    header_lines.append("#")
-    header_lines.append("# Full config with planner saved to: config_with_planner.yaml")
-
-    header = "\n".join(header_lines)
-    body = yaml.dump(config, default_flow_style=False, sort_keys=False)
-
-    return f"{header}\n{body}"
+    if prefill_ttft_ms is not None:
+        header_lines.append(f"# Profiled TTFT: {round(prefill_ttft_ms, 2)} ms")
+    if decode_itl_ms is not None:
+        header_lines.append(f"# Profiled ITL: {round(decode_itl_ms, 2)} ms")
+    if prefill_thpt_per_gpu is not None:
+        header_lines.append(
+            f"# Profiled Prefill Throughput: {round(prefill_thpt_per_gpu, 2)} tokens/s/GPU"
+        )
+    if decode_thpt_per_gpu is not None:
+        header_lines.append(
+            f"# Profiled Decode Throughput: {round(decode_thpt_per_gpu, 2)} tokens/s/GPU"
+        )
+    _maybe_add_model_backend_header_lines(header_lines, args)
+    header_lines.append(
+        "# Note: This is a services-only preview. Final config includes planner."
+    )
+    return header_lines


 class PlotType(str, Enum):
@@ -326,15 +269,22 @@ def populate_prefill_data(data, prefill_data, args):
            prefill_data.parallel_mappings,
        )
    ):
-        # Generate DGD worker config YAML for display
-        config_yaml = generate_dgd_worker_config_yaml(
-            parallel_mapping=mapping,
+        config_obj = generate_prefill_service_config_preview(
+            config_path=args.config,
+            args=args,
+            best_prefill_mapping=mapping,
+            num_gpus_per_node=getattr(args, "num_gpus_per_node", 8),
+        )
+        service_name = next(iter(config_obj.keys()))
+        header_lines = build_single_service_preview_header_lines(
+            service_name=service_name,
            engine_type="prefill",
-            model=getattr(args, "model", None),
-            backend=getattr(args, "backend", None),
-            ttft_or_itl=ttft,
+            mapping=mapping,
+            ttft_or_itl_ms=ttft,
            thpt_per_gpu=thpt,
+            args=args,
        )
+        config_yaml = dump_yaml_with_header(header_lines, config_obj)
        table_data.append([gpu, round(ttft, 2), round(thpt, 2), config_yaml])
    data[PlotType.PREFILL]["table"]["data"] = table_data

@@ -383,15 +333,22 @@ def populate_decode_data(data, decode_data, args):
            decode_data.parallel_mappings,
        )
    ):
-        # Generate DGD worker config YAML for display
-        config_yaml = generate_dgd_worker_config_yaml(
-            parallel_mapping=mapping,
+        config_obj = generate_decode_service_config_preview(
+            config_path=args.config,
+            args=args,
+            best_decode_mapping=mapping,
+            num_gpus_per_node=getattr(args, "num_gpus_per_node", 8),
+        )
+        service_name = next(iter(config_obj.keys()))
+        header_lines = build_single_service_preview_header_lines(
+            service_name=service_name,
            engine_type="decode",
-            model=getattr(args, "model", None),
-            backend=getattr(args, "backend", None),
-            ttft_or_itl=itl,
+            mapping=mapping,
+            ttft_or_itl_ms=itl,
            thpt_per_gpu=thpt,
+            args=args,
        )
+        config_yaml = dump_yaml_with_header(header_lines, config_obj)
        table_data.append([gpu, round(itl, 2), round(thpt, 2), config_yaml])
    data[PlotType.DECODE]["table"]["data"] = table_data

@@ -468,13 +425,32 @@ def populate_cost_data(
            # Store mapping from cost table row to original indices
            cost_index_mapping[table_idx] = (orig_prefill_idx, orig_decode_idx)

-            # Generate DGD config YAML for display
-            config_yaml = generate_dgd_config_yaml_for_display(
+            services_obj = generate_prefill_decode_services_config_preview(
+                config_path=args.config,
+                args=args,
+                best_prefill_mapping=prefill_mapping,
+                best_decode_mapping=decode_mapping,
+                num_gpus_per_node=getattr(args, "num_gpus_per_node", 8),
+            )
+            # Determine service names (backend-dependent)
+            service_names = list(services_obj.keys())
+            # Prefer stable names by picking based on subComponentType if present; fallback to insertion order.
+            prefill_service_name = service_names[0]
+            decode_service_name = (
+                service_names[1] if len(service_names) > 1 else service_names[0]
+            )
+            header_lines = build_two_service_preview_header_lines(
+                prefill_service_name=prefill_service_name,
+                decode_service_name=decode_service_name,
                prefill_mapping=prefill_mapping,
                decode_mapping=decode_mapping,
-                model=getattr(args, "model", None),
-                backend=getattr(args, "backend", None),
+                prefill_ttft_ms=float(_p_ttft),
+                prefill_thpt_per_gpu=float(_p_thpt),
+                decode_itl_ms=float(_d_itl),
+                decode_thpt_per_gpu=float(_d_thpt),
+                args=args,
            )
+            config_yaml = dump_yaml_with_header(header_lines, services_obj)

            # Add to table data (GPU hours, not cost - frontend handles cost conversion)
            table_data.append(