feat: add webUI to profiler (#4544)

Signed-off-by: hongkuanz <hongkuanz@nvidia.com> Signed-off-by: Hongkuan Zhou <tedzhouhk@gmail.com> Co-authored-by: hhzhang16 <54051230+hhzhang16@users.noreply.github.com>

feat: add webUI to profiler (#4544)
Signed-off-by: hongkuanz <hongkuanz@nvidia.com> Signed-off-by: Hongkuan Zhou <tedzhouhk@gmail.com> Co-authored-by: hhzhang16 <54051230+hhzhang16@users.noreply.github.com>
ca0cd3b1 · Hongkuan Zhou · GitHub · 0173d5e6 · ca0cd3b1 · ca0cd3b1
Unverified Commit ca0cd3b1 authored Dec 10, 2025 by Hongkuan Zhou Committed by GitHub Dec 10, 2025
12 changed files
--- a/benchmarks/profiler/profile_sla.py
+++ b/benchmarks/profiler/profile_sla.py
@@ -50,6 +50,7 @@ from benchmarks.profiler.utils.profile_prefill import (
    profile_prefill_aiconfigurator,
 )
 from benchmarks.profiler.utils.profiler_argparse import create_profiler_parser
+from benchmarks.profiler.webui.select_config import pick_config_with_webui
 from deploy.utils.dynamo_deployment import (
    DynamoDeploymentClient,
    cleanup_remaining_deployments,
@@ -476,7 +477,15 @@ async def run_profile(args):
            # Safety guards: no results → exit early with a clear message
            if not prefill_data.num_gpus:
                logger.error("No prefill results produced; skipping recommendations.")
+                return

+            if args.pick_with_webui:
+                # select best P/D config in webUI
+                selected_prefill_idx, selected_decode_idx = pick_config_with_webui(
+                    prefill_data, decode_data, args
+                )
+            else:
+                # automatically select P/D config within SLA with the highest throughput/GPU
                # select best parallel mapping for prefill
                if min(prefill_data.ttft) > args.ttft:
                    logger.warning(
@@ -485,7 +494,9 @@ async def run_profile(args):
                    selected_prefill_idx = int(np.argmin(np.array(prefill_data.ttft)))
                else:
                    valid_indices = [
-                    i for i, ttft in enumerate(prefill_data.ttft) if ttft <= args.ttft
+                        i
+                        for i, ttft in enumerate(prefill_data.ttft)
+                        if ttft <= args.ttft
                    ]
                    # Among valid TP sizes, select the one with highest throughput per GPU
                    valid_thpts = [prefill_data.thpt_per_gpu[i] for i in valid_indices]
@@ -497,7 +508,9 @@ async def run_profile(args):

                # select best parallel mapping for decode
                if not decode_data.num_gpus:
-                logger.error("No decode results produced; skipping recommendations.")
+                    logger.error(
+                        "No decode results produced; skipping recommendations."
+                    )
                    return
                if min(decode_data.itl) > args.itl:
                    logger.warning(

--- a/benchmarks/profiler/utils/defaults.py
+++ b/benchmarks/profiler/utils/defaults.py
@@ -30,6 +30,10 @@ AIPERF_WARMUP_REQUEST_PER_DP_RANK = 3
 AIPERF_PREFILL_BENCHMARK_OSL = 5
 AIPERF_PREFILL_ATTN_DP_NUM_REQ_RATIO = 4

+# Cost calculation defaults
+# TODO: allow user to configure this in GUI
+GPU_COST_PER_HOUR = 3.0  # Cost per GPU per hour in dollars
+

 class EngineType(str, Enum):
    PREFILL = "prefill"

--- a/benchmarks/profiler/utils/pareto.py
+++ b/benchmarks/profiler/utils/pareto.py
@@ -4,33 +4,39 @@

 def compute_pareto(x, y):
    """
-    compute the pareto front (top-left is better) for the given x and y values
-    return sorted lists of the x and y values for the pareto front
+    Compute the pareto front (top-left is better) for the given x and y values.
+
+    Returns:
+        tuple: (xs, ys, indices) where:
+            - xs: list of x values on the pareto front
+            - ys: list of y values on the pareto front
+            - indices: list of original indices corresponding to the pareto points
    """
    # Validate inputs
    if x is None or y is None:
-        return [], []
+        return [], [], []

    if len(x) != len(y):
        raise ValueError("x and y must have the same length")

    if len(x) == 0:
-        return [], []
+        return [], [], []

-    # Build point list and sort by x asc, then y desc so we prefer smaller x and larger y.
-    points = list(zip(x, y))
+    # Build point list with original indices and sort by x asc, then y desc
+    points = [(x[i], y[i], i) for i in range(len(x))]
    points.sort(key=lambda p: (p[0], -p[1]))

-    # Single pass to keep only non-dominated points (minimize x, maximize y).
+    # Single pass to keep only non-dominated points (minimize x, maximize y)
    pareto = []
    max_y = float("-inf")
-    for px, py in points:
+    for px, py, idx in points:
        if py > max_y:
-            pareto.append((px, py))
+            pareto.append((px, py, idx))
            max_y = py

    # Return sorted by x ascending for convenience
    pareto.sort(key=lambda p: (p[0], p[1]))
-    xs = [px for px, _ in pareto]
-    ys = [py for _, py in pareto]
-    return xs, ys
+    xs = [px for px, _, _ in pareto]
+    ys = [py for _, py, _ in pareto]
+    indices = [idx for _, _, idx in pareto]
+    return xs, ys, indices
--- a/benchmarks/profiler/utils/plot.py
+++ b/benchmarks/profiler/utils/plot.py
@@ -21,6 +21,7 @@ import numpy as np
 from matplotlib import cm
 from scipy.interpolate import griddata

+from benchmarks.profiler.utils.defaults import GPU_COST_PER_HOUR
 from benchmarks.profiler.utils.pareto import compute_pareto

 logger = logging.getLogger(__name__)
@@ -297,13 +298,11 @@ def plot_pd_joint_results(isl, osl, prefill_data, decode_data, output_dir):
        decode_data: DecodeProfileData instance containing profiling results
        output_dir: directory to save the plot
    """
-    GPU_COST_PER_HOUR = 3.0  # $3/hour
-
    # compute pareto front for prefill
-    p_ttft, p_thpt = compute_pareto(prefill_data.ttft, prefill_data.thpt_per_gpu)
+    p_ttft, p_thpt, _ = compute_pareto(prefill_data.ttft, prefill_data.thpt_per_gpu)

    # compute pareto front for decode
-    d_itl, d_thpt = compute_pareto(decode_data.itl, decode_data.thpt_per_gpu)
+    d_itl, d_thpt, _ = compute_pareto(decode_data.itl, decode_data.thpt_per_gpu)

    # convert to cost per thousand requests
    p_ttft = np.array(p_ttft)

--- a/benchmarks/profiler/utils/profiler_argparse.py
+++ b/benchmarks/profiler/utils/profiler_argparse.py
@@ -3,6 +3,7 @@

 import argparse
 import ast
+import os
 from typing import Any, Dict

 import yaml
@@ -84,6 +85,8 @@ def create_profiler_parser() -> argparse.Namespace:
            aic_backend: String (aiconfigurator backend of the target model, if not provided, will use args.backend, default: "")
            aic_backend_version: String (specify backend version when using aiconfigurator to estimate perf, default: None)
            dry_run: Boolean (dry run the profile job, default: False)
+            pick_with_webui: Boolean (pick the best parallelization mapping using webUI, default: False)
+            webui_port: Int (webUI port, default: $PROFILER_WEBUI_PORT or 8000)
        sla:
            isl: Int (target input sequence length, default: 3000)
            osl: Int (target output sequence length, default: 500)
@@ -113,6 +116,8 @@ def create_profiler_parser() -> argparse.Namespace:
        help="Configuration as Python dict literal, YAML, or JSON string. CLI args override config values. "
        "Example: \"{'engine': {'backend': 'vllm', 'config': '/path'}, 'sla': {'isl': 3000}}\"",
    )
+
+    # CLI arguments with config-aware defaults (using nested .get() for cleaner code)
    parser.add_argument(
        "--model",
        type=str,
@@ -126,7 +131,6 @@ def create_profiler_parser() -> argparse.Namespace:
        help="Container image to use for DGD components (frontend, planner, workers). Overrides images in config file.",
    )

-    # CLI arguments with config-aware defaults (using nested .get() for cleaner code)
    parser.add_argument(
        "--namespace",
        type=str,
@@ -233,6 +237,23 @@ def create_profiler_parser() -> argparse.Namespace:
        default=config.get("hardware", {}).get("enable_gpu_discovery", False),
        help="Enable automatic GPU discovery from Kubernetes cluster nodes. When enabled, overrides any manually specified hardware configuration. Requires cluster-wide node access permissions.",
    )
+    parser.add_argument(
+        "--pick-with-webui",
+        action="store_true",
+        default=config.get("sweep", {}).get("pick_with_webui", False),
+        help="Pick the best parallelization mapping using webUI",
+    )
+
+    default_webui_port = 8000
+    webui_port_env = os.environ.get("PROFILER_WEBUI_PORT")
+    if webui_port_env:
+        default_webui_port = int(webui_port_env)
+    parser.add_argument(
+        "--webui-port",
+        type=int,
+        default=config.get("sweep", {}).get("webui_port", default_webui_port),
+        help="WebUI port",
+    )

    # Dynamically add all planner arguments from planner_argparse.py
    add_planner_arguments_to_parser(parser, prefix="planner-")

--- a/benchmarks/profiler/webui/data_template.json
+++ b/benchmarks/profiler/webui/data_template.json
+{
+    "settings": {
+        "allow_confirm_datapoint": true,
+        "hide_show_config": true
+    },
+    "prefill": {
+        "chart": {
+            "labels": [],
+            "datasets": [
+                {
+                    "label": "Prefill Performance",
+                    "data": [],
+                    "backgroundColor": "#1f77b4",
+                    "borderColor": "#1f77b4"
+                }
+            ],
+            "target_line": {
+                "value": 0.0,
+                "label": "Target TTFT: ? ms"
+            },
+            "axes": {
+                "x": {
+                    "title": "Time to First Token (ms)",
+                    "min": 0
+                },
+                "y": {
+                    "title": "Prefill Throughput per GPU (tokens/s/GPU)",
+                    "min": 0
+                }
+            }
+        },
+        "table": {
+            "columns": [
+                "GPUs",
+                "TTFT (ms)",
+                "Throughput (tokens/s/GPU)",
+                "Action"
+            ],
+            "data": []
+        }
+    },
+    "decode": {
+        "chart": {
+            "datasets": [],
+            "target_line": {
+                "value": 0.0,
+                "label": "Target ITL: ? ms"
+            },
+            "axes": {
+                "x": {
+                    "title": "Inter Token Latency (ms)",
+                    "min": 0
+                },
+                "y": {
+                    "title": "Decode Throughput per GPU (tokens/s/GPU)",
+                    "min": 0
+                }
+            }
+        },
+        "table": {
+            "columns": [
+                "GPUs",
+                "ITL (ms)",
+                "Throughput (tokens/s/GPU)",
+                "Action"
+            ],
+            "data": []
+        }
+    },
+    "cost": {
+        "chart": {
+            "datasets": [],
+            "axes": {
+                "x": {
+                    "title": "Tokens per User",
+                    "min": 0
+                },
+                "y": {
+                    "title": "Cost ($)",
+                    "min": 0
+                }
+            },
+            "title": "Cost Per 1000 ? requests"
+        },
+        "table": {
+            "columns": [
+                "TTFT (ms)",
+                "Prefill Thpt (tokens/s/GPU)",
+                "ITL (ms)",
+                "Decode Thpt (tokens/s/GPU)",
+                "Tokens/User",
+                "Cost ($)",
+                "Action"
+            ],
+            "data": []
+        }
+    }
+}
--- a/benchmarks/profiler/webui/select_config.py
+++ b/benchmarks/profiler/webui/select_config.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import logging
+import os
+import queue
+from pathlib import Path
+
+from benchmarks.profiler.webui.utils import (
+    PlotType,
+    create_gradio_interface,
+    create_selection_handler,
+    populate_cost_data,
+    populate_decode_data,
+    populate_prefill_data,
+    wait_for_selection,
+)
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+console_handler = logging.StreamHandler()
+console_handler.setLevel(logging.INFO)
+formatter = logging.Formatter(
+    "%(asctime)s - %(name)s - %(levelname)s - %(message)s", "%Y-%m-%d %H:%M:%S"
+)
+console_handler.setFormatter(formatter)
+logger.addHandler(console_handler)
+
+
+def generate_config_data(prefill_data, decode_data, args):
+    """
+    Generate JSON data file for WebUI from profiling results.
+
+    Args:
+        prefill_data: PrefillProfileData instance
+        decode_data: DecodeProfileData instance
+        args: Arguments containing SLA targets (ttft, itl, isl, osl) and output_dir
+
+    Returns a JSON data file for WebUI consumption,
+        see https://github.com/ai-dynamo/aiconfigurator/blob/main/src/aiconfigurator/webapp/components/profiling/standalone/sample_profiling_data.json for more details
+    """
+    # Load template
+    template_path = Path(__file__).parent / "data_template.json"
+    with open(template_path, "r") as f:
+        data = json.load(f)
+
+    # Construct output path
+    output_path = os.path.join(args.output_dir, "webui_data.json")
+
+    # Set SLA targets
+    data[PlotType.PREFILL]["chart"]["target_line"]["value"] = args.ttft
+    data[PlotType.PREFILL]["chart"]["target_line"][
+        "label"
+    ] = f"Target TTFT: {args.ttft} ms"
+
+    data[PlotType.DECODE]["chart"]["target_line"]["value"] = args.itl
+    data[PlotType.DECODE]["chart"]["target_line"][
+        "label"
+    ] = f"Target ITL: {args.itl} ms"
+
+    data[PlotType.COST]["chart"][
+        "title"
+    ] = f"Cost Per 1000 i{args.isl}o{args.osl} requests"
+
+    # Populate data sections
+    populate_prefill_data(data, prefill_data)
+    populate_decode_data(data, decode_data)
+    populate_cost_data(data, prefill_data, decode_data, args)
+
+    # Save JSON file
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    with open(output_path, "w") as f:
+        json.dump(data, f, indent=4)
+
+    logger.info(f"Generated WebUI config data at {output_path}")
+    return data
+
+
+def pick_config_with_webui(prefill_data, decode_data, args):
+    """
+    Launch WebUI for user to pick configurations.
+
+    Args:
+        prefill_data: PrefillProfileData instance
+        decode_data: DecodeProfileData instance
+        args: Arguments containing SLA targets and output_dir
+
+    Returns:
+        tuple[int, int]: (selected_prefill_idx, selected_decode_idx)
+    """
+    # Generate JSON data file and load it
+    generate_config_data(prefill_data, decode_data, args)
+
+    output_path = os.path.join(args.output_dir, "webui_data.json")
+    with open(output_path, "r") as f:
+        json_data_str = f.read()
+        data_dict = json.loads(json_data_str)
+
+    logger.info(f"Launching WebUI on port {args.webui_port}...")
+
+    # Queue to communicate selection from UI to main thread
+    selection_queue: queue.Queue[tuple[int | None, int | None]] = queue.Queue()
+
+    # Track individual selections
+    prefill_selection = {"idx": None}
+    decode_selection = {"idx": None}
+
+    # Create selection handler and Gradio interface
+    handle_selection = create_selection_handler(
+        data_dict, selection_queue, prefill_selection, decode_selection
+    )
+    demo = create_gradio_interface(json_data_str, handle_selection)
+
+    return wait_for_selection(demo, selection_queue, args.webui_port)
--- a/benchmarks/profiler/webui/utils.py
+++ b/benchmarks/profiler/webui/utils.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import logging
+import queue
+import threading
+from enum import Enum
+
+import gradio as gr
+import numpy as np
+from aiconfigurator.webapp.components.profiling import (
+    create_performance_results_section,
+    create_profiling_ui_components,
+    inject_profiling_assets,
+    load_profiling_javascript,
+)
+
+from benchmarks.profiler.utils.defaults import GPU_COST_PER_HOUR
+from benchmarks.profiler.utils.pareto import compute_pareto
+
+logger = logging.getLogger(__name__)
+
+
+class PlotType(str, Enum):
+    """Enum for the three plot/config types in the WebUI."""
+
+    PREFILL = "prefill"
+    DECODE = "decode"
+    COST = "cost"
+
+
+# Color palette for chart datasets
+# TODO: handle case with more than 8 lines
+CHART_COLORS = [
+    "#1f77b4",  # blue
+    "#ff7f0e",  # orange
+    "#2ca02c",  # green
+    "#d62728",  # red
+    "#9467bd",  # purple
+    "#8c564b",  # brown
+    "#e377c2",  # pink
+    "#7f7f7f",  # gray
+]
+
+# TODO: is this too long?
+WEB_UI_SELECTION_TIMEOUT = 3600
+
+
+def populate_prefill_data(data, prefill_data):
+    """Populate prefill chart and table data."""
+    if not prefill_data.num_gpus:
+        return
+
+    # Get unique GPU counts for labels
+    unique_gpus = sorted(set(prefill_data.num_gpus))
+    data[PlotType.PREFILL]["chart"]["labels"] = [f"{gpu} GPUs" for gpu in unique_gpus]
+
+    # Populate chart data points
+    chart_data = []
+    for i, (gpu, ttft, thpt, label) in enumerate(
+        zip(
+            prefill_data.num_gpus,
+            prefill_data.ttft,
+            prefill_data.thpt_per_gpu,
+            prefill_data.parallel_mapping_labels,
+        )
+    ):
+        chart_data.append(
+            {
+                "x": round(ttft, 2),
+                "y": round(thpt, 2),
+                "gpu": gpu,
+                "tableIdx": i,
+                "gpuLabel": f"{gpu} GPUs [{label}]",
+            }
+        )
+    data[PlotType.PREFILL]["chart"]["datasets"][0]["data"] = chart_data
+
+    # Populate table data
+    table_data = []
+    for i, (gpu, ttft, thpt, label) in enumerate(
+        zip(
+            prefill_data.num_gpus,
+            prefill_data.ttft,
+            prefill_data.thpt_per_gpu,
+            prefill_data.parallel_mapping_labels,
+        )
+    ):
+        # TODO: Add actual config YAML data
+        config_yaml = f"prefill_config_{i}.yaml"
+        table_data.append([gpu, round(ttft, 2), round(thpt, 2), config_yaml])
+    data[PlotType.PREFILL]["table"]["data"] = table_data
+
+
+def populate_decode_data(data, decode_data):
+    """Populate decode chart and table data."""
+    if not decode_data.num_gpus:
+        return
+
+    # Group by GPU count for multiple datasets
+    gpu_groups: dict[int, list[dict[str, float | int]]] = {}
+    for i, (gpu, itl, thpt, label) in enumerate(
+        zip(
+            decode_data.num_gpus,
+            decode_data.itl,
+            decode_data.thpt_per_gpu,
+            decode_data.parallel_mapping_labels,
+        )
+    ):
+        if gpu not in gpu_groups:
+            gpu_groups[gpu] = []
+        gpu_groups[gpu].append({"x": round(itl, 2), "y": round(thpt, 2), "tableIdx": i})
+
+    # Create datasets for each GPU count with different colors
+    datasets = []
+    for idx, (gpu, points) in enumerate(sorted(gpu_groups.items())):
+        color = CHART_COLORS[idx % len(CHART_COLORS)]
+        datasets.append(
+            {
+                "label": f"{gpu} GPUs",
+                "data": points,
+                "backgroundColor": color,
+                "borderColor": color,
+            }
+        )
+    data[PlotType.DECODE]["chart"]["datasets"] = datasets
+
+    # Populate table data
+    table_data = []
+    for i, (gpu, itl, thpt, label) in enumerate(
+        zip(
+            decode_data.num_gpus,
+            decode_data.itl,
+            decode_data.thpt_per_gpu,
+            decode_data.parallel_mapping_labels,
+        )
+    ):
+        config_yaml = f"decode_config_{i}.yaml"
+        table_data.append([gpu, round(itl, 2), round(thpt, 2), config_yaml])
+    data[PlotType.DECODE]["table"]["data"] = table_data
+
+
+def populate_cost_data(data, prefill_data, decode_data, args):
+    """Populate cost chart and table data with pareto-optimal configurations."""
+    if not prefill_data.num_gpus or not decode_data.num_gpus:
+        return
+
+    # Compute pareto front for prefill (minimize TTFT, maximize throughput)
+    p_ttft, p_thpt, prefill_pareto_indices = compute_pareto(
+        prefill_data.ttft, prefill_data.thpt_per_gpu
+    )
+
+    # Compute pareto front for decode (minimize ITL, maximize throughput)
+    d_itl, d_thpt, decode_pareto_indices = compute_pareto(
+        decode_data.itl, decode_data.thpt_per_gpu
+    )
+
+    # Convert to numpy arrays
+    p_ttft = np.array(p_ttft)
+    p_thpt = np.array(p_thpt)
+    d_itl = np.array(d_itl)
+    d_thpt = np.array(d_thpt)
+
+    # Generate cost datasets - one line per prefill config
+    cost_datasets = []
+    table_data = []
+    cost_index_mapping = {}  # Map cost table row idx -> (prefill_idx, decode_idx)
+    table_idx = 0
+
+    for p_idx, (_p_ttft, _p_thpt) in enumerate(zip(p_ttft, p_thpt)):
+        # Calculate prefill cost (fixed for this line)
+        prefill_cost = args.isl * 1000 / _p_thpt * GPU_COST_PER_HOUR / 3600
+
+        # For each decode config, calculate total cost
+        line_data = []
+        for d_idx, (_d_itl, _d_thpt) in enumerate(zip(d_itl, d_thpt)):
+            # Calculate decode cost
+            decode_cost = args.osl * 1000 / _d_thpt * GPU_COST_PER_HOUR / 3600
+            total_cost = prefill_cost + decode_cost
+
+            # X-axis: tokens per user (based on ITL)
+            tokens_per_user = 1000 / _d_itl
+
+            line_data.append(
+                {
+                    "x": round(tokens_per_user, 2),
+                    "y": round(total_cost, 2),
+                    "tableIdx": table_idx,
+                }
+            )
+
+            # Store mapping from cost table row to original indices
+            orig_prefill_idx = prefill_pareto_indices[p_idx]
+            orig_decode_idx = decode_pareto_indices[d_idx]
+            cost_index_mapping[table_idx] = (orig_prefill_idx, orig_decode_idx)
+
+            # Add to table data
+            table_data.append(
+                [
+                    round(_p_ttft, 2),
+                    round(_p_thpt, 2),
+                    round(_d_itl, 2),
+                    round(_d_thpt, 2),
+                    round(tokens_per_user, 2),
+                    round(total_cost, 2),
+                    f"cost_config_{table_idx}.yaml",  # TODO: Add actual config
+                ]
+            )
+            table_idx += 1
+
+        # Create dataset for this prefill config
+        color = CHART_COLORS[p_idx % len(CHART_COLORS)]
+        cost_datasets.append(
+            {
+                "label": f"TTFT: {_p_ttft:.2f}ms",
+                "data": line_data,
+                "backgroundColor": color,
+                "borderColor": color,
+            }
+        )
+
+    data[PlotType.COST]["chart"]["datasets"] = cost_datasets
+    data[PlotType.COST]["table"]["data"] = table_data
+
+    # Store the index mapping in the JSON for reference
+    data[PlotType.COST]["index_mapping"] = {
+        str(k): list(v) for k, v in cost_index_mapping.items()
+    }
+
+
+def create_selection_handler(
+    data_dict, selection_queue, prefill_selection, decode_selection
+):
+    """Create a selection handler closure for the WebUI.
+
+    Args:
+        data_dict: Parsed JSON data containing cost index mapping
+        selection_queue: Queue to communicate selections to main thread
+        prefill_selection: Dict tracking prefill selection state
+        decode_selection: Dict tracking decode selection state
+
+    Returns:
+        Callable: Selection handler function for Gradio
+    """
+
+    def handle_selection(selection_json):
+        """Handle datapoint selection from table."""
+        if not selection_json or selection_json.strip() == "":
+            return
+
+        try:
+            selection = json.loads(selection_json)
+            plot_type = selection.get("plotType")
+            row_idx = selection.get("rowIndex")
+
+            logger.info(f"Selection received: {plot_type}, row {row_idx}")
+
+            # Store selection for later confirmation
+            if plot_type == PlotType.COST:
+                # Cost selection - use index mapping to get original indices
+                cost_index_mapping = data_dict[PlotType.COST].get("index_mapping", {})
+                mapping_entry = cost_index_mapping.get(str(row_idx))
+
+                if mapping_entry:
+                    prefill_idx, decode_idx = mapping_entry
+                    if prefill_idx is not None and decode_idx is not None:
+                        logger.info(
+                            f"Cost selection determines: Prefill={prefill_idx}, Decode={decode_idx}"
+                        )
+                        # Auto-submit for cost selection
+                        selection_queue.put((prefill_idx, decode_idx))
+            elif plot_type == PlotType.PREFILL:
+                prefill_selection["idx"] = row_idx
+                logger.info(f"Prefill selected: {row_idx}")
+                # Check if we have both selections
+                if decode_selection["idx"] is not None:
+                    logger.info(
+                        f"Both selections complete: Prefill={row_idx}, Decode={decode_selection['idx']}"
+                    )
+                    selection_queue.put((row_idx, decode_selection["idx"]))
+                else:
+                    logger.info("Waiting for decode selection...")
+            elif plot_type == PlotType.DECODE:
+                decode_selection["idx"] = row_idx
+                logger.info(f"Decode selected: {row_idx}")
+                # Check if we have both selections
+                if prefill_selection["idx"] is not None:
+                    logger.info(
+                        f"Both selections complete: Prefill={prefill_selection['idx']}, Decode={row_idx}"
+                    )
+                    selection_queue.put((prefill_selection["idx"], row_idx))
+                else:
+                    logger.info("Waiting for prefill selection...")
+
+        except Exception as e:
+            logger.error(f"Error handling selection: {e}")
+
+    return handle_selection
+
+
+def create_gradio_interface(json_data_str, handle_selection):
+    """Create the Gradio interface for configuration selection.
+
+    Args:
+        json_data_str: JSON string containing profiling data
+        handle_selection: Selection handler function
+
+    Returns:
+        gr.Blocks: Configured Gradio demo
+    """
+    with gr.Blocks(title="Configuration Selection") as demo:
+        # Create hidden UI components (reused from AIC profiling module)
+        ui_components = create_profiling_ui_components()
+        selection_input = ui_components["selection_input"]
+        selection_button = ui_components["selection_button"]
+        json_data = ui_components["json_data"]
+
+        # Inject CSS and modal (reused from AIC profiling module)
+        inject_profiling_assets()
+
+        gr.Markdown("# 📊 Profiling Results - Select Configuration")
+        gr.Markdown(
+            """
+            **Two ways to select prefill and decode configs:**
+            1. **Cost Analysis** (recommended): Click any row in the Cost Analysis table - automatically determines both prefill and decode
+            2. **Individual**: Click one row in the Prefill table AND one row in the Decode table
+            The selection will be processed automatically once complete.
+
+            > 📝 **Note:** The dotted red line in the prefill and decode charts are default TTFT and ITL SLAs if not specified.
+
+            > ⚠️ **Warning:** The TTFT values here represent the ideal case when requests arrive uniformly, minimizing queueing. Real-world TTFT may be higher than profiling results. To mitigate the issue, planner uses ][correction factors](https://github.com/ai-dynamo/dynamo/blob/main/docs/planner/sla_planner.md#2-correction-factor-calculation) to adjust dynamically at runtime.
+            """
+        )
+
+        # Performance Results Section (reused from AIC profiling module)
+        create_performance_results_section()
+
+        # Handle selection button
+        selection_button.click(
+            fn=handle_selection,
+            inputs=[selection_input],
+            outputs=[],
+        )
+
+        # Trigger visualization when JSON data changes
+        json_data.change(
+            fn=None,
+            inputs=[json_data],
+            outputs=[],
+            js=(
+                "(data) => { if (data && data.trim() && window.initializeVisualizations) "
+                "window.initializeVisualizations(data); }"
+            ),
+        )
+
+        # Load JavaScript and data automatically on page load
+        def load_data():
+            """Load profiling data."""
+            return json_data_str
+
+        demo.load(
+            fn=load_data, inputs=[], outputs=[json_data], js=load_profiling_javascript()
+        )
+
+    return demo
+
+
+def wait_for_selection(demo, selection_queue, port):
+    """Launch the demo and wait for user selection.
+
+    Args:
+        demo: Gradio demo instance
+        selection_queue: Queue to receive selection from UI
+        port: Port number for the WebUI
+
+    Returns:
+        tuple[int, int]: (selected_prefill_idx, selected_decode_idx)
+    """
+
+    # Launch the interface in a separate thread
+    def launch_thread():
+        demo.launch(
+            server_name="0.0.0.0",
+            server_port=port,
+            share=False,
+            prevent_thread_lock=True,
+        )
+
+    thread = threading.Thread(target=launch_thread, daemon=True)
+    thread.start()
+
+    logger.info(f"WebUI launched. Waiting for user selection on http://0.0.0.0:{port}")
+    logger.info("Please select a row from the Cost Analysis table")
+
+    # Block and wait for selection
+    try:
+        selected_prefill_idx, selected_decode_idx = selection_queue.get(
+            timeout=WEB_UI_SELECTION_TIMEOUT
+        )
+        logger.info(
+            f"User selected: Prefill={selected_prefill_idx}, Decode={selected_decode_idx}"
+        )
+
+        # Close the demo
+        demo.close()
+
+        return selected_prefill_idx, selected_decode_idx
+
+    except queue.Empty:
+        logger.error("Selection timeout - no selection made within 1 hour")
+        demo.close()
+        # Return default
+        return 0, 0
--- a/benchmarks/pyproject.toml
+++ b/benchmarks/pyproject.toml
@@ -40,7 +40,7 @@ classifiers = [
 ]

 dependencies = [
-    "aiconfigurator @ git+https://github.com/ai-dynamo/aiconfigurator.git@release/0.4.0",
+    "aiconfigurator[webapp] @ git+https://github.com/ai-dynamo/aiconfigurator.git@bdc142609b97c23a298115f09a9f88ae143f48d8",
    "networkx",
    "pandas",
    "pydantic>=2",

--- a/container/deps/requirements.txt
+++ b/container/deps/requirements.txt
@@ -11,9 +11,9 @@
 #   maximum versions available on different platforms (x86_64 vs aarch64, different CUDA versions)

 # For Multimodal EPD (required for device_map="auto" in vision model loading)
-accelerate==1.12.0
-aiconfigurator @ git+https://github.com/ai-dynamo/aiconfigurator.git@5554d2eb8206738c66048bf2d72183e9bcd85759
-aiofiles==24.1.0
+accelerate
+aiconfigurator[webapp] @ git+https://github.com/ai-dynamo/aiconfigurator.git@bdc142609b97c23a298115f09a9f88ae143f48d8
+aiofiles
 aiperf @ git+https://github.com/ai-dynamo/aiperf.git@4d3fa29403c8f75da22a14f1f7b3aeb27db9288f
 av==15.0.0
 fastapi==0.120.1

--- a/tests/profiler/test_profile_sla_aiconfigurator.py
+++ b/tests/profiler/test_profile_sla_aiconfigurator.py
@@ -66,6 +66,7 @@ class TestProfileSlaAiconfigurator:
                self.aic_backend_version = None
                self.num_gpus_per_node = 8
                self.deploy_after_profile = False
+                self.pick_with_webui = False
                # Provide minimal model_info to avoid HF queries
                self.model_info = ModelInfo(
                    model_size=16384.0,

--- a/tests/profiler/test_profile_sla_dryrun.py
+++ b/tests/profiler/test_profile_sla_dryrun.py
@@ -73,6 +73,7 @@ class TestProfileSLADryRun:
                self.aic_backend_version = None
                self.num_gpus_per_node = 8
                self.deploy_after_profile = False
+                self.pick_with_webui = False
                # Provide minimal model_info to avoid HF queries
                self.model_info = ModelInfo(
                    model_size=16384.0,
@@ -116,6 +117,7 @@ class TestProfileSLADryRun:
                self.aic_backend_version = None
                self.num_gpus_per_node = 8
                self.deploy_after_profile = False
+                self.pick_with_webui = False
                self.model_info = ModelInfo(
                    model_size=16384.0,
                    architecture="TestArchitecture",
@@ -180,6 +182,7 @@ class TestProfileSLADryRun:
                self.aic_backend_version = None
                self.num_gpus_per_node = 8
                self.deploy_after_profile = False
+                self.pick_with_webui = False
                self.model_info = ModelInfo(
                    model_size=16384.0,
                    architecture="TestArchitecture",
@@ -233,6 +236,7 @@ class TestProfileSLADryRun:
                self.aic_backend_version = None
                self.num_gpus_per_node = 8
                self.deploy_after_profile = False
+                self.pick_with_webui = False
                self.model_info = ModelInfo(
                    model_size=65536.0,
                    architecture="TestMoEArchitecture",
@@ -309,6 +313,7 @@ class TestProfileSLADryRun:
                # Set to 0 to trigger auto-generation path
                self.num_gpus_per_node = 0
                self.deploy_after_profile = False
+                self.pick_with_webui = False
                self.enable_gpu_discovery = True

        return Args()
@@ -376,6 +381,7 @@ class TestProfileSLADryRun:
                self.aic_backend_version = None
                self.num_gpus_per_node = 0
                self.deploy_after_profile = False
+                self.pick_with_webui = False
                self.enable_gpu_discovery = True

        return Args()
@@ -443,6 +449,7 @@ class TestProfileSLADryRun:
                self.aic_backend_version = None
                self.num_gpus_per_node = 0
                self.deploy_after_profile = False
+                self.pick_with_webui = False
                self.enable_gpu_discovery = True

        return Args()