feat: Profiler WebUI improvements -- error handling, GPU hours, style fixes,...

feat: Profiler WebUI improvements -- error handling, GPU hours, style fixes, preview configs (#4968) Signed-off-by: Hannah Zhang <hannahz@nvidia.com>

feat: Profiler WebUI improvements -- error handling, GPU hours, style fixes,...
feat: Profiler WebUI improvements -- error handling, GPU hours, style fixes, preview configs (#4968) Signed-off-by: Hannah Zhang <hannahz@nvidia.com>
6b5842ee · hhzhang16 · GitHub · 74fcd4a9 · 6b5842ee · 6b5842ee
Unverified Commit 6b5842ee authored Dec 16, 2025 by hhzhang16 Committed by GitHub Dec 16, 2025
6 changed files
--- a/benchmarks/profiler/profile_sla.py
+++ b/benchmarks/profiler/profile_sla.py
@@ -50,7 +50,11 @@ from benchmarks.profiler.utils.profile_prefill import (
    profile_prefill_aiconfigurator,
 )
 from benchmarks.profiler.utils.profiler_argparse import create_profiler_parser
-from benchmarks.profiler.webui.select_config import pick_config_with_webui
+from benchmarks.profiler.webui.select_config import (
+    add_profiling_error,
+    clear_profiling_errors,
+    pick_config_with_webui,
+)
 from deploy.utils.dynamo_deployment import (
    DynamoDeploymentClient,
    cleanup_remaining_deployments,
@@ -131,6 +135,9 @@ async def run_profile(args):
    # List to track all created deployment clients for cleanup in case of failure
    deployment_clients = []

+    # Clear any errors from previous profiling runs
+    clear_profiling_errors()
+
    # Inherit aic_backend from backend if not explicitly set
    if not args.aic_backend:
        args.aic_backend = args.backend
@@ -476,7 +483,9 @@ async def run_profile(args):
            logger.info("Analyzing results and generate recommendations...")
            # Safety guards: no results → exit early with a clear message
            if not prefill_data.num_gpus:
-                logger.error("No prefill results produced; skipping recommendations.")
+                error_msg = "No prefill results produced; skipping recommendations."
+                logger.error(error_msg)
+                add_profiling_error(error_msg)
                return

            if args.pick_with_webui:
@@ -488,9 +497,9 @@ async def run_profile(args):
                # automatically select P/D config within SLA with the highest throughput/GPU
                # select best parallel mapping for prefill
                if min(prefill_data.ttft) > args.ttft:
-                    logger.warning(
-                        "No engine configuration satisfies the TTFT requirement, please try a smaller model or more powerful hardware"
-                    )
+                    warning_msg = "No engine configuration satisfies the TTFT requirement, please try a smaller model or more powerful hardware"
+                    logger.warning(warning_msg)
+                    add_profiling_error(warning_msg)
                    selected_prefill_idx = int(np.argmin(np.array(prefill_data.ttft)))
                else:
                    valid_indices = [
@@ -508,14 +517,14 @@ async def run_profile(args):

                # select best parallel mapping for decode
                if not decode_data.num_gpus:
-                    logger.error(
-                        "No decode results produced; skipping recommendations."
-                    )
+                    error_msg = "No decode results produced; skipping recommendations."
+                    logger.error(error_msg)
+                    add_profiling_error(error_msg)
                    return
                if min(decode_data.itl) > args.itl:
-                    logger.warning(
-                        "No engine configuration satisfies the ITL requirement, please try a smaller model or more powerful hardware"
-                    )
+                    warning_msg = "No engine configuration satisfies the ITL requirement, please try a smaller model or more powerful hardware"
+                    logger.warning(warning_msg)
+                    add_profiling_error(warning_msg)
                    selected_decode_idx = int(np.argmin(np.array(decode_data.itl)))
                else:
                    valid_indices = [

--- a/benchmarks/profiler/webui/data_template.json
+++ b/benchmarks/profiler/webui/data_template.json
@@ -76,11 +76,11 @@
                    "min": 0
                },
                "y": {
-                    "title": "Cost ($)",
+                    "title": "GPU Hours",
                    "min": 0
                }
            },
-            "title": "Cost Per 1000 ? requests"
+            "title": "GPU Hours Per 1000 ? requests"
        },
        "table": {
            "columns": [
@@ -89,7 +89,7 @@
                "ITL (ms)",
                "Decode Thpt (tokens/s/GPU)",
                "Tokens/User",
-                "Cost ($)",
+                "GPU Hours",
                "Action"
            ],
            "data": []

--- a/benchmarks/profiler/webui/select_config.py
+++ b/benchmarks/profiler/webui/select_config.py
@@ -5,15 +5,18 @@ import json
 import logging
 import queue

-from benchmarks.profiler.utils.defaults import DEFAULT_GPU_COST_PER_HOUR
 from benchmarks.profiler.webui.utils import (
-    create_gpu_cost_update_handler,
+    add_profiling_error,
+    clear_profiling_errors,
    create_gradio_interface,
    create_selection_handler,
    generate_config_data,
    wait_for_selection,
 )

+# Re-export for use by profiler modules
+__all__ = ["pick_config_with_webui", "add_profiling_error", "clear_profiling_errors"]
+
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 console_handler = logging.StreamHandler()
@@ -37,12 +40,15 @@ def pick_config_with_webui(prefill_data, decode_data, args):
    Returns:
        tuple[int, int]: (selected_prefill_idx, selected_decode_idx)
    """
-    # Generate JSON data (also writes default JSON file for convenience)
+    # Note: Don't clear profiling errors here - they should be accumulated
+    # during the profiling run and displayed in the WebUI.
+    # clear_profiling_errors() should be called at the start of a new profiling run.
+
+    # Generate JSON data with GPU hours (frontend handles cost conversion)
    data_dict = generate_config_data(
        prefill_data,
        decode_data,
        args,
-        gpu_cost_per_hour=DEFAULT_GPU_COST_PER_HOUR,
        write_to_disk=True,
    )
    json_data_str = json.dumps(data_dict)
@@ -61,19 +67,11 @@ def pick_config_with_webui(prefill_data, decode_data, args):
    handle_selection = create_selection_handler(
        data_dict_ref, selection_queue, prefill_selection, decode_selection
    )
-    update_gpu_cost_per_hour = create_gpu_cost_update_handler(
-        prefill_data=prefill_data,
-        decode_data=decode_data,
-        args=args,
-        data_dict_ref=data_dict_ref,
-        default_gpu_cost_per_hour=DEFAULT_GPU_COST_PER_HOUR,
-    )

+    # Note: GPU hours -> Cost conversion is handled by frontend JavaScript (gpu_cost_toggle.js)
    demo = create_gradio_interface(
        json_data_str,
        handle_selection,
-        update_json_data_fn=update_gpu_cost_per_hour,
-        default_gpu_cost_per_hour=DEFAULT_GPU_COST_PER_HOUR,
    )

    return wait_for_selection(demo, selection_queue, args.webui_port)
--- a/benchmarks/profiler/webui/utils.py
+++ b/benchmarks/profiler/webui/utils.py
--- a/container/deps/requirements.txt
+++ b/container/deps/requirements.txt
@@ -12,7 +12,7 @@

 # For Multimodal EPD (required for device_map="auto" in vision model loading)
 accelerate
-aiconfigurator[webapp] @ git+https://github.com/ai-dynamo/aiconfigurator.git@7f7ad5e248f3eaa4a0b74a069095828a4f356e60
+aiconfigurator[webapp] @ git+https://github.com/ai-dynamo/aiconfigurator.git@0c8f38d354e9138f2cc00efcde66245b3801df1d
 aiofiles
 aiperf @ git+https://github.com/ai-dynamo/aiperf.git@4d3fa29403c8f75da22a14f1f7b3aeb27db9288f
 av==15.0.0

--- a/docs/benchmarks/sla_driven_profiling.md
+++ b/docs/benchmarks/sla_driven_profiling.md
@@ -170,6 +170,67 @@ Suggested prefill TP:4 (TTFT 48.37 ms, throughput 15505.23 tokens/s/GPU)
 Suggested decode TP:4 (ITL 4.83 ms, throughput 51.22 tokens/s/GPU)
 ```

+#### Interactive Configuration Selection WebUI
+
+When running the profiler with `--pick-with-webui`, an interactive web interface is launched that allows you to visually explore profiling results and manually select configurations.
+
+**Features:**
+- **Interactive Charts**: Visualize prefill TTFT, decode ITL, and GPU hours analysis with hover-to-highlight synchronization between charts and tables
+- **Pareto-Optimal Analysis**: The GPU Hours table shows pareto-optimal configurations balancing latency and throughput
+- **DGD Config Preview**: Click "Show Config" on any row to view the corresponding DynamoGraphDeployment YAML
+- **GPU Cost Estimation**: Toggle GPU cost display to convert GPU hours to cost ($/1000 requests)
+- **SLA Visualization**: Red dashed lines indicate your TTFT and ITL targets
+
+**Selection Methods:**
+1. **GPU Hours Table** (recommended): Click any row to select both prefill and decode configurations at once based on the pareto-optimal combination
+2. **Individual Selection**: Click one row in the Prefill table AND one row in the Decode table to manually choose each
+
+**Example DGD Config Output:**
+
+When you click "Show Config", you'll see a DynamoGraphDeployment configuration like:
+
+```yaml
+# DynamoGraphDeployment Configuration
+# Prefill: 1 GPU(s), TP=1
+# Decode: 4 GPU(s), TP=4
+# Model: Qwen/Qwen3-32B-FP8
+# Backend: trtllm
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+spec:
+  services:
+    PrefillWorker:
+      subComponentType: prefill
+      replicas: 1
+      extraPodSpec:
+        mainContainer:
+          args:
+          - --tensor-parallel-size=1
+    DecodeWorker:
+      subComponentType: decode
+      replicas: 1
+      extraPodSpec:
+        mainContainer:
+          args:
+          - --tensor-parallel-size=4
+```
+
+**Usage:**
+```bash
+python -m benchmarks.profiler.profile_sla \
+  --backend trtllm \
+  --config path/to/disagg.yaml \
+  --pick-with-webui \
+  --use-ai-configurator \
+  --model Qwen/Qwen3-32B-FP8 \
+  --aic-system h200_sxm \
+  --ttft 200 --itl 15
+```
+
+Once you have selected a configuration, the full DynamoGraphDeployment CRD will be saved in your output folder as `config_with_planner.yaml`.
+
+The WebUI launches on port 8000 by default (configurable with `--webui-port`).
+
 #### Output Performance Plots

 The profiler will generate the following plots to better visualize the performance data: