feat: allow user to input gpu cost in profiler webui (#4935)

Signed-off-by: hongkuanz <hongkuanz@nvidia.com>

feat: allow user to input gpu cost in profiler webui (#4935)
Signed-off-by: hongkuanz <hongkuanz@nvidia.com>
b24ccd29 · Hongkuan Zhou · GitHub · c2a29f80 · b24ccd29 · b24ccd29
Unverified Commit b24ccd29 authored Dec 12, 2025 by Hongkuan Zhou Committed by GitHub Dec 12, 2025
4 changed files
--- a/benchmarks/profiler/utils/defaults.py
+++ b/benchmarks/profiler/utils/defaults.py
@@ -32,7 +32,7 @@ AIPERF_PREFILL_ATTN_DP_NUM_REQ_RATIO = 4

 # Cost calculation defaults
 # TODO: allow user to configure this in GUI
-GPU_COST_PER_HOUR = 3.0  # Cost per GPU per hour in dollars
+DEFAULT_GPU_COST_PER_HOUR = 3.0  # Cost per GPU per hour in dollars


 class EngineType(str, Enum):

--- a/benchmarks/profiler/utils/plot.py
+++ b/benchmarks/profiler/utils/plot.py
@@ -21,7 +21,7 @@ import numpy as np
 from matplotlib import cm
 from scipy.interpolate import griddata

-from benchmarks.profiler.utils.defaults import GPU_COST_PER_HOUR
+from benchmarks.profiler.utils.defaults import DEFAULT_GPU_COST_PER_HOUR
 from benchmarks.profiler.utils.pareto import compute_pareto

 logger = logging.getLogger(__name__)
@@ -315,14 +315,16 @@ def plot_pd_joint_results(isl, osl, prefill_data, decode_data, output_dir):
    ttft = []
    for _p_ttft, _p_thpt in zip(p_ttft, p_thpt):
        ttft.append(_p_ttft)
-        prefill_cost = isl * 1000 / _p_thpt * GPU_COST_PER_HOUR / 3600
+        prefill_cost = isl * 1000 / _p_thpt * DEFAULT_GPU_COST_PER_HOUR / 3600
        tokens_per_user.append(1000 / d_itl)
-        cost.append(osl * 1000 / d_thpt * GPU_COST_PER_HOUR / 3600 + prefill_cost)
+        cost.append(
+            osl * 1000 / d_thpt * DEFAULT_GPU_COST_PER_HOUR / 3600 + prefill_cost
+        )

    # plot
    plt.figure(figsize=(12, 10))
    plt.title(
-        f"Cost Per 1000 i{isl}o{osl} requests (GPU/hour = ${GPU_COST_PER_HOUR}) Under Different SLA"
+        f"Cost Per 1000 i{isl}o{osl} requests (GPU/hour = ${DEFAULT_GPU_COST_PER_HOUR}) Under Different SLA"
    )
    for _tokens_per_user, _cost, _ttft in zip(tokens_per_user, cost, ttft):
        line = plt.plot(_tokens_per_user, _cost, label=f"TTFT: {_ttft:.2f}ms")[0]

--- a/benchmarks/profiler/webui/select_config.py
+++ b/benchmarks/profiler/webui/select_config.py
@@ -3,17 +3,14 @@

 import json
 import logging
-import os
 import queue
-from pathlib import Path

+from benchmarks.profiler.utils.defaults import DEFAULT_GPU_COST_PER_HOUR
 from benchmarks.profiler.webui.utils import (
-    PlotType,
+    create_gpu_cost_update_handler,
    create_gradio_interface,
    create_selection_handler,
-    populate_cost_data,
-    populate_decode_data,
-    populate_prefill_data,
+    generate_config_data,
    wait_for_selection,
 )

@@ -28,55 +25,6 @@ console_handler.setFormatter(formatter)
 logger.addHandler(console_handler)


-def generate_config_data(prefill_data, decode_data, args):
-    """
-    Generate JSON data file for WebUI from profiling results.
-
-    Args:
-        prefill_data: PrefillProfileData instance
-        decode_data: DecodeProfileData instance
-        args: Arguments containing SLA targets (ttft, itl, isl, osl) and output_dir
-
-    Returns a JSON data file for WebUI consumption,
-        see https://github.com/ai-dynamo/aiconfigurator/blob/main/src/aiconfigurator/webapp/components/profiling/standalone/sample_profiling_data.json for more details
-    """
-    # Load template
-    template_path = Path(__file__).parent / "data_template.json"
-    with open(template_path, "r") as f:
-        data = json.load(f)
-
-    # Construct output path
-    output_path = os.path.join(args.output_dir, "webui_data.json")
-
-    # Set SLA targets
-    data[PlotType.PREFILL]["chart"]["target_line"]["value"] = args.ttft
-    data[PlotType.PREFILL]["chart"]["target_line"][
-        "label"
-    ] = f"Target TTFT: {args.ttft} ms"
-
-    data[PlotType.DECODE]["chart"]["target_line"]["value"] = args.itl
-    data[PlotType.DECODE]["chart"]["target_line"][
-        "label"
-    ] = f"Target ITL: {args.itl} ms"
-
-    data[PlotType.COST]["chart"][
-        "title"
-    ] = f"Cost Per 1000 i{args.isl}o{args.osl} requests"
-
-    # Populate data sections
-    populate_prefill_data(data, prefill_data)
-    populate_decode_data(data, decode_data)
-    populate_cost_data(data, prefill_data, decode_data, args)
-
-    # Save JSON file
-    os.makedirs(os.path.dirname(output_path), exist_ok=True)
-    with open(output_path, "w") as f:
-        json.dump(data, f, indent=4)
-
-    logger.info(f"Generated WebUI config data at {output_path}")
-    return data
-
-
 def pick_config_with_webui(prefill_data, decode_data, args):
    """
    Launch WebUI for user to pick configurations.
@@ -89,13 +37,15 @@ def pick_config_with_webui(prefill_data, decode_data, args):
    Returns:
        tuple[int, int]: (selected_prefill_idx, selected_decode_idx)
    """
-    # Generate JSON data file and load it
-    generate_config_data(prefill_data, decode_data, args)
-
-    output_path = os.path.join(args.output_dir, "webui_data.json")
-    with open(output_path, "r") as f:
-        json_data_str = f.read()
-        data_dict = json.loads(json_data_str)
+    # Generate JSON data (also writes default JSON file for convenience)
+    data_dict = generate_config_data(
+        prefill_data,
+        decode_data,
+        args,
+        gpu_cost_per_hour=DEFAULT_GPU_COST_PER_HOUR,
+        write_to_disk=True,
+    )
+    json_data_str = json.dumps(data_dict)

    logger.info(f"Launching WebUI on port {args.webui_port}...")

@@ -107,9 +57,23 @@ def pick_config_with_webui(prefill_data, decode_data, args):
    decode_selection = {"idx": None}

    # Create selection handler and Gradio interface
+    data_dict_ref = {"data": data_dict}
    handle_selection = create_selection_handler(
-        data_dict, selection_queue, prefill_selection, decode_selection
+        data_dict_ref, selection_queue, prefill_selection, decode_selection
+    )
+    update_gpu_cost_per_hour = create_gpu_cost_update_handler(
+        prefill_data=prefill_data,
+        decode_data=decode_data,
+        args=args,
+        data_dict_ref=data_dict_ref,
+        default_gpu_cost_per_hour=DEFAULT_GPU_COST_PER_HOUR,
+    )
+
+    demo = create_gradio_interface(
+        json_data_str,
+        handle_selection,
+        update_json_data_fn=update_gpu_cost_per_hour,
+        default_gpu_cost_per_hour=DEFAULT_GPU_COST_PER_HOUR,
    )
-    demo = create_gradio_interface(json_data_str, handle_selection)

    return wait_for_selection(demo, selection_queue, args.webui_port)
--- a/benchmarks/profiler/webui/utils.py
+++ b/benchmarks/profiler/webui/utils.py
@@ -3,9 +3,11 @@

 import json
 import logging
+import os
 import queue
 import threading
 from enum import Enum
+from pathlib import Path

 import gradio as gr
 import numpy as np
@@ -16,7 +18,7 @@ from aiconfigurator.webapp.components.profiling import (
    load_profiling_javascript,
 )

-from benchmarks.profiler.utils.defaults import GPU_COST_PER_HOUR
+from benchmarks.profiler.utils.defaults import DEFAULT_GPU_COST_PER_HOUR
 from benchmarks.profiler.utils.pareto import compute_pareto

 logger = logging.getLogger(__name__)
@@ -47,6 +49,95 @@ CHART_COLORS = [
 WEB_UI_SELECTION_TIMEOUT = 3600


+def generate_config_data(
+    prefill_data,
+    decode_data,
+    args,
+    gpu_cost_per_hour: float = DEFAULT_GPU_COST_PER_HOUR,
+    write_to_disk: bool = True,
+):
+    """
+    Generate JSON data file for WebUI from profiling results.
+
+    Args:
+        prefill_data: PrefillProfileData instance
+        decode_data: DecodeProfileData instance
+        args: Arguments containing SLA targets (ttft, itl, isl, osl) and output_dir
+        gpu_cost_per_hour: GPU cost in $/GPU/hour used for cost plot/table
+        write_to_disk: Whether to write the generated JSON to args.output_dir/webui_data.json
+
+    Returns:
+        dict: Data dict for WebUI consumption.
+    """
+    # Load template
+    template_path = Path(__file__).parent / "data_template.json"
+    with open(template_path, "r") as f:
+        data = json.load(f)
+
+    # Construct output path
+    output_path = os.path.join(args.output_dir, "webui_data.json")
+
+    # Set SLA targets
+    data[PlotType.PREFILL]["chart"]["target_line"]["value"] = args.ttft
+    data[PlotType.PREFILL]["chart"]["target_line"][
+        "label"
+    ] = f"Target TTFT: {args.ttft} ms"
+
+    data[PlotType.DECODE]["chart"]["target_line"]["value"] = args.itl
+    data[PlotType.DECODE]["chart"]["target_line"][
+        "label"
+    ] = f"Target ITL: {args.itl} ms"
+
+    data[PlotType.COST]["chart"][
+        "title"
+    ] = f"Cost Per 1000 i{args.isl}o{args.osl} requests"
+
+    # Populate data sections
+    populate_prefill_data(data, prefill_data)
+    populate_decode_data(data, decode_data)
+    populate_cost_data(
+        data, prefill_data, decode_data, args, gpu_cost_per_hour=gpu_cost_per_hour
+    )
+
+    # Save JSON file (optional)
+    if write_to_disk:
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        with open(output_path, "w") as f:
+            json.dump(data, f, indent=4)
+        logger.info(f"Generated WebUI config data at {output_path}")
+
+    return data
+
+
+def create_gpu_cost_update_handler(
+    *,
+    prefill_data,
+    decode_data,
+    args,
+    data_dict_ref,
+    default_gpu_cost_per_hour: float = DEFAULT_GPU_COST_PER_HOUR,
+):
+    """Create a Gradio change-handler that regenerates profiling JSON when GPU cost changes."""
+
+    def update_gpu_cost_per_hour(gpu_cost_per_hour):
+        try:
+            gpu_cost = float(gpu_cost_per_hour)
+        except Exception:
+            gpu_cost = default_gpu_cost_per_hour
+
+        new_data = generate_config_data(
+            prefill_data,
+            decode_data,
+            args,
+            gpu_cost_per_hour=gpu_cost,
+            write_to_disk=False,
+        )
+        data_dict_ref["data"] = new_data
+        return json.dumps(new_data)
+
+    return update_gpu_cost_per_hour
+
+
 def populate_prefill_data(data, prefill_data):
    """Populate prefill chart and table data."""
    if not prefill_data.num_gpus:
@@ -141,7 +232,13 @@ def populate_decode_data(data, decode_data):
    data[PlotType.DECODE]["table"]["data"] = table_data


-def populate_cost_data(data, prefill_data, decode_data, args):
+def populate_cost_data(
+    data,
+    prefill_data,
+    decode_data,
+    args,
+    gpu_cost_per_hour: float = DEFAULT_GPU_COST_PER_HOUR,
+):
    """Populate cost chart and table data with pareto-optimal configurations."""
    if not prefill_data.num_gpus or not decode_data.num_gpus:
        return
@@ -170,13 +267,13 @@ def populate_cost_data(data, prefill_data, decode_data, args):

    for p_idx, (_p_ttft, _p_thpt) in enumerate(zip(p_ttft, p_thpt)):
        # Calculate prefill cost (fixed for this line)
-        prefill_cost = args.isl * 1000 / _p_thpt * GPU_COST_PER_HOUR / 3600
+        prefill_cost = args.isl * 1000 / _p_thpt * gpu_cost_per_hour / 3600

        # For each decode config, calculate total cost
        line_data = []
        for d_idx, (_d_itl, _d_thpt) in enumerate(zip(d_itl, d_thpt)):
            # Calculate decode cost
-            decode_cost = args.osl * 1000 / _d_thpt * GPU_COST_PER_HOUR / 3600
+            decode_cost = args.osl * 1000 / _d_thpt * gpu_cost_per_hour / 3600
            total_cost = prefill_cost + decode_cost

            # X-axis: tokens per user (based on ITL)
@@ -230,12 +327,12 @@ def populate_cost_data(data, prefill_data, decode_data, args):


 def create_selection_handler(
-    data_dict, selection_queue, prefill_selection, decode_selection
+    data_dict_ref, selection_queue, prefill_selection, decode_selection
 ):
    """Create a selection handler closure for the WebUI.

    Args:
-        data_dict: Parsed JSON data containing cost index mapping
+        data_dict_ref: Dict wrapper holding the latest parsed JSON data (mutated when UI inputs change)
        selection_queue: Queue to communicate selections to main thread
        prefill_selection: Dict tracking prefill selection state
        decode_selection: Dict tracking decode selection state
@@ -250,6 +347,7 @@ def create_selection_handler(
            return

        try:
+            data_dict = data_dict_ref["data"]
            selection = json.loads(selection_json)
            plot_type = selection.get("plotType")
            row_idx = selection.get("rowIndex")
@@ -299,12 +397,19 @@ def create_selection_handler(
    return handle_selection


-def create_gradio_interface(json_data_str, handle_selection):
+def create_gradio_interface(
+    json_data_str,
+    handle_selection,
+    update_json_data_fn=None,
+    default_gpu_cost_per_hour: float = DEFAULT_GPU_COST_PER_HOUR,
+):
    """Create the Gradio interface for configuration selection.

    Args:
        json_data_str: JSON string containing profiling data
        handle_selection: Selection handler function
+        update_json_data_fn: Optional function that takes (gpu_cost_per_hour) and returns updated JSON string.
+        default_gpu_cost_per_hour: Default GPU cost per hour used to initialize the input box.

    Returns:
        gr.Blocks: Configured Gradio demo
@@ -320,6 +425,7 @@ def create_gradio_interface(json_data_str, handle_selection):
        inject_profiling_assets()

        gr.Markdown("# 📊 Profiling Results - Select Configuration")
+
        gr.Markdown(
            """
            **Two ways to select prefill and decode configs:**
@@ -333,6 +439,20 @@ def create_gradio_interface(json_data_str, handle_selection):
            """
        )

+        with gr.Row():
+            gpu_cost_per_hour = gr.Number(
+                label="GPU cost per hour ($/GPU/hour)",
+                value=default_gpu_cost_per_hour,
+                minimum=0,
+                precision=4,
+            )
+        if update_json_data_fn is not None:
+            gpu_cost_per_hour.change(
+                fn=update_json_data_fn,
+                inputs=[gpu_cost_per_hour],
+                outputs=[json_data],
+            )
+
        # Performance Results Section (reused from AIC profiling module)
        create_performance_results_section()