Unverified Commit b24ccd29 authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

feat: allow user to input gpu cost in profiler webui (#4935)


Signed-off-by: default avatarhongkuanz <hongkuanz@nvidia.com>
parent c2a29f80
......@@ -32,7 +32,7 @@ AIPERF_PREFILL_ATTN_DP_NUM_REQ_RATIO = 4
# Cost calculation defaults
# TODO: allow user to configure this in GUI
GPU_COST_PER_HOUR = 3.0 # Cost per GPU per hour in dollars
DEFAULT_GPU_COST_PER_HOUR = 3.0 # Cost per GPU per hour in dollars
class EngineType(str, Enum):
......
......@@ -21,7 +21,7 @@ import numpy as np
from matplotlib import cm
from scipy.interpolate import griddata
from benchmarks.profiler.utils.defaults import GPU_COST_PER_HOUR
from benchmarks.profiler.utils.defaults import DEFAULT_GPU_COST_PER_HOUR
from benchmarks.profiler.utils.pareto import compute_pareto
logger = logging.getLogger(__name__)
......@@ -315,14 +315,16 @@ def plot_pd_joint_results(isl, osl, prefill_data, decode_data, output_dir):
ttft = []
for _p_ttft, _p_thpt in zip(p_ttft, p_thpt):
ttft.append(_p_ttft)
prefill_cost = isl * 1000 / _p_thpt * GPU_COST_PER_HOUR / 3600
prefill_cost = isl * 1000 / _p_thpt * DEFAULT_GPU_COST_PER_HOUR / 3600
tokens_per_user.append(1000 / d_itl)
cost.append(osl * 1000 / d_thpt * GPU_COST_PER_HOUR / 3600 + prefill_cost)
cost.append(
osl * 1000 / d_thpt * DEFAULT_GPU_COST_PER_HOUR / 3600 + prefill_cost
)
# plot
plt.figure(figsize=(12, 10))
plt.title(
f"Cost Per 1000 i{isl}o{osl} requests (GPU/hour = ${GPU_COST_PER_HOUR}) Under Different SLA"
f"Cost Per 1000 i{isl}o{osl} requests (GPU/hour = ${DEFAULT_GPU_COST_PER_HOUR}) Under Different SLA"
)
for _tokens_per_user, _cost, _ttft in zip(tokens_per_user, cost, ttft):
line = plt.plot(_tokens_per_user, _cost, label=f"TTFT: {_ttft:.2f}ms")[0]
......
......@@ -3,17 +3,14 @@
import json
import logging
import os
import queue
from pathlib import Path
from benchmarks.profiler.utils.defaults import DEFAULT_GPU_COST_PER_HOUR
from benchmarks.profiler.webui.utils import (
PlotType,
create_gpu_cost_update_handler,
create_gradio_interface,
create_selection_handler,
populate_cost_data,
populate_decode_data,
populate_prefill_data,
generate_config_data,
wait_for_selection,
)
......@@ -28,55 +25,6 @@ console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
def generate_config_data(prefill_data, decode_data, args):
"""
Generate JSON data file for WebUI from profiling results.
Args:
prefill_data: PrefillProfileData instance
decode_data: DecodeProfileData instance
args: Arguments containing SLA targets (ttft, itl, isl, osl) and output_dir
Returns a JSON data file for WebUI consumption,
see https://github.com/ai-dynamo/aiconfigurator/blob/main/src/aiconfigurator/webapp/components/profiling/standalone/sample_profiling_data.json for more details
"""
# Load template
template_path = Path(__file__).parent / "data_template.json"
with open(template_path, "r") as f:
data = json.load(f)
# Construct output path
output_path = os.path.join(args.output_dir, "webui_data.json")
# Set SLA targets
data[PlotType.PREFILL]["chart"]["target_line"]["value"] = args.ttft
data[PlotType.PREFILL]["chart"]["target_line"][
"label"
] = f"Target TTFT: {args.ttft} ms"
data[PlotType.DECODE]["chart"]["target_line"]["value"] = args.itl
data[PlotType.DECODE]["chart"]["target_line"][
"label"
] = f"Target ITL: {args.itl} ms"
data[PlotType.COST]["chart"][
"title"
] = f"Cost Per 1000 i{args.isl}o{args.osl} requests"
# Populate data sections
populate_prefill_data(data, prefill_data)
populate_decode_data(data, decode_data)
populate_cost_data(data, prefill_data, decode_data, args)
# Save JSON file
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, "w") as f:
json.dump(data, f, indent=4)
logger.info(f"Generated WebUI config data at {output_path}")
return data
def pick_config_with_webui(prefill_data, decode_data, args):
"""
Launch WebUI for user to pick configurations.
......@@ -89,13 +37,15 @@ def pick_config_with_webui(prefill_data, decode_data, args):
Returns:
tuple[int, int]: (selected_prefill_idx, selected_decode_idx)
"""
# Generate JSON data file and load it
generate_config_data(prefill_data, decode_data, args)
output_path = os.path.join(args.output_dir, "webui_data.json")
with open(output_path, "r") as f:
json_data_str = f.read()
data_dict = json.loads(json_data_str)
# Generate JSON data (also writes default JSON file for convenience)
data_dict = generate_config_data(
prefill_data,
decode_data,
args,
gpu_cost_per_hour=DEFAULT_GPU_COST_PER_HOUR,
write_to_disk=True,
)
json_data_str = json.dumps(data_dict)
logger.info(f"Launching WebUI on port {args.webui_port}...")
......@@ -107,9 +57,23 @@ def pick_config_with_webui(prefill_data, decode_data, args):
decode_selection = {"idx": None}
# Create selection handler and Gradio interface
data_dict_ref = {"data": data_dict}
handle_selection = create_selection_handler(
data_dict, selection_queue, prefill_selection, decode_selection
data_dict_ref, selection_queue, prefill_selection, decode_selection
)
update_gpu_cost_per_hour = create_gpu_cost_update_handler(
prefill_data=prefill_data,
decode_data=decode_data,
args=args,
data_dict_ref=data_dict_ref,
default_gpu_cost_per_hour=DEFAULT_GPU_COST_PER_HOUR,
)
demo = create_gradio_interface(
json_data_str,
handle_selection,
update_json_data_fn=update_gpu_cost_per_hour,
default_gpu_cost_per_hour=DEFAULT_GPU_COST_PER_HOUR,
)
demo = create_gradio_interface(json_data_str, handle_selection)
return wait_for_selection(demo, selection_queue, args.webui_port)
......@@ -3,9 +3,11 @@
import json
import logging
import os
import queue
import threading
from enum import Enum
from pathlib import Path
import gradio as gr
import numpy as np
......@@ -16,7 +18,7 @@ from aiconfigurator.webapp.components.profiling import (
load_profiling_javascript,
)
from benchmarks.profiler.utils.defaults import GPU_COST_PER_HOUR
from benchmarks.profiler.utils.defaults import DEFAULT_GPU_COST_PER_HOUR
from benchmarks.profiler.utils.pareto import compute_pareto
logger = logging.getLogger(__name__)
......@@ -47,6 +49,95 @@ CHART_COLORS = [
WEB_UI_SELECTION_TIMEOUT = 3600
def generate_config_data(
prefill_data,
decode_data,
args,
gpu_cost_per_hour: float = DEFAULT_GPU_COST_PER_HOUR,
write_to_disk: bool = True,
):
"""
Generate JSON data file for WebUI from profiling results.
Args:
prefill_data: PrefillProfileData instance
decode_data: DecodeProfileData instance
args: Arguments containing SLA targets (ttft, itl, isl, osl) and output_dir
gpu_cost_per_hour: GPU cost in $/GPU/hour used for cost plot/table
write_to_disk: Whether to write the generated JSON to args.output_dir/webui_data.json
Returns:
dict: Data dict for WebUI consumption.
"""
# Load template
template_path = Path(__file__).parent / "data_template.json"
with open(template_path, "r") as f:
data = json.load(f)
# Construct output path
output_path = os.path.join(args.output_dir, "webui_data.json")
# Set SLA targets
data[PlotType.PREFILL]["chart"]["target_line"]["value"] = args.ttft
data[PlotType.PREFILL]["chart"]["target_line"][
"label"
] = f"Target TTFT: {args.ttft} ms"
data[PlotType.DECODE]["chart"]["target_line"]["value"] = args.itl
data[PlotType.DECODE]["chart"]["target_line"][
"label"
] = f"Target ITL: {args.itl} ms"
data[PlotType.COST]["chart"][
"title"
] = f"Cost Per 1000 i{args.isl}o{args.osl} requests"
# Populate data sections
populate_prefill_data(data, prefill_data)
populate_decode_data(data, decode_data)
populate_cost_data(
data, prefill_data, decode_data, args, gpu_cost_per_hour=gpu_cost_per_hour
)
# Save JSON file (optional)
if write_to_disk:
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, "w") as f:
json.dump(data, f, indent=4)
logger.info(f"Generated WebUI config data at {output_path}")
return data
def create_gpu_cost_update_handler(
*,
prefill_data,
decode_data,
args,
data_dict_ref,
default_gpu_cost_per_hour: float = DEFAULT_GPU_COST_PER_HOUR,
):
"""Create a Gradio change-handler that regenerates profiling JSON when GPU cost changes."""
def update_gpu_cost_per_hour(gpu_cost_per_hour):
try:
gpu_cost = float(gpu_cost_per_hour)
except Exception:
gpu_cost = default_gpu_cost_per_hour
new_data = generate_config_data(
prefill_data,
decode_data,
args,
gpu_cost_per_hour=gpu_cost,
write_to_disk=False,
)
data_dict_ref["data"] = new_data
return json.dumps(new_data)
return update_gpu_cost_per_hour
def populate_prefill_data(data, prefill_data):
"""Populate prefill chart and table data."""
if not prefill_data.num_gpus:
......@@ -141,7 +232,13 @@ def populate_decode_data(data, decode_data):
data[PlotType.DECODE]["table"]["data"] = table_data
def populate_cost_data(data, prefill_data, decode_data, args):
def populate_cost_data(
data,
prefill_data,
decode_data,
args,
gpu_cost_per_hour: float = DEFAULT_GPU_COST_PER_HOUR,
):
"""Populate cost chart and table data with pareto-optimal configurations."""
if not prefill_data.num_gpus or not decode_data.num_gpus:
return
......@@ -170,13 +267,13 @@ def populate_cost_data(data, prefill_data, decode_data, args):
for p_idx, (_p_ttft, _p_thpt) in enumerate(zip(p_ttft, p_thpt)):
# Calculate prefill cost (fixed for this line)
prefill_cost = args.isl * 1000 / _p_thpt * GPU_COST_PER_HOUR / 3600
prefill_cost = args.isl * 1000 / _p_thpt * gpu_cost_per_hour / 3600
# For each decode config, calculate total cost
line_data = []
for d_idx, (_d_itl, _d_thpt) in enumerate(zip(d_itl, d_thpt)):
# Calculate decode cost
decode_cost = args.osl * 1000 / _d_thpt * GPU_COST_PER_HOUR / 3600
decode_cost = args.osl * 1000 / _d_thpt * gpu_cost_per_hour / 3600
total_cost = prefill_cost + decode_cost
# X-axis: tokens per user (based on ITL)
......@@ -230,12 +327,12 @@ def populate_cost_data(data, prefill_data, decode_data, args):
def create_selection_handler(
data_dict, selection_queue, prefill_selection, decode_selection
data_dict_ref, selection_queue, prefill_selection, decode_selection
):
"""Create a selection handler closure for the WebUI.
Args:
data_dict: Parsed JSON data containing cost index mapping
data_dict_ref: Dict wrapper holding the latest parsed JSON data (mutated when UI inputs change)
selection_queue: Queue to communicate selections to main thread
prefill_selection: Dict tracking prefill selection state
decode_selection: Dict tracking decode selection state
......@@ -250,6 +347,7 @@ def create_selection_handler(
return
try:
data_dict = data_dict_ref["data"]
selection = json.loads(selection_json)
plot_type = selection.get("plotType")
row_idx = selection.get("rowIndex")
......@@ -299,12 +397,19 @@ def create_selection_handler(
return handle_selection
def create_gradio_interface(json_data_str, handle_selection):
def create_gradio_interface(
json_data_str,
handle_selection,
update_json_data_fn=None,
default_gpu_cost_per_hour: float = DEFAULT_GPU_COST_PER_HOUR,
):
"""Create the Gradio interface for configuration selection.
Args:
json_data_str: JSON string containing profiling data
handle_selection: Selection handler function
update_json_data_fn: Optional function that takes (gpu_cost_per_hour) and returns updated JSON string.
default_gpu_cost_per_hour: Default GPU cost per hour used to initialize the input box.
Returns:
gr.Blocks: Configured Gradio demo
......@@ -320,6 +425,7 @@ def create_gradio_interface(json_data_str, handle_selection):
inject_profiling_assets()
gr.Markdown("# 📊 Profiling Results - Select Configuration")
gr.Markdown(
"""
**Two ways to select prefill and decode configs:**
......@@ -333,6 +439,20 @@ def create_gradio_interface(json_data_str, handle_selection):
"""
)
with gr.Row():
gpu_cost_per_hour = gr.Number(
label="GPU cost per hour ($/GPU/hour)",
value=default_gpu_cost_per_hour,
minimum=0,
precision=4,
)
if update_json_data_fn is not None:
gpu_cost_per_hour.change(
fn=update_json_data_fn,
inputs=[gpu_cost_per_hour],
outputs=[json_data],
)
# Performance Results Section (reused from AIC profiling module)
create_performance_results_section()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment