"ssh:/git@developer.sourcefind.cn:2222/OpenDAS/dynamo.git" did not exist on "71f9e7a9de9f45c1a44a1753c1ce45a8115452cb"
Unverified Commit b24ccd29 authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

feat: allow user to input gpu cost in profiler webui (#4935)


Signed-off-by: default avatarhongkuanz <hongkuanz@nvidia.com>
parent c2a29f80
...@@ -32,7 +32,7 @@ AIPERF_PREFILL_ATTN_DP_NUM_REQ_RATIO = 4 ...@@ -32,7 +32,7 @@ AIPERF_PREFILL_ATTN_DP_NUM_REQ_RATIO = 4
# Cost calculation defaults # Cost calculation defaults
# TODO: allow user to configure this in GUI # TODO: allow user to configure this in GUI
GPU_COST_PER_HOUR = 3.0 # Cost per GPU per hour in dollars DEFAULT_GPU_COST_PER_HOUR = 3.0 # Cost per GPU per hour in dollars
class EngineType(str, Enum): class EngineType(str, Enum):
......
...@@ -21,7 +21,7 @@ import numpy as np ...@@ -21,7 +21,7 @@ import numpy as np
from matplotlib import cm from matplotlib import cm
from scipy.interpolate import griddata from scipy.interpolate import griddata
from benchmarks.profiler.utils.defaults import GPU_COST_PER_HOUR from benchmarks.profiler.utils.defaults import DEFAULT_GPU_COST_PER_HOUR
from benchmarks.profiler.utils.pareto import compute_pareto from benchmarks.profiler.utils.pareto import compute_pareto
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -315,14 +315,16 @@ def plot_pd_joint_results(isl, osl, prefill_data, decode_data, output_dir): ...@@ -315,14 +315,16 @@ def plot_pd_joint_results(isl, osl, prefill_data, decode_data, output_dir):
ttft = [] ttft = []
for _p_ttft, _p_thpt in zip(p_ttft, p_thpt): for _p_ttft, _p_thpt in zip(p_ttft, p_thpt):
ttft.append(_p_ttft) ttft.append(_p_ttft)
prefill_cost = isl * 1000 / _p_thpt * GPU_COST_PER_HOUR / 3600 prefill_cost = isl * 1000 / _p_thpt * DEFAULT_GPU_COST_PER_HOUR / 3600
tokens_per_user.append(1000 / d_itl) tokens_per_user.append(1000 / d_itl)
cost.append(osl * 1000 / d_thpt * GPU_COST_PER_HOUR / 3600 + prefill_cost) cost.append(
osl * 1000 / d_thpt * DEFAULT_GPU_COST_PER_HOUR / 3600 + prefill_cost
)
# plot # plot
plt.figure(figsize=(12, 10)) plt.figure(figsize=(12, 10))
plt.title( plt.title(
f"Cost Per 1000 i{isl}o{osl} requests (GPU/hour = ${GPU_COST_PER_HOUR}) Under Different SLA" f"Cost Per 1000 i{isl}o{osl} requests (GPU/hour = ${DEFAULT_GPU_COST_PER_HOUR}) Under Different SLA"
) )
for _tokens_per_user, _cost, _ttft in zip(tokens_per_user, cost, ttft): for _tokens_per_user, _cost, _ttft in zip(tokens_per_user, cost, ttft):
line = plt.plot(_tokens_per_user, _cost, label=f"TTFT: {_ttft:.2f}ms")[0] line = plt.plot(_tokens_per_user, _cost, label=f"TTFT: {_ttft:.2f}ms")[0]
......
...@@ -3,17 +3,14 @@ ...@@ -3,17 +3,14 @@
import json import json
import logging import logging
import os
import queue import queue
from pathlib import Path
from benchmarks.profiler.utils.defaults import DEFAULT_GPU_COST_PER_HOUR
from benchmarks.profiler.webui.utils import ( from benchmarks.profiler.webui.utils import (
PlotType, create_gpu_cost_update_handler,
create_gradio_interface, create_gradio_interface,
create_selection_handler, create_selection_handler,
populate_cost_data, generate_config_data,
populate_decode_data,
populate_prefill_data,
wait_for_selection, wait_for_selection,
) )
...@@ -28,55 +25,6 @@ console_handler.setFormatter(formatter) ...@@ -28,55 +25,6 @@ console_handler.setFormatter(formatter)
logger.addHandler(console_handler) logger.addHandler(console_handler)
def generate_config_data(prefill_data, decode_data, args):
"""
Generate JSON data file for WebUI from profiling results.
Args:
prefill_data: PrefillProfileData instance
decode_data: DecodeProfileData instance
args: Arguments containing SLA targets (ttft, itl, isl, osl) and output_dir
Returns a JSON data file for WebUI consumption,
see https://github.com/ai-dynamo/aiconfigurator/blob/main/src/aiconfigurator/webapp/components/profiling/standalone/sample_profiling_data.json for more details
"""
# Load template
template_path = Path(__file__).parent / "data_template.json"
with open(template_path, "r") as f:
data = json.load(f)
# Construct output path
output_path = os.path.join(args.output_dir, "webui_data.json")
# Set SLA targets
data[PlotType.PREFILL]["chart"]["target_line"]["value"] = args.ttft
data[PlotType.PREFILL]["chart"]["target_line"][
"label"
] = f"Target TTFT: {args.ttft} ms"
data[PlotType.DECODE]["chart"]["target_line"]["value"] = args.itl
data[PlotType.DECODE]["chart"]["target_line"][
"label"
] = f"Target ITL: {args.itl} ms"
data[PlotType.COST]["chart"][
"title"
] = f"Cost Per 1000 i{args.isl}o{args.osl} requests"
# Populate data sections
populate_prefill_data(data, prefill_data)
populate_decode_data(data, decode_data)
populate_cost_data(data, prefill_data, decode_data, args)
# Save JSON file
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, "w") as f:
json.dump(data, f, indent=4)
logger.info(f"Generated WebUI config data at {output_path}")
return data
def pick_config_with_webui(prefill_data, decode_data, args): def pick_config_with_webui(prefill_data, decode_data, args):
""" """
Launch WebUI for user to pick configurations. Launch WebUI for user to pick configurations.
...@@ -89,13 +37,15 @@ def pick_config_with_webui(prefill_data, decode_data, args): ...@@ -89,13 +37,15 @@ def pick_config_with_webui(prefill_data, decode_data, args):
Returns: Returns:
tuple[int, int]: (selected_prefill_idx, selected_decode_idx) tuple[int, int]: (selected_prefill_idx, selected_decode_idx)
""" """
# Generate JSON data file and load it # Generate JSON data (also writes default JSON file for convenience)
generate_config_data(prefill_data, decode_data, args) data_dict = generate_config_data(
prefill_data,
output_path = os.path.join(args.output_dir, "webui_data.json") decode_data,
with open(output_path, "r") as f: args,
json_data_str = f.read() gpu_cost_per_hour=DEFAULT_GPU_COST_PER_HOUR,
data_dict = json.loads(json_data_str) write_to_disk=True,
)
json_data_str = json.dumps(data_dict)
logger.info(f"Launching WebUI on port {args.webui_port}...") logger.info(f"Launching WebUI on port {args.webui_port}...")
...@@ -107,9 +57,23 @@ def pick_config_with_webui(prefill_data, decode_data, args): ...@@ -107,9 +57,23 @@ def pick_config_with_webui(prefill_data, decode_data, args):
decode_selection = {"idx": None} decode_selection = {"idx": None}
# Create selection handler and Gradio interface # Create selection handler and Gradio interface
data_dict_ref = {"data": data_dict}
handle_selection = create_selection_handler( handle_selection = create_selection_handler(
data_dict, selection_queue, prefill_selection, decode_selection data_dict_ref, selection_queue, prefill_selection, decode_selection
)
update_gpu_cost_per_hour = create_gpu_cost_update_handler(
prefill_data=prefill_data,
decode_data=decode_data,
args=args,
data_dict_ref=data_dict_ref,
default_gpu_cost_per_hour=DEFAULT_GPU_COST_PER_HOUR,
)
demo = create_gradio_interface(
json_data_str,
handle_selection,
update_json_data_fn=update_gpu_cost_per_hour,
default_gpu_cost_per_hour=DEFAULT_GPU_COST_PER_HOUR,
) )
demo = create_gradio_interface(json_data_str, handle_selection)
return wait_for_selection(demo, selection_queue, args.webui_port) return wait_for_selection(demo, selection_queue, args.webui_port)
...@@ -3,9 +3,11 @@ ...@@ -3,9 +3,11 @@
import json import json
import logging import logging
import os
import queue import queue
import threading import threading
from enum import Enum from enum import Enum
from pathlib import Path
import gradio as gr import gradio as gr
import numpy as np import numpy as np
...@@ -16,7 +18,7 @@ from aiconfigurator.webapp.components.profiling import ( ...@@ -16,7 +18,7 @@ from aiconfigurator.webapp.components.profiling import (
load_profiling_javascript, load_profiling_javascript,
) )
from benchmarks.profiler.utils.defaults import GPU_COST_PER_HOUR from benchmarks.profiler.utils.defaults import DEFAULT_GPU_COST_PER_HOUR
from benchmarks.profiler.utils.pareto import compute_pareto from benchmarks.profiler.utils.pareto import compute_pareto
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -47,6 +49,95 @@ CHART_COLORS = [ ...@@ -47,6 +49,95 @@ CHART_COLORS = [
WEB_UI_SELECTION_TIMEOUT = 3600 WEB_UI_SELECTION_TIMEOUT = 3600
def generate_config_data(
prefill_data,
decode_data,
args,
gpu_cost_per_hour: float = DEFAULT_GPU_COST_PER_HOUR,
write_to_disk: bool = True,
):
"""
Generate JSON data file for WebUI from profiling results.
Args:
prefill_data: PrefillProfileData instance
decode_data: DecodeProfileData instance
args: Arguments containing SLA targets (ttft, itl, isl, osl) and output_dir
gpu_cost_per_hour: GPU cost in $/GPU/hour used for cost plot/table
write_to_disk: Whether to write the generated JSON to args.output_dir/webui_data.json
Returns:
dict: Data dict for WebUI consumption.
"""
# Load template
template_path = Path(__file__).parent / "data_template.json"
with open(template_path, "r") as f:
data = json.load(f)
# Construct output path
output_path = os.path.join(args.output_dir, "webui_data.json")
# Set SLA targets
data[PlotType.PREFILL]["chart"]["target_line"]["value"] = args.ttft
data[PlotType.PREFILL]["chart"]["target_line"][
"label"
] = f"Target TTFT: {args.ttft} ms"
data[PlotType.DECODE]["chart"]["target_line"]["value"] = args.itl
data[PlotType.DECODE]["chart"]["target_line"][
"label"
] = f"Target ITL: {args.itl} ms"
data[PlotType.COST]["chart"][
"title"
] = f"Cost Per 1000 i{args.isl}o{args.osl} requests"
# Populate data sections
populate_prefill_data(data, prefill_data)
populate_decode_data(data, decode_data)
populate_cost_data(
data, prefill_data, decode_data, args, gpu_cost_per_hour=gpu_cost_per_hour
)
# Save JSON file (optional)
if write_to_disk:
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, "w") as f:
json.dump(data, f, indent=4)
logger.info(f"Generated WebUI config data at {output_path}")
return data
def create_gpu_cost_update_handler(
*,
prefill_data,
decode_data,
args,
data_dict_ref,
default_gpu_cost_per_hour: float = DEFAULT_GPU_COST_PER_HOUR,
):
"""Create a Gradio change-handler that regenerates profiling JSON when GPU cost changes."""
def update_gpu_cost_per_hour(gpu_cost_per_hour):
try:
gpu_cost = float(gpu_cost_per_hour)
except Exception:
gpu_cost = default_gpu_cost_per_hour
new_data = generate_config_data(
prefill_data,
decode_data,
args,
gpu_cost_per_hour=gpu_cost,
write_to_disk=False,
)
data_dict_ref["data"] = new_data
return json.dumps(new_data)
return update_gpu_cost_per_hour
def populate_prefill_data(data, prefill_data): def populate_prefill_data(data, prefill_data):
"""Populate prefill chart and table data.""" """Populate prefill chart and table data."""
if not prefill_data.num_gpus: if not prefill_data.num_gpus:
...@@ -141,7 +232,13 @@ def populate_decode_data(data, decode_data): ...@@ -141,7 +232,13 @@ def populate_decode_data(data, decode_data):
data[PlotType.DECODE]["table"]["data"] = table_data data[PlotType.DECODE]["table"]["data"] = table_data
def populate_cost_data(data, prefill_data, decode_data, args): def populate_cost_data(
data,
prefill_data,
decode_data,
args,
gpu_cost_per_hour: float = DEFAULT_GPU_COST_PER_HOUR,
):
"""Populate cost chart and table data with pareto-optimal configurations.""" """Populate cost chart and table data with pareto-optimal configurations."""
if not prefill_data.num_gpus or not decode_data.num_gpus: if not prefill_data.num_gpus or not decode_data.num_gpus:
return return
...@@ -170,13 +267,13 @@ def populate_cost_data(data, prefill_data, decode_data, args): ...@@ -170,13 +267,13 @@ def populate_cost_data(data, prefill_data, decode_data, args):
for p_idx, (_p_ttft, _p_thpt) in enumerate(zip(p_ttft, p_thpt)): for p_idx, (_p_ttft, _p_thpt) in enumerate(zip(p_ttft, p_thpt)):
# Calculate prefill cost (fixed for this line) # Calculate prefill cost (fixed for this line)
prefill_cost = args.isl * 1000 / _p_thpt * GPU_COST_PER_HOUR / 3600 prefill_cost = args.isl * 1000 / _p_thpt * gpu_cost_per_hour / 3600
# For each decode config, calculate total cost # For each decode config, calculate total cost
line_data = [] line_data = []
for d_idx, (_d_itl, _d_thpt) in enumerate(zip(d_itl, d_thpt)): for d_idx, (_d_itl, _d_thpt) in enumerate(zip(d_itl, d_thpt)):
# Calculate decode cost # Calculate decode cost
decode_cost = args.osl * 1000 / _d_thpt * GPU_COST_PER_HOUR / 3600 decode_cost = args.osl * 1000 / _d_thpt * gpu_cost_per_hour / 3600
total_cost = prefill_cost + decode_cost total_cost = prefill_cost + decode_cost
# X-axis: tokens per user (based on ITL) # X-axis: tokens per user (based on ITL)
...@@ -230,12 +327,12 @@ def populate_cost_data(data, prefill_data, decode_data, args): ...@@ -230,12 +327,12 @@ def populate_cost_data(data, prefill_data, decode_data, args):
def create_selection_handler( def create_selection_handler(
data_dict, selection_queue, prefill_selection, decode_selection data_dict_ref, selection_queue, prefill_selection, decode_selection
): ):
"""Create a selection handler closure for the WebUI. """Create a selection handler closure for the WebUI.
Args: Args:
data_dict: Parsed JSON data containing cost index mapping data_dict_ref: Dict wrapper holding the latest parsed JSON data (mutated when UI inputs change)
selection_queue: Queue to communicate selections to main thread selection_queue: Queue to communicate selections to main thread
prefill_selection: Dict tracking prefill selection state prefill_selection: Dict tracking prefill selection state
decode_selection: Dict tracking decode selection state decode_selection: Dict tracking decode selection state
...@@ -250,6 +347,7 @@ def create_selection_handler( ...@@ -250,6 +347,7 @@ def create_selection_handler(
return return
try: try:
data_dict = data_dict_ref["data"]
selection = json.loads(selection_json) selection = json.loads(selection_json)
plot_type = selection.get("plotType") plot_type = selection.get("plotType")
row_idx = selection.get("rowIndex") row_idx = selection.get("rowIndex")
...@@ -299,12 +397,19 @@ def create_selection_handler( ...@@ -299,12 +397,19 @@ def create_selection_handler(
return handle_selection return handle_selection
def create_gradio_interface(json_data_str, handle_selection): def create_gradio_interface(
json_data_str,
handle_selection,
update_json_data_fn=None,
default_gpu_cost_per_hour: float = DEFAULT_GPU_COST_PER_HOUR,
):
"""Create the Gradio interface for configuration selection. """Create the Gradio interface for configuration selection.
Args: Args:
json_data_str: JSON string containing profiling data json_data_str: JSON string containing profiling data
handle_selection: Selection handler function handle_selection: Selection handler function
update_json_data_fn: Optional function that takes (gpu_cost_per_hour) and returns updated JSON string.
default_gpu_cost_per_hour: Default GPU cost per hour used to initialize the input box.
Returns: Returns:
gr.Blocks: Configured Gradio demo gr.Blocks: Configured Gradio demo
...@@ -320,6 +425,7 @@ def create_gradio_interface(json_data_str, handle_selection): ...@@ -320,6 +425,7 @@ def create_gradio_interface(json_data_str, handle_selection):
inject_profiling_assets() inject_profiling_assets()
gr.Markdown("# 📊 Profiling Results - Select Configuration") gr.Markdown("# 📊 Profiling Results - Select Configuration")
gr.Markdown( gr.Markdown(
""" """
**Two ways to select prefill and decode configs:** **Two ways to select prefill and decode configs:**
...@@ -333,6 +439,20 @@ def create_gradio_interface(json_data_str, handle_selection): ...@@ -333,6 +439,20 @@ def create_gradio_interface(json_data_str, handle_selection):
""" """
) )
with gr.Row():
gpu_cost_per_hour = gr.Number(
label="GPU cost per hour ($/GPU/hour)",
value=default_gpu_cost_per_hour,
minimum=0,
precision=4,
)
if update_json_data_fn is not None:
gpu_cost_per_hour.change(
fn=update_json_data_fn,
inputs=[gpu_cost_per_hour],
outputs=[json_data],
)
# Performance Results Section (reused from AIC profiling module) # Performance Results Section (reused from AIC profiling module)
create_performance_results_section() create_performance_results_section()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment