Unverified Commit 6b5842ee authored by hhzhang16's avatar hhzhang16 Committed by GitHub
Browse files

feat: Profiler WebUI improvements -- error handling, GPU hours, style fixes,...


feat: Profiler WebUI improvements -- error handling, GPU hours, style fixes, preview configs (#4968)
Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
parent 74fcd4a9
...@@ -50,7 +50,11 @@ from benchmarks.profiler.utils.profile_prefill import ( ...@@ -50,7 +50,11 @@ from benchmarks.profiler.utils.profile_prefill import (
profile_prefill_aiconfigurator, profile_prefill_aiconfigurator,
) )
from benchmarks.profiler.utils.profiler_argparse import create_profiler_parser from benchmarks.profiler.utils.profiler_argparse import create_profiler_parser
from benchmarks.profiler.webui.select_config import pick_config_with_webui from benchmarks.profiler.webui.select_config import (
add_profiling_error,
clear_profiling_errors,
pick_config_with_webui,
)
from deploy.utils.dynamo_deployment import ( from deploy.utils.dynamo_deployment import (
DynamoDeploymentClient, DynamoDeploymentClient,
cleanup_remaining_deployments, cleanup_remaining_deployments,
...@@ -131,6 +135,9 @@ async def run_profile(args): ...@@ -131,6 +135,9 @@ async def run_profile(args):
# List to track all created deployment clients for cleanup in case of failure # List to track all created deployment clients for cleanup in case of failure
deployment_clients = [] deployment_clients = []
# Clear any errors from previous profiling runs
clear_profiling_errors()
# Inherit aic_backend from backend if not explicitly set # Inherit aic_backend from backend if not explicitly set
if not args.aic_backend: if not args.aic_backend:
args.aic_backend = args.backend args.aic_backend = args.backend
...@@ -476,7 +483,9 @@ async def run_profile(args): ...@@ -476,7 +483,9 @@ async def run_profile(args):
logger.info("Analyzing results and generate recommendations...") logger.info("Analyzing results and generate recommendations...")
# Safety guards: no results → exit early with a clear message # Safety guards: no results → exit early with a clear message
if not prefill_data.num_gpus: if not prefill_data.num_gpus:
logger.error("No prefill results produced; skipping recommendations.") error_msg = "No prefill results produced; skipping recommendations."
logger.error(error_msg)
add_profiling_error(error_msg)
return return
if args.pick_with_webui: if args.pick_with_webui:
...@@ -488,9 +497,9 @@ async def run_profile(args): ...@@ -488,9 +497,9 @@ async def run_profile(args):
# automatically select P/D config within SLA with the highest throughput/GPU # automatically select P/D config within SLA with the highest throughput/GPU
# select best parallel mapping for prefill # select best parallel mapping for prefill
if min(prefill_data.ttft) > args.ttft: if min(prefill_data.ttft) > args.ttft:
logger.warning( warning_msg = "No engine configuration satisfies the TTFT requirement, please try a smaller model or more powerful hardware"
"No engine configuration satisfies the TTFT requirement, please try a smaller model or more powerful hardware" logger.warning(warning_msg)
) add_profiling_error(warning_msg)
selected_prefill_idx = int(np.argmin(np.array(prefill_data.ttft))) selected_prefill_idx = int(np.argmin(np.array(prefill_data.ttft)))
else: else:
valid_indices = [ valid_indices = [
...@@ -508,14 +517,14 @@ async def run_profile(args): ...@@ -508,14 +517,14 @@ async def run_profile(args):
# select best parallel mapping for decode # select best parallel mapping for decode
if not decode_data.num_gpus: if not decode_data.num_gpus:
logger.error( error_msg = "No decode results produced; skipping recommendations."
"No decode results produced; skipping recommendations." logger.error(error_msg)
) add_profiling_error(error_msg)
return return
if min(decode_data.itl) > args.itl: if min(decode_data.itl) > args.itl:
logger.warning( warning_msg = "No engine configuration satisfies the ITL requirement, please try a smaller model or more powerful hardware"
"No engine configuration satisfies the ITL requirement, please try a smaller model or more powerful hardware" logger.warning(warning_msg)
) add_profiling_error(warning_msg)
selected_decode_idx = int(np.argmin(np.array(decode_data.itl))) selected_decode_idx = int(np.argmin(np.array(decode_data.itl)))
else: else:
valid_indices = [ valid_indices = [
......
...@@ -76,11 +76,11 @@ ...@@ -76,11 +76,11 @@
"min": 0 "min": 0
}, },
"y": { "y": {
"title": "Cost ($)", "title": "GPU Hours",
"min": 0 "min": 0
} }
}, },
"title": "Cost Per 1000 ? requests" "title": "GPU Hours Per 1000 ? requests"
}, },
"table": { "table": {
"columns": [ "columns": [
...@@ -89,7 +89,7 @@ ...@@ -89,7 +89,7 @@
"ITL (ms)", "ITL (ms)",
"Decode Thpt (tokens/s/GPU)", "Decode Thpt (tokens/s/GPU)",
"Tokens/User", "Tokens/User",
"Cost ($)", "GPU Hours",
"Action" "Action"
], ],
"data": [] "data": []
......
...@@ -5,15 +5,18 @@ import json ...@@ -5,15 +5,18 @@ import json
import logging import logging
import queue import queue
from benchmarks.profiler.utils.defaults import DEFAULT_GPU_COST_PER_HOUR
from benchmarks.profiler.webui.utils import ( from benchmarks.profiler.webui.utils import (
create_gpu_cost_update_handler, add_profiling_error,
clear_profiling_errors,
create_gradio_interface, create_gradio_interface,
create_selection_handler, create_selection_handler,
generate_config_data, generate_config_data,
wait_for_selection, wait_for_selection,
) )
# Re-export for use by profiler modules
__all__ = ["pick_config_with_webui", "add_profiling_error", "clear_profiling_errors"]
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler() console_handler = logging.StreamHandler()
...@@ -37,12 +40,15 @@ def pick_config_with_webui(prefill_data, decode_data, args): ...@@ -37,12 +40,15 @@ def pick_config_with_webui(prefill_data, decode_data, args):
Returns: Returns:
tuple[int, int]: (selected_prefill_idx, selected_decode_idx) tuple[int, int]: (selected_prefill_idx, selected_decode_idx)
""" """
# Generate JSON data (also writes default JSON file for convenience) # Note: Don't clear profiling errors here - they should be accumulated
# during the profiling run and displayed in the WebUI.
# clear_profiling_errors() should be called at the start of a new profiling run.
# Generate JSON data with GPU hours (frontend handles cost conversion)
data_dict = generate_config_data( data_dict = generate_config_data(
prefill_data, prefill_data,
decode_data, decode_data,
args, args,
gpu_cost_per_hour=DEFAULT_GPU_COST_PER_HOUR,
write_to_disk=True, write_to_disk=True,
) )
json_data_str = json.dumps(data_dict) json_data_str = json.dumps(data_dict)
...@@ -61,19 +67,11 @@ def pick_config_with_webui(prefill_data, decode_data, args): ...@@ -61,19 +67,11 @@ def pick_config_with_webui(prefill_data, decode_data, args):
handle_selection = create_selection_handler( handle_selection = create_selection_handler(
data_dict_ref, selection_queue, prefill_selection, decode_selection data_dict_ref, selection_queue, prefill_selection, decode_selection
) )
update_gpu_cost_per_hour = create_gpu_cost_update_handler(
prefill_data=prefill_data,
decode_data=decode_data,
args=args,
data_dict_ref=data_dict_ref,
default_gpu_cost_per_hour=DEFAULT_GPU_COST_PER_HOUR,
)
# Note: GPU hours -> Cost conversion is handled by frontend JavaScript (gpu_cost_toggle.js)
demo = create_gradio_interface( demo = create_gradio_interface(
json_data_str, json_data_str,
handle_selection, handle_selection,
update_json_data_fn=update_gpu_cost_per_hour,
default_gpu_cost_per_hour=DEFAULT_GPU_COST_PER_HOUR,
) )
return wait_for_selection(demo, selection_queue, args.webui_port) return wait_for_selection(demo, selection_queue, args.webui_port)
...@@ -6,11 +6,13 @@ import logging ...@@ -6,11 +6,13 @@ import logging
import os import os
import queue import queue
import threading import threading
import time
from enum import Enum from enum import Enum
from pathlib import Path from pathlib import Path
import gradio as gr import gradio as gr
import numpy as np import numpy as np
import yaml
from aiconfigurator.webapp.components.profiling import ( from aiconfigurator.webapp.components.profiling import (
create_performance_results_section, create_performance_results_section,
create_profiling_ui_components, create_profiling_ui_components,
...@@ -18,12 +20,187 @@ from aiconfigurator.webapp.components.profiling import ( ...@@ -18,12 +20,187 @@ from aiconfigurator.webapp.components.profiling import (
load_profiling_javascript, load_profiling_javascript,
) )
from benchmarks.profiler.utils.defaults import DEFAULT_GPU_COST_PER_HOUR
from benchmarks.profiler.utils.pareto import compute_pareto from benchmarks.profiler.utils.pareto import compute_pareto
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# Global variable to track selection completion for graceful shutdown
_selection_complete = threading.Event()
# Global error state for propagating profiling errors to WebUI
_profiling_errors: list[str] = []
def add_profiling_error(error_message: str) -> None:
"""Add an error message to be displayed in the WebUI.
Args:
error_message: The error message to display
"""
_profiling_errors.append(error_message)
logger.error(f"Profiling error: {error_message}")
def get_profiling_errors() -> list[str]:
"""Get all profiling errors.
Returns:
List of error messages
"""
return _profiling_errors.copy()
def clear_profiling_errors() -> None:
"""Clear all profiling errors."""
_profiling_errors.clear()
def generate_dgd_worker_config_yaml(
parallel_mapping,
engine_type: str,
model: str | None = None,
backend: str | None = None,
ttft_or_itl: float | None = None,
thpt_per_gpu: float | None = None,
) -> str:
"""
Generate a DGD worker service config snippet for display in the WebUI.
Uses ParallelizationMapping.label() for display and shows the service structure
that would be used in the final DynamoGraphDeployment.
Args:
parallel_mapping: ParallelizationMapping instance
engine_type: "prefill" or "decode"
model: Model name/path
backend: Backend name (sglang, vllm, trtllm)
ttft_or_itl: TTFT (prefill) or ITL (decode) in ms
thpt_per_gpu: Throughput per GPU in tokens/s/GPU
Returns:
YAML string representation of the DGD worker config
"""
num_gpus = parallel_mapping.get_num_gpus()
# Build the worker config in DGD style
# Note: Actual args vary by backend; this shows the structure
worker_config = {
"componentType": "worker",
"subComponentType": engine_type,
"replicas": 1,
"resources": {
"limits": {
"gpu": str(num_gpus),
}
},
}
# Build header comments with profiling metadata
header_lines = [
"# DynamoGraphDeployment Worker Config",
f"# Engine: {engine_type}",
f"# Num GPUs: {num_gpus}",
f"# Parallelization: {parallel_mapping.label()}",
]
if engine_type == "prefill" and ttft_or_itl is not None:
header_lines.append(f"# Profiled TTFT: {round(ttft_or_itl, 2)} ms")
elif engine_type == "decode" and ttft_or_itl is not None:
header_lines.append(f"# Profiled ITL: {round(ttft_or_itl, 2)} ms")
if thpt_per_gpu is not None:
header_lines.append(
f"# Profiled Throughput: {round(thpt_per_gpu, 2)} tokens/s/GPU"
)
if model:
header_lines.append(f"# Model: {model}")
if backend:
header_lines.append(f"# Backend: {backend}")
header_lines.append("#")
header_lines.append("# Note: Final config generated after selection includes")
header_lines.append("# backend-specific args and planner configuration.")
# Add the actual config
service_name = f"{engine_type.capitalize()}Worker"
body = yaml.dump(
{service_name: worker_config}, default_flow_style=False, sort_keys=False
)
return "\n".join(header_lines) + "\n" + body
def generate_dgd_config_yaml_for_display(
prefill_mapping,
decode_mapping,
model: str | None = None,
backend: str | None = None,
) -> str:
"""
Generate a DGD config snippet for display in the WebUI.
This shows the combined prefill + decode DynamoGraphDeployment structure.
Uses ParallelizationMapping.label() for parallelization info.
Args:
prefill_mapping: ParallelizationMapping for prefill
decode_mapping: ParallelizationMapping for decode
model: Model name/path
backend: Backend name
Returns:
YAML string representation of the DGD configuration
"""
prefill_gpus = prefill_mapping.get_num_gpus()
decode_gpus = decode_mapping.get_num_gpus()
# Build DGD-style config showing the service structure
config = {
"apiVersion": "nvidia.com/v1alpha1",
"kind": "DynamoGraphDeployment",
"spec": {
"services": {
"PrefillWorker": {
"componentType": "worker",
"subComponentType": "prefill",
"replicas": 1,
"resources": {
"limits": {"gpu": str(prefill_gpus)},
},
},
"DecodeWorker": {
"componentType": "worker",
"subComponentType": "decode",
"replicas": 1,
"resources": {
"limits": {"gpu": str(decode_gpus)},
},
},
}
},
}
# Build header comments with parallelization and model info
header_lines = [
"# DynamoGraphDeployment Configuration Preview",
f"# Prefill: {prefill_gpus} GPU(s), {prefill_mapping.label()}",
f"# Decode: {decode_gpus} GPU(s), {decode_mapping.label()}",
]
if model:
header_lines.append(f"# Model: {model}")
if backend:
header_lines.append(f"# Backend: {backend}")
header_lines.append("#")
header_lines.append("# Full config with planner saved to: config_with_planner.yaml")
header = "\n".join(header_lines)
body = yaml.dump(config, default_flow_style=False, sort_keys=False)
return f"{header}\n{body}"
class PlotType(str, Enum): class PlotType(str, Enum):
"""Enum for the three plot/config types in the WebUI.""" """Enum for the three plot/config types in the WebUI."""
...@@ -53,17 +230,18 @@ def generate_config_data( ...@@ -53,17 +230,18 @@ def generate_config_data(
prefill_data, prefill_data,
decode_data, decode_data,
args, args,
gpu_cost_per_hour: float = DEFAULT_GPU_COST_PER_HOUR,
write_to_disk: bool = True, write_to_disk: bool = True,
): ):
""" """
Generate JSON data file for WebUI from profiling results. Generate JSON data file for WebUI from profiling results.
Note: This function computes GPU hours (not cost). The frontend handles
cost calculation when the user provides a GPU cost per hour value.
Args: Args:
prefill_data: PrefillProfileData instance prefill_data: PrefillProfileData instance
decode_data: DecodeProfileData instance decode_data: DecodeProfileData instance
args: Arguments containing SLA targets (ttft, itl, isl, osl) and output_dir args: Arguments containing SLA targets (ttft, itl, isl, osl) and output_dir
gpu_cost_per_hour: GPU cost in $/GPU/hour used for cost plot/table
write_to_disk: Whether to write the generated JSON to args.output_dir/webui_data.json write_to_disk: Whether to write the generated JSON to args.output_dir/webui_data.json
Returns: Returns:
...@@ -90,14 +268,12 @@ def generate_config_data( ...@@ -90,14 +268,12 @@ def generate_config_data(
data[PlotType.COST]["chart"][ data[PlotType.COST]["chart"][
"title" "title"
] = f"Cost Per 1000 i{args.isl}o{args.osl} requests" ] = f"GPU Hours Per 1000 i{args.isl}o{args.osl} requests"
# Populate data sections # Populate data sections
populate_prefill_data(data, prefill_data) populate_prefill_data(data, prefill_data, args)
populate_decode_data(data, decode_data) populate_decode_data(data, decode_data, args)
populate_cost_data( populate_cost_data(data, prefill_data, decode_data, args)
data, prefill_data, decode_data, args, gpu_cost_per_hour=gpu_cost_per_hour
)
# Save JSON file (optional) # Save JSON file (optional)
if write_to_disk: if write_to_disk:
...@@ -109,36 +285,7 @@ def generate_config_data( ...@@ -109,36 +285,7 @@ def generate_config_data(
return data return data
def create_gpu_cost_update_handler( def populate_prefill_data(data, prefill_data, args):
*,
prefill_data,
decode_data,
args,
data_dict_ref,
default_gpu_cost_per_hour: float = DEFAULT_GPU_COST_PER_HOUR,
):
"""Create a Gradio change-handler that regenerates profiling JSON when GPU cost changes."""
def update_gpu_cost_per_hour(gpu_cost_per_hour):
try:
gpu_cost = float(gpu_cost_per_hour)
except Exception:
gpu_cost = default_gpu_cost_per_hour
new_data = generate_config_data(
prefill_data,
decode_data,
args,
gpu_cost_per_hour=gpu_cost,
write_to_disk=False,
)
data_dict_ref["data"] = new_data
return json.dumps(new_data)
return update_gpu_cost_per_hour
def populate_prefill_data(data, prefill_data):
"""Populate prefill chart and table data.""" """Populate prefill chart and table data."""
if not prefill_data.num_gpus: if not prefill_data.num_gpus:
return return
...@@ -170,21 +317,29 @@ def populate_prefill_data(data, prefill_data): ...@@ -170,21 +317,29 @@ def populate_prefill_data(data, prefill_data):
# Populate table data # Populate table data
table_data = [] table_data = []
for i, (gpu, ttft, thpt, label) in enumerate( for i, (gpu, ttft, thpt, label, mapping) in enumerate(
zip( zip(
prefill_data.num_gpus, prefill_data.num_gpus,
prefill_data.ttft, prefill_data.ttft,
prefill_data.thpt_per_gpu, prefill_data.thpt_per_gpu,
prefill_data.parallel_mapping_labels, prefill_data.parallel_mapping_labels,
prefill_data.parallel_mappings,
) )
): ):
# TODO: Add actual config YAML data # Generate DGD worker config YAML for display
config_yaml = f"prefill_config_{i}.yaml" config_yaml = generate_dgd_worker_config_yaml(
parallel_mapping=mapping,
engine_type="prefill",
model=getattr(args, "model", None),
backend=getattr(args, "backend", None),
ttft_or_itl=ttft,
thpt_per_gpu=thpt,
)
table_data.append([gpu, round(ttft, 2), round(thpt, 2), config_yaml]) table_data.append([gpu, round(ttft, 2), round(thpt, 2), config_yaml])
data[PlotType.PREFILL]["table"]["data"] = table_data data[PlotType.PREFILL]["table"]["data"] = table_data
def populate_decode_data(data, decode_data): def populate_decode_data(data, decode_data, args):
"""Populate decode chart and table data.""" """Populate decode chart and table data."""
if not decode_data.num_gpus: if not decode_data.num_gpus:
return return
...@@ -219,15 +374,24 @@ def populate_decode_data(data, decode_data): ...@@ -219,15 +374,24 @@ def populate_decode_data(data, decode_data):
# Populate table data # Populate table data
table_data = [] table_data = []
for i, (gpu, itl, thpt, label) in enumerate( for i, (gpu, itl, thpt, label, mapping) in enumerate(
zip( zip(
decode_data.num_gpus, decode_data.num_gpus,
decode_data.itl, decode_data.itl,
decode_data.thpt_per_gpu, decode_data.thpt_per_gpu,
decode_data.parallel_mapping_labels, decode_data.parallel_mapping_labels,
decode_data.parallel_mappings,
) )
): ):
config_yaml = f"decode_config_{i}.yaml" # Generate DGD worker config YAML for display
config_yaml = generate_dgd_worker_config_yaml(
parallel_mapping=mapping,
engine_type="decode",
model=getattr(args, "model", None),
backend=getattr(args, "backend", None),
ttft_or_itl=itl,
thpt_per_gpu=thpt,
)
table_data.append([gpu, round(itl, 2), round(thpt, 2), config_yaml]) table_data.append([gpu, round(itl, 2), round(thpt, 2), config_yaml])
data[PlotType.DECODE]["table"]["data"] = table_data data[PlotType.DECODE]["table"]["data"] = table_data
...@@ -237,9 +401,12 @@ def populate_cost_data( ...@@ -237,9 +401,12 @@ def populate_cost_data(
prefill_data, prefill_data,
decode_data, decode_data,
args, args,
gpu_cost_per_hour: float = DEFAULT_GPU_COST_PER_HOUR,
): ):
"""Populate cost chart and table data with pareto-optimal configurations.""" """Populate cost chart and table data with pareto-optimal configurations.
Note: This function computes GPU hours (not cost). The frontend handles
cost calculation when the user provides a GPU cost per hour value.
"""
if not prefill_data.num_gpus or not decode_data.num_gpus: if not prefill_data.num_gpus or not decode_data.num_gpus:
return return
...@@ -266,15 +433,26 @@ def populate_cost_data( ...@@ -266,15 +433,26 @@ def populate_cost_data(
table_idx = 0 table_idx = 0
for p_idx, (_p_ttft, _p_thpt) in enumerate(zip(p_ttft, p_thpt)): for p_idx, (_p_ttft, _p_thpt) in enumerate(zip(p_ttft, p_thpt)):
# Calculate prefill cost (fixed for this line) # Get prefill config details for this pareto point
prefill_cost = args.isl * 1000 / _p_thpt * gpu_cost_per_hour / 3600 orig_prefill_idx = prefill_pareto_indices[p_idx]
prefill_mapping = prefill_data.parallel_mappings[orig_prefill_idx]
prefill_num_gpus = prefill_mapping.get_num_gpus()
# For each decode config, calculate total cost # Calculate prefill GPU hours per 1000 requests
# GPU hours = (tokens_per_request * num_requests) / (tokens_per_second_per_gpu * 3600) * num_gpus
prefill_gpu_hours = args.isl * 1000 / _p_thpt / 3600 * prefill_num_gpus
# For each decode config, calculate total GPU hours
line_data = [] line_data = []
for d_idx, (_d_itl, _d_thpt) in enumerate(zip(d_itl, d_thpt)): for d_idx, (_d_itl, _d_thpt) in enumerate(zip(d_itl, d_thpt)):
# Calculate decode cost # Get decode config details for this pareto point
decode_cost = args.osl * 1000 / _d_thpt * gpu_cost_per_hour / 3600 orig_decode_idx = decode_pareto_indices[d_idx]
total_cost = prefill_cost + decode_cost decode_mapping = decode_data.parallel_mappings[orig_decode_idx]
decode_num_gpus = decode_mapping.get_num_gpus()
# Calculate decode GPU hours per 1000 requests (scaled by num_gpus)
decode_gpu_hours = args.osl * 1000 / _d_thpt / 3600 * decode_num_gpus
total_gpu_hours = prefill_gpu_hours + decode_gpu_hours
# X-axis: tokens per user (based on ITL) # X-axis: tokens per user (based on ITL)
tokens_per_user = 1000 / _d_itl tokens_per_user = 1000 / _d_itl
...@@ -282,17 +460,23 @@ def populate_cost_data( ...@@ -282,17 +460,23 @@ def populate_cost_data(
line_data.append( line_data.append(
{ {
"x": round(tokens_per_user, 2), "x": round(tokens_per_user, 2),
"y": round(total_cost, 2), "y": round(total_gpu_hours, 4),
"tableIdx": table_idx, "tableIdx": table_idx,
} }
) )
# Store mapping from cost table row to original indices # Store mapping from cost table row to original indices
orig_prefill_idx = prefill_pareto_indices[p_idx]
orig_decode_idx = decode_pareto_indices[d_idx]
cost_index_mapping[table_idx] = (orig_prefill_idx, orig_decode_idx) cost_index_mapping[table_idx] = (orig_prefill_idx, orig_decode_idx)
# Add to table data # Generate DGD config YAML for display
config_yaml = generate_dgd_config_yaml_for_display(
prefill_mapping=prefill_mapping,
decode_mapping=decode_mapping,
model=getattr(args, "model", None),
backend=getattr(args, "backend", None),
)
# Add to table data (GPU hours, not cost - frontend handles cost conversion)
table_data.append( table_data.append(
[ [
round(_p_ttft, 2), round(_p_ttft, 2),
...@@ -300,8 +484,8 @@ def populate_cost_data( ...@@ -300,8 +484,8 @@ def populate_cost_data(
round(_d_itl, 2), round(_d_itl, 2),
round(_d_thpt, 2), round(_d_thpt, 2),
round(tokens_per_user, 2), round(tokens_per_user, 2),
round(total_cost, 2), round(total_gpu_hours, 4),
f"cost_config_{table_idx}.yaml", # TODO: Add actual config config_yaml,
] ]
) )
table_idx += 1 table_idx += 1
...@@ -338,13 +522,17 @@ def create_selection_handler( ...@@ -338,13 +522,17 @@ def create_selection_handler(
decode_selection: Dict tracking decode selection state decode_selection: Dict tracking decode selection state
Returns: Returns:
Callable: Selection handler function for Gradio Callable: Selection handler function for Gradio that returns a status message
""" """
def handle_selection(selection_json): def handle_selection(selection_json):
"""Handle datapoint selection from table.""" """Handle datapoint selection from table.
Returns:
str: Status message to display in the UI
"""
if not selection_json or selection_json.strip() == "": if not selection_json or selection_json.strip() == "":
return return ""
try: try:
data_dict = data_dict_ref["data"] data_dict = data_dict_ref["data"]
...@@ -366,8 +554,10 @@ def create_selection_handler( ...@@ -366,8 +554,10 @@ def create_selection_handler(
logger.info( logger.info(
f"Cost selection determines: Prefill={prefill_idx}, Decode={decode_idx}" f"Cost selection determines: Prefill={prefill_idx}, Decode={decode_idx}"
) )
# Auto-submit for cost selection # Signal selection complete and put in queue
_selection_complete.set()
selection_queue.put((prefill_idx, decode_idx)) selection_queue.put((prefill_idx, decode_idx))
return f"✅ Configuration selected! Prefill config #{prefill_idx}, Decode config #{decode_idx}. Processing..."
elif plot_type == PlotType.PREFILL: elif plot_type == PlotType.PREFILL:
prefill_selection["idx"] = row_idx prefill_selection["idx"] = row_idx
logger.info(f"Prefill selected: {row_idx}") logger.info(f"Prefill selected: {row_idx}")
...@@ -376,9 +566,11 @@ def create_selection_handler( ...@@ -376,9 +566,11 @@ def create_selection_handler(
logger.info( logger.info(
f"Both selections complete: Prefill={row_idx}, Decode={decode_selection['idx']}" f"Both selections complete: Prefill={row_idx}, Decode={decode_selection['idx']}"
) )
_selection_complete.set()
selection_queue.put((row_idx, decode_selection["idx"])) selection_queue.put((row_idx, decode_selection["idx"]))
return f"✅ Configuration selected! Prefill config #{row_idx}, Decode config #{decode_selection['idx']}. Processing..."
else: else:
logger.info("Waiting for decode selection...") return f"ℹ️ Prefill config #{row_idx} selected. Please select a Decode configuration."
elif plot_type == PlotType.DECODE: elif plot_type == PlotType.DECODE:
decode_selection["idx"] = row_idx decode_selection["idx"] = row_idx
logger.info(f"Decode selected: {row_idx}") logger.info(f"Decode selected: {row_idx}")
...@@ -387,12 +579,17 @@ def create_selection_handler( ...@@ -387,12 +579,17 @@ def create_selection_handler(
logger.info( logger.info(
f"Both selections complete: Prefill={prefill_selection['idx']}, Decode={row_idx}" f"Both selections complete: Prefill={prefill_selection['idx']}, Decode={row_idx}"
) )
_selection_complete.set()
selection_queue.put((prefill_selection["idx"], row_idx)) selection_queue.put((prefill_selection["idx"], row_idx))
return f"✅ Configuration selected! Prefill config #{prefill_selection['idx']}, Decode config #{row_idx}. Processing..."
else: else:
logger.info("Waiting for prefill selection...") return f"ℹ️ Decode config #{row_idx} selected. Please select a Prefill configuration."
return ""
except Exception as e: except Exception as e:
logger.error(f"Error handling selection: {e}") logger.error(f"Error handling selection: {e}")
return f"❌ Error: {str(e)}"
return handle_selection return handle_selection
...@@ -400,16 +597,12 @@ def create_selection_handler( ...@@ -400,16 +597,12 @@ def create_selection_handler(
def create_gradio_interface( def create_gradio_interface(
json_data_str, json_data_str,
handle_selection, handle_selection,
update_json_data_fn=None,
default_gpu_cost_per_hour: float = DEFAULT_GPU_COST_PER_HOUR,
): ):
"""Create the Gradio interface for configuration selection. """Create the Gradio interface for configuration selection.
Args: Args:
json_data_str: JSON string containing profiling data json_data_str: JSON string containing profiling data
handle_selection: Selection handler function handle_selection: Selection handler function
update_json_data_fn: Optional function that takes (gpu_cost_per_hour) and returns updated JSON string.
default_gpu_cost_per_hour: Default GPU cost per hour used to initialize the input box.
Returns: Returns:
gr.Blocks: Configured Gradio demo gr.Blocks: Configured Gradio demo
...@@ -426,41 +619,49 @@ def create_gradio_interface( ...@@ -426,41 +619,49 @@ def create_gradio_interface(
gr.Markdown("# 📊 Profiling Results - Select Configuration") gr.Markdown("# 📊 Profiling Results - Select Configuration")
# Display any profiling errors/warnings at the top
profiling_errors = get_profiling_errors()
if profiling_errors:
error_text = "\n".join(f"- {err}" for err in profiling_errors)
gr.Markdown(
f"""
<div style="background-color: #fff3cd; border: 1px solid #ffc107; border-radius: 4px; padding: 10px; margin-bottom: 10px;">
<strong>⚠️ Profiling Warnings/Errors:</strong>
{error_text}
</div>
"""
)
gr.Markdown( gr.Markdown(
""" """
**Two ways to select prefill and decode configs:** **Two ways to select prefill and decode configs:**
1. **Cost Analysis** (recommended): Click any row in the Cost Analysis table - automatically determines both prefill and decode 1. **GPU Hours Analysis** (recommended): Select any row in the GPU Hours table - automatically determines both prefill and decode
2. **Individual**: Click one row in the Prefill table AND one row in the Decode table 2. **Individual**: Select one row in the Prefill table AND one row in the Decode table
The selection will be processed automatically once complete. The selection will be processed automatically once complete.
> 📝 **Note:** The dotted red line in the prefill and decode charts are default TTFT and ITL SLAs if not specified. > 📝 **Note:** The dotted red line in the prefill and decode charts are default TTFT and ITL SLAs if not specified.
> ⚠️ **Warning:** The TTFT values here represent the ideal case when requests arrive uniformly, minimizing queueing. Real-world TTFT may be higher than profiling results. To mitigate the issue, planner uses ][correction factors](https://github.com/ai-dynamo/dynamo/blob/main/docs/planner/sla_planner.md#2-correction-factor-calculation) to adjust dynamically at runtime. > ⚠️ **Warning:** The TTFT values here represent the ideal case when requests arrive uniformly, minimizing queueing. Real-world TTFT may be higher than profiling results. To mitigate the issue, planner uses [correction factors](https://github.com/ai-dynamo/dynamo/blob/main/docs/planner/sla_planner.md#2-correction-factor-calculation) to adjust dynamically at runtime.
> 💡 **Tip:** Use the GPU cost checkbox and input in the charts section to convert GPU hours to cost.
""" """
) )
with gr.Row(): # Status message display for selection feedback
gpu_cost_per_hour = gr.Number( selection_status = gr.Markdown(
label="GPU cost per hour ($/GPU/hour)", value="",
value=default_gpu_cost_per_hour, elem_id="selection_status",
minimum=0, )
precision=4,
)
if update_json_data_fn is not None:
gpu_cost_per_hour.change(
fn=update_json_data_fn,
inputs=[gpu_cost_per_hour],
outputs=[json_data],
)
# Performance Results Section (reused from AIC profiling module) # Performance Results Section (reused from AIC profiling module)
create_performance_results_section() create_performance_results_section()
# Handle selection button # Handle selection button - now returns status message
selection_button.click( selection_button.click(
fn=handle_selection, fn=handle_selection,
inputs=[selection_input], inputs=[selection_input],
outputs=[], outputs=[selection_status],
) )
# Trigger visualization when JSON data changes # Trigger visualization when JSON data changes
...@@ -513,6 +714,9 @@ def wait_for_selection(demo, selection_queue, port): ...@@ -513,6 +714,9 @@ def wait_for_selection(demo, selection_queue, port):
logger.info(f"WebUI launched. Waiting for user selection on http://0.0.0.0:{port}") logger.info(f"WebUI launched. Waiting for user selection on http://0.0.0.0:{port}")
logger.info("Please select a row from the Cost Analysis table") logger.info("Please select a row from the Cost Analysis table")
# Reset the selection complete event
_selection_complete.clear()
# Block and wait for selection # Block and wait for selection
try: try:
selected_prefill_idx, selected_decode_idx = selection_queue.get( selected_prefill_idx, selected_decode_idx = selection_queue.get(
...@@ -522,7 +726,12 @@ def wait_for_selection(demo, selection_queue, port): ...@@ -522,7 +726,12 @@ def wait_for_selection(demo, selection_queue, port):
f"User selected: Prefill={selected_prefill_idx}, Decode={selected_decode_idx}" f"User selected: Prefill={selected_prefill_idx}, Decode={selected_decode_idx}"
) )
# Close the demo # Wait for the selection handler to complete and give UI time to show success message
if _selection_complete.wait(timeout=2.0):
# Give extra time for the UI to display the success message
time.sleep(1.0)
# Close the demo gracefully
demo.close() demo.close()
return selected_prefill_idx, selected_decode_idx return selected_prefill_idx, selected_decode_idx
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
# For Multimodal EPD (required for device_map="auto" in vision model loading) # For Multimodal EPD (required for device_map="auto" in vision model loading)
accelerate accelerate
aiconfigurator[webapp] @ git+https://github.com/ai-dynamo/aiconfigurator.git@7f7ad5e248f3eaa4a0b74a069095828a4f356e60 aiconfigurator[webapp] @ git+https://github.com/ai-dynamo/aiconfigurator.git@0c8f38d354e9138f2cc00efcde66245b3801df1d
aiofiles aiofiles
aiperf @ git+https://github.com/ai-dynamo/aiperf.git@4d3fa29403c8f75da22a14f1f7b3aeb27db9288f aiperf @ git+https://github.com/ai-dynamo/aiperf.git@4d3fa29403c8f75da22a14f1f7b3aeb27db9288f
av==15.0.0 av==15.0.0
......
...@@ -170,6 +170,67 @@ Suggested prefill TP:4 (TTFT 48.37 ms, throughput 15505.23 tokens/s/GPU) ...@@ -170,6 +170,67 @@ Suggested prefill TP:4 (TTFT 48.37 ms, throughput 15505.23 tokens/s/GPU)
Suggested decode TP:4 (ITL 4.83 ms, throughput 51.22 tokens/s/GPU) Suggested decode TP:4 (ITL 4.83 ms, throughput 51.22 tokens/s/GPU)
``` ```
#### Interactive Configuration Selection WebUI
When running the profiler with `--pick-with-webui`, an interactive web interface is launched that allows you to visually explore profiling results and manually select configurations.
**Features:**
- **Interactive Charts**: Visualize prefill TTFT, decode ITL, and GPU hours analysis with hover-to-highlight synchronization between charts and tables
- **Pareto-Optimal Analysis**: The GPU Hours table shows pareto-optimal configurations balancing latency and throughput
- **DGD Config Preview**: Click "Show Config" on any row to view the corresponding DynamoGraphDeployment YAML
- **GPU Cost Estimation**: Toggle GPU cost display to convert GPU hours to cost ($/1000 requests)
- **SLA Visualization**: Red dashed lines indicate your TTFT and ITL targets
**Selection Methods:**
1. **GPU Hours Table** (recommended): Click any row to select both prefill and decode configurations at once based on the pareto-optimal combination
2. **Individual Selection**: Click one row in the Prefill table AND one row in the Decode table to manually choose each
**Example DGD Config Output:**
When you click "Show Config", you'll see a DynamoGraphDeployment configuration like:
```yaml
# DynamoGraphDeployment Configuration
# Prefill: 1 GPU(s), TP=1
# Decode: 4 GPU(s), TP=4
# Model: Qwen/Qwen3-32B-FP8
# Backend: trtllm
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
spec:
services:
PrefillWorker:
subComponentType: prefill
replicas: 1
extraPodSpec:
mainContainer:
args:
- --tensor-parallel-size=1
DecodeWorker:
subComponentType: decode
replicas: 1
extraPodSpec:
mainContainer:
args:
- --tensor-parallel-size=4
```
**Usage:**
```bash
python -m benchmarks.profiler.profile_sla \
--backend trtllm \
--config path/to/disagg.yaml \
--pick-with-webui \
--use-ai-configurator \
--model Qwen/Qwen3-32B-FP8 \
--aic-system h200_sxm \
--ttft 200 --itl 15
```
Once you have selected a configuration, the full DynamoGraphDeployment CRD will be saved in your output folder as `config_with_planner.yaml`.
The WebUI launches on port 8000 by default (configurable with `--webui-port`).
#### Output Performance Plots #### Output Performance Plots
The profiler will generate the following plots to better visualize the performance data: The profiler will generate the following plots to better visualize the performance data:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment