Unverified Commit ca0cd3b1 authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

feat: add webUI to profiler (#4544)


Signed-off-by: default avatarhongkuanz <hongkuanz@nvidia.com>
Signed-off-by: default avatarHongkuan Zhou <tedzhouhk@gmail.com>
Co-authored-by: default avatarhhzhang16 <54051230+hhzhang16@users.noreply.github.com>
parent 0173d5e6
......@@ -50,6 +50,7 @@ from benchmarks.profiler.utils.profile_prefill import (
profile_prefill_aiconfigurator,
)
from benchmarks.profiler.utils.profiler_argparse import create_profiler_parser
from benchmarks.profiler.webui.select_config import pick_config_with_webui
from deploy.utils.dynamo_deployment import (
DynamoDeploymentClient,
cleanup_remaining_deployments,
......@@ -476,7 +477,15 @@ async def run_profile(args):
# Safety guards: no results → exit early with a clear message
if not prefill_data.num_gpus:
logger.error("No prefill results produced; skipping recommendations.")
return
if args.pick_with_webui:
# select best P/D config in webUI
selected_prefill_idx, selected_decode_idx = pick_config_with_webui(
prefill_data, decode_data, args
)
else:
# automatically select P/D config within SLA with the highest throughput/GPU
# select best parallel mapping for prefill
if min(prefill_data.ttft) > args.ttft:
logger.warning(
......@@ -485,7 +494,9 @@ async def run_profile(args):
selected_prefill_idx = int(np.argmin(np.array(prefill_data.ttft)))
else:
valid_indices = [
i for i, ttft in enumerate(prefill_data.ttft) if ttft <= args.ttft
i
for i, ttft in enumerate(prefill_data.ttft)
if ttft <= args.ttft
]
# Among valid TP sizes, select the one with highest throughput per GPU
valid_thpts = [prefill_data.thpt_per_gpu[i] for i in valid_indices]
......@@ -497,7 +508,9 @@ async def run_profile(args):
# select best parallel mapping for decode
if not decode_data.num_gpus:
logger.error("No decode results produced; skipping recommendations.")
logger.error(
"No decode results produced; skipping recommendations."
)
return
if min(decode_data.itl) > args.itl:
logger.warning(
......
......@@ -30,6 +30,10 @@ AIPERF_WARMUP_REQUEST_PER_DP_RANK = 3
AIPERF_PREFILL_BENCHMARK_OSL = 5
AIPERF_PREFILL_ATTN_DP_NUM_REQ_RATIO = 4
# Cost calculation defaults
# TODO: allow user to configure this in GUI
GPU_COST_PER_HOUR = 3.0 # Cost per GPU per hour in dollars
class EngineType(str, Enum):
PREFILL = "prefill"
......
......@@ -4,33 +4,39 @@
def compute_pareto(x, y):
"""
compute the pareto front (top-left is better) for the given x and y values
return sorted lists of the x and y values for the pareto front
Compute the pareto front (top-left is better) for the given x and y values.
Returns:
tuple: (xs, ys, indices) where:
- xs: list of x values on the pareto front
- ys: list of y values on the pareto front
- indices: list of original indices corresponding to the pareto points
"""
# Validate inputs
if x is None or y is None:
return [], []
return [], [], []
if len(x) != len(y):
raise ValueError("x and y must have the same length")
if len(x) == 0:
return [], []
return [], [], []
# Build point list and sort by x asc, then y desc so we prefer smaller x and larger y.
points = list(zip(x, y))
# Build point list with original indices and sort by x asc, then y desc
points = [(x[i], y[i], i) for i in range(len(x))]
points.sort(key=lambda p: (p[0], -p[1]))
# Single pass to keep only non-dominated points (minimize x, maximize y).
# Single pass to keep only non-dominated points (minimize x, maximize y)
pareto = []
max_y = float("-inf")
for px, py in points:
for px, py, idx in points:
if py > max_y:
pareto.append((px, py))
pareto.append((px, py, idx))
max_y = py
# Return sorted by x ascending for convenience
pareto.sort(key=lambda p: (p[0], p[1]))
xs = [px for px, _ in pareto]
ys = [py for _, py in pareto]
return xs, ys
xs = [px for px, _, _ in pareto]
ys = [py for _, py, _ in pareto]
indices = [idx for _, _, idx in pareto]
return xs, ys, indices
......@@ -21,6 +21,7 @@ import numpy as np
from matplotlib import cm
from scipy.interpolate import griddata
from benchmarks.profiler.utils.defaults import GPU_COST_PER_HOUR
from benchmarks.profiler.utils.pareto import compute_pareto
logger = logging.getLogger(__name__)
......@@ -297,13 +298,11 @@ def plot_pd_joint_results(isl, osl, prefill_data, decode_data, output_dir):
decode_data: DecodeProfileData instance containing profiling results
output_dir: directory to save the plot
"""
GPU_COST_PER_HOUR = 3.0 # $3/hour
# compute pareto front for prefill
p_ttft, p_thpt = compute_pareto(prefill_data.ttft, prefill_data.thpt_per_gpu)
p_ttft, p_thpt, _ = compute_pareto(prefill_data.ttft, prefill_data.thpt_per_gpu)
# compute pareto front for decode
d_itl, d_thpt = compute_pareto(decode_data.itl, decode_data.thpt_per_gpu)
d_itl, d_thpt, _ = compute_pareto(decode_data.itl, decode_data.thpt_per_gpu)
# convert to cost per thousand requests
p_ttft = np.array(p_ttft)
......
......@@ -3,6 +3,7 @@
import argparse
import ast
import os
from typing import Any, Dict
import yaml
......@@ -84,6 +85,8 @@ def create_profiler_parser() -> argparse.Namespace:
aic_backend: String (aiconfigurator backend of the target model, if not provided, will use args.backend, default: "")
aic_backend_version: String (specify backend version when using aiconfigurator to estimate perf, default: None)
dry_run: Boolean (dry run the profile job, default: False)
pick_with_webui: Boolean (pick the best parallelization mapping using webUI, default: False)
webui_port: Int (webUI port, default: $PROFILER_WEBUI_PORT or 8000)
sla:
isl: Int (target input sequence length, default: 3000)
osl: Int (target output sequence length, default: 500)
......@@ -113,6 +116,8 @@ def create_profiler_parser() -> argparse.Namespace:
help="Configuration as Python dict literal, YAML, or JSON string. CLI args override config values. "
"Example: \"{'engine': {'backend': 'vllm', 'config': '/path'}, 'sla': {'isl': 3000}}\"",
)
# CLI arguments with config-aware defaults (using nested .get() for cleaner code)
parser.add_argument(
"--model",
type=str,
......@@ -126,7 +131,6 @@ def create_profiler_parser() -> argparse.Namespace:
help="Container image to use for DGD components (frontend, planner, workers). Overrides images in config file.",
)
# CLI arguments with config-aware defaults (using nested .get() for cleaner code)
parser.add_argument(
"--namespace",
type=str,
......@@ -233,6 +237,23 @@ def create_profiler_parser() -> argparse.Namespace:
default=config.get("hardware", {}).get("enable_gpu_discovery", False),
help="Enable automatic GPU discovery from Kubernetes cluster nodes. When enabled, overrides any manually specified hardware configuration. Requires cluster-wide node access permissions.",
)
parser.add_argument(
"--pick-with-webui",
action="store_true",
default=config.get("sweep", {}).get("pick_with_webui", False),
help="Pick the best parallelization mapping using webUI",
)
default_webui_port = 8000
webui_port_env = os.environ.get("PROFILER_WEBUI_PORT")
if webui_port_env:
default_webui_port = int(webui_port_env)
parser.add_argument(
"--webui-port",
type=int,
default=config.get("sweep", {}).get("webui_port", default_webui_port),
help="WebUI port",
)
# Dynamically add all planner arguments from planner_argparse.py
add_planner_arguments_to_parser(parser, prefix="planner-")
......
{
"settings": {
"allow_confirm_datapoint": true,
"hide_show_config": true
},
"prefill": {
"chart": {
"labels": [],
"datasets": [
{
"label": "Prefill Performance",
"data": [],
"backgroundColor": "#1f77b4",
"borderColor": "#1f77b4"
}
],
"target_line": {
"value": 0.0,
"label": "Target TTFT: ? ms"
},
"axes": {
"x": {
"title": "Time to First Token (ms)",
"min": 0
},
"y": {
"title": "Prefill Throughput per GPU (tokens/s/GPU)",
"min": 0
}
}
},
"table": {
"columns": [
"GPUs",
"TTFT (ms)",
"Throughput (tokens/s/GPU)",
"Action"
],
"data": []
}
},
"decode": {
"chart": {
"datasets": [],
"target_line": {
"value": 0.0,
"label": "Target ITL: ? ms"
},
"axes": {
"x": {
"title": "Inter Token Latency (ms)",
"min": 0
},
"y": {
"title": "Decode Throughput per GPU (tokens/s/GPU)",
"min": 0
}
}
},
"table": {
"columns": [
"GPUs",
"ITL (ms)",
"Throughput (tokens/s/GPU)",
"Action"
],
"data": []
}
},
"cost": {
"chart": {
"datasets": [],
"axes": {
"x": {
"title": "Tokens per User",
"min": 0
},
"y": {
"title": "Cost ($)",
"min": 0
}
},
"title": "Cost Per 1000 ? requests"
},
"table": {
"columns": [
"TTFT (ms)",
"Prefill Thpt (tokens/s/GPU)",
"ITL (ms)",
"Decode Thpt (tokens/s/GPU)",
"Tokens/User",
"Cost ($)",
"Action"
],
"data": []
}
}
}
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import json
import logging
import os
import queue
from pathlib import Path
from benchmarks.profiler.webui.utils import (
PlotType,
create_gradio_interface,
create_selection_handler,
populate_cost_data,
populate_decode_data,
populate_prefill_data,
wait_for_selection,
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
formatter = logging.Formatter(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s", "%Y-%m-%d %H:%M:%S"
)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
def generate_config_data(prefill_data, decode_data, args):
"""
Generate JSON data file for WebUI from profiling results.
Args:
prefill_data: PrefillProfileData instance
decode_data: DecodeProfileData instance
args: Arguments containing SLA targets (ttft, itl, isl, osl) and output_dir
Returns a JSON data file for WebUI consumption,
see https://github.com/ai-dynamo/aiconfigurator/blob/main/src/aiconfigurator/webapp/components/profiling/standalone/sample_profiling_data.json for more details
"""
# Load template
template_path = Path(__file__).parent / "data_template.json"
with open(template_path, "r") as f:
data = json.load(f)
# Construct output path
output_path = os.path.join(args.output_dir, "webui_data.json")
# Set SLA targets
data[PlotType.PREFILL]["chart"]["target_line"]["value"] = args.ttft
data[PlotType.PREFILL]["chart"]["target_line"][
"label"
] = f"Target TTFT: {args.ttft} ms"
data[PlotType.DECODE]["chart"]["target_line"]["value"] = args.itl
data[PlotType.DECODE]["chart"]["target_line"][
"label"
] = f"Target ITL: {args.itl} ms"
data[PlotType.COST]["chart"][
"title"
] = f"Cost Per 1000 i{args.isl}o{args.osl} requests"
# Populate data sections
populate_prefill_data(data, prefill_data)
populate_decode_data(data, decode_data)
populate_cost_data(data, prefill_data, decode_data, args)
# Save JSON file
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, "w") as f:
json.dump(data, f, indent=4)
logger.info(f"Generated WebUI config data at {output_path}")
return data
def pick_config_with_webui(prefill_data, decode_data, args):
"""
Launch WebUI for user to pick configurations.
Args:
prefill_data: PrefillProfileData instance
decode_data: DecodeProfileData instance
args: Arguments containing SLA targets and output_dir
Returns:
tuple[int, int]: (selected_prefill_idx, selected_decode_idx)
"""
# Generate JSON data file and load it
generate_config_data(prefill_data, decode_data, args)
output_path = os.path.join(args.output_dir, "webui_data.json")
with open(output_path, "r") as f:
json_data_str = f.read()
data_dict = json.loads(json_data_str)
logger.info(f"Launching WebUI on port {args.webui_port}...")
# Queue to communicate selection from UI to main thread
selection_queue: queue.Queue[tuple[int | None, int | None]] = queue.Queue()
# Track individual selections
prefill_selection = {"idx": None}
decode_selection = {"idx": None}
# Create selection handler and Gradio interface
handle_selection = create_selection_handler(
data_dict, selection_queue, prefill_selection, decode_selection
)
demo = create_gradio_interface(json_data_str, handle_selection)
return wait_for_selection(demo, selection_queue, args.webui_port)
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import json
import logging
import queue
import threading
from enum import Enum
import gradio as gr
import numpy as np
from aiconfigurator.webapp.components.profiling import (
create_performance_results_section,
create_profiling_ui_components,
inject_profiling_assets,
load_profiling_javascript,
)
from benchmarks.profiler.utils.defaults import GPU_COST_PER_HOUR
from benchmarks.profiler.utils.pareto import compute_pareto
logger = logging.getLogger(__name__)
class PlotType(str, Enum):
"""Enum for the three plot/config types in the WebUI."""
PREFILL = "prefill"
DECODE = "decode"
COST = "cost"
# Color palette for chart datasets
# TODO: handle case with more than 8 lines
CHART_COLORS = [
"#1f77b4", # blue
"#ff7f0e", # orange
"#2ca02c", # green
"#d62728", # red
"#9467bd", # purple
"#8c564b", # brown
"#e377c2", # pink
"#7f7f7f", # gray
]
# TODO: is this too long?
WEB_UI_SELECTION_TIMEOUT = 3600
def populate_prefill_data(data, prefill_data):
"""Populate prefill chart and table data."""
if not prefill_data.num_gpus:
return
# Get unique GPU counts for labels
unique_gpus = sorted(set(prefill_data.num_gpus))
data[PlotType.PREFILL]["chart"]["labels"] = [f"{gpu} GPUs" for gpu in unique_gpus]
# Populate chart data points
chart_data = []
for i, (gpu, ttft, thpt, label) in enumerate(
zip(
prefill_data.num_gpus,
prefill_data.ttft,
prefill_data.thpt_per_gpu,
prefill_data.parallel_mapping_labels,
)
):
chart_data.append(
{
"x": round(ttft, 2),
"y": round(thpt, 2),
"gpu": gpu,
"tableIdx": i,
"gpuLabel": f"{gpu} GPUs [{label}]",
}
)
data[PlotType.PREFILL]["chart"]["datasets"][0]["data"] = chart_data
# Populate table data
table_data = []
for i, (gpu, ttft, thpt, label) in enumerate(
zip(
prefill_data.num_gpus,
prefill_data.ttft,
prefill_data.thpt_per_gpu,
prefill_data.parallel_mapping_labels,
)
):
# TODO: Add actual config YAML data
config_yaml = f"prefill_config_{i}.yaml"
table_data.append([gpu, round(ttft, 2), round(thpt, 2), config_yaml])
data[PlotType.PREFILL]["table"]["data"] = table_data
def populate_decode_data(data, decode_data):
"""Populate decode chart and table data."""
if not decode_data.num_gpus:
return
# Group by GPU count for multiple datasets
gpu_groups: dict[int, list[dict[str, float | int]]] = {}
for i, (gpu, itl, thpt, label) in enumerate(
zip(
decode_data.num_gpus,
decode_data.itl,
decode_data.thpt_per_gpu,
decode_data.parallel_mapping_labels,
)
):
if gpu not in gpu_groups:
gpu_groups[gpu] = []
gpu_groups[gpu].append({"x": round(itl, 2), "y": round(thpt, 2), "tableIdx": i})
# Create datasets for each GPU count with different colors
datasets = []
for idx, (gpu, points) in enumerate(sorted(gpu_groups.items())):
color = CHART_COLORS[idx % len(CHART_COLORS)]
datasets.append(
{
"label": f"{gpu} GPUs",
"data": points,
"backgroundColor": color,
"borderColor": color,
}
)
data[PlotType.DECODE]["chart"]["datasets"] = datasets
# Populate table data
table_data = []
for i, (gpu, itl, thpt, label) in enumerate(
zip(
decode_data.num_gpus,
decode_data.itl,
decode_data.thpt_per_gpu,
decode_data.parallel_mapping_labels,
)
):
config_yaml = f"decode_config_{i}.yaml"
table_data.append([gpu, round(itl, 2), round(thpt, 2), config_yaml])
data[PlotType.DECODE]["table"]["data"] = table_data
def populate_cost_data(data, prefill_data, decode_data, args):
"""Populate cost chart and table data with pareto-optimal configurations."""
if not prefill_data.num_gpus or not decode_data.num_gpus:
return
# Compute pareto front for prefill (minimize TTFT, maximize throughput)
p_ttft, p_thpt, prefill_pareto_indices = compute_pareto(
prefill_data.ttft, prefill_data.thpt_per_gpu
)
# Compute pareto front for decode (minimize ITL, maximize throughput)
d_itl, d_thpt, decode_pareto_indices = compute_pareto(
decode_data.itl, decode_data.thpt_per_gpu
)
# Convert to numpy arrays
p_ttft = np.array(p_ttft)
p_thpt = np.array(p_thpt)
d_itl = np.array(d_itl)
d_thpt = np.array(d_thpt)
# Generate cost datasets - one line per prefill config
cost_datasets = []
table_data = []
cost_index_mapping = {} # Map cost table row idx -> (prefill_idx, decode_idx)
table_idx = 0
for p_idx, (_p_ttft, _p_thpt) in enumerate(zip(p_ttft, p_thpt)):
# Calculate prefill cost (fixed for this line)
prefill_cost = args.isl * 1000 / _p_thpt * GPU_COST_PER_HOUR / 3600
# For each decode config, calculate total cost
line_data = []
for d_idx, (_d_itl, _d_thpt) in enumerate(zip(d_itl, d_thpt)):
# Calculate decode cost
decode_cost = args.osl * 1000 / _d_thpt * GPU_COST_PER_HOUR / 3600
total_cost = prefill_cost + decode_cost
# X-axis: tokens per user (based on ITL)
tokens_per_user = 1000 / _d_itl
line_data.append(
{
"x": round(tokens_per_user, 2),
"y": round(total_cost, 2),
"tableIdx": table_idx,
}
)
# Store mapping from cost table row to original indices
orig_prefill_idx = prefill_pareto_indices[p_idx]
orig_decode_idx = decode_pareto_indices[d_idx]
cost_index_mapping[table_idx] = (orig_prefill_idx, orig_decode_idx)
# Add to table data
table_data.append(
[
round(_p_ttft, 2),
round(_p_thpt, 2),
round(_d_itl, 2),
round(_d_thpt, 2),
round(tokens_per_user, 2),
round(total_cost, 2),
f"cost_config_{table_idx}.yaml", # TODO: Add actual config
]
)
table_idx += 1
# Create dataset for this prefill config
color = CHART_COLORS[p_idx % len(CHART_COLORS)]
cost_datasets.append(
{
"label": f"TTFT: {_p_ttft:.2f}ms",
"data": line_data,
"backgroundColor": color,
"borderColor": color,
}
)
data[PlotType.COST]["chart"]["datasets"] = cost_datasets
data[PlotType.COST]["table"]["data"] = table_data
# Store the index mapping in the JSON for reference
data[PlotType.COST]["index_mapping"] = {
str(k): list(v) for k, v in cost_index_mapping.items()
}
def create_selection_handler(
data_dict, selection_queue, prefill_selection, decode_selection
):
"""Create a selection handler closure for the WebUI.
Args:
data_dict: Parsed JSON data containing cost index mapping
selection_queue: Queue to communicate selections to main thread
prefill_selection: Dict tracking prefill selection state
decode_selection: Dict tracking decode selection state
Returns:
Callable: Selection handler function for Gradio
"""
def handle_selection(selection_json):
"""Handle datapoint selection from table."""
if not selection_json or selection_json.strip() == "":
return
try:
selection = json.loads(selection_json)
plot_type = selection.get("plotType")
row_idx = selection.get("rowIndex")
logger.info(f"Selection received: {plot_type}, row {row_idx}")
# Store selection for later confirmation
if plot_type == PlotType.COST:
# Cost selection - use index mapping to get original indices
cost_index_mapping = data_dict[PlotType.COST].get("index_mapping", {})
mapping_entry = cost_index_mapping.get(str(row_idx))
if mapping_entry:
prefill_idx, decode_idx = mapping_entry
if prefill_idx is not None and decode_idx is not None:
logger.info(
f"Cost selection determines: Prefill={prefill_idx}, Decode={decode_idx}"
)
# Auto-submit for cost selection
selection_queue.put((prefill_idx, decode_idx))
elif plot_type == PlotType.PREFILL:
prefill_selection["idx"] = row_idx
logger.info(f"Prefill selected: {row_idx}")
# Check if we have both selections
if decode_selection["idx"] is not None:
logger.info(
f"Both selections complete: Prefill={row_idx}, Decode={decode_selection['idx']}"
)
selection_queue.put((row_idx, decode_selection["idx"]))
else:
logger.info("Waiting for decode selection...")
elif plot_type == PlotType.DECODE:
decode_selection["idx"] = row_idx
logger.info(f"Decode selected: {row_idx}")
# Check if we have both selections
if prefill_selection["idx"] is not None:
logger.info(
f"Both selections complete: Prefill={prefill_selection['idx']}, Decode={row_idx}"
)
selection_queue.put((prefill_selection["idx"], row_idx))
else:
logger.info("Waiting for prefill selection...")
except Exception as e:
logger.error(f"Error handling selection: {e}")
return handle_selection
def create_gradio_interface(json_data_str, handle_selection):
"""Create the Gradio interface for configuration selection.
Args:
json_data_str: JSON string containing profiling data
handle_selection: Selection handler function
Returns:
gr.Blocks: Configured Gradio demo
"""
with gr.Blocks(title="Configuration Selection") as demo:
# Create hidden UI components (reused from AIC profiling module)
ui_components = create_profiling_ui_components()
selection_input = ui_components["selection_input"]
selection_button = ui_components["selection_button"]
json_data = ui_components["json_data"]
# Inject CSS and modal (reused from AIC profiling module)
inject_profiling_assets()
gr.Markdown("# 📊 Profiling Results - Select Configuration")
gr.Markdown(
"""
**Two ways to select prefill and decode configs:**
1. **Cost Analysis** (recommended): Click any row in the Cost Analysis table - automatically determines both prefill and decode
2. **Individual**: Click one row in the Prefill table AND one row in the Decode table
The selection will be processed automatically once complete.
> 📝 **Note:** The dotted red line in the prefill and decode charts are default TTFT and ITL SLAs if not specified.
> ⚠️ **Warning:** The TTFT values here represent the ideal case when requests arrive uniformly, minimizing queueing. Real-world TTFT may be higher than profiling results. To mitigate the issue, planner uses ][correction factors](https://github.com/ai-dynamo/dynamo/blob/main/docs/planner/sla_planner.md#2-correction-factor-calculation) to adjust dynamically at runtime.
"""
)
# Performance Results Section (reused from AIC profiling module)
create_performance_results_section()
# Handle selection button
selection_button.click(
fn=handle_selection,
inputs=[selection_input],
outputs=[],
)
# Trigger visualization when JSON data changes
json_data.change(
fn=None,
inputs=[json_data],
outputs=[],
js=(
"(data) => { if (data && data.trim() && window.initializeVisualizations) "
"window.initializeVisualizations(data); }"
),
)
# Load JavaScript and data automatically on page load
def load_data():
"""Load profiling data."""
return json_data_str
demo.load(
fn=load_data, inputs=[], outputs=[json_data], js=load_profiling_javascript()
)
return demo
def wait_for_selection(demo, selection_queue, port):
"""Launch the demo and wait for user selection.
Args:
demo: Gradio demo instance
selection_queue: Queue to receive selection from UI
port: Port number for the WebUI
Returns:
tuple[int, int]: (selected_prefill_idx, selected_decode_idx)
"""
# Launch the interface in a separate thread
def launch_thread():
demo.launch(
server_name="0.0.0.0",
server_port=port,
share=False,
prevent_thread_lock=True,
)
thread = threading.Thread(target=launch_thread, daemon=True)
thread.start()
logger.info(f"WebUI launched. Waiting for user selection on http://0.0.0.0:{port}")
logger.info("Please select a row from the Cost Analysis table")
# Block and wait for selection
try:
selected_prefill_idx, selected_decode_idx = selection_queue.get(
timeout=WEB_UI_SELECTION_TIMEOUT
)
logger.info(
f"User selected: Prefill={selected_prefill_idx}, Decode={selected_decode_idx}"
)
# Close the demo
demo.close()
return selected_prefill_idx, selected_decode_idx
except queue.Empty:
logger.error("Selection timeout - no selection made within 1 hour")
demo.close()
# Return default
return 0, 0
......@@ -40,7 +40,7 @@ classifiers = [
]
dependencies = [
"aiconfigurator @ git+https://github.com/ai-dynamo/aiconfigurator.git@release/0.4.0",
"aiconfigurator[webapp] @ git+https://github.com/ai-dynamo/aiconfigurator.git@bdc142609b97c23a298115f09a9f88ae143f48d8",
"networkx",
"pandas",
"pydantic>=2",
......
......@@ -11,9 +11,9 @@
# maximum versions available on different platforms (x86_64 vs aarch64, different CUDA versions)
# For Multimodal EPD (required for device_map="auto" in vision model loading)
accelerate==1.12.0
aiconfigurator @ git+https://github.com/ai-dynamo/aiconfigurator.git@5554d2eb8206738c66048bf2d72183e9bcd85759
aiofiles==24.1.0
accelerate
aiconfigurator[webapp] @ git+https://github.com/ai-dynamo/aiconfigurator.git@bdc142609b97c23a298115f09a9f88ae143f48d8
aiofiles
aiperf @ git+https://github.com/ai-dynamo/aiperf.git@4d3fa29403c8f75da22a14f1f7b3aeb27db9288f
av==15.0.0
fastapi==0.120.1
......
......@@ -66,6 +66,7 @@ class TestProfileSlaAiconfigurator:
self.aic_backend_version = None
self.num_gpus_per_node = 8
self.deploy_after_profile = False
self.pick_with_webui = False
# Provide minimal model_info to avoid HF queries
self.model_info = ModelInfo(
model_size=16384.0,
......
......@@ -73,6 +73,7 @@ class TestProfileSLADryRun:
self.aic_backend_version = None
self.num_gpus_per_node = 8
self.deploy_after_profile = False
self.pick_with_webui = False
# Provide minimal model_info to avoid HF queries
self.model_info = ModelInfo(
model_size=16384.0,
......@@ -116,6 +117,7 @@ class TestProfileSLADryRun:
self.aic_backend_version = None
self.num_gpus_per_node = 8
self.deploy_after_profile = False
self.pick_with_webui = False
self.model_info = ModelInfo(
model_size=16384.0,
architecture="TestArchitecture",
......@@ -180,6 +182,7 @@ class TestProfileSLADryRun:
self.aic_backend_version = None
self.num_gpus_per_node = 8
self.deploy_after_profile = False
self.pick_with_webui = False
self.model_info = ModelInfo(
model_size=16384.0,
architecture="TestArchitecture",
......@@ -233,6 +236,7 @@ class TestProfileSLADryRun:
self.aic_backend_version = None
self.num_gpus_per_node = 8
self.deploy_after_profile = False
self.pick_with_webui = False
self.model_info = ModelInfo(
model_size=65536.0,
architecture="TestMoEArchitecture",
......@@ -309,6 +313,7 @@ class TestProfileSLADryRun:
# Set to 0 to trigger auto-generation path
self.num_gpus_per_node = 0
self.deploy_after_profile = False
self.pick_with_webui = False
self.enable_gpu_discovery = True
return Args()
......@@ -376,6 +381,7 @@ class TestProfileSLADryRun:
self.aic_backend_version = None
self.num_gpus_per_node = 0
self.deploy_after_profile = False
self.pick_with_webui = False
self.enable_gpu_discovery = True
return Args()
......@@ -443,6 +449,7 @@ class TestProfileSLADryRun:
self.aic_backend_version = None
self.num_gpus_per_node = 0
self.deploy_after_profile = False
self.pick_with_webui = False
self.enable_gpu_discovery = True
return Args()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment