Unverified Commit 6b5842ee authored by hhzhang16's avatar hhzhang16 Committed by GitHub
Browse files

feat: Profiler WebUI improvements -- error handling, GPU hours, style fixes,...


feat: Profiler WebUI improvements -- error handling, GPU hours, style fixes, preview configs (#4968)
Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
parent 74fcd4a9
......@@ -50,7 +50,11 @@ from benchmarks.profiler.utils.profile_prefill import (
profile_prefill_aiconfigurator,
)
from benchmarks.profiler.utils.profiler_argparse import create_profiler_parser
from benchmarks.profiler.webui.select_config import pick_config_with_webui
from benchmarks.profiler.webui.select_config import (
add_profiling_error,
clear_profiling_errors,
pick_config_with_webui,
)
from deploy.utils.dynamo_deployment import (
DynamoDeploymentClient,
cleanup_remaining_deployments,
......@@ -131,6 +135,9 @@ async def run_profile(args):
# List to track all created deployment clients for cleanup in case of failure
deployment_clients = []
# Clear any errors from previous profiling runs
clear_profiling_errors()
# Inherit aic_backend from backend if not explicitly set
if not args.aic_backend:
args.aic_backend = args.backend
......@@ -476,7 +483,9 @@ async def run_profile(args):
logger.info("Analyzing results and generate recommendations...")
# Safety guards: no results → exit early with a clear message
if not prefill_data.num_gpus:
logger.error("No prefill results produced; skipping recommendations.")
error_msg = "No prefill results produced; skipping recommendations."
logger.error(error_msg)
add_profiling_error(error_msg)
return
if args.pick_with_webui:
......@@ -488,9 +497,9 @@ async def run_profile(args):
# automatically select P/D config within SLA with the highest throughput/GPU
# select best parallel mapping for prefill
if min(prefill_data.ttft) > args.ttft:
logger.warning(
"No engine configuration satisfies the TTFT requirement, please try a smaller model or more powerful hardware"
)
warning_msg = "No engine configuration satisfies the TTFT requirement, please try a smaller model or more powerful hardware"
logger.warning(warning_msg)
add_profiling_error(warning_msg)
selected_prefill_idx = int(np.argmin(np.array(prefill_data.ttft)))
else:
valid_indices = [
......@@ -508,14 +517,14 @@ async def run_profile(args):
# select best parallel mapping for decode
if not decode_data.num_gpus:
logger.error(
"No decode results produced; skipping recommendations."
)
error_msg = "No decode results produced; skipping recommendations."
logger.error(error_msg)
add_profiling_error(error_msg)
return
if min(decode_data.itl) > args.itl:
logger.warning(
"No engine configuration satisfies the ITL requirement, please try a smaller model or more powerful hardware"
)
warning_msg = "No engine configuration satisfies the ITL requirement, please try a smaller model or more powerful hardware"
logger.warning(warning_msg)
add_profiling_error(warning_msg)
selected_decode_idx = int(np.argmin(np.array(decode_data.itl)))
else:
valid_indices = [
......
......@@ -76,11 +76,11 @@
"min": 0
},
"y": {
"title": "Cost ($)",
"title": "GPU Hours",
"min": 0
}
},
"title": "Cost Per 1000 ? requests"
"title": "GPU Hours Per 1000 ? requests"
},
"table": {
"columns": [
......@@ -89,7 +89,7 @@
"ITL (ms)",
"Decode Thpt (tokens/s/GPU)",
"Tokens/User",
"Cost ($)",
"GPU Hours",
"Action"
],
"data": []
......
......@@ -5,15 +5,18 @@ import json
import logging
import queue
from benchmarks.profiler.utils.defaults import DEFAULT_GPU_COST_PER_HOUR
from benchmarks.profiler.webui.utils import (
create_gpu_cost_update_handler,
add_profiling_error,
clear_profiling_errors,
create_gradio_interface,
create_selection_handler,
generate_config_data,
wait_for_selection,
)
# Re-export for use by profiler modules
__all__ = ["pick_config_with_webui", "add_profiling_error", "clear_profiling_errors"]
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler()
......@@ -37,12 +40,15 @@ def pick_config_with_webui(prefill_data, decode_data, args):
Returns:
tuple[int, int]: (selected_prefill_idx, selected_decode_idx)
"""
# Generate JSON data (also writes default JSON file for convenience)
# Note: Don't clear profiling errors here - they should be accumulated
# during the profiling run and displayed in the WebUI.
# clear_profiling_errors() should be called at the start of a new profiling run.
# Generate JSON data with GPU hours (frontend handles cost conversion)
data_dict = generate_config_data(
prefill_data,
decode_data,
args,
gpu_cost_per_hour=DEFAULT_GPU_COST_PER_HOUR,
write_to_disk=True,
)
json_data_str = json.dumps(data_dict)
......@@ -61,19 +67,11 @@ def pick_config_with_webui(prefill_data, decode_data, args):
handle_selection = create_selection_handler(
data_dict_ref, selection_queue, prefill_selection, decode_selection
)
update_gpu_cost_per_hour = create_gpu_cost_update_handler(
prefill_data=prefill_data,
decode_data=decode_data,
args=args,
data_dict_ref=data_dict_ref,
default_gpu_cost_per_hour=DEFAULT_GPU_COST_PER_HOUR,
)
# Note: GPU hours -> Cost conversion is handled by frontend JavaScript (gpu_cost_toggle.js)
demo = create_gradio_interface(
json_data_str,
handle_selection,
update_json_data_fn=update_gpu_cost_per_hour,
default_gpu_cost_per_hour=DEFAULT_GPU_COST_PER_HOUR,
)
return wait_for_selection(demo, selection_queue, args.webui_port)
This diff is collapsed.
......@@ -12,7 +12,7 @@
# For Multimodal EPD (required for device_map="auto" in vision model loading)
accelerate
aiconfigurator[webapp] @ git+https://github.com/ai-dynamo/aiconfigurator.git@7f7ad5e248f3eaa4a0b74a069095828a4f356e60
aiconfigurator[webapp] @ git+https://github.com/ai-dynamo/aiconfigurator.git@0c8f38d354e9138f2cc00efcde66245b3801df1d
aiofiles
aiperf @ git+https://github.com/ai-dynamo/aiperf.git@4d3fa29403c8f75da22a14f1f7b3aeb27db9288f
av==15.0.0
......
......@@ -170,6 +170,67 @@ Suggested prefill TP:4 (TTFT 48.37 ms, throughput 15505.23 tokens/s/GPU)
Suggested decode TP:4 (ITL 4.83 ms, throughput 51.22 tokens/s/GPU)
```
#### Interactive Configuration Selection WebUI
When running the profiler with `--pick-with-webui`, an interactive web interface is launched that allows you to visually explore profiling results and manually select configurations.
**Features:**
- **Interactive Charts**: Visualize prefill TTFT, decode ITL, and GPU hours analysis with hover-to-highlight synchronization between charts and tables
- **Pareto-Optimal Analysis**: The GPU Hours table shows pareto-optimal configurations balancing latency and throughput
- **DGD Config Preview**: Click "Show Config" on any row to view the corresponding DynamoGraphDeployment YAML
- **GPU Cost Estimation**: Toggle GPU cost display to convert GPU hours to cost ($/1000 requests)
- **SLA Visualization**: Red dashed lines indicate your TTFT and ITL targets
**Selection Methods:**
1. **GPU Hours Table** (recommended): Click any row to select both prefill and decode configurations at once based on the pareto-optimal combination
2. **Individual Selection**: Click one row in the Prefill table AND one row in the Decode table to manually choose each
**Example DGD Config Output:**
When you click "Show Config", you'll see a DynamoGraphDeployment configuration like:
```yaml
# DynamoGraphDeployment Configuration
# Prefill: 1 GPU(s), TP=1
# Decode: 4 GPU(s), TP=4
# Model: Qwen/Qwen3-32B-FP8
# Backend: trtllm
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
spec:
services:
PrefillWorker:
subComponentType: prefill
replicas: 1
extraPodSpec:
mainContainer:
args:
- --tensor-parallel-size=1
DecodeWorker:
subComponentType: decode
replicas: 1
extraPodSpec:
mainContainer:
args:
- --tensor-parallel-size=4
```
**Usage:**
```bash
python -m benchmarks.profiler.profile_sla \
--backend trtllm \
--config path/to/disagg.yaml \
--pick-with-webui \
--use-ai-configurator \
--model Qwen/Qwen3-32B-FP8 \
--aic-system h200_sxm \
--ttft 200 --itl 15
```
Once you have selected a configuration, the full DynamoGraphDeployment CRD will be saved in your output folder as `config_with_planner.yaml`.
The WebUI launches on port 8000 by default (configurable with `--webui-port`).
#### Output Performance Plots
The profiler will generate the following plots to better visualize the performance data:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment