Unverified Commit 6b5842ee authored by hhzhang16's avatar hhzhang16 Committed by GitHub
Browse files

feat: Profiler WebUI improvements -- error handling, GPU hours, style fixes,...


feat: Profiler WebUI improvements -- error handling, GPU hours, style fixes, preview configs (#4968)
Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
parent 74fcd4a9
...@@ -50,7 +50,11 @@ from benchmarks.profiler.utils.profile_prefill import ( ...@@ -50,7 +50,11 @@ from benchmarks.profiler.utils.profile_prefill import (
profile_prefill_aiconfigurator, profile_prefill_aiconfigurator,
) )
from benchmarks.profiler.utils.profiler_argparse import create_profiler_parser from benchmarks.profiler.utils.profiler_argparse import create_profiler_parser
from benchmarks.profiler.webui.select_config import pick_config_with_webui from benchmarks.profiler.webui.select_config import (
add_profiling_error,
clear_profiling_errors,
pick_config_with_webui,
)
from deploy.utils.dynamo_deployment import ( from deploy.utils.dynamo_deployment import (
DynamoDeploymentClient, DynamoDeploymentClient,
cleanup_remaining_deployments, cleanup_remaining_deployments,
...@@ -131,6 +135,9 @@ async def run_profile(args): ...@@ -131,6 +135,9 @@ async def run_profile(args):
# List to track all created deployment clients for cleanup in case of failure # List to track all created deployment clients for cleanup in case of failure
deployment_clients = [] deployment_clients = []
# Clear any errors from previous profiling runs
clear_profiling_errors()
# Inherit aic_backend from backend if not explicitly set # Inherit aic_backend from backend if not explicitly set
if not args.aic_backend: if not args.aic_backend:
args.aic_backend = args.backend args.aic_backend = args.backend
...@@ -476,7 +483,9 @@ async def run_profile(args): ...@@ -476,7 +483,9 @@ async def run_profile(args):
logger.info("Analyzing results and generate recommendations...") logger.info("Analyzing results and generate recommendations...")
# Safety guards: no results → exit early with a clear message # Safety guards: no results → exit early with a clear message
if not prefill_data.num_gpus: if not prefill_data.num_gpus:
logger.error("No prefill results produced; skipping recommendations.") error_msg = "No prefill results produced; skipping recommendations."
logger.error(error_msg)
add_profiling_error(error_msg)
return return
if args.pick_with_webui: if args.pick_with_webui:
...@@ -488,9 +497,9 @@ async def run_profile(args): ...@@ -488,9 +497,9 @@ async def run_profile(args):
# automatically select P/D config within SLA with the highest throughput/GPU # automatically select P/D config within SLA with the highest throughput/GPU
# select best parallel mapping for prefill # select best parallel mapping for prefill
if min(prefill_data.ttft) > args.ttft: if min(prefill_data.ttft) > args.ttft:
logger.warning( warning_msg = "No engine configuration satisfies the TTFT requirement, please try a smaller model or more powerful hardware"
"No engine configuration satisfies the TTFT requirement, please try a smaller model or more powerful hardware" logger.warning(warning_msg)
) add_profiling_error(warning_msg)
selected_prefill_idx = int(np.argmin(np.array(prefill_data.ttft))) selected_prefill_idx = int(np.argmin(np.array(prefill_data.ttft)))
else: else:
valid_indices = [ valid_indices = [
...@@ -508,14 +517,14 @@ async def run_profile(args): ...@@ -508,14 +517,14 @@ async def run_profile(args):
# select best parallel mapping for decode # select best parallel mapping for decode
if not decode_data.num_gpus: if not decode_data.num_gpus:
logger.error( error_msg = "No decode results produced; skipping recommendations."
"No decode results produced; skipping recommendations." logger.error(error_msg)
) add_profiling_error(error_msg)
return return
if min(decode_data.itl) > args.itl: if min(decode_data.itl) > args.itl:
logger.warning( warning_msg = "No engine configuration satisfies the ITL requirement, please try a smaller model or more powerful hardware"
"No engine configuration satisfies the ITL requirement, please try a smaller model or more powerful hardware" logger.warning(warning_msg)
) add_profiling_error(warning_msg)
selected_decode_idx = int(np.argmin(np.array(decode_data.itl))) selected_decode_idx = int(np.argmin(np.array(decode_data.itl)))
else: else:
valid_indices = [ valid_indices = [
......
...@@ -76,11 +76,11 @@ ...@@ -76,11 +76,11 @@
"min": 0 "min": 0
}, },
"y": { "y": {
"title": "Cost ($)", "title": "GPU Hours",
"min": 0 "min": 0
} }
}, },
"title": "Cost Per 1000 ? requests" "title": "GPU Hours Per 1000 ? requests"
}, },
"table": { "table": {
"columns": [ "columns": [
...@@ -89,7 +89,7 @@ ...@@ -89,7 +89,7 @@
"ITL (ms)", "ITL (ms)",
"Decode Thpt (tokens/s/GPU)", "Decode Thpt (tokens/s/GPU)",
"Tokens/User", "Tokens/User",
"Cost ($)", "GPU Hours",
"Action" "Action"
], ],
"data": [] "data": []
......
...@@ -5,15 +5,18 @@ import json ...@@ -5,15 +5,18 @@ import json
import logging import logging
import queue import queue
from benchmarks.profiler.utils.defaults import DEFAULT_GPU_COST_PER_HOUR
from benchmarks.profiler.webui.utils import ( from benchmarks.profiler.webui.utils import (
create_gpu_cost_update_handler, add_profiling_error,
clear_profiling_errors,
create_gradio_interface, create_gradio_interface,
create_selection_handler, create_selection_handler,
generate_config_data, generate_config_data,
wait_for_selection, wait_for_selection,
) )
# Re-export for use by profiler modules
__all__ = ["pick_config_with_webui", "add_profiling_error", "clear_profiling_errors"]
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler() console_handler = logging.StreamHandler()
...@@ -37,12 +40,15 @@ def pick_config_with_webui(prefill_data, decode_data, args): ...@@ -37,12 +40,15 @@ def pick_config_with_webui(prefill_data, decode_data, args):
Returns: Returns:
tuple[int, int]: (selected_prefill_idx, selected_decode_idx) tuple[int, int]: (selected_prefill_idx, selected_decode_idx)
""" """
# Generate JSON data (also writes default JSON file for convenience) # Note: Don't clear profiling errors here - they should be accumulated
# during the profiling run and displayed in the WebUI.
# clear_profiling_errors() should be called at the start of a new profiling run.
# Generate JSON data with GPU hours (frontend handles cost conversion)
data_dict = generate_config_data( data_dict = generate_config_data(
prefill_data, prefill_data,
decode_data, decode_data,
args, args,
gpu_cost_per_hour=DEFAULT_GPU_COST_PER_HOUR,
write_to_disk=True, write_to_disk=True,
) )
json_data_str = json.dumps(data_dict) json_data_str = json.dumps(data_dict)
...@@ -61,19 +67,11 @@ def pick_config_with_webui(prefill_data, decode_data, args): ...@@ -61,19 +67,11 @@ def pick_config_with_webui(prefill_data, decode_data, args):
handle_selection = create_selection_handler( handle_selection = create_selection_handler(
data_dict_ref, selection_queue, prefill_selection, decode_selection data_dict_ref, selection_queue, prefill_selection, decode_selection
) )
update_gpu_cost_per_hour = create_gpu_cost_update_handler(
prefill_data=prefill_data,
decode_data=decode_data,
args=args,
data_dict_ref=data_dict_ref,
default_gpu_cost_per_hour=DEFAULT_GPU_COST_PER_HOUR,
)
# Note: GPU hours -> Cost conversion is handled by frontend JavaScript (gpu_cost_toggle.js)
demo = create_gradio_interface( demo = create_gradio_interface(
json_data_str, json_data_str,
handle_selection, handle_selection,
update_json_data_fn=update_gpu_cost_per_hour,
default_gpu_cost_per_hour=DEFAULT_GPU_COST_PER_HOUR,
) )
return wait_for_selection(demo, selection_queue, args.webui_port) return wait_for_selection(demo, selection_queue, args.webui_port)
This diff is collapsed.
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
# For Multimodal EPD (required for device_map="auto" in vision model loading) # For Multimodal EPD (required for device_map="auto" in vision model loading)
accelerate accelerate
aiconfigurator[webapp] @ git+https://github.com/ai-dynamo/aiconfigurator.git@7f7ad5e248f3eaa4a0b74a069095828a4f356e60 aiconfigurator[webapp] @ git+https://github.com/ai-dynamo/aiconfigurator.git@0c8f38d354e9138f2cc00efcde66245b3801df1d
aiofiles aiofiles
aiperf @ git+https://github.com/ai-dynamo/aiperf.git@4d3fa29403c8f75da22a14f1f7b3aeb27db9288f aiperf @ git+https://github.com/ai-dynamo/aiperf.git@4d3fa29403c8f75da22a14f1f7b3aeb27db9288f
av==15.0.0 av==15.0.0
......
...@@ -170,6 +170,67 @@ Suggested prefill TP:4 (TTFT 48.37 ms, throughput 15505.23 tokens/s/GPU) ...@@ -170,6 +170,67 @@ Suggested prefill TP:4 (TTFT 48.37 ms, throughput 15505.23 tokens/s/GPU)
Suggested decode TP:4 (ITL 4.83 ms, throughput 51.22 tokens/s/GPU) Suggested decode TP:4 (ITL 4.83 ms, throughput 51.22 tokens/s/GPU)
``` ```
#### Interactive Configuration Selection WebUI
When running the profiler with `--pick-with-webui`, an interactive web interface is launched that allows you to visually explore profiling results and manually select configurations.
**Features:**
- **Interactive Charts**: Visualize prefill TTFT, decode ITL, and GPU hours analysis with hover-to-highlight synchronization between charts and tables
- **Pareto-Optimal Analysis**: The GPU Hours table shows pareto-optimal configurations balancing latency and throughput
- **DGD Config Preview**: Click "Show Config" on any row to view the corresponding DynamoGraphDeployment YAML
- **GPU Cost Estimation**: Toggle GPU cost display to convert GPU hours to cost ($/1000 requests)
- **SLA Visualization**: Red dashed lines indicate your TTFT and ITL targets
**Selection Methods:**
1. **GPU Hours Table** (recommended): Click any row to select both prefill and decode configurations at once based on the pareto-optimal combination
2. **Individual Selection**: Click one row in the Prefill table AND one row in the Decode table to manually choose each
**Example DGD Config Output:**
When you click "Show Config", you'll see a DynamoGraphDeployment configuration like:
```yaml
# DynamoGraphDeployment Configuration
# Prefill: 1 GPU(s), TP=1
# Decode: 4 GPU(s), TP=4
# Model: Qwen/Qwen3-32B-FP8
# Backend: trtllm
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
spec:
services:
PrefillWorker:
subComponentType: prefill
replicas: 1
extraPodSpec:
mainContainer:
args:
- --tensor-parallel-size=1
DecodeWorker:
subComponentType: decode
replicas: 1
extraPodSpec:
mainContainer:
args:
- --tensor-parallel-size=4
```
**Usage:**
```bash
python -m benchmarks.profiler.profile_sla \
--backend trtllm \
--config path/to/disagg.yaml \
--pick-with-webui \
--use-ai-configurator \
--model Qwen/Qwen3-32B-FP8 \
--aic-system h200_sxm \
--ttft 200 --itl 15
```
Once you have selected a configuration, the full DynamoGraphDeployment CRD will be saved in your output folder as `config_with_planner.yaml`.
The WebUI launches on port 8000 by default (configurable with `--webui-port`).
#### Output Performance Plots #### Output Performance Plots
The profiler will generate the following plots to better visualize the performance data: The profiler will generate the following plots to better visualize the performance data:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment