Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
0b284b63
Unverified
Commit
0b284b63
authored
Nov 05, 2025
by
Hongkuan Zhou
Committed by
GitHub
Nov 05, 2025
Browse files
feat: add cost plot to profiler (#4003)
Signed-off-by:
hongkuanz
<
hongkuanz@nvidia.com
>
parent
4765d880
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
94 additions
and
13 deletions
+94
-13
benchmarks/profiler/profile_sla.py
benchmarks/profiler/profile_sla.py
+9
-7
benchmarks/profiler/utils/pareto.py
benchmarks/profiler/utils/pareto.py
+36
-0
benchmarks/profiler/utils/plot.py
benchmarks/profiler/utils/plot.py
+49
-6
No files found.
benchmarks/profiler/profile_sla.py
View file @
0b284b63
...
@@ -27,6 +27,7 @@ from benchmarks.profiler.utils.dgd_generation import generate_dgd_config_with_pl
...
@@ -27,6 +27,7 @@ from benchmarks.profiler.utils.dgd_generation import generate_dgd_config_with_pl
from
benchmarks.profiler.utils.estimate_perf
import
AIConfiguratorPerfEstimator
from
benchmarks.profiler.utils.estimate_perf
import
AIConfiguratorPerfEstimator
from
benchmarks.profiler.utils.plot
import
(
from
benchmarks.profiler.utils.plot
import
(
plot_decode_performance
,
plot_decode_performance
,
plot_pd_joint_results
,
plot_prefill_performance
,
plot_prefill_performance
,
)
)
from
benchmarks.profiler.utils.profile_cache
import
(
from
benchmarks.profiler.utils.profile_cache
import
(
...
@@ -280,14 +281,10 @@ async def run_profile(args):
...
@@ -280,14 +281,10 @@ async def run_profile(args):
prefill_thpt_per_gpu
.
append
(
args
.
isl
/
ttft
/
num_gpus
*
1000
)
prefill_thpt_per_gpu
.
append
(
args
.
isl
/
ttft
/
num_gpus
*
1000
)
# Plot the results as a 2D scatter plot
# Plot the results as a 2D scatter plot
prefill_results
=
None
if
prefill_num_gpus
and
prefill_ttft
and
prefill_thpt_per_gpu
:
if
prefill_num_gpus
and
prefill_ttft
and
prefill_thpt_per_gpu
:
plot_prefill_performance
(
prefill_results
=
(
prefill_num_gpus
,
prefill_ttft
,
prefill_thpt_per_gpu
)
prefill_num_gpus
,
plot_prefill_performance
(
prefill_results
,
args
.
ttft
,
args
.
output_dir
)
prefill_ttft
,
prefill_thpt_per_gpu
,
args
.
ttft
,
args
.
output_dir
,
)
# then profile decode
# then profile decode
decode_num_gpus
=
[]
decode_num_gpus
=
[]
...
@@ -476,6 +473,11 @@ async def run_profile(args):
...
@@ -476,6 +473,11 @@ async def run_profile(args):
if
decode_results
:
if
decode_results
:
plot_decode_performance
(
decode_results
,
args
.
itl
,
args
.
output_dir
)
plot_decode_performance
(
decode_results
,
args
.
itl
,
args
.
output_dir
)
if
prefill_results
and
decode_results
:
plot_pd_joint_results
(
args
.
isl
,
args
.
osl
,
prefill_results
,
decode_results
,
args
.
output_dir
)
if
args
.
dry_run
:
if
args
.
dry_run
:
logger
.
info
(
"Skipping recommendations in dry run mode"
)
logger
.
info
(
"Skipping recommendations in dry run mode"
)
else
:
else
:
...
...
benchmarks/profiler/utils/pareto.py
0 → 100644
View file @
0b284b63
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
def
compute_pareto
(
x
,
y
):
"""
compute the pareto front (top-left is better) for the given x and y values
return sorted lists of the x and y values for the pareto front
"""
# Validate inputs
if
x
is
None
or
y
is
None
:
return
[],
[]
if
len
(
x
)
!=
len
(
y
):
raise
ValueError
(
"x and y must have the same length"
)
if
len
(
x
)
==
0
:
return
[],
[]
# Build point list and sort by x asc, then y desc so we prefer smaller x and larger y.
points
=
list
(
zip
(
x
,
y
))
points
.
sort
(
key
=
lambda
p
:
(
p
[
0
],
-
p
[
1
]))
# Single pass to keep only non-dominated points (minimize x, maximize y).
pareto
=
[]
max_y
=
float
(
"-inf"
)
for
px
,
py
in
points
:
if
py
>
max_y
:
pareto
.
append
((
px
,
py
))
max_y
=
py
# Return sorted by x ascending for convenience
pareto
.
sort
(
key
=
lambda
p
:
(
p
[
0
],
p
[
1
]))
xs
=
[
px
for
px
,
_
in
pareto
]
ys
=
[
py
for
_
,
py
in
pareto
]
return
xs
,
ys
benchmarks/profiler/utils/plot.py
View file @
0b284b63
...
@@ -20,6 +20,8 @@ import numpy as np
...
@@ -20,6 +20,8 @@ import numpy as np
from
matplotlib
import
cm
from
matplotlib
import
cm
from
scipy.interpolate
import
griddata
from
scipy.interpolate
import
griddata
from
benchmarks.profiler.utils.pareto
import
compute_pareto
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
logger
.
setLevel
(
logging
.
INFO
)
logger
.
setLevel
(
logging
.
INFO
)
console_handler
=
logging
.
StreamHandler
()
console_handler
=
logging
.
StreamHandler
()
...
@@ -31,19 +33,16 @@ console_handler.setFormatter(formatter)
...
@@ -31,19 +33,16 @@ console_handler.setFormatter(formatter)
logger
.
addHandler
(
console_handler
)
logger
.
addHandler
(
console_handler
)
def
plot_prefill_performance
(
def
plot_prefill_performance
(
prefill_results
,
target_ttft
,
output_dir
):
prefill_num_gpu
,
prefill_ttft
,
prefill_thpt_per_gpu
,
target_ttft
,
output_dir
):
"""
"""
Plot prefill performance as a 2D scatter plot with GPU count annotations.
Plot prefill performance as a 2D scatter plot with GPU count annotations.
Args:
Args:
prefill_num_gpu: list of GPU counts
prefill_results: tuple of (prefill_num_gpu, prefill_ttft, prefill_thpt_per_gpu)
prefill_ttft: list of time to first token values
prefill_thpt_per_gpu: list of throughput per GPU values
target_ttft: target TTFT value for the vertical line
target_ttft: target TTFT value for the vertical line
output_dir: directory to save the plot
output_dir: directory to save the plot
"""
"""
prefill_num_gpu
,
prefill_ttft
,
prefill_thpt_per_gpu
=
prefill_results
plt
.
figure
(
figsize
=
(
10
,
6
))
plt
.
figure
(
figsize
=
(
10
,
6
))
plt
.
scatter
(
prefill_ttft
,
prefill_thpt_per_gpu
,
s
=
100
)
plt
.
scatter
(
prefill_ttft
,
prefill_thpt_per_gpu
,
s
=
100
)
for
i
,
num_gpu
in
enumerate
(
prefill_num_gpu
):
for
i
,
num_gpu
in
enumerate
(
prefill_num_gpu
):
...
@@ -252,3 +251,47 @@ def plot_decode_3d_surface(
...
@@ -252,3 +251,47 @@ def plot_decode_3d_surface(
logger
.
info
(
f
"Saving throughput surface plot to
{
thpt_plot_path
}
"
)
logger
.
info
(
f
"Saving throughput surface plot to
{
thpt_plot_path
}
"
)
plt
.
savefig
(
thpt_plot_path
,
dpi
=
300
,
bbox_inches
=
"tight"
)
plt
.
savefig
(
thpt_plot_path
,
dpi
=
300
,
bbox_inches
=
"tight"
)
plt
.
close
()
plt
.
close
()
def
plot_pd_joint_results
(
isl
,
osl
,
prefill_results
,
decode_results
,
output_dir
):
GPU_COST_PER_HOUR
=
3.0
# $3/hour
# compute pareto front for prefill
p_ttft
,
p_thpt
=
compute_pareto
(
prefill_results
[
1
],
prefill_results
[
2
])
# compute pareto front for decode
_d_itl
,
_d_thpt
=
[],
[]
for
_d_result
in
decode_results
:
_d_itl
.
extend
(
_d_result
[
1
])
_d_thpt
.
extend
(
_d_result
[
2
])
d_itl
,
d_thpt
=
compute_pareto
(
_d_itl
,
_d_thpt
)
# convert to cost per thousand requests
p_ttft
=
np
.
array
(
p_ttft
)
p_thpt
=
np
.
array
(
p_thpt
)
d_itl
=
np
.
array
(
d_itl
)
d_thpt
=
np
.
array
(
d_thpt
)
tokens_per_user
=
[]
cost
=
[]
ttft
=
[]
for
_p_ttft
,
_p_thpt
in
zip
(
p_ttft
,
p_thpt
):
ttft
.
append
(
_p_ttft
)
prefill_cost
=
isl
*
1000
/
_p_thpt
*
GPU_COST_PER_HOUR
/
3600
tokens_per_user
.
append
(
1000
/
d_itl
)
cost
.
append
(
osl
*
1000
/
d_thpt
*
GPU_COST_PER_HOUR
/
3600
+
prefill_cost
)
# plot
plt
.
figure
(
figsize
=
(
12
,
10
))
plt
.
title
(
f
"Cost Per 1000 i
{
isl
}
o
{
osl
}
requests (GPU/hour = $
{
GPU_COST_PER_HOUR
}
) Under Different SLA"
)
for
_tokens_per_user
,
_cost
,
_ttft
in
zip
(
tokens_per_user
,
cost
,
ttft
):
line
=
plt
.
plot
(
_tokens_per_user
,
_cost
,
label
=
f
"TTFT:
{
_ttft
:.
2
f
}
ms"
)[
0
]
plt
.
scatter
(
_tokens_per_user
,
_cost
,
marker
=
"x"
,
s
=
100
,
color
=
line
.
get_color
())
plt
.
xlabel
(
"Tokens per User"
)
plt
.
ylabel
(
"Cost ($)"
)
plt
.
grid
(
True
)
plt
.
legend
()
plt
.
savefig
(
f
"
{
output_dir
}
/cost_sla.png"
,
dpi
=
300
)
plt
.
close
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment