Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
d83ab662
Unverified
Commit
d83ab662
authored
Jan 16, 2026
by
Yan Ru Pei
Committed by
GitHub
Jan 16, 2026
Browse files
chore: make precentile plots in router benchmark (#5476)
Signed-off-by:
PeaBrane
<
yanrpei@gmail.com
>
parent
34c4882d
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
85 additions
and
33 deletions
+85
-33
benchmarks/router/prefix_ratio_benchmark.py
benchmarks/router/prefix_ratio_benchmark.py
+85
-33
No files found.
benchmarks/router/prefix_ratio_benchmark.py
View file @
d83ab662
...
...
@@ -84,6 +84,8 @@ def get_aiperf_cmd(
str
(
num_prefix_prompts
),
"--artifact-dir"
,
artifact_dir
,
"--dataset-sampling-strategy"
,
"shuffle"
,
"-H"
,
"Authorization: Bearer NOT USED"
,
"-H"
,
...
...
@@ -157,20 +159,30 @@ def aggregate_results(results: List[Optional[Dict]]) -> Optional[Dict]:
if
not
results
:
return
None
# For TTFT, we take the average across all URLs
# For throughput, we sum across all URLs (total system throughput)
ttft_values
=
[
r
[
"time_to_first_token"
][
"avg"
]
for
r
in
results
if
r
is
not
None
]
throughput_values
=
[
r
[
"output_token_throughput"
][
"avg"
]
for
r
in
results
if
r
is
not
None
]
if
not
ttft_values
or
not
throughput_values
:
valid_results
=
[
r
for
r
in
results
if
r
is
not
None
]
if
not
valid_results
:
return
None
# For TTFT percentiles, average across URLs
ttft_p25_values
=
[
r
[
"time_to_first_token"
][
"p25"
]
for
r
in
valid_results
]
ttft_p50_values
=
[
r
[
"time_to_first_token"
][
"p50"
]
for
r
in
valid_results
]
ttft_p75_values
=
[
r
[
"time_to_first_token"
][
"p75"
]
for
r
in
valid_results
]
# For ITL percentiles, average across URLs
itl_p25_values
=
[
r
[
"inter_token_latency"
][
"p25"
]
for
r
in
valid_results
]
itl_p50_values
=
[
r
[
"inter_token_latency"
][
"p50"
]
for
r
in
valid_results
]
itl_p75_values
=
[
r
[
"inter_token_latency"
][
"p75"
]
for
r
in
valid_results
]
aggregated
=
{
"time_to_first_token"
:
{
"avg"
:
sum
(
ttft_values
)
/
len
(
ttft_values
)},
"output_token_throughput"
:
{
"avg"
:
sum
(
throughput_values
)
# Total throughput across all URLs
"time_to_first_token"
:
{
"p25"
:
sum
(
ttft_p25_values
)
/
len
(
ttft_p25_values
),
"p50"
:
sum
(
ttft_p50_values
)
/
len
(
ttft_p50_values
),
"p75"
:
sum
(
ttft_p75_values
)
/
len
(
ttft_p75_values
),
},
"inter_token_latency"
:
{
"p25"
:
sum
(
itl_p25_values
)
/
len
(
itl_p25_values
),
"p50"
:
sum
(
itl_p50_values
)
/
len
(
itl_p50_values
),
"p75"
:
sum
(
itl_p75_values
)
/
len
(
itl_p75_values
),
},
}
...
...
@@ -328,8 +340,12 @@ def main():
# Store results
prefix_ratios
=
[]
ttft_values
=
[]
throughput_values
=
[]
ttft_p25_values
=
[]
ttft_p50_values
=
[]
ttft_p75_values
=
[]
itl_p25_values
=
[]
itl_p50_values
=
[]
itl_p75_values
=
[]
current_seed
=
args
.
seed
...
...
@@ -350,50 +366,82 @@ def main():
)
if
result
is
not
None
:
ttft
=
result
[
"time_to_first_token"
]
[
"avg"
]
throughput
=
result
[
"
output
_token_
throughput"
][
"avg
"
]
ttft
=
result
[
"time_to_first_token"
]
itl
=
result
[
"
inter
_token_
latency
"
]
prefix_ratios
.
append
(
prefix_ratio
)
ttft_values
.
append
(
ttft
)
throughput_values
.
append
(
throughput
)
ttft_p25_values
.
append
(
ttft
[
"p25"
])
ttft_p50_values
.
append
(
ttft
[
"p50"
])
ttft_p75_values
.
append
(
ttft
[
"p75"
])
itl_p25_values
.
append
(
itl
[
"p25"
])
itl_p50_values
.
append
(
itl
[
"p50"
])
itl_p75_values
.
append
(
itl
[
"p75"
])
logger
.
info
(
f
"Prefix ratio
{
prefix_ratio
}
: TTFT=
{
ttft
:.
2
f
}
ms, Throughput=
{
throughput
:.
2
f
}
tokens/s"
f
"Prefix ratio
{
prefix_ratio
}
: TTFT p50=
{
ttft
[
'p50'
]:.
2
f
}
ms (p25=
{
ttft
[
'p25'
]:.
2
f
}
, p75=
{
ttft
[
'p75'
]:.
2
f
}
), "
f
"ITL p50=
{
itl
[
'p50'
]:.
2
f
}
ms (p25=
{
itl
[
'p25'
]:.
2
f
}
, p75=
{
itl
[
'p75'
]:.
2
f
}
)"
)
current_seed
+=
1
# Create plots
if
prefix_ratios
and
ttft_values
and
throughput_values
:
# Plot TTFT vs Prefix Ratio
if
prefix_ratios
and
ttft_p50_values
and
itl_p50_values
:
plt
.
figure
(
figsize
=
(
12
,
5
))
# Plot TTFT vs Prefix Ratio with shaded p25-p75 region
plt
.
subplot
(
1
,
2
,
1
)
plt
.
plot
(
prefix_ratios
,
ttft_values
,
"bo-"
,
linewidth
=
2
,
markersize
=
8
)
plt
.
fill_between
(
prefix_ratios
,
ttft_p25_values
,
ttft_p75_values
,
alpha
=
0.3
,
color
=
"blue"
,
label
=
"p25-p75"
,
)
plt
.
plot
(
prefix_ratios
,
ttft_p50_values
,
"bo-"
,
linewidth
=
2
,
markersize
=
8
,
label
=
"p50"
,
)
plt
.
xlabel
(
"Prefix Ratio"
)
plt
.
ylabel
(
"Time to First Token (ms)"
)
plt
.
title
(
"TTFT vs Prefix Ratio"
)
plt
.
grid
(
True
,
alpha
=
0.3
)
for
i
,
(
pr
,
ttft
)
in
enumerate
(
zip
(
prefix_ratios
,
ttft_values
)):
plt
.
legend
()
for
i
,
(
pr
,
p50
)
in
enumerate
(
zip
(
prefix_ratios
,
ttft_p50_values
)):
plt
.
annotate
(
f
"
{
ttft
:.
1
f
}
ms"
,
(
pr
,
ttft
),
f
"
{
p50
:.
1
f
}
ms"
,
(
pr
,
p50
),
textcoords
=
"offset points"
,
xytext
=
(
0
,
10
),
ha
=
"center"
,
)
# Plot
Throughput
vs Prefix Ratio
# Plot
ITL
vs Prefix Ratio
with shaded p25-p75 region
plt
.
subplot
(
1
,
2
,
2
)
plt
.
plot
(
prefix_ratios
,
throughput_values
,
"ro-"
,
linewidth
=
2
,
markersize
=
8
)
plt
.
fill_between
(
prefix_ratios
,
itl_p25_values
,
itl_p75_values
,
alpha
=
0.3
,
color
=
"red"
,
label
=
"p25-p75"
,
)
plt
.
plot
(
prefix_ratios
,
itl_p50_values
,
"ro-"
,
linewidth
=
2
,
markersize
=
8
,
label
=
"p50"
)
plt
.
xlabel
(
"Prefix Ratio"
)
plt
.
ylabel
(
"
Output Token Throughput (tokens/
s)"
)
plt
.
title
(
"
Throughput
vs Prefix Ratio"
)
plt
.
ylabel
(
"
Inter-Token Latency (m
s)"
)
plt
.
title
(
"
ITL
vs Prefix Ratio"
)
plt
.
grid
(
True
,
alpha
=
0.3
)
for
i
,
(
pr
,
thpt
)
in
enumerate
(
zip
(
prefix_ratios
,
throughput_values
)):
plt
.
legend
()
for
i
,
(
pr
,
p50
)
in
enumerate
(
zip
(
prefix_ratios
,
itl_p50_values
)):
plt
.
annotate
(
f
"
{
thpt
:.
1
f
}
"
,
(
pr
,
thpt
),
f
"
{
p50
:.
1
f
}
ms
"
,
(
pr
,
p50
),
textcoords
=
"offset points"
,
xytext
=
(
0
,
10
),
ha
=
"center"
,
...
...
@@ -409,8 +457,12 @@ def main():
# Save results to JSON
results_data
=
{
"prefix_ratios"
:
prefix_ratios
,
"ttft_values"
:
ttft_values
,
"throughput_values"
:
throughput_values
,
"ttft_p25_values"
:
ttft_p25_values
,
"ttft_p50_values"
:
ttft_p50_values
,
"ttft_p75_values"
:
ttft_p75_values
,
"itl_p25_values"
:
itl_p25_values
,
"itl_p50_values"
:
itl_p50_values
,
"itl_p75_values"
:
itl_p75_values
,
"config"
:
{
"model"
:
args
.
model
,
"tokenizer"
:
args
.
tokenizer
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment