Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
d83ab662
Unverified
Commit
d83ab662
authored
Jan 16, 2026
by
Yan Ru Pei
Committed by
GitHub
Jan 16, 2026
Browse files
chore: make precentile plots in router benchmark (#5476)
Signed-off-by:
PeaBrane
<
yanrpei@gmail.com
>
parent
34c4882d
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
85 additions
and
33 deletions
+85
-33
benchmarks/router/prefix_ratio_benchmark.py
benchmarks/router/prefix_ratio_benchmark.py
+85
-33
No files found.
benchmarks/router/prefix_ratio_benchmark.py
View file @
d83ab662
...
@@ -84,6 +84,8 @@ def get_aiperf_cmd(
...
@@ -84,6 +84,8 @@ def get_aiperf_cmd(
str
(
num_prefix_prompts
),
str
(
num_prefix_prompts
),
"--artifact-dir"
,
"--artifact-dir"
,
artifact_dir
,
artifact_dir
,
"--dataset-sampling-strategy"
,
"shuffle"
,
"-H"
,
"-H"
,
"Authorization: Bearer NOT USED"
,
"Authorization: Bearer NOT USED"
,
"-H"
,
"-H"
,
...
@@ -157,20 +159,30 @@ def aggregate_results(results: List[Optional[Dict]]) -> Optional[Dict]:
...
@@ -157,20 +159,30 @@ def aggregate_results(results: List[Optional[Dict]]) -> Optional[Dict]:
if
not
results
:
if
not
results
:
return
None
return
None
# For TTFT, we take the average across all URLs
valid_results
=
[
r
for
r
in
results
if
r
is
not
None
]
# For throughput, we sum across all URLs (total system throughput)
if
not
valid_results
:
ttft_values
=
[
r
[
"time_to_first_token"
][
"avg"
]
for
r
in
results
if
r
is
not
None
]
throughput_values
=
[
r
[
"output_token_throughput"
][
"avg"
]
for
r
in
results
if
r
is
not
None
]
if
not
ttft_values
or
not
throughput_values
:
return
None
return
None
# For TTFT percentiles, average across URLs
ttft_p25_values
=
[
r
[
"time_to_first_token"
][
"p25"
]
for
r
in
valid_results
]
ttft_p50_values
=
[
r
[
"time_to_first_token"
][
"p50"
]
for
r
in
valid_results
]
ttft_p75_values
=
[
r
[
"time_to_first_token"
][
"p75"
]
for
r
in
valid_results
]
# For ITL percentiles, average across URLs
itl_p25_values
=
[
r
[
"inter_token_latency"
][
"p25"
]
for
r
in
valid_results
]
itl_p50_values
=
[
r
[
"inter_token_latency"
][
"p50"
]
for
r
in
valid_results
]
itl_p75_values
=
[
r
[
"inter_token_latency"
][
"p75"
]
for
r
in
valid_results
]
aggregated
=
{
aggregated
=
{
"time_to_first_token"
:
{
"avg"
:
sum
(
ttft_values
)
/
len
(
ttft_values
)},
"time_to_first_token"
:
{
"output_token_throughput"
:
{
"p25"
:
sum
(
ttft_p25_values
)
/
len
(
ttft_p25_values
),
"avg"
:
sum
(
throughput_values
)
# Total throughput across all URLs
"p50"
:
sum
(
ttft_p50_values
)
/
len
(
ttft_p50_values
),
"p75"
:
sum
(
ttft_p75_values
)
/
len
(
ttft_p75_values
),
},
"inter_token_latency"
:
{
"p25"
:
sum
(
itl_p25_values
)
/
len
(
itl_p25_values
),
"p50"
:
sum
(
itl_p50_values
)
/
len
(
itl_p50_values
),
"p75"
:
sum
(
itl_p75_values
)
/
len
(
itl_p75_values
),
},
},
}
}
...
@@ -328,8 +340,12 @@ def main():
...
@@ -328,8 +340,12 @@ def main():
# Store results
# Store results
prefix_ratios
=
[]
prefix_ratios
=
[]
ttft_values
=
[]
ttft_p25_values
=
[]
throughput_values
=
[]
ttft_p50_values
=
[]
ttft_p75_values
=
[]
itl_p25_values
=
[]
itl_p50_values
=
[]
itl_p75_values
=
[]
current_seed
=
args
.
seed
current_seed
=
args
.
seed
...
@@ -350,50 +366,82 @@ def main():
...
@@ -350,50 +366,82 @@ def main():
)
)
if
result
is
not
None
:
if
result
is
not
None
:
ttft
=
result
[
"time_to_first_token"
]
[
"avg"
]
ttft
=
result
[
"time_to_first_token"
]
throughput
=
result
[
"
output
_token_
throughput"
][
"avg
"
]
itl
=
result
[
"
inter
_token_
latency
"
]
prefix_ratios
.
append
(
prefix_ratio
)
prefix_ratios
.
append
(
prefix_ratio
)
ttft_values
.
append
(
ttft
)
ttft_p25_values
.
append
(
ttft
[
"p25"
])
throughput_values
.
append
(
throughput
)
ttft_p50_values
.
append
(
ttft
[
"p50"
])
ttft_p75_values
.
append
(
ttft
[
"p75"
])
itl_p25_values
.
append
(
itl
[
"p25"
])
itl_p50_values
.
append
(
itl
[
"p50"
])
itl_p75_values
.
append
(
itl
[
"p75"
])
logger
.
info
(
logger
.
info
(
f
"Prefix ratio
{
prefix_ratio
}
: TTFT=
{
ttft
:.
2
f
}
ms, Throughput=
{
throughput
:.
2
f
}
tokens/s"
f
"Prefix ratio
{
prefix_ratio
}
: TTFT p50=
{
ttft
[
'p50'
]:.
2
f
}
ms (p25=
{
ttft
[
'p25'
]:.
2
f
}
, p75=
{
ttft
[
'p75'
]:.
2
f
}
), "
f
"ITL p50=
{
itl
[
'p50'
]:.
2
f
}
ms (p25=
{
itl
[
'p25'
]:.
2
f
}
, p75=
{
itl
[
'p75'
]:.
2
f
}
)"
)
)
current_seed
+=
1
current_seed
+=
1
# Create plots
# Create plots
if
prefix_ratios
and
ttft_values
and
throughput_values
:
if
prefix_ratios
and
ttft_p50_values
and
itl_p50_values
:
# Plot TTFT vs Prefix Ratio
plt
.
figure
(
figsize
=
(
12
,
5
))
plt
.
figure
(
figsize
=
(
12
,
5
))
# Plot TTFT vs Prefix Ratio with shaded p25-p75 region
plt
.
subplot
(
1
,
2
,
1
)
plt
.
subplot
(
1
,
2
,
1
)
plt
.
plot
(
prefix_ratios
,
ttft_values
,
"bo-"
,
linewidth
=
2
,
markersize
=
8
)
plt
.
fill_between
(
prefix_ratios
,
ttft_p25_values
,
ttft_p75_values
,
alpha
=
0.3
,
color
=
"blue"
,
label
=
"p25-p75"
,
)
plt
.
plot
(
prefix_ratios
,
ttft_p50_values
,
"bo-"
,
linewidth
=
2
,
markersize
=
8
,
label
=
"p50"
,
)
plt
.
xlabel
(
"Prefix Ratio"
)
plt
.
xlabel
(
"Prefix Ratio"
)
plt
.
ylabel
(
"Time to First Token (ms)"
)
plt
.
ylabel
(
"Time to First Token (ms)"
)
plt
.
title
(
"TTFT vs Prefix Ratio"
)
plt
.
title
(
"TTFT vs Prefix Ratio"
)
plt
.
grid
(
True
,
alpha
=
0.3
)
plt
.
grid
(
True
,
alpha
=
0.3
)
for
i
,
(
pr
,
ttft
)
in
enumerate
(
zip
(
prefix_ratios
,
ttft_values
)):
plt
.
legend
()
for
i
,
(
pr
,
p50
)
in
enumerate
(
zip
(
prefix_ratios
,
ttft_p50_values
)):
plt
.
annotate
(
plt
.
annotate
(
f
"
{
ttft
:.
1
f
}
ms"
,
f
"
{
p50
:.
1
f
}
ms"
,
(
pr
,
ttft
),
(
pr
,
p50
),
textcoords
=
"offset points"
,
textcoords
=
"offset points"
,
xytext
=
(
0
,
10
),
xytext
=
(
0
,
10
),
ha
=
"center"
,
ha
=
"center"
,
)
)
# Plot
Throughput
vs Prefix Ratio
# Plot
ITL
vs Prefix Ratio
with shaded p25-p75 region
plt
.
subplot
(
1
,
2
,
2
)
plt
.
subplot
(
1
,
2
,
2
)
plt
.
plot
(
prefix_ratios
,
throughput_values
,
"ro-"
,
linewidth
=
2
,
markersize
=
8
)
plt
.
fill_between
(
prefix_ratios
,
itl_p25_values
,
itl_p75_values
,
alpha
=
0.3
,
color
=
"red"
,
label
=
"p25-p75"
,
)
plt
.
plot
(
prefix_ratios
,
itl_p50_values
,
"ro-"
,
linewidth
=
2
,
markersize
=
8
,
label
=
"p50"
)
plt
.
xlabel
(
"Prefix Ratio"
)
plt
.
xlabel
(
"Prefix Ratio"
)
plt
.
ylabel
(
"
Output Token Throughput (tokens/
s)"
)
plt
.
ylabel
(
"
Inter-Token Latency (m
s)"
)
plt
.
title
(
"
Throughput
vs Prefix Ratio"
)
plt
.
title
(
"
ITL
vs Prefix Ratio"
)
plt
.
grid
(
True
,
alpha
=
0.3
)
plt
.
grid
(
True
,
alpha
=
0.3
)
for
i
,
(
pr
,
thpt
)
in
enumerate
(
zip
(
prefix_ratios
,
throughput_values
)):
plt
.
legend
()
for
i
,
(
pr
,
p50
)
in
enumerate
(
zip
(
prefix_ratios
,
itl_p50_values
)):
plt
.
annotate
(
plt
.
annotate
(
f
"
{
thpt
:.
1
f
}
"
,
f
"
{
p50
:.
1
f
}
ms
"
,
(
pr
,
thpt
),
(
pr
,
p50
),
textcoords
=
"offset points"
,
textcoords
=
"offset points"
,
xytext
=
(
0
,
10
),
xytext
=
(
0
,
10
),
ha
=
"center"
,
ha
=
"center"
,
...
@@ -409,8 +457,12 @@ def main():
...
@@ -409,8 +457,12 @@ def main():
# Save results to JSON
# Save results to JSON
results_data
=
{
results_data
=
{
"prefix_ratios"
:
prefix_ratios
,
"prefix_ratios"
:
prefix_ratios
,
"ttft_values"
:
ttft_values
,
"ttft_p25_values"
:
ttft_p25_values
,
"throughput_values"
:
throughput_values
,
"ttft_p50_values"
:
ttft_p50_values
,
"ttft_p75_values"
:
ttft_p75_values
,
"itl_p25_values"
:
itl_p25_values
,
"itl_p50_values"
:
itl_p50_values
,
"itl_p75_values"
:
itl_p75_values
,
"config"
:
{
"config"
:
{
"model"
:
args
.
model
,
"model"
:
args
.
model
,
"tokenizer"
:
args
.
tokenizer
,
"tokenizer"
:
args
.
tokenizer
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment