Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
fd358991
"...git@developer.sourcefind.cn:2222/OpenDAS/vllm_cscc.git" did not exist on "675ba75f40f756b113955f77c324f962da1830eb"
Unverified
Commit
fd358991
authored
Aug 11, 2025
by
Hongkuan Zhou
Committed by
GitHub
Aug 11, 2025
Browse files
feat: standalone profiling script for a given endpoint (#2386)
parent
dabd2267
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
326 additions
and
121 deletions
+326
-121
benchmarks/profiler/profile_endpoint.py
benchmarks/profiler/profile_endpoint.py
+100
-0
benchmarks/profiler/profile_sla.py
benchmarks/profiler/profile_sla.py
+22
-104
benchmarks/profiler/utils/plot.py
benchmarks/profiler/utils/plot.py
+50
-17
benchmarks/profiler/utils/profile_decode.py
benchmarks/profiler/utils/profile_decode.py
+85
-0
benchmarks/profiler/utils/profile_prefill.py
benchmarks/profiler/utils/profile_prefill.py
+69
-0
No files found.
benchmarks/profiler/profile_endpoint.py
0 → 100644
View file @
fd358991
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import
argparse
import
logging
import
os
from
utils.profile_prefill
import
profile_prefill
from
benchmarks.profiler.utils.profile_decode
import
profile_decode
logger
=
logging
.
getLogger
(
__name__
)
logger
.
setLevel
(
logging
.
INFO
)
console_handler
=
logging
.
StreamHandler
()
console_handler
.
setLevel
(
logging
.
INFO
)
formatter
=
logging
.
Formatter
(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
,
"%Y-%m-%d %H:%M:%S"
)
console_handler
.
setFormatter
(
formatter
)
logger
.
addHandler
(
console_handler
)
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
description
=
"profile a given endpoint's performance for prefill or decode"
)
parser
.
add_argument
(
"--mode"
,
type
=
str
,
required
=
True
,
choices
=
[
"prefill"
,
"decode"
],
help
=
"mode to profile"
,
)
parser
.
add_argument
(
"--model_name"
,
type
=
str
,
required
=
True
,
help
=
"model name"
,
)
parser
.
add_argument
(
"--url"
,
type
=
str
,
required
=
True
,
help
=
"base url of the endpoint"
,
)
parser
.
add_argument
(
"--num_gpus"
,
type
=
int
,
required
=
True
,
help
=
"number of gpus"
,
)
parser
.
add_argument
(
"--max_kv_tokens"
,
type
=
int
,
required
=
False
,
default
=
0
,
help
=
"max kv tokens of the endpoint (only used for decode)"
,
)
parser
.
add_argument
(
"--work_dir"
,
type
=
str
,
default
=
"endpoint_profiling_results/"
,
help
=
"work directory to save the results"
,
)
parser
.
add_argument
(
"--max_context_length"
,
type
=
int
,
default
=
16384
,
help
=
"max context length of the endpoint"
,
)
parser
.
add_argument
(
"--interpolation_granularity"
,
type
=
int
,
default
=
8
,
help
=
"interpolation granularity for the results"
,
)
args
=
parser
.
parse_args
()
os
.
makedirs
(
args
.
work_dir
,
exist_ok
=
True
)
if
args
.
mode
==
"prefill"
:
profile_prefill
(
args
.
work_dir
,
args
.
model_name
,
args
.
url
,
args
.
num_gpus
,
args
.
max_context_length
,
args
.
interpolation_granularity
,
)
elif
args
.
mode
==
"decode"
:
assert
args
.
max_kv_tokens
>
0
,
"max_kv_tokens must be provided for decode"
profile_decode
(
args
.
work_dir
,
args
.
model_name
,
args
.
url
,
args
.
num_gpus
,
args
.
max_kv_tokens
,
args
.
max_context_length
,
args
.
interpolation_granularity
,
)
else
:
raise
ValueError
(
f
"Invalid mode:
{
args
.
mode
}
"
)
benchmarks/profiler/profile_sla.py
View file @
fd358991
...
...
@@ -28,18 +28,16 @@ from utils.dynamo_deployment import (
cleanup_remaining_deployments
,
)
from
utils.genai_perf
import
benchmark_decode
,
benchmark_prefill
from
utils.plot
import
(
plot_decode_3d_surface
,
plot_decode_performance
,
plot_prefill_interpolation
,
plot_prefill_performance
,
)
from
utils.plot
import
plot_decode_performance
,
plot_prefill_performance
from
utils.profile_cache
import
(
check_decode_results_exist
,
check_prefill_results_exist
,
load_existing_decode_results
,
load_existing_prefill_results
,
)
from
utils.profile_prefill
import
profile_prefill
from
benchmarks.profiler.utils.profile_decode
import
profile_decode
logger
=
logging
.
getLogger
(
__name__
)
logger
.
setLevel
(
logging
.
INFO
)
...
...
@@ -373,9 +371,6 @@ async def run_profile(args):
# interpolate ISL - TTFT with best prefill TP
best_prefill_tp
=
prefill_tp_size
[
selected_prefill_idx
]
prefill_isl
=
[]
prefill_ttft
=
[]
prefill_thpt_per_gpu
=
[]
logger
.
info
(
f
"Profiling prefill under best TP
{
best_prefill_tp
}
with different ISL..."
)
...
...
@@ -420,58 +415,22 @@ async def run_profile(args):
)
base_url
=
client
.
get_service_url
()
for
isl
in
range
(
100
,
profile_prefill
(
work_dir
,
model_name
,
base_url
,
best_prefill_tp
,
args
.
max_context_length
,
(
args
.
max_context_length
-
100
)
//
args
.
prefill_interpolation_granularity
,
):
# run genai-perf
genai_perf_artifact_dir
=
f
"
{
work_dir
}
/gap_isl
{
isl
}
"
gap_result
=
benchmark_prefill
(
isl
,
genai_perf_artifact_dir
,
model_name
,
base_url
=
base_url
)
if
gap_result
is
not
None
:
ttft
=
gap_result
[
"time_to_first_token"
][
"avg"
]
prefill_isl
.
append
(
isl
)
prefill_ttft
.
append
(
ttft
)
prefill_thpt_per_gpu
.
append
(
isl
/
ttft
/
best_prefill_tp
*
1000
)
args
.
prefill_interpolation_granularity
,
)
print
(
"Cleaning up deployment..."
)
await
client
.
delete_deployment
()
deployment_clients
.
remove
(
client
)
print
(
"Deployment deleted"
)
# Interpolate prefill_ttft vs prefill_isl with quadratic function (y=ax^2+bx+c)
if
len
(
prefill_isl
)
>
2
:
logger
.
info
(
"Interpolating prefill TTFT and throughput vs ISL..."
)
# Convert to numpy arrays for easier manipulation
prefill_isl_np
=
np
.
array
(
prefill_isl
)
prefill_ttft_np
=
np
.
array
(
prefill_ttft
)
prefill_thpt_per_gpu_np
=
np
.
array
(
prefill_thpt_per_gpu
)
save_path
=
f
"
{
work_dir
}
/raw_data.npz"
np
.
savez
(
save_path
,
prefill_isl
=
prefill_isl_np
,
prefill_ttft
=
prefill_ttft_np
,
prefill_thpt_per_gpu
=
prefill_thpt_per_gpu_np
,
)
# Call the plotting function
plot_prefill_interpolation
(
prefill_isl_np
,
prefill_ttft_np
,
prefill_thpt_per_gpu_np
,
work_dir
)
else
:
logger
.
warning
(
"Not enough data points to perform interpolation (need at least 3 points)"
)
# interpolate ITL - Active_KV_Cache - Decode_Context_Length with best decode TP
x_kv_usage
=
[]
y_context_length
=
[]
z_itl
=
[]
z_thpt_per_gpu
=
[]
best_decode_tp
=
decode_tp_size
[
selected_decode_idx
]
logger
.
info
(
f
"Profiling decode with TP size
{
best_decode_tp
}
..."
)
decode_config
=
config_modifier
.
set_config_tp_size
(
...
...
@@ -508,64 +467,23 @@ async def run_profile(args):
f
"
{
work_dir
}
/vllm-v1-agg/vllmdecodeworker/0.log"
)
osl
=
500
# not too large to reduce ITL variance, not too small to have stable measurement
base_url
=
client
.
get_service_url
()
for
isl
in
range
(
100
,
args
.
max_context_length
-
osl
,
(
args
.
max_context_length
-
osl
)
//
args
.
decode_interpolation_granularity
,
):
max_concurrency
=
max_kv_tokens
//
(
isl
+
osl
)
sweep_num_request
=
list
(
range
(
1
,
max_concurrency
,
max_concurrency
//
args
.
decode_interpolation_granularity
,
)
)
for
num_request
in
sweep_num_request
:
genai_perf_artifact_dir
=
(
f
"
{
work_dir
}
/gap_isl
{
isl
}
_osl
{
osl
}
_n
{
num_request
}
"
)
gap_result
=
benchmark_decode
(
isl
,
osl
,
num_request
,
genai_perf_artifact_dir
,
model_name
,
base_url
=
base_url
,
)
if
gap_result
is
not
None
:
itl
=
gap_result
[
"inter_token_latency"
][
"avg"
]
x_kv_usage
.
append
((
isl
+
osl
/
2
)
*
num_request
/
max_kv_tokens
)
y_context_length
.
append
(
isl
+
osl
/
2
)
z_itl
.
append
(
itl
)
z_thpt_per_gpu
.
append
(
gap_result
[
"output_token_throughput"
][
"avg"
]
/
best_decode_tp
)
profile_decode
(
work_dir
,
model_name
,
base_url
,
best_decode_tp
,
max_kv_tokens
,
args
.
max_context_length
,
args
.
decode_interpolation_granularity
,
)
print
(
"Cleaning up deployment..."
)
await
client
.
delete_deployment
()
deployment_clients
.
remove
(
client
)
print
(
"Deployment deleted"
)
# Save the data points to a .npz file
save_path
=
f
"
{
work_dir
}
/raw_data.npz"
np
.
savez
(
save_path
,
x_kv_usage
=
np
.
array
(
x_kv_usage
),
y_context_length
=
np
.
array
(
y_context_length
),
z_itl
=
np
.
array
(
z_itl
),
z_thpt_per_gpu
=
np
.
array
(
z_thpt_per_gpu
),
max_kv_tokens
=
np
.
array
([
max_kv_tokens
]),
)
logger
.
info
(
f
"Saved data points to
{
save_path
}
"
)
# Plot 3D surface
plot_decode_3d_surface
(
x_kv_usage
,
y_context_length
,
z_itl
,
best_decode_tp
,
work_dir
)
except
Exception
as
e
:
logger
.
error
(
f
"Profile job failed with error:
{
e
}
"
)
raise
...
...
benchmarks/profiler/utils/plot.py
View file @
fd358991
...
...
@@ -114,16 +114,13 @@ def plot_prefill_interpolation(
"""
# Fit quadratic functions
ttft_coeffs
=
np
.
polyfit
(
prefill_isl_np
,
prefill_ttft_np
,
2
)
thpt_coeffs
=
np
.
polyfit
(
prefill_isl_np
,
prefill_thpt_per_gpu_np
,
2
)
# Create interpolation functions
ttft_poly
=
np
.
poly1d
(
ttft_coeffs
)
thpt_poly
=
np
.
poly1d
(
thpt_coeffs
)
# Generate points for smooth curves
x_interp
=
np
.
linspace
(
min
(
prefill_isl_np
),
max
(
prefill_isl_np
),
100
)
ttft_interp
=
ttft_poly
(
x_interp
)
thpt_interp
=
thpt_poly
(
x_interp
)
# Plot TTFT vs ISL
plt
.
figure
(
figsize
=
(
10
,
6
))
...
...
@@ -148,14 +145,7 @@ def plot_prefill_interpolation(
# Plot Throughput vs ISL
plt
.
figure
(
figsize
=
(
10
,
6
))
plt
.
scatter
(
prefill_isl_np
,
prefill_thpt_per_gpu_np
,
s
=
100
,
label
=
"Measured data"
)
plt
.
plot
(
x_interp
,
thpt_interp
,
"g-"
,
label
=
f
"Quadratic fit:
{
thpt_coeffs
[
0
]:.
2
e
}
x² +
{
thpt_coeffs
[
1
]:.
2
e
}
x +
{
thpt_coeffs
[
2
]:.
2
e
}
"
,
)
plt
.
scatter
(
prefill_isl_np
,
prefill_thpt_per_gpu_np
,
s
=
100
,
label
=
"Throughput/GPU"
)
plt
.
title
(
"Prefill Throughput vs Input Sequence Length"
)
plt
.
xlabel
(
"Input Sequence Length (tokens)"
)
plt
.
ylabel
(
"Prefill throughput per GPU (tokens/s/GPU)"
)
...
...
@@ -170,7 +160,9 @@ def plot_prefill_interpolation(
plt
.
close
()
def
plot_decode_3d_surface
(
x_kv_usage
,
y_context_length
,
z_itl
,
tp_size
,
work_dir
):
def
plot_decode_3d_surface
(
x_kv_usage
,
y_context_length
,
z_itl
,
z_thpt_per_gpu
,
work_dir
):
"""
Plot 3D surface for decode interpolation with KV usage, context length, and ITL.
...
...
@@ -178,14 +170,18 @@ def plot_decode_3d_surface(x_kv_usage, y_context_length, z_itl, tp_size, work_di
x_kv_usage: list of KV usage percentages
y_context_length: list of context lengths
z_itl: list of ITL values
tp_size: TP size for the plot filename
z_thpt_per_gpu: list of throughput per GPU values
work_dir: directory to save the plot
"""
xi
=
np
.
linspace
(
min
(
x_kv_usage
),
max
(
x_kv_usage
),
100
)
yi
=
np
.
linspace
(
min
(
y_context_length
),
max
(
y_context_length
),
100
)
X
,
Y
=
np
.
meshgrid
(
xi
,
yi
)
Z
=
griddata
((
x_kv_usage
,
y_context_length
),
z_itl
,
(
X
,
Y
),
method
=
"cubic"
)
Z_itl
=
griddata
((
x_kv_usage
,
y_context_length
),
z_itl
,
(
X
,
Y
),
method
=
"cubic"
)
Z_thpt
=
griddata
(
(
x_kv_usage
,
y_context_length
),
z_thpt_per_gpu
,
(
X
,
Y
),
method
=
"cubic"
)
# Plot ITL surface
fig
=
plt
.
figure
(
figsize
=
(
12
,
10
))
ax
=
fig
.
add_subplot
(
111
,
projection
=
"3d"
)
# type: ignore
...
...
@@ -193,7 +189,7 @@ def plot_decode_3d_surface(x_kv_usage, y_context_length, z_itl, tp_size, work_di
surf
=
ax
.
plot_surface
(
# type: ignore
X
,
Y
,
Z
,
Z
_itl
,
cmap
=
cm
.
coolwarm
,
# type: ignore
linewidth
=
0.2
,
antialiased
=
True
,
...
...
@@ -202,20 +198,57 @@ def plot_decode_3d_surface(x_kv_usage, y_context_length, z_itl, tp_size, work_di
# Add a color bar with custom settings
cbar
=
fig
.
colorbar
(
surf
,
ax
=
ax
,
shrink
=
0.5
,
aspect
=
5
)
cbar
.
set_label
(
"
Z Value
"
,
fontsize
=
12
)
cbar
.
set_label
(
"
ITL (ms)
"
,
fontsize
=
12
)
cbar
.
ax
.
tick_params
(
labelsize
=
10
)
# Add labels with custom font sizes
ax
.
set_xlabel
(
"Active KV Percentage"
,
fontsize
=
12
)
ax
.
set_ylabel
(
"Decode Context Length"
,
fontsize
=
12
)
ax
.
set_zlabel
(
"ITL"
,
fontsize
=
12
)
# type: ignore
ax
.
set_title
(
"Decode ITL Interpolation"
,
fontsize
=
14
)
# Set viewing angle
ax
.
view_init
(
elev
=
30
,
azim
=
45
)
# type: ignore
ax
.
grid
(
True
)
ax
.
tick_params
(
axis
=
"both"
,
which
=
"major"
,
labelsize
=
10
)
plot_path
=
f
"
{
work_dir
}
/decode_
tp
{
tp_size
}
.png"
plot_path
=
f
"
{
work_dir
}
/decode_
itl_interpolation
.png"
logger
.
info
(
f
"Saving ITL surface plot to
{
plot_path
}
"
)
plt
.
savefig
(
plot_path
,
dpi
=
300
,
bbox_inches
=
"tight"
)
plt
.
close
()
# Plot Throughput surface
fig
=
plt
.
figure
(
figsize
=
(
12
,
10
))
ax
=
fig
.
add_subplot
(
111
,
projection
=
"3d"
)
# type: ignore
# Create the throughput surface plot with customizations
surf
=
ax
.
plot_surface
(
# type: ignore
X
,
Y
,
Z_thpt
,
cmap
=
cm
.
viridis
,
# type: ignore
linewidth
=
0.2
,
antialiased
=
True
,
alpha
=
0.8
,
)
# Add a color bar with custom settings
cbar
=
fig
.
colorbar
(
surf
,
ax
=
ax
,
shrink
=
0.5
,
aspect
=
5
)
cbar
.
set_label
(
"Throughput per GPU (tokens/s/GPU)"
,
fontsize
=
12
)
cbar
.
ax
.
tick_params
(
labelsize
=
10
)
# Add labels with custom font sizes
ax
.
set_xlabel
(
"Active KV Percentage"
,
fontsize
=
12
)
ax
.
set_ylabel
(
"Decode Context Length"
,
fontsize
=
12
)
ax
.
set_zlabel
(
"Throughput per GPU"
,
fontsize
=
12
)
# type: ignore
ax
.
set_title
(
"Decode Throughput Interpolation"
,
fontsize
=
14
)
# Set viewing angle
ax
.
view_init
(
elev
=
30
,
azim
=
45
)
# type: ignore
ax
.
grid
(
True
)
ax
.
tick_params
(
axis
=
"both"
,
which
=
"major"
,
labelsize
=
10
)
thpt_plot_path
=
f
"
{
work_dir
}
/decode_throughput_interpolation.png"
logger
.
info
(
f
"Saving throughput surface plot to
{
thpt_plot_path
}
"
)
plt
.
savefig
(
thpt_plot_path
,
dpi
=
300
,
bbox_inches
=
"tight"
)
plt
.
close
()
benchmarks/profiler/utils/profile_decode.py
0 → 100644
View file @
fd358991
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import
logging
import
numpy
as
np
from
utils.genai_perf
import
benchmark_decode
from
utils.plot
import
plot_decode_3d_surface
logger
=
logging
.
getLogger
(
__name__
)
logger
.
setLevel
(
logging
.
INFO
)
console_handler
=
logging
.
StreamHandler
()
console_handler
.
setLevel
(
logging
.
INFO
)
formatter
=
logging
.
Formatter
(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
,
"%Y-%m-%d %H:%M:%S"
)
console_handler
.
setFormatter
(
formatter
)
logger
.
addHandler
(
console_handler
)
def
profile_decode
(
work_dir
,
model_name
,
url
,
num_gpus
,
max_kv_tokens
,
max_context_length
,
interpolation_granularity
,
):
"""interpolate ITL - Active_KV_Cache - Decode_Context_Length"""
x_kv_usage
=
[]
y_context_length
=
[]
z_itl
=
[]
z_thpt_per_gpu
=
[]
osl
=
500
# not too large to reduce ITL variance, not too small to have stable measurement
for
isl
in
range
(
100
,
max_context_length
-
osl
,
(
max_context_length
-
osl
)
//
interpolation_granularity
,
):
max_concurrency
=
max_kv_tokens
//
(
isl
+
osl
)
sweep_num_request
=
range
(
1
,
max_concurrency
,
max_concurrency
//
interpolation_granularity
,
)
for
num_request
in
sweep_num_request
:
genai_perf_artifact_dir
=
f
"
{
work_dir
}
/gap_isl
{
isl
}
_osl
{
osl
}
_n
{
num_request
}
"
gap_result
=
benchmark_decode
(
isl
,
osl
,
num_request
,
genai_perf_artifact_dir
,
model_name
,
base_url
=
url
,
)
if
gap_result
is
not
None
:
itl
=
gap_result
[
"inter_token_latency"
][
"avg"
]
x_kv_usage
.
append
((
isl
+
osl
/
2
)
*
num_request
/
max_kv_tokens
)
y_context_length
.
append
(
isl
+
osl
/
2
)
z_itl
.
append
(
itl
)
z_thpt_per_gpu
.
append
(
gap_result
[
"output_token_throughput"
][
"avg"
]
/
num_gpus
)
# Save the data points to a .npz file
save_path
=
f
"
{
work_dir
}
/raw_data.npz"
np
.
savez
(
save_path
,
x_kv_usage
=
np
.
array
(
x_kv_usage
),
y_context_length
=
np
.
array
(
y_context_length
),
z_itl
=
np
.
array
(
z_itl
),
z_thpt_per_gpu
=
np
.
array
(
z_thpt_per_gpu
),
max_kv_tokens
=
np
.
array
([
max_kv_tokens
]),
)
logger
.
info
(
f
"Saved data points to
{
save_path
}
"
)
# Plot 3D surface
plot_decode_3d_surface
(
x_kv_usage
,
y_context_length
,
z_itl
,
z_thpt_per_gpu
,
work_dir
)
return
benchmarks/profiler/utils/profile_prefill.py
0 → 100644
View file @
fd358991
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import
logging
import
numpy
as
np
from
utils.genai_perf
import
benchmark_prefill
from
utils.plot
import
plot_prefill_interpolation
logger
=
logging
.
getLogger
(
__name__
)
logger
.
setLevel
(
logging
.
INFO
)
console_handler
=
logging
.
StreamHandler
()
console_handler
.
setLevel
(
logging
.
INFO
)
formatter
=
logging
.
Formatter
(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
,
"%Y-%m-%d %H:%M:%S"
)
console_handler
.
setFormatter
(
formatter
)
logger
.
addHandler
(
console_handler
)
def
profile_prefill
(
work_dir
,
model_name
,
url
,
num_gpus
,
max_context_length
,
interpolation_granularity
):
prefill_isl
=
[]
prefill_ttft
=
[]
prefill_thpt_per_gpu
=
[]
for
isl
in
range
(
100
,
max_context_length
,
(
max_context_length
-
100
)
//
interpolation_granularity
,
):
# run genai-perf
genai_perf_artifact_dir
=
f
"
{
work_dir
}
/gap_isl
{
isl
}
"
gap_result
=
benchmark_prefill
(
isl
,
genai_perf_artifact_dir
,
model_name
,
base_url
=
url
)
if
gap_result
is
not
None
:
ttft
=
gap_result
[
"time_to_first_token"
][
"avg"
]
prefill_isl
.
append
(
isl
)
prefill_ttft
.
append
(
ttft
)
prefill_thpt_per_gpu
.
append
(
isl
/
ttft
/
num_gpus
*
1000
)
# Interpolate prefill_ttft vs prefill_isl with quadratic function (y=ax^2+bx+c)
if
len
(
prefill_isl
)
>
2
:
logger
.
info
(
"Interpolating prefill TTFT and throughput vs ISL..."
)
# Convert to numpy arrays for easier manipulation
prefill_isl_np
=
np
.
array
(
prefill_isl
)
prefill_ttft_np
=
np
.
array
(
prefill_ttft
)
prefill_thpt_per_gpu_np
=
np
.
array
(
prefill_thpt_per_gpu
)
save_path
=
f
"
{
work_dir
}
/raw_data.npz"
np
.
savez
(
save_path
,
prefill_isl
=
prefill_isl_np
,
prefill_ttft
=
prefill_ttft_np
,
prefill_thpt_per_gpu
=
prefill_thpt_per_gpu_np
,
)
# Call the plotting function
plot_prefill_interpolation
(
prefill_isl_np
,
prefill_ttft_np
,
prefill_thpt_per_gpu_np
,
work_dir
)
else
:
logger
.
warning
(
"Not enough data points to perform interpolation (need at least 3 points)"
)
return
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment