Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
fd358991
Unverified
Commit
fd358991
authored
Aug 11, 2025
by
Hongkuan Zhou
Committed by
GitHub
Aug 11, 2025
Browse files
feat: standalone profiling script for a given endpoint (#2386)
parent
dabd2267
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
326 additions
and
121 deletions
+326
-121
benchmarks/profiler/profile_endpoint.py
benchmarks/profiler/profile_endpoint.py
+100
-0
benchmarks/profiler/profile_sla.py
benchmarks/profiler/profile_sla.py
+22
-104
benchmarks/profiler/utils/plot.py
benchmarks/profiler/utils/plot.py
+50
-17
benchmarks/profiler/utils/profile_decode.py
benchmarks/profiler/utils/profile_decode.py
+85
-0
benchmarks/profiler/utils/profile_prefill.py
benchmarks/profiler/utils/profile_prefill.py
+69
-0
No files found.
benchmarks/profiler/profile_endpoint.py
0 → 100644
View file @
fd358991
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import
argparse
import
logging
import
os
from
utils.profile_prefill
import
profile_prefill
from
benchmarks.profiler.utils.profile_decode
import
profile_decode
logger
=
logging
.
getLogger
(
__name__
)
logger
.
setLevel
(
logging
.
INFO
)
console_handler
=
logging
.
StreamHandler
()
console_handler
.
setLevel
(
logging
.
INFO
)
formatter
=
logging
.
Formatter
(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
,
"%Y-%m-%d %H:%M:%S"
)
console_handler
.
setFormatter
(
formatter
)
logger
.
addHandler
(
console_handler
)
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
description
=
"profile a given endpoint's performance for prefill or decode"
)
parser
.
add_argument
(
"--mode"
,
type
=
str
,
required
=
True
,
choices
=
[
"prefill"
,
"decode"
],
help
=
"mode to profile"
,
)
parser
.
add_argument
(
"--model_name"
,
type
=
str
,
required
=
True
,
help
=
"model name"
,
)
parser
.
add_argument
(
"--url"
,
type
=
str
,
required
=
True
,
help
=
"base url of the endpoint"
,
)
parser
.
add_argument
(
"--num_gpus"
,
type
=
int
,
required
=
True
,
help
=
"number of gpus"
,
)
parser
.
add_argument
(
"--max_kv_tokens"
,
type
=
int
,
required
=
False
,
default
=
0
,
help
=
"max kv tokens of the endpoint (only used for decode)"
,
)
parser
.
add_argument
(
"--work_dir"
,
type
=
str
,
default
=
"endpoint_profiling_results/"
,
help
=
"work directory to save the results"
,
)
parser
.
add_argument
(
"--max_context_length"
,
type
=
int
,
default
=
16384
,
help
=
"max context length of the endpoint"
,
)
parser
.
add_argument
(
"--interpolation_granularity"
,
type
=
int
,
default
=
8
,
help
=
"interpolation granularity for the results"
,
)
args
=
parser
.
parse_args
()
os
.
makedirs
(
args
.
work_dir
,
exist_ok
=
True
)
if
args
.
mode
==
"prefill"
:
profile_prefill
(
args
.
work_dir
,
args
.
model_name
,
args
.
url
,
args
.
num_gpus
,
args
.
max_context_length
,
args
.
interpolation_granularity
,
)
elif
args
.
mode
==
"decode"
:
assert
args
.
max_kv_tokens
>
0
,
"max_kv_tokens must be provided for decode"
profile_decode
(
args
.
work_dir
,
args
.
model_name
,
args
.
url
,
args
.
num_gpus
,
args
.
max_kv_tokens
,
args
.
max_context_length
,
args
.
interpolation_granularity
,
)
else
:
raise
ValueError
(
f
"Invalid mode:
{
args
.
mode
}
"
)
benchmarks/profiler/profile_sla.py
View file @
fd358991
...
...
@@ -28,18 +28,16 @@ from utils.dynamo_deployment import (
cleanup_remaining_deployments
,
)
from
utils.genai_perf
import
benchmark_decode
,
benchmark_prefill
from
utils.plot
import
(
plot_decode_3d_surface
,
plot_decode_performance
,
plot_prefill_interpolation
,
plot_prefill_performance
,
)
from
utils.plot
import
plot_decode_performance
,
plot_prefill_performance
from
utils.profile_cache
import
(
check_decode_results_exist
,
check_prefill_results_exist
,
load_existing_decode_results
,
load_existing_prefill_results
,
)
from
utils.profile_prefill
import
profile_prefill
from
benchmarks.profiler.utils.profile_decode
import
profile_decode
logger
=
logging
.
getLogger
(
__name__
)
logger
.
setLevel
(
logging
.
INFO
)
...
...
@@ -373,9 +371,6 @@ async def run_profile(args):
# interpolate ISL - TTFT with best prefill TP
best_prefill_tp
=
prefill_tp_size
[
selected_prefill_idx
]
prefill_isl
=
[]
prefill_ttft
=
[]
prefill_thpt_per_gpu
=
[]
logger
.
info
(
f
"Profiling prefill under best TP
{
best_prefill_tp
}
with different ISL..."
)
...
...
@@ -420,58 +415,22 @@ async def run_profile(args):
)
base_url
=
client
.
get_service_url
()
for
isl
in
range
(
100
,
profile_prefill
(
work_dir
,
model_name
,
base_url
,
best_prefill_tp
,
args
.
max_context_length
,
(
args
.
max_context_length
-
100
)
//
args
.
prefill_interpolation_granularity
,
):
# run genai-perf
genai_perf_artifact_dir
=
f
"
{
work_dir
}
/gap_isl
{
isl
}
"
gap_result
=
benchmark_prefill
(
isl
,
genai_perf_artifact_dir
,
model_name
,
base_url
=
base_url
)
if
gap_result
is
not
None
:
ttft
=
gap_result
[
"time_to_first_token"
][
"avg"
]
prefill_isl
.
append
(
isl
)
prefill_ttft
.
append
(
ttft
)
prefill_thpt_per_gpu
.
append
(
isl
/
ttft
/
best_prefill_tp
*
1000
)
args
.
prefill_interpolation_granularity
,
)
print
(
"Cleaning up deployment..."
)
await
client
.
delete_deployment
()
deployment_clients
.
remove
(
client
)
print
(
"Deployment deleted"
)
# Interpolate prefill_ttft vs prefill_isl with quadratic function (y=ax^2+bx+c)
if
len
(
prefill_isl
)
>
2
:
logger
.
info
(
"Interpolating prefill TTFT and throughput vs ISL..."
)
# Convert to numpy arrays for easier manipulation
prefill_isl_np
=
np
.
array
(
prefill_isl
)
prefill_ttft_np
=
np
.
array
(
prefill_ttft
)
prefill_thpt_per_gpu_np
=
np
.
array
(
prefill_thpt_per_gpu
)
save_path
=
f
"
{
work_dir
}
/raw_data.npz"
np
.
savez
(
save_path
,
prefill_isl
=
prefill_isl_np
,
prefill_ttft
=
prefill_ttft_np
,
prefill_thpt_per_gpu
=
prefill_thpt_per_gpu_np
,
)
# Call the plotting function
plot_prefill_interpolation
(
prefill_isl_np
,
prefill_ttft_np
,
prefill_thpt_per_gpu_np
,
work_dir
)
else
:
logger
.
warning
(
"Not enough data points to perform interpolation (need at least 3 points)"
)
# interpolate ITL - Active_KV_Cache - Decode_Context_Length with best decode TP
x_kv_usage
=
[]
y_context_length
=
[]
z_itl
=
[]
z_thpt_per_gpu
=
[]
best_decode_tp
=
decode_tp_size
[
selected_decode_idx
]
logger
.
info
(
f
"Profiling decode with TP size
{
best_decode_tp
}
..."
)
decode_config
=
config_modifier
.
set_config_tp_size
(
...
...
@@ -508,64 +467,23 @@ async def run_profile(args):
f
"
{
work_dir
}
/vllm-v1-agg/vllmdecodeworker/0.log"
)
osl
=
500
# not too large to reduce ITL variance, not too small to have stable measurement
base_url
=
client
.
get_service_url
()
for
isl
in
range
(
100
,
args
.
max_context_length
-
osl
,
(
args
.
max_context_length
-
osl
)
//
args
.
decode_interpolation_granularity
,
):
max_concurrency
=
max_kv_tokens
//
(
isl
+
osl
)
sweep_num_request
=
list
(
range
(
1
,
max_concurrency
,
max_concurrency
//
args
.
decode_interpolation_granularity
,
)
)
for
num_request
in
sweep_num_request
:
genai_perf_artifact_dir
=
(
f
"
{
work_dir
}
/gap_isl
{
isl
}
_osl
{
osl
}
_n
{
num_request
}
"
)
gap_result
=
benchmark_decode
(
isl
,
osl
,
num_request
,
genai_perf_artifact_dir
,
model_name
,
base_url
=
base_url
,
)
if
gap_result
is
not
None
:
itl
=
gap_result
[
"inter_token_latency"
][
"avg"
]
x_kv_usage
.
append
((
isl
+
osl
/
2
)
*
num_request
/
max_kv_tokens
)
y_context_length
.
append
(
isl
+
osl
/
2
)
z_itl
.
append
(
itl
)
z_thpt_per_gpu
.
append
(
gap_result
[
"output_token_throughput"
][
"avg"
]
/
best_decode_tp
)
profile_decode
(
work_dir
,
model_name
,
base_url
,
best_decode_tp
,
max_kv_tokens
,
args
.
max_context_length
,
args
.
decode_interpolation_granularity
,
)
print
(
"Cleaning up deployment..."
)
await
client
.
delete_deployment
()
deployment_clients
.
remove
(
client
)
print
(
"Deployment deleted"
)
# Save the data points to a .npz file
save_path
=
f
"
{
work_dir
}
/raw_data.npz"
np
.
savez
(
save_path
,
x_kv_usage
=
np
.
array
(
x_kv_usage
),
y_context_length
=
np
.
array
(
y_context_length
),
z_itl
=
np
.
array
(
z_itl
),
z_thpt_per_gpu
=
np
.
array
(
z_thpt_per_gpu
),
max_kv_tokens
=
np
.
array
([
max_kv_tokens
]),
)
logger
.
info
(
f
"Saved data points to
{
save_path
}
"
)
# Plot 3D surface
plot_decode_3d_surface
(
x_kv_usage
,
y_context_length
,
z_itl
,
best_decode_tp
,
work_dir
)
except
Exception
as
e
:
logger
.
error
(
f
"Profile job failed with error:
{
e
}
"
)
raise
...
...
benchmarks/profiler/utils/plot.py
View file @
fd358991
...
...
@@ -114,16 +114,13 @@ def plot_prefill_interpolation(
"""
# Fit quadratic functions
ttft_coeffs
=
np
.
polyfit
(
prefill_isl_np
,
prefill_ttft_np
,
2
)
thpt_coeffs
=
np
.
polyfit
(
prefill_isl_np
,
prefill_thpt_per_gpu_np
,
2
)
# Create interpolation functions
ttft_poly
=
np
.
poly1d
(
ttft_coeffs
)
thpt_poly
=
np
.
poly1d
(
thpt_coeffs
)
# Generate points for smooth curves
x_interp
=
np
.
linspace
(
min
(
prefill_isl_np
),
max
(
prefill_isl_np
),
100
)
ttft_interp
=
ttft_poly
(
x_interp
)
thpt_interp
=
thpt_poly
(
x_interp
)
# Plot TTFT vs ISL
plt
.
figure
(
figsize
=
(
10
,
6
))
...
...
@@ -148,14 +145,7 @@ def plot_prefill_interpolation(
# Plot Throughput vs ISL
plt
.
figure
(
figsize
=
(
10
,
6
))
plt
.
scatter
(
prefill_isl_np
,
prefill_thpt_per_gpu_np
,
s
=
100
,
label
=
"Measured data"
)
plt
.
plot
(
x_interp
,
thpt_interp
,
"g-"
,
label
=
f
"Quadratic fit:
{
thpt_coeffs
[
0
]:.
2
e
}
x² +
{
thpt_coeffs
[
1
]:.
2
e
}
x +
{
thpt_coeffs
[
2
]:.
2
e
}
"
,
)
plt
.
scatter
(
prefill_isl_np
,
prefill_thpt_per_gpu_np
,
s
=
100
,
label
=
"Throughput/GPU"
)
plt
.
title
(
"Prefill Throughput vs Input Sequence Length"
)
plt
.
xlabel
(
"Input Sequence Length (tokens)"
)
plt
.
ylabel
(
"Prefill throughput per GPU (tokens/s/GPU)"
)
...
...
@@ -170,7 +160,9 @@ def plot_prefill_interpolation(
plt
.
close
()
def
plot_decode_3d_surface
(
x_kv_usage
,
y_context_length
,
z_itl
,
tp_size
,
work_dir
):
def
plot_decode_3d_surface
(
x_kv_usage
,
y_context_length
,
z_itl
,
z_thpt_per_gpu
,
work_dir
):
"""
Plot 3D surface for decode interpolation with KV usage, context length, and ITL.
...
...
@@ -178,14 +170,18 @@ def plot_decode_3d_surface(x_kv_usage, y_context_length, z_itl, tp_size, work_di
x_kv_usage: list of KV usage percentages
y_context_length: list of context lengths
z_itl: list of ITL values
tp_size: TP size for the plot filename
z_thpt_per_gpu: list of throughput per GPU values
work_dir: directory to save the plot
"""
xi
=
np
.
linspace
(
min
(
x_kv_usage
),
max
(
x_kv_usage
),
100
)
yi
=
np
.
linspace
(
min
(
y_context_length
),
max
(
y_context_length
),
100
)
X
,
Y
=
np
.
meshgrid
(
xi
,
yi
)
Z
=
griddata
((
x_kv_usage
,
y_context_length
),
z_itl
,
(
X
,
Y
),
method
=
"cubic"
)
Z_itl
=
griddata
((
x_kv_usage
,
y_context_length
),
z_itl
,
(
X
,
Y
),
method
=
"cubic"
)
Z_thpt
=
griddata
(
(
x_kv_usage
,
y_context_length
),
z_thpt_per_gpu
,
(
X
,
Y
),
method
=
"cubic"
)
# Plot ITL surface
fig
=
plt
.
figure
(
figsize
=
(
12
,
10
))
ax
=
fig
.
add_subplot
(
111
,
projection
=
"3d"
)
# type: ignore
...
...
@@ -193,7 +189,7 @@ def plot_decode_3d_surface(x_kv_usage, y_context_length, z_itl, tp_size, work_di
surf
=
ax
.
plot_surface
(
# type: ignore
X
,
Y
,
Z
,
Z
_itl
,
cmap
=
cm
.
coolwarm
,
# type: ignore
linewidth
=
0.2
,
antialiased
=
True
,
...
...
@@ -202,20 +198,57 @@ def plot_decode_3d_surface(x_kv_usage, y_context_length, z_itl, tp_size, work_di
# Add a color bar with custom settings
cbar
=
fig
.
colorbar
(
surf
,
ax
=
ax
,
shrink
=
0.5
,
aspect
=
5
)
cbar
.
set_label
(
"
Z Value
"
,
fontsize
=
12
)
cbar
.
set_label
(
"
ITL (ms)
"
,
fontsize
=
12
)
cbar
.
ax
.
tick_params
(
labelsize
=
10
)
# Add labels with custom font sizes
ax
.
set_xlabel
(
"Active KV Percentage"
,
fontsize
=
12
)
ax
.
set_ylabel
(
"Decode Context Length"
,
fontsize
=
12
)
ax
.
set_zlabel
(
"ITL"
,
fontsize
=
12
)
# type: ignore
ax
.
set_title
(
"Decode ITL Interpolation"
,
fontsize
=
14
)
# Set viewing angle
ax
.
view_init
(
elev
=
30
,
azim
=
45
)
# type: ignore
ax
.
grid
(
True
)
ax
.
tick_params
(
axis
=
"both"
,
which
=
"major"
,
labelsize
=
10
)
plot_path
=
f
"
{
work_dir
}
/decode_
tp
{
tp_size
}
.png"
plot_path
=
f
"
{
work_dir
}
/decode_
itl_interpolation
.png"
logger
.
info
(
f
"Saving ITL surface plot to
{
plot_path
}
"
)
plt
.
savefig
(
plot_path
,
dpi
=
300
,
bbox_inches
=
"tight"
)
plt
.
close
()
# Plot Throughput surface
fig
=
plt
.
figure
(
figsize
=
(
12
,
10
))
ax
=
fig
.
add_subplot
(
111
,
projection
=
"3d"
)
# type: ignore
# Create the throughput surface plot with customizations
surf
=
ax
.
plot_surface
(
# type: ignore
X
,
Y
,
Z_thpt
,
cmap
=
cm
.
viridis
,
# type: ignore
linewidth
=
0.2
,
antialiased
=
True
,
alpha
=
0.8
,
)
# Add a color bar with custom settings
cbar
=
fig
.
colorbar
(
surf
,
ax
=
ax
,
shrink
=
0.5
,
aspect
=
5
)
cbar
.
set_label
(
"Throughput per GPU (tokens/s/GPU)"
,
fontsize
=
12
)
cbar
.
ax
.
tick_params
(
labelsize
=
10
)
# Add labels with custom font sizes
ax
.
set_xlabel
(
"Active KV Percentage"
,
fontsize
=
12
)
ax
.
set_ylabel
(
"Decode Context Length"
,
fontsize
=
12
)
ax
.
set_zlabel
(
"Throughput per GPU"
,
fontsize
=
12
)
# type: ignore
ax
.
set_title
(
"Decode Throughput Interpolation"
,
fontsize
=
14
)
# Set viewing angle
ax
.
view_init
(
elev
=
30
,
azim
=
45
)
# type: ignore
ax
.
grid
(
True
)
ax
.
tick_params
(
axis
=
"both"
,
which
=
"major"
,
labelsize
=
10
)
thpt_plot_path
=
f
"
{
work_dir
}
/decode_throughput_interpolation.png"
logger
.
info
(
f
"Saving throughput surface plot to
{
thpt_plot_path
}
"
)
plt
.
savefig
(
thpt_plot_path
,
dpi
=
300
,
bbox_inches
=
"tight"
)
plt
.
close
()
benchmarks/profiler/utils/profile_decode.py
0 → 100644
View file @
fd358991
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import
logging
import
numpy
as
np
from
utils.genai_perf
import
benchmark_decode
from
utils.plot
import
plot_decode_3d_surface
logger
=
logging
.
getLogger
(
__name__
)
logger
.
setLevel
(
logging
.
INFO
)
console_handler
=
logging
.
StreamHandler
()
console_handler
.
setLevel
(
logging
.
INFO
)
formatter
=
logging
.
Formatter
(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
,
"%Y-%m-%d %H:%M:%S"
)
console_handler
.
setFormatter
(
formatter
)
logger
.
addHandler
(
console_handler
)
def
profile_decode
(
work_dir
,
model_name
,
url
,
num_gpus
,
max_kv_tokens
,
max_context_length
,
interpolation_granularity
,
):
"""interpolate ITL - Active_KV_Cache - Decode_Context_Length"""
x_kv_usage
=
[]
y_context_length
=
[]
z_itl
=
[]
z_thpt_per_gpu
=
[]
osl
=
500
# not too large to reduce ITL variance, not too small to have stable measurement
for
isl
in
range
(
100
,
max_context_length
-
osl
,
(
max_context_length
-
osl
)
//
interpolation_granularity
,
):
max_concurrency
=
max_kv_tokens
//
(
isl
+
osl
)
sweep_num_request
=
range
(
1
,
max_concurrency
,
max_concurrency
//
interpolation_granularity
,
)
for
num_request
in
sweep_num_request
:
genai_perf_artifact_dir
=
f
"
{
work_dir
}
/gap_isl
{
isl
}
_osl
{
osl
}
_n
{
num_request
}
"
gap_result
=
benchmark_decode
(
isl
,
osl
,
num_request
,
genai_perf_artifact_dir
,
model_name
,
base_url
=
url
,
)
if
gap_result
is
not
None
:
itl
=
gap_result
[
"inter_token_latency"
][
"avg"
]
x_kv_usage
.
append
((
isl
+
osl
/
2
)
*
num_request
/
max_kv_tokens
)
y_context_length
.
append
(
isl
+
osl
/
2
)
z_itl
.
append
(
itl
)
z_thpt_per_gpu
.
append
(
gap_result
[
"output_token_throughput"
][
"avg"
]
/
num_gpus
)
# Save the data points to a .npz file
save_path
=
f
"
{
work_dir
}
/raw_data.npz"
np
.
savez
(
save_path
,
x_kv_usage
=
np
.
array
(
x_kv_usage
),
y_context_length
=
np
.
array
(
y_context_length
),
z_itl
=
np
.
array
(
z_itl
),
z_thpt_per_gpu
=
np
.
array
(
z_thpt_per_gpu
),
max_kv_tokens
=
np
.
array
([
max_kv_tokens
]),
)
logger
.
info
(
f
"Saved data points to
{
save_path
}
"
)
# Plot 3D surface
plot_decode_3d_surface
(
x_kv_usage
,
y_context_length
,
z_itl
,
z_thpt_per_gpu
,
work_dir
)
return
benchmarks/profiler/utils/profile_prefill.py
0 → 100644
View file @
fd358991
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import
logging
import
numpy
as
np
from
utils.genai_perf
import
benchmark_prefill
from
utils.plot
import
plot_prefill_interpolation
logger
=
logging
.
getLogger
(
__name__
)
logger
.
setLevel
(
logging
.
INFO
)
console_handler
=
logging
.
StreamHandler
()
console_handler
.
setLevel
(
logging
.
INFO
)
formatter
=
logging
.
Formatter
(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
,
"%Y-%m-%d %H:%M:%S"
)
console_handler
.
setFormatter
(
formatter
)
logger
.
addHandler
(
console_handler
)
def
profile_prefill
(
work_dir
,
model_name
,
url
,
num_gpus
,
max_context_length
,
interpolation_granularity
):
prefill_isl
=
[]
prefill_ttft
=
[]
prefill_thpt_per_gpu
=
[]
for
isl
in
range
(
100
,
max_context_length
,
(
max_context_length
-
100
)
//
interpolation_granularity
,
):
# run genai-perf
genai_perf_artifact_dir
=
f
"
{
work_dir
}
/gap_isl
{
isl
}
"
gap_result
=
benchmark_prefill
(
isl
,
genai_perf_artifact_dir
,
model_name
,
base_url
=
url
)
if
gap_result
is
not
None
:
ttft
=
gap_result
[
"time_to_first_token"
][
"avg"
]
prefill_isl
.
append
(
isl
)
prefill_ttft
.
append
(
ttft
)
prefill_thpt_per_gpu
.
append
(
isl
/
ttft
/
num_gpus
*
1000
)
# Interpolate prefill_ttft vs prefill_isl with quadratic function (y=ax^2+bx+c)
if
len
(
prefill_isl
)
>
2
:
logger
.
info
(
"Interpolating prefill TTFT and throughput vs ISL..."
)
# Convert to numpy arrays for easier manipulation
prefill_isl_np
=
np
.
array
(
prefill_isl
)
prefill_ttft_np
=
np
.
array
(
prefill_ttft
)
prefill_thpt_per_gpu_np
=
np
.
array
(
prefill_thpt_per_gpu
)
save_path
=
f
"
{
work_dir
}
/raw_data.npz"
np
.
savez
(
save_path
,
prefill_isl
=
prefill_isl_np
,
prefill_ttft
=
prefill_ttft_np
,
prefill_thpt_per_gpu
=
prefill_thpt_per_gpu_np
,
)
# Call the plotting function
plot_prefill_interpolation
(
prefill_isl_np
,
prefill_ttft_np
,
prefill_thpt_per_gpu_np
,
work_dir
)
else
:
logger
.
warning
(
"Not enough data points to perform interpolation (need at least 3 points)"
)
return
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment