Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
eee600c3
Unverified
Commit
eee600c3
authored
Dec 18, 2025
by
zhrrr
Committed by
GitHub
Dec 18, 2025
Browse files
[Misc] support nsys profile for bench latency (#29776)
Signed-off-by:
zhuhaoran
<
zhuhaoran.zhr@alibaba-inc.com
>
parent
100f93d2
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
14 additions
and
12 deletions
+14
-12
vllm/benchmarks/latency.py
vllm/benchmarks/latency.py
+14
-12
No files found.
vllm/benchmarks/latency.py
View file @
eee600c3
...
@@ -79,10 +79,6 @@ def add_cli_args(parser: argparse.ArgumentParser):
...
@@ -79,10 +79,6 @@ def add_cli_args(parser: argparse.ArgumentParser):
def
main
(
args
:
argparse
.
Namespace
):
def
main
(
args
:
argparse
.
Namespace
):
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
if
args
.
profile
and
not
engine_args
.
profiler_config
.
profiler
==
"torch"
:
raise
ValueError
(
"The torch profiler is not enabled. Please provide profiler_config."
)
# Lazy import to avoid importing LLM when the bench command is not selected.
# Lazy import to avoid importing LLM when the bench command is not selected.
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
...
@@ -125,8 +121,8 @@ def main(args: argparse.Namespace):
...
@@ -125,8 +121,8 @@ def main(args: argparse.Namespace):
),
),
)
)
def
run_to_completion
(
profile
_dir
:
str
|
None
=
Non
e
):
def
run_to_completion
(
do_
profile
:
bool
=
Fals
e
):
if
profile
_dir
:
if
do_
profile
:
llm
.
start_profile
()
llm
.
start_profile
()
llm_generate
()
llm_generate
()
llm
.
stop_profile
()
llm
.
stop_profile
()
...
@@ -139,18 +135,24 @@ def main(args: argparse.Namespace):
...
@@ -139,18 +135,24 @@ def main(args: argparse.Namespace):
print
(
"Warming up..."
)
print
(
"Warming up..."
)
for
_
in
tqdm
(
range
(
args
.
num_iters_warmup
),
desc
=
"Warmup iterations"
):
for
_
in
tqdm
(
range
(
args
.
num_iters_warmup
),
desc
=
"Warmup iterations"
):
run_to_completion
(
profile
_dir
=
Non
e
)
run_to_completion
(
do_
profile
=
Fals
e
)
if
args
.
profile
:
if
args
.
profile
:
profile_dir
=
engine_args
.
profiler_config
.
torch_profiler_dir
profiler_config
=
engine_args
.
profiler_config
print
(
f
"Profiling (results will be saved to '
{
profile_dir
}
')..."
)
if
profiler_config
.
profiler
==
"torch"
:
run_to_completion
(
profile_dir
=
profile_dir
)
print
(
"Profiling with torch profiler (results will be saved to"
f
"
{
profiler_config
.
torch_profiler_dir
}
)..."
)
elif
profiler_config
.
profiler
==
"cuda"
:
print
(
"Profiling with cuda profiler ..."
)
run_to_completion
(
do_profile
=
True
)
return
return
# Benchmark.
# Benchmark.
latencies
=
[]
latencies
=
[]
for
_
in
tqdm
(
range
(
args
.
num_iters
),
desc
=
"
Profiling
iterations"
):
for
_
in
tqdm
(
range
(
args
.
num_iters
),
desc
=
"
Bench
iterations"
):
latencies
.
append
(
run_to_completion
(
profile
_dir
=
Non
e
))
latencies
.
append
(
run_to_completion
(
do_
profile
=
Fals
e
))
latencies
=
np
.
array
(
latencies
)
latencies
=
np
.
array
(
latencies
)
percentages
=
[
10
,
25
,
50
,
75
,
90
,
99
]
percentages
=
[
10
,
25
,
50
,
75
,
90
,
99
]
percentiles
=
np
.
percentile
(
latencies
,
percentages
)
percentiles
=
np
.
percentile
(
latencies
,
percentages
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment