Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
774c5fde
Unverified
Commit
774c5fde
authored
May 27, 2025
by
Divakar Verma
Committed by
GitHub
May 28, 2025
Browse files
[V1] fix torch profiling for V1 offline scenarios (#18445)
Signed-off-by:
Divakar Verma
<
divakar.verma@amd.com
>
parent
9a21e331
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
23 additions
and
51 deletions
+23
-51
benchmarks/benchmark_latency.py
benchmarks/benchmark_latency.py
+10
-27
vllm/benchmarks/latency.py
vllm/benchmarks/latency.py
+9
-24
vllm/v1/worker/gpu_worker.py
vllm/v1/worker/gpu_worker.py
+2
-0
vllm/worker/worker.py
vllm/worker/worker.py
+2
-0
No files found.
benchmarks/benchmark_latency.py
View file @
774c5fde
...
@@ -6,13 +6,12 @@ import dataclasses
...
@@ -6,13 +6,12 @@ import dataclasses
import
json
import
json
import
os
import
os
import
time
import
time
from
pathlib
import
Path
from
typing
import
Any
,
Optional
from
typing
import
Any
,
Optional
import
numpy
as
np
import
numpy
as
np
import
torch
from
tqdm
import
tqdm
from
tqdm
import
tqdm
import
vllm.envs
as
envs
from
benchmark_utils
import
convert_to_pytorch_benchmark_format
,
write_to_json
from
benchmark_utils
import
convert_to_pytorch_benchmark_format
,
write_to_json
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.engine.arg_utils
import
EngineArgs
...
@@ -80,17 +79,9 @@ def main(args: argparse.Namespace):
...
@@ -80,17 +79,9 @@ def main(args: argparse.Namespace):
def
run_to_completion
(
profile_dir
:
Optional
[
str
]
=
None
):
def
run_to_completion
(
profile_dir
:
Optional
[
str
]
=
None
):
if
profile_dir
:
if
profile_dir
:
with
torch
.
profiler
.
profile
(
llm
.
start_profile
()
activities
=
[
llm_generate
()
torch
.
profiler
.
ProfilerActivity
.
CPU
,
llm
.
stop_profile
()
torch
.
profiler
.
ProfilerActivity
.
CUDA
,
],
on_trace_ready
=
torch
.
profiler
.
tensorboard_trace_handler
(
str
(
profile_dir
)
),
)
as
p
:
llm_generate
()
print
(
p
.
key_averages
().
table
(
sort_by
=
"self_cuda_time_total"
))
else
:
else
:
start_time
=
time
.
perf_counter
()
start_time
=
time
.
perf_counter
()
llm_generate
()
llm_generate
()
...
@@ -103,11 +94,7 @@ def main(args: argparse.Namespace):
...
@@ -103,11 +94,7 @@ def main(args: argparse.Namespace):
run_to_completion
(
profile_dir
=
None
)
run_to_completion
(
profile_dir
=
None
)
if
args
.
profile
:
if
args
.
profile
:
profile_dir
=
args
.
profile_result_dir
profile_dir
=
envs
.
VLLM_TORCH_PROFILER_DIR
if
not
profile_dir
:
profile_dir
=
(
Path
(
"."
)
/
"vllm_benchmark_result"
/
f
"latency_result_
{
time
.
time
()
}
"
)
print
(
f
"Profiling (results will be saved to '
{
profile_dir
}
')..."
)
print
(
f
"Profiling (results will be saved to '
{
profile_dir
}
')..."
)
run_to_completion
(
profile_dir
=
profile_dir
)
run_to_completion
(
profile_dir
=
profile_dir
)
return
return
...
@@ -164,15 +151,6 @@ if __name__ == "__main__":
...
@@ -164,15 +151,6 @@ if __name__ == "__main__":
action
=
"store_true"
,
action
=
"store_true"
,
help
=
"profile the generation process of a single batch"
,
help
=
"profile the generation process of a single batch"
,
)
)
parser
.
add_argument
(
"--profile-result-dir"
,
type
=
str
,
default
=
None
,
help
=
(
"path to save the pytorch profiler output. Can be visualized "
"with ui.perfetto.dev or Tensorboard."
),
)
parser
.
add_argument
(
parser
.
add_argument
(
"--output-json"
,
"--output-json"
,
type
=
str
,
type
=
str
,
...
@@ -193,4 +171,9 @@ if __name__ == "__main__":
...
@@ -193,4 +171,9 @@ if __name__ == "__main__":
# numbers. We need to disable prefix caching by default.
# numbers. We need to disable prefix caching by default.
parser
.
set_defaults
(
enable_prefix_caching
=
False
)
parser
.
set_defaults
(
enable_prefix_caching
=
False
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
if
args
.
profile
and
not
envs
.
VLLM_TORCH_PROFILER_DIR
:
raise
OSError
(
"The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
"Please set it to a valid path to use torch profiler."
)
main
(
args
)
main
(
args
)
vllm/benchmarks/latency.py
View file @
774c5fde
...
@@ -6,13 +6,12 @@ import dataclasses
...
@@ -6,13 +6,12 @@ import dataclasses
import
json
import
json
import
os
import
os
import
time
import
time
from
pathlib
import
Path
from
typing
import
Any
,
Optional
from
typing
import
Any
,
Optional
import
numpy
as
np
import
numpy
as
np
import
torch
from
tqdm
import
tqdm
from
tqdm
import
tqdm
import
vllm.envs
as
envs
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
from
vllm.benchmarks.utils
import
(
convert_to_pytorch_benchmark_format
,
from
vllm.benchmarks.utils
import
(
convert_to_pytorch_benchmark_format
,
write_to_json
)
write_to_json
)
...
@@ -59,13 +58,6 @@ def add_cli_args(parser: argparse.ArgumentParser):
...
@@ -59,13 +58,6 @@ def add_cli_args(parser: argparse.ArgumentParser):
action
=
"store_true"
,
action
=
"store_true"
,
help
=
"profile the generation process of a single batch"
,
help
=
"profile the generation process of a single batch"
,
)
)
parser
.
add_argument
(
"--profile-result-dir"
,
type
=
str
,
default
=
None
,
help
=
(
"path to save the pytorch profiler output. Can be visualized "
"with ui.perfetto.dev or Tensorboard."
),
)
parser
.
add_argument
(
parser
.
add_argument
(
"--output-json"
,
"--output-json"
,
type
=
str
,
type
=
str
,
...
@@ -87,7 +79,10 @@ def add_cli_args(parser: argparse.ArgumentParser):
...
@@ -87,7 +79,10 @@ def add_cli_args(parser: argparse.ArgumentParser):
def
main
(
args
:
argparse
.
Namespace
):
def
main
(
args
:
argparse
.
Namespace
):
print
(
args
)
print
(
args
)
if
args
.
profile
and
not
envs
.
VLLM_TORCH_PROFILER_DIR
:
raise
OSError
(
"The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
"Please set it to a valid path to use torch profiler."
)
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
engine_args
=
EngineArgs
.
from_cli_args
(
args
)
# NOTE(woosuk): If the request cannot be processed in a single batch,
# NOTE(woosuk): If the request cannot be processed in a single batch,
...
@@ -131,16 +126,9 @@ def main(args: argparse.Namespace):
...
@@ -131,16 +126,9 @@ def main(args: argparse.Namespace):
def
run_to_completion
(
profile_dir
:
Optional
[
str
]
=
None
):
def
run_to_completion
(
profile_dir
:
Optional
[
str
]
=
None
):
if
profile_dir
:
if
profile_dir
:
with
torch
.
profiler
.
profile
(
llm
.
start_profile
()
activities
=
[
llm_generate
()
torch
.
profiler
.
ProfilerActivity
.
CPU
,
llm
.
stop_profile
()
torch
.
profiler
.
ProfilerActivity
.
CUDA
,
],
on_trace_ready
=
torch
.
profiler
.
tensorboard_trace_handler
(
str
(
profile_dir
)),
)
as
p
:
llm_generate
()
print
(
p
.
key_averages
().
table
(
sort_by
=
"self_cuda_time_total"
))
else
:
else
:
start_time
=
time
.
perf_counter
()
start_time
=
time
.
perf_counter
()
llm_generate
()
llm_generate
()
...
@@ -153,10 +141,7 @@ def main(args: argparse.Namespace):
...
@@ -153,10 +141,7 @@ def main(args: argparse.Namespace):
run_to_completion
(
profile_dir
=
None
)
run_to_completion
(
profile_dir
=
None
)
if
args
.
profile
:
if
args
.
profile
:
profile_dir
=
args
.
profile_result_dir
profile_dir
=
envs
.
VLLM_TORCH_PROFILER_DIR
if
not
profile_dir
:
profile_dir
=
(
Path
(
"."
)
/
"vllm_benchmark_result"
/
f
"latency_result_
{
time
.
time
()
}
"
)
print
(
f
"Profiling (results will be saved to '
{
profile_dir
}
')..."
)
print
(
f
"Profiling (results will be saved to '
{
profile_dir
}
')..."
)
run_to_completion
(
profile_dir
=
profile_dir
)
run_to_completion
(
profile_dir
=
profile_dir
)
return
return
...
...
vllm/v1/worker/gpu_worker.py
View file @
774c5fde
...
@@ -292,6 +292,8 @@ class Worker(WorkerBase):
...
@@ -292,6 +292,8 @@ class Worker(WorkerBase):
self
.
profiler
.
start
()
self
.
profiler
.
start
()
else
:
else
:
self
.
profiler
.
stop
()
self
.
profiler
.
stop
()
print
(
self
.
profiler
.
key_averages
().
table
(
sort_by
=
"self_cuda_time_total"
))
def
execute_dummy_batch
(
self
)
->
None
:
def
execute_dummy_batch
(
self
)
->
None
:
self
.
model_runner
.
_dummy_run
(
1
)
self
.
model_runner
.
_dummy_run
(
1
)
...
...
vllm/worker/worker.py
View file @
774c5fde
...
@@ -128,6 +128,8 @@ class Worker(LocalOrDistributedWorkerBase):
...
@@ -128,6 +128,8 @@ class Worker(LocalOrDistributedWorkerBase):
if
self
.
profiler
is
None
:
if
self
.
profiler
is
None
:
raise
RuntimeError
(
"Profiler is not enabled."
)
raise
RuntimeError
(
"Profiler is not enabled."
)
self
.
profiler
.
stop
()
self
.
profiler
.
stop
()
print
(
self
.
profiler
.
key_averages
().
table
(
sort_by
=
"self_cuda_time_total"
))
def
sleep
(
self
,
level
:
int
=
1
)
->
None
:
def
sleep
(
self
,
level
:
int
=
1
)
->
None
:
free_bytes_before_sleep
=
torch
.
cuda
.
mem_get_info
()[
0
]
free_bytes_before_sleep
=
torch
.
cuda
.
mem_get_info
()[
0
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment