Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
norm
vllm
Commits
e74b1736
".github/vscode:/vscode.git/clone" did not exist on "3c85a57297b22df8921bae39c0a2e3982ee69de7"
Unverified
Commit
e74b1736
authored
Nov 29, 2023
by
Woosuk Kwon
Committed by
GitHub
Nov 29, 2023
Browse files
Add profile option to latency benchmark script (#1839)
parent
f07c1cea
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
25 additions
and
14 deletions
+25
-14
benchmarks/benchmark_latency.py
benchmarks/benchmark_latency.py
+25
-14
No files found.
benchmarks/benchmark_latency.py
View file @
e74b1736
...
@@ -12,7 +12,6 @@ from vllm import LLM, SamplingParams
...
@@ -12,7 +12,6 @@ from vllm import LLM, SamplingParams
def
main
(
args
:
argparse
.
Namespace
):
def
main
(
args
:
argparse
.
Namespace
):
print
(
args
)
print
(
args
)
# Process all the requests in a single batch if possible.
# NOTE(woosuk): If the request cannot be processed in a single batch,
# NOTE(woosuk): If the request cannot be processed in a single batch,
# the engine will automatically process the request in multiple batches.
# the engine will automatically process the request in multiple batches.
llm
=
LLM
(
llm
=
LLM
(
...
@@ -21,7 +20,6 @@ def main(args: argparse.Namespace):
...
@@ -21,7 +20,6 @@ def main(args: argparse.Namespace):
quantization
=
args
.
quantization
,
quantization
=
args
.
quantization
,
tensor_parallel_size
=
args
.
tensor_parallel_size
,
tensor_parallel_size
=
args
.
tensor_parallel_size
,
max_num_seqs
=
args
.
batch_size
,
max_num_seqs
=
args
.
batch_size
,
max_num_batched_tokens
=
args
.
batch_size
*
args
.
input_len
,
trust_remote_code
=
args
.
trust_remote_code
,
trust_remote_code
=
args
.
trust_remote_code
,
dtype
=
args
.
dtype
,
dtype
=
args
.
dtype
,
)
)
...
@@ -39,22 +37,31 @@ def main(args: argparse.Namespace):
...
@@ -39,22 +37,31 @@ def main(args: argparse.Namespace):
def
run_to_completion
(
profile
:
bool
=
False
):
def
run_to_completion
(
profile
:
bool
=
False
):
if
profile
:
if
profile
:
torch
.
cuda
.
cudart
().
cudaProfilerStart
()
with
torch
.
profiler
.
profile
(
activities
=
[
torch
.
profiler
.
ProfilerActivity
.
CPU
,
torch
.
profiler
.
ProfilerActivity
.
CUDA
,
])
as
p
:
llm
.
generate
(
prompt_token_ids
=
dummy_prompt_token_ids
,
sampling_params
=
sampling_params
,
use_tqdm
=
False
)
print
(
p
.
key_averages
())
else
:
start_time
=
time
.
perf_counter
()
start_time
=
time
.
perf_counter
()
llm
.
generate
(
prompt_token_ids
=
dummy_prompt_token_ids
,
llm
.
generate
(
prompt_token_ids
=
dummy_prompt_token_ids
,
sampling_params
=
sampling_params
,
sampling_params
=
sampling_params
,
use_tqdm
=
False
)
use_tqdm
=
False
)
end_time
=
time
.
perf_counter
()
end_time
=
time
.
perf_counter
()
latency
=
end_time
-
start_time
latency
=
end_time
-
start_time
if
profile
:
torch
.
cuda
.
cudart
().
cudaProfilerStop
()
return
latency
return
latency
print
(
"Warming up..."
)
print
(
"Warming up..."
)
run_to_completion
(
profile
=
False
)
run_to_completion
(
profile
=
False
)
if
args
.
profile
:
print
(
"Profiling..."
)
run_to_completion
(
profile
=
True
)
return
# Benchmark.
# Benchmark.
latencies
=
[]
latencies
=
[]
for
_
in
tqdm
(
range
(
args
.
num_iters
),
desc
=
"Profiling iterations"
):
for
_
in
tqdm
(
range
(
args
.
num_iters
),
desc
=
"Profiling iterations"
):
...
@@ -97,5 +104,9 @@ if __name__ == '__main__':
...
@@ -97,5 +104,9 @@ if __name__ == '__main__':
'The "auto" option will use FP16 precision '
'The "auto" option will use FP16 precision '
'for FP32 and FP16 models, and BF16 precision '
'for FP32 and FP16 models, and BF16 precision '
'for BF16 models.'
)
'for BF16 models.'
)
parser
.
add_argument
(
'--profile'
,
action
=
'store_true'
,
help
=
'profile the generation process of a single batch'
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
main
(
args
)
main
(
args
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment