Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
b3376e5c
Unverified
Commit
b3376e5c
authored
Jun 07, 2024
by
Benjamin Kitor
Committed by
GitHub
Jun 08, 2024
Browse files
[Misc] Add args for selecting distributed executor to benchmarks (#5335)
parent
e69ded7d
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
20 additions
and
3 deletions
+20
-3
benchmarks/benchmark_latency.py
benchmarks/benchmark_latency.py
+9
-1
benchmarks/benchmark_throughput.py
benchmarks/benchmark_throughput.py
+11
-2
No files found.
benchmarks/benchmark_latency.py
View file @
b3376e5c
...
@@ -36,7 +36,8 @@ def main(args: argparse.Namespace):
...
@@ -36,7 +36,8 @@ def main(args: argparse.Namespace):
enable_chunked_prefill
=
args
.
enable_chunked_prefill
,
enable_chunked_prefill
=
args
.
enable_chunked_prefill
,
download_dir
=
args
.
download_dir
,
download_dir
=
args
.
download_dir
,
block_size
=
args
.
block_size
,
block_size
=
args
.
block_size
,
gpu_memory_utilization
=
args
.
gpu_memory_utilization
)
gpu_memory_utilization
=
args
.
gpu_memory_utilization
,
distributed_executor_backend
=
args
.
distributed_executor_backend
)
sampling_params
=
SamplingParams
(
sampling_params
=
SamplingParams
(
n
=
args
.
n
,
n
=
args
.
n
,
...
@@ -221,5 +222,12 @@ if __name__ == '__main__':
...
@@ -221,5 +222,12 @@ if __name__ == '__main__':
help
=
'the fraction of GPU memory to be used for '
help
=
'the fraction of GPU memory to be used for '
'the model executor, which can range from 0 to 1.'
'the model executor, which can range from 0 to 1.'
'If unspecified, will use the default value of 0.9.'
)
'If unspecified, will use the default value of 0.9.'
)
parser
.
add_argument
(
'--distributed-executor-backend'
,
choices
=
[
'ray'
,
'mp'
],
default
=
None
,
help
=
'Backend to use for distributed serving. When more than 1 GPU '
'is used, will be automatically set to "ray" if installed '
'or "mp" (multiprocessing) otherwise.'
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
main
(
args
)
main
(
args
)
benchmarks/benchmark_throughput.py
View file @
b3376e5c
...
@@ -78,6 +78,7 @@ def run_vllm(
...
@@ -78,6 +78,7 @@ def run_vllm(
enable_prefix_caching
:
bool
,
enable_prefix_caching
:
bool
,
enable_chunked_prefill
:
bool
,
enable_chunked_prefill
:
bool
,
max_num_batched_tokens
:
int
,
max_num_batched_tokens
:
int
,
distributed_executor_backend
:
Optional
[
str
],
gpu_memory_utilization
:
float
=
0.9
,
gpu_memory_utilization
:
float
=
0.9
,
download_dir
:
Optional
[
str
]
=
None
,
download_dir
:
Optional
[
str
]
=
None
,
)
->
float
:
)
->
float
:
...
@@ -100,6 +101,7 @@ def run_vllm(
...
@@ -100,6 +101,7 @@ def run_vllm(
download_dir
=
download_dir
,
download_dir
=
download_dir
,
enable_chunked_prefill
=
enable_chunked_prefill
,
enable_chunked_prefill
=
enable_chunked_prefill
,
max_num_batched_tokens
=
max_num_batched_tokens
,
max_num_batched_tokens
=
max_num_batched_tokens
,
distributed_executor_backend
=
distributed_executor_backend
,
)
)
# Add the requests to the engine.
# Add the requests to the engine.
...
@@ -225,8 +227,8 @@ def main(args: argparse.Namespace):
...
@@ -225,8 +227,8 @@ def main(args: argparse.Namespace):
args
.
enforce_eager
,
args
.
kv_cache_dtype
,
args
.
enforce_eager
,
args
.
kv_cache_dtype
,
args
.
quantization_param_path
,
args
.
device
,
args
.
quantization_param_path
,
args
.
device
,
args
.
enable_prefix_caching
,
args
.
enable_chunked_prefill
,
args
.
enable_prefix_caching
,
args
.
enable_chunked_prefill
,
args
.
max_num_batched_tokens
,
args
.
gpu_memory_utilization
,
args
.
max_num_batched_tokens
,
args
.
distributed_executor_backend
,
args
.
download_dir
)
args
.
gpu_memory_utilization
,
args
.
download_dir
)
elif
args
.
backend
==
"hf"
:
elif
args
.
backend
==
"hf"
:
assert
args
.
tensor_parallel_size
==
1
assert
args
.
tensor_parallel_size
==
1
elapsed_time
=
run_hf
(
requests
,
args
.
model
,
tokenizer
,
args
.
n
,
elapsed_time
=
run_hf
(
requests
,
args
.
model
,
tokenizer
,
args
.
n
,
...
@@ -368,6 +370,13 @@ if __name__ == "__main__":
...
@@ -368,6 +370,13 @@ if __name__ == "__main__":
type
=
str
,
type
=
str
,
default
=
None
,
default
=
None
,
help
=
'Path to save the throughput results in JSON format.'
)
help
=
'Path to save the throughput results in JSON format.'
)
parser
.
add_argument
(
'--distributed-executor-backend'
,
choices
=
[
'ray'
,
'mp'
],
default
=
None
,
help
=
'Backend to use for distributed serving. When more than 1 GPU '
'is used, will be automatically set to "ray" if installed '
'or "mp" (multiprocessing) otherwise.'
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
if
args
.
tokenizer
is
None
:
if
args
.
tokenizer
is
None
:
args
.
tokenizer
=
args
.
model
args
.
tokenizer
=
args
.
model
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment