Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
9cbc7e5f
Unverified
Commit
9cbc7e5f
authored
Mar 05, 2024
by
Allen.Dou
Committed by
GitHub
Mar 04, 2024
Browse files
enable --gpu-memory-utilization in benchmark_throughput.py (#3175)
Co-authored-by:
zixiao
<
shunli.dsl@alibaba-inc.com
>
parent
27a7b070
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
14 additions
and
7 deletions
+14
-7
benchmarks/benchmark_throughput.py
benchmarks/benchmark_throughput.py
+14
-7
No files found.
benchmarks/benchmark_throughput.py
View file @
9cbc7e5f
...
@@ -74,6 +74,7 @@ def run_vllm(
...
@@ -74,6 +74,7 @@ def run_vllm(
kv_cache_dtype
:
str
,
kv_cache_dtype
:
str
,
device
:
str
,
device
:
str
,
enable_prefix_caching
:
bool
,
enable_prefix_caching
:
bool
,
gpu_memory_utilization
:
float
=
0.9
,
)
->
float
:
)
->
float
:
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
llm
=
LLM
(
model
=
model
,
llm
=
LLM
(
model
=
model
,
...
@@ -84,6 +85,7 @@ def run_vllm(
...
@@ -84,6 +85,7 @@ def run_vllm(
trust_remote_code
=
trust_remote_code
,
trust_remote_code
=
trust_remote_code
,
dtype
=
dtype
,
dtype
=
dtype
,
max_model_len
=
max_model_len
,
max_model_len
=
max_model_len
,
gpu_memory_utilization
=
gpu_memory_utilization
,
enforce_eager
=
enforce_eager
,
enforce_eager
=
enforce_eager
,
kv_cache_dtype
=
kv_cache_dtype
,
kv_cache_dtype
=
kv_cache_dtype
,
device
=
device
,
device
=
device
,
...
@@ -206,13 +208,12 @@ def main(args: argparse.Namespace):
...
@@ -206,13 +208,12 @@ def main(args: argparse.Namespace):
args
.
output_len
)
args
.
output_len
)
if
args
.
backend
==
"vllm"
:
if
args
.
backend
==
"vllm"
:
elapsed_time
=
run_vllm
(
requests
,
args
.
model
,
args
.
tokenizer
,
elapsed_time
=
run_vllm
(
args
.
quantization
,
args
.
tensor_parallel_size
,
requests
,
args
.
model
,
args
.
tokenizer
,
args
.
quantization
,
args
.
seed
,
args
.
n
,
args
.
use_beam_search
,
args
.
tensor_parallel_size
,
args
.
seed
,
args
.
n
,
args
.
use_beam_search
,
args
.
trust_remote_code
,
args
.
dtype
,
args
.
trust_remote_code
,
args
.
dtype
,
args
.
max_model_len
,
args
.
max_model_len
,
args
.
enforce_eager
,
args
.
enforce_eager
,
args
.
kv_cache_dtype
,
args
.
device
,
args
.
kv_cache_dtype
,
args
.
device
,
args
.
enable_prefix_caching
,
args
.
gpu_memory_utilization
)
args
.
enable_prefix_caching
)
elif
args
.
backend
==
"hf"
:
elif
args
.
backend
==
"hf"
:
assert
args
.
tensor_parallel_size
==
1
assert
args
.
tensor_parallel_size
==
1
elapsed_time
=
run_hf
(
requests
,
args
.
model
,
tokenizer
,
args
.
n
,
elapsed_time
=
run_hf
(
requests
,
args
.
model
,
tokenizer
,
args
.
n
,
...
@@ -287,6 +288,12 @@ if __name__ == "__main__":
...
@@ -287,6 +288,12 @@ if __name__ == "__main__":
'The "auto" option will use FP16 precision '
'The "auto" option will use FP16 precision '
'for FP32 and FP16 models, and BF16 precision '
'for FP32 and FP16 models, and BF16 precision '
'for BF16 models.'
)
'for BF16 models.'
)
parser
.
add_argument
(
'--gpu-memory-utilization'
,
type
=
float
,
default
=
0.9
,
help
=
'the fraction of GPU memory to be used for '
'the model executor, which can range from 0 to 1.'
'If unspecified, will use the default value of 0.9.'
)
parser
.
add_argument
(
"--enforce-eager"
,
parser
.
add_argument
(
"--enforce-eager"
,
action
=
"store_true"
,
action
=
"store_true"
,
help
=
"enforce eager execution"
)
help
=
"enforce eager execution"
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment