Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
00d3d196
Commit
00d3d196
authored
Oct 10, 2024
by
zhuwenwen
Browse files
update benchmarks
parent
2f5f98bb
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
62 additions
and
28 deletions
+62
-28
benchmarks/benchmark_throughput.py
benchmarks/benchmark_throughput.py
+26
-12
requirements-rocm.txt
requirements-rocm.txt
+1
-1
vllm/benchmarks/benchmark_throughput.py
vllm/benchmarks/benchmark_throughput.py
+26
-12
vllm/config.py
vllm/config.py
+9
-3
No files found.
benchmarks/benchmark_throughput.py
View file @
00d3d196
...
...
@@ -375,18 +375,32 @@ def main(args: argparse.Namespace):
args
.
output_len
)
if
args
.
backend
==
"vllm"
:
run_args
=
[
warmup_requests
,
requests
,
args
.
model
,
args
.
tokenizer
,
args
.
quantization
,
args
.
tensor_parallel_size
,
args
.
seed
,
args
.
n
,
args
.
use_beam_search
,
args
.
trust_remote_code
,
args
.
dtype
,
args
.
max_model_len
,
args
.
enforce_eager
,
args
.
kv_cache_dtype
,
args
.
quantization_param_path
,
args
.
device
,
args
.
enable_prefix_caching
,
args
.
enable_chunked_prefill
,
args
.
max_num_batched_tokens
,
args
.
distributed_executor_backend
,
args
.
gpu_memory_utilization
,
args
.
num_scheduler_steps
,
args
.
use_v2_block_manager
,
args
.
download_dir
,
args
.
load_format
,
args
.
disable_async_output_proc
]
if
args
.
async_engine
:
run_args
=
[
requests
,
args
.
model
,
args
.
tokenizer
,
args
.
quantization
,
args
.
tensor_parallel_size
,
args
.
seed
,
args
.
n
,
args
.
use_beam_search
,
args
.
trust_remote_code
,
args
.
dtype
,
args
.
max_model_len
,
args
.
enforce_eager
,
args
.
kv_cache_dtype
,
args
.
quantization_param_path
,
args
.
device
,
args
.
enable_prefix_caching
,
args
.
enable_chunked_prefill
,
args
.
max_num_batched_tokens
,
args
.
distributed_executor_backend
,
args
.
gpu_memory_utilization
,
args
.
num_scheduler_steps
,
args
.
use_v2_block_manager
,
args
.
download_dir
,
args
.
load_format
,
args
.
disable_async_output_proc
]
else
:
run_args
=
[
warmup_requests
,
requests
,
args
.
model
,
args
.
tokenizer
,
args
.
quantization
,
args
.
tensor_parallel_size
,
args
.
seed
,
args
.
n
,
args
.
use_beam_search
,
args
.
trust_remote_code
,
args
.
dtype
,
args
.
max_model_len
,
args
.
enforce_eager
,
args
.
kv_cache_dtype
,
args
.
quantization_param_path
,
args
.
device
,
args
.
enable_prefix_caching
,
args
.
enable_chunked_prefill
,
args
.
max_num_batched_tokens
,
args
.
distributed_executor_backend
,
args
.
gpu_memory_utilization
,
args
.
num_scheduler_steps
,
args
.
use_v2_block_manager
,
args
.
download_dir
,
args
.
load_format
,
args
.
disable_async_output_proc
]
if
args
.
async_engine
:
run_args
.
append
(
args
.
disable_frontend_multiprocessing
)
...
...
requirements-rocm.txt
View file @
00d3d196
...
...
@@ -9,7 +9,7 @@ ray >= 2.10.0
peft
pytest-asyncio
tensorizer>=2.9.0
setuptools_scm
setuptools_scm
>=8
torch == 2.3.0
triton == 2.1.0
...
...
vllm/benchmarks/benchmark_throughput.py
View file @
00d3d196
...
...
@@ -375,18 +375,32 @@ def main(args: argparse.Namespace):
args
.
output_len
)
if
args
.
backend
==
"vllm"
:
run_args
=
[
warmup_requests
,
requests
,
args
.
model
,
args
.
tokenizer
,
args
.
quantization
,
args
.
tensor_parallel_size
,
args
.
seed
,
args
.
n
,
args
.
use_beam_search
,
args
.
trust_remote_code
,
args
.
dtype
,
args
.
max_model_len
,
args
.
enforce_eager
,
args
.
kv_cache_dtype
,
args
.
quantization_param_path
,
args
.
device
,
args
.
enable_prefix_caching
,
args
.
enable_chunked_prefill
,
args
.
max_num_batched_tokens
,
args
.
distributed_executor_backend
,
args
.
gpu_memory_utilization
,
args
.
num_scheduler_steps
,
args
.
use_v2_block_manager
,
args
.
download_dir
,
args
.
load_format
,
args
.
disable_async_output_proc
]
if
args
.
async_engine
:
run_args
=
[
requests
,
args
.
model
,
args
.
tokenizer
,
args
.
quantization
,
args
.
tensor_parallel_size
,
args
.
seed
,
args
.
n
,
args
.
use_beam_search
,
args
.
trust_remote_code
,
args
.
dtype
,
args
.
max_model_len
,
args
.
enforce_eager
,
args
.
kv_cache_dtype
,
args
.
quantization_param_path
,
args
.
device
,
args
.
enable_prefix_caching
,
args
.
enable_chunked_prefill
,
args
.
max_num_batched_tokens
,
args
.
distributed_executor_backend
,
args
.
gpu_memory_utilization
,
args
.
num_scheduler_steps
,
args
.
use_v2_block_manager
,
args
.
download_dir
,
args
.
load_format
,
args
.
disable_async_output_proc
]
else
:
run_args
=
[
warmup_requests
,
requests
,
args
.
model
,
args
.
tokenizer
,
args
.
quantization
,
args
.
tensor_parallel_size
,
args
.
seed
,
args
.
n
,
args
.
use_beam_search
,
args
.
trust_remote_code
,
args
.
dtype
,
args
.
max_model_len
,
args
.
enforce_eager
,
args
.
kv_cache_dtype
,
args
.
quantization_param_path
,
args
.
device
,
args
.
enable_prefix_caching
,
args
.
enable_chunked_prefill
,
args
.
max_num_batched_tokens
,
args
.
distributed_executor_backend
,
args
.
gpu_memory_utilization
,
args
.
num_scheduler_steps
,
args
.
use_v2_block_manager
,
args
.
download_dir
,
args
.
load_format
,
args
.
disable_async_output_proc
]
if
args
.
async_engine
:
run_args
.
append
(
args
.
disable_frontend_multiprocessing
)
...
...
vllm/config.py
View file @
00d3d196
...
...
@@ -289,9 +289,15 @@ class ModelConfig:
quantization_override
=
method
.
override_quantization_method
(
quant_cfg
,
self
.
quantization
)
if
quantization_override
:
quant_method
=
quantization_override
self
.
quantization
=
quantization_override
break
if
is_hip
():
if
quantization_override
in
rocm_supported_quantization
:
quant_method
=
quantization_override
self
.
quantization
=
quantization_override
break
else
:
quant_method
=
quantization_override
self
.
quantization
=
quantization_override
break
# Verify quantization configurations.
if
self
.
quantization
is
None
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment