Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a2469127
Unverified
Commit
a2469127
authored
Sep 13, 2024
by
youkaichao
Committed by
GitHub
Sep 13, 2024
Browse files
[misc][ci] fix quant test (#8449)
parent
06311e29
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
21 additions
and
15 deletions
+21
-15
tests/quantization/test_bitsandbytes.py
tests/quantization/test_bitsandbytes.py
+20
-12
tests/quantization/utils.py
tests/quantization/utils.py
+1
-3
No files found.
tests/quantization/test_bitsandbytes.py
View file @
a2469127
...
@@ -10,6 +10,8 @@ import torch
...
@@ -10,6 +10,8 @@ import torch
from
tests.quantization.utils
import
is_quant_method_supported
from
tests.quantization.utils
import
is_quant_method_supported
from
..utils
import
fork_new_process_for_each_test
models_4bit_to_test
=
[
models_4bit_to_test
=
[
(
'huggyllama/llama-7b'
,
'quantize model inflight'
),
(
'huggyllama/llama-7b'
,
'quantize model inflight'
),
]
]
...
@@ -29,6 +31,7 @@ models_pre_quant_8bit_to_test = [
...
@@ -29,6 +31,7 @@ models_pre_quant_8bit_to_test = [
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"bitsandbytes"
),
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"bitsandbytes"
),
reason
=
'bitsandbytes is not supported on this GPU type.'
)
reason
=
'bitsandbytes is not supported on this GPU type.'
)
@
pytest
.
mark
.
parametrize
(
"model_name, description"
,
models_4bit_to_test
)
@
pytest
.
mark
.
parametrize
(
"model_name, description"
,
models_4bit_to_test
)
@
fork_new_process_for_each_test
def
test_load_4bit_bnb_model
(
hf_runner
,
vllm_runner
,
example_prompts
,
def
test_load_4bit_bnb_model
(
hf_runner
,
vllm_runner
,
example_prompts
,
model_name
,
description
)
->
None
:
model_name
,
description
)
->
None
:
...
@@ -41,6 +44,7 @@ def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
...
@@ -41,6 +44,7 @@ def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
reason
=
'bitsandbytes is not supported on this GPU type.'
)
reason
=
'bitsandbytes is not supported on this GPU type.'
)
@
pytest
.
mark
.
parametrize
(
"model_name, description"
,
@
pytest
.
mark
.
parametrize
(
"model_name, description"
,
models_pre_qaunt_4bit_to_test
)
models_pre_qaunt_4bit_to_test
)
@
fork_new_process_for_each_test
def
test_load_pre_quant_4bit_bnb_model
(
hf_runner
,
vllm_runner
,
example_prompts
,
def
test_load_pre_quant_4bit_bnb_model
(
hf_runner
,
vllm_runner
,
example_prompts
,
model_name
,
description
)
->
None
:
model_name
,
description
)
->
None
:
...
@@ -52,6 +56,7 @@ def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
...
@@ -52,6 +56,7 @@ def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
reason
=
'bitsandbytes is not supported on this GPU type.'
)
reason
=
'bitsandbytes is not supported on this GPU type.'
)
@
pytest
.
mark
.
parametrize
(
"model_name, description"
,
@
pytest
.
mark
.
parametrize
(
"model_name, description"
,
models_pre_quant_8bit_to_test
)
models_pre_quant_8bit_to_test
)
@
fork_new_process_for_each_test
def
test_load_8bit_bnb_model
(
hf_runner
,
vllm_runner
,
example_prompts
,
def
test_load_8bit_bnb_model
(
hf_runner
,
vllm_runner
,
example_prompts
,
model_name
,
description
)
->
None
:
model_name
,
description
)
->
None
:
...
@@ -77,18 +82,8 @@ def validate_generated_texts(hf_runner,
...
@@ -77,18 +82,8 @@ def validate_generated_texts(hf_runner,
model_name
,
model_name
,
hf_model_kwargs
=
None
):
hf_model_kwargs
=
None
):
if
hf_model_kwargs
is
None
:
# NOTE: run vLLM first, as it requires a clean process
hf_model_kwargs
=
{}
# when using distributed inference
# Run with HF runner
with
hf_runner
(
model_name
,
model_kwargs
=
hf_model_kwargs
)
as
llm
:
hf_outputs
=
llm
.
generate_greedy
(
prompts
,
8
)
hf_logs
=
log_generated_texts
(
prompts
,
hf_outputs
,
"HfRunner"
)
# Clean up the GPU memory for the next test
torch
.
cuda
.
synchronize
()
gc
.
collect
()
torch
.
cuda
.
empty_cache
()
#Run with vLLM runner
#Run with vLLM runner
with
vllm_runner
(
model_name
,
with
vllm_runner
(
model_name
,
...
@@ -104,6 +99,19 @@ def validate_generated_texts(hf_runner,
...
@@ -104,6 +99,19 @@ def validate_generated_texts(hf_runner,
gc
.
collect
()
gc
.
collect
()
torch
.
cuda
.
empty_cache
()
torch
.
cuda
.
empty_cache
()
if
hf_model_kwargs
is
None
:
hf_model_kwargs
=
{}
# Run with HF runner
with
hf_runner
(
model_name
,
model_kwargs
=
hf_model_kwargs
)
as
llm
:
hf_outputs
=
llm
.
generate_greedy
(
prompts
,
8
)
hf_logs
=
log_generated_texts
(
prompts
,
hf_outputs
,
"HfRunner"
)
# Clean up the GPU memory for the next test
torch
.
cuda
.
synchronize
()
gc
.
collect
()
torch
.
cuda
.
empty_cache
()
# Compare the generated strings
# Compare the generated strings
for
hf_log
,
vllm_log
in
zip
(
hf_logs
,
vllm_logs
):
for
hf_log
,
vllm_log
in
zip
(
hf_logs
,
vllm_logs
):
hf_str
=
hf_log
[
"generated_text"
]
hf_str
=
hf_log
[
"generated_text"
]
...
...
tests/quantization/utils.py
View file @
a2469127
import
torch
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
def
is_quant_method_supported
(
quant_method
:
str
)
->
bool
:
def
is_quant_method_supported
(
quant_method
:
str
)
->
bool
:
# Currently, all quantization methods require Nvidia or AMD GPUs
# Currently, all quantization methods require Nvidia or AMD GPUs
if
not
torch
.
cuda
.
is_available
(
):
if
not
(
current_platform
.
is_cuda
()
or
current_platform
.
is_rocm
()
):
return
False
return
False
capability
=
current_platform
.
get_device_capability
()
capability
=
current_platform
.
get_device_capability
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment