Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
23ec72fa
Unverified
Commit
23ec72fa
authored
Jun 13, 2024
by
Michael Goin
Committed by
GitHub
Jun 13, 2024
Browse files
[CI/Build][REDO] Add is_quant_method_supported to control quantization test configurations (#5466)
parent
c2637a61
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
32 additions
and
71 deletions
+32
-71
tests/models/test_aqlm.py
tests/models/test_aqlm.py
+2
-11
tests/models/test_fp8.py
tests/models/test_fp8.py
+2
-10
tests/models/test_gptq_marlin.py
tests/models/test_gptq_marlin.py
+2
-11
tests/models/test_gptq_marlin_24.py
tests/models/test_gptq_marlin_24.py
+2
-11
tests/models/test_marlin.py
tests/models/test_marlin.py
+2
-11
tests/quantization/test_bitsandbytes.py
tests/quantization/test_bitsandbytes.py
+3
-7
tests/quantization/test_fp8.py
tests/quantization/test_fp8.py
+5
-10
tests/quantization/utils.py
tests/quantization/utils.py
+14
-0
No files found.
tests/models/test_aqlm.py
View file @
23ec72fa
...
...
@@ -4,17 +4,8 @@ Run `pytest tests/models/test_aqlm.py`.
"""
import
pytest
import
torch
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
aqlm_not_supported
=
True
if
torch
.
cuda
.
is_available
():
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
aqlm_not_supported
=
(
capability
<
QUANTIZATION_METHODS
[
"aqlm"
].
get_min_capability
())
from
tests.quantization.utils
import
is_quant_method_supported
# In this test we hardcode prompts and generations for the model so we don't
# need to require the AQLM package as a dependency
...
...
@@ -67,7 +58,7 @@ ground_truth_generations = [
]
@
pytest
.
mark
.
skipif
(
aqlm_not
_supported
,
@
pytest
.
mark
.
skipif
(
not
is_quant_method
_supported
(
"aqlm"
)
,
reason
=
"AQLM is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
...
...
tests/models/test_fp8.py
View file @
23ec72fa
...
...
@@ -8,8 +8,8 @@ import pytest
import
torch
from
transformers
import
AutoTokenizer
from
tests.quantization.utils
import
is_quant_method_supported
from
vllm
import
LLM
,
SamplingParams
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
os
.
environ
[
"TOKENIZERS_PARALLELISM"
]
=
"true"
...
...
@@ -67,16 +67,8 @@ EXPECTED_STRS_MAP = {
},
}
fp8_not_supported
=
True
if
torch
.
cuda
.
is_available
():
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
fp8_not_supported
=
(
capability
<
QUANTIZATION_METHODS
[
"fp8"
].
get_min_capability
())
@
pytest
.
mark
.
skipif
(
fp8_not_supported
,
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
),
reason
=
"fp8 is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model_name"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"kv_cache_dtype"
,
[
"auto"
,
"fp8"
])
...
...
tests/models/test_gptq_marlin.py
View file @
23ec72fa
...
...
@@ -11,9 +11,8 @@ Run `pytest tests/models/test_gptq_marlin.py`.
import
os
import
pytest
import
torch
from
vllm.model_executor.layer
s.quantization
import
QUANTIZATION_METHODS
from
test
s.quantization
.utils
import
is_quant_method_supported
from
vllm.model_executor.layers.rotary_embedding
import
_ROPE_DICT
from
.utils
import
check_logprobs_close
...
...
@@ -22,14 +21,6 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
MAX_MODEL_LEN
=
1024
gptq_marlin_not_supported
=
True
if
torch
.
cuda
.
is_available
():
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
gptq_marlin_not_supported
=
(
capability
<
QUANTIZATION_METHODS
[
"gptq_marlin"
].
get_min_capability
())
MODELS
=
[
# act_order==False, group_size=channelwise
(
"robertgshaw2/zephyr-7b-beta-channelwise-gptq"
,
"main"
),
...
...
@@ -53,7 +44,7 @@ MODELS = [
@
pytest
.
mark
.
flaky
(
reruns
=
3
)
@
pytest
.
mark
.
skipif
(
gptq_marlin_not_supported
,
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gptq_marlin"
)
,
reason
=
"gptq_marlin is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
,
"bfloat16"
])
...
...
tests/models/test_gptq_marlin_24.py
View file @
23ec72fa
...
...
@@ -9,18 +9,9 @@ Run `pytest tests/models/test_marlin_24.py`.
from
dataclasses
import
dataclass
import
pytest
import
torch
from
tests.models.utils
import
check_logprobs_close
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
marlin_not_supported
=
True
if
torch
.
cuda
.
is_available
():
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
marlin_not_supported
=
(
capability
<
QUANTIZATION_METHODS
[
"marlin"
].
get_min_capability
())
from
tests.quantization.utils
import
is_quant_method_supported
@
dataclass
...
...
@@ -47,7 +38,7 @@ model_pairs = [
@
pytest
.
mark
.
flaky
(
reruns
=
2
)
@
pytest
.
mark
.
skipif
(
marlin_not_supported
,
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"gptq_marlin_24"
)
,
reason
=
"Marlin24 is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model_pair"
,
model_pairs
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
...
...
tests/models/test_marlin.py
View file @
23ec72fa
...
...
@@ -13,20 +13,11 @@ Run `pytest tests/models/test_marlin.py`.
from
dataclasses
import
dataclass
import
pytest
import
torch
from
vllm.model_executor.layer
s.quantization
import
QUANTIZATION_METHODS
from
test
s.quantization
.utils
import
is_quant_method_supported
from
.utils
import
check_logprobs_close
marlin_not_supported
=
True
if
torch
.
cuda
.
is_available
():
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
marlin_not_supported
=
(
capability
<
QUANTIZATION_METHODS
[
"marlin"
].
get_min_capability
())
@
dataclass
class
ModelPair
:
...
...
@@ -45,7 +36,7 @@ model_pairs = [
@
pytest
.
mark
.
flaky
(
reruns
=
2
)
@
pytest
.
mark
.
skipif
(
marlin_not_supported
,
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"marlin"
)
,
reason
=
"Marlin is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model_pair"
,
model_pairs
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
...
...
tests/quantization/test_bitsandbytes.py
View file @
23ec72fa
...
...
@@ -5,15 +5,11 @@ Run `pytest tests/quantization/test_bitsandbytes.py`.
import
pytest
import
torch
from
tests.quantization.utils
import
is_quant_method_supported
from
vllm
import
SamplingParams
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
@
pytest
.
mark
.
skipif
(
capability
<
QUANTIZATION_METHODS
[
'bitsandbytes'
].
get_min_capability
(),
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"bitsandbytes"
),
reason
=
'bitsandbytes is not supported on this GPU type.'
)
def
test_load_bnb_model
(
vllm_runner
)
->
None
:
with
vllm_runner
(
'huggyllama/llama-7b'
,
...
...
tests/quantization/test_fp8.py
View file @
23ec72fa
...
...
@@ -5,16 +5,12 @@ Run `pytest tests/quantization/test_fp8.py --forked`.
import
pytest
import
torch
from
tests.quantization.utils
import
is_quant_method_supported
from
vllm._custom_ops
import
scaled_fp8_quant
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
from
vllm.model_executor.layers.quantization.fp8
import
Fp8LinearMethod
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
@
pytest
.
mark
.
skipif
(
capability
<
QUANTIZATION_METHODS
[
"fp8"
].
get_min_capability
(),
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
),
reason
=
"FP8 is not supported on this GPU type."
)
def
test_load_fp16_model
(
vllm_runner
)
->
None
:
with
vllm_runner
(
"facebook/opt-125m"
,
quantization
=
"fp8"
)
as
llm
:
...
...
@@ -25,8 +21,7 @@ def test_load_fp16_model(vllm_runner) -> None:
assert
fc1
.
weight
.
dtype
==
torch
.
float8_e4m3fn
@
pytest
.
mark
.
skipif
(
capability
<
QUANTIZATION_METHODS
[
"fp8"
].
get_min_capability
(),
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
),
reason
=
"FP8 is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
float16
,
torch
.
bfloat16
])
def
test_scaled_fp8_quant
(
dtype
)
->
None
:
...
...
tests/quantization/utils.py
0 → 100644
View file @
23ec72fa
import
torch
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
def
is_quant_method_supported
(
quant_method
:
str
)
->
bool
:
# Currently, all quantization methods require Nvidia or AMD GPUs
if
not
torch
.
cuda
.
is_available
():
return
False
capability
=
torch
.
cuda
.
get_device_capability
()
capability
=
capability
[
0
]
*
10
+
capability
[
1
]
return
(
capability
<
QUANTIZATION_METHODS
[
quant_method
].
get_min_capability
())
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment