Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
dd248f76
Unverified
Commit
dd248f76
authored
Jun 25, 2024
by
Dipika Sikka
Committed by
GitHub
Jun 25, 2024
Browse files
[Misc] Update `w4a16` `compressed-tensors` support to include `w8a16` (#5794)
parent
d9b34bae
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
36 additions
and
26 deletions
+36
-26
tests/quantization/test_compressed_tensors.py
tests/quantization/test_compressed_tensors.py
+12
-11
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
...ers/quantization/compressed_tensors/compressed_tensors.py
+17
-11
vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
...ayers/quantization/compressed_tensors/schemes/__init__.py
+3
-2
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
...compressed_tensors/schemes/compressed_tensors_w4a16_24.py
+1
-0
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
...on/compressed_tensors/schemes/compressed_tensors_wNa16.py
+3
-2
No files found.
tests/quantization/test_compressed_tensors.py
View file @
dd248f76
...
...
@@ -8,9 +8,9 @@ import torch
from
vllm
import
SamplingParams
from
vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors
import
(
# noqa: E501
CompressedTensorsLinearMethod
,
CompressedTensorsW4A16
,
CompressedTensorsW
4A16Sparse24
,
CompressedTensorsW8A8
DynamicToken
,
CompressedTensorsW
8A8StaticTensor
)
CompressedTensorsLinearMethod
,
CompressedTensorsW4A16
Sparse24
,
CompressedTensorsW
8A8DynamicToken
,
CompressedTensorsW8A8
StaticTensor
,
CompressedTensorsW
NA16
)
@
pytest
.
mark
.
parametrize
(
"model_args"
,
[
...
...
@@ -74,26 +74,27 @@ def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner, model_args):
assert
qkv_proj
.
weight
.
dtype
is
torch
.
int8
@
pytest
.
mark
.
parametrize
(
"w4a16_args"
,
[
(
"nm-testing/tinyllama-oneshot-w4a16-channel-v2"
,
"channel"
,
None
),
(
"nm-testing/tinyllama-oneshot-w4a16-group128-v2"
,
"group"
,
128
),
])
def
test_compressed_tensors_w4a16
(
vllm_runner
,
w4a16_args
):
model
,
strategy
,
group
=
w4a16_args
@
pytest
.
mark
.
parametrize
(
"wNa16_args"
,
[(
"nm-testing/tinyllama-oneshot-w4a16-channel-v2"
,
"channel"
,
None
,
8
),
(
"nm-testing/tinyllama-oneshot-w4a16-group128-v2"
,
"group"
,
128
,
8
),
(
"nm-testing/tinyllama-oneshot-w8a16-per-channel"
,
"channel"
,
None
,
4
)])
def
test_compressed_tensors_w4a16
(
vllm_runner
,
wNa16_args
):
model
,
strategy
,
group
,
pack_factor
=
wNa16_args
with
vllm_runner
(
model
)
as
llm
:
model
=
llm
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
# noqa: E501
layer
=
model
.
model
.
layers
[
0
]
qkv_proj
=
layer
.
self_attn
.
qkv_proj
assert
isinstance
(
qkv_proj
.
quant_method
,
CompressedTensorsLinearMethod
)
assert
isinstance
(
qkv_proj
.
scheme
,
CompressedTensorsW
4
A16
)
assert
isinstance
(
qkv_proj
.
scheme
,
CompressedTensorsW
N
A16
)
assert
qkv_proj
.
scheme
.
strategy
==
strategy
assert
qkv_proj
.
scheme
.
group_size
==
group
assert
qkv_proj
.
weight_packed
.
dtype
is
torch
.
int32
assert
qkv_proj
.
weight_scale
.
dtype
is
torch
.
float16
assert
qkv_proj
.
weight_packed
.
pack_factor
==
8
assert
qkv_proj
.
weight_packed
.
pack_factor
==
pack_factor
def
test_compressed_tensors_w4a16_marlin24
(
vllm_runner
):
...
...
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
View file @
dd248f76
...
...
@@ -7,9 +7,10 @@ from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
from
vllm.model_executor.layers.quantization.base_config
import
(
# noqa: E501
QuantizationConfig
)
from
vllm.model_executor.layers.quantization.compressed_tensors.schemes
import
(
CompressedTensorsScheme
,
CompressedTensorsW4A16
,
CompressedTensorsW4A16Sparse24
,
CompressedTensorsW8A8DynamicToken
,
CompressedTensorsW8A8StaticTensor
)
W4A16SPARSE24_SUPPORTED_BITS
,
WNA16_SUPPORTED_BITS
,
CompressedTensorsScheme
,
CompressedTensorsW4A16Sparse24
,
CompressedTensorsW8A8DynamicToken
,
CompressedTensorsW8A8StaticTensor
,
CompressedTensorsWNA16
)
from
vllm.model_executor.layers.quantization.compressed_tensors.utils
import
(
CompressionFormat
,
QuantizationArgs
,
QuantizationStrategy
,
find_first_name_or_class_match
)
...
...
@@ -108,26 +109,31 @@ class CompressedTensorsConfig(QuantizationConfig):
return
is_8_bits
and
is_token
and
is_symmetric
and
is_dynamic
def
_is_w
4
a16
(
self
,
weight_quant
:
BaseModel
,
def
_is_w
N
a16
_group_channel
(
self
,
weight_quant
:
BaseModel
,
input_quant
:
BaseModel
)
->
bool
:
input_quant_none
=
input_quant
is
None
is_4_bits
=
weight_quant
.
num_bits
==
4
is_symmetric
=
weight_quant
.
symmetric
is_channel_group
=
(
weight_quant
.
strategy
==
QuantizationStrategy
.
CHANNEL
.
value
or
weight_quant
.
strategy
==
QuantizationStrategy
.
GROUP
.
value
)
is_static
=
not
weight_quant
.
dynamic
return
is_4_bits
and
input_quant_none
and
is_symmetric
and
is_static
return
(
is_channel_group
and
input_quant_none
and
is_symmetric
and
is_static
)
def
_get_schema
(
self
,
weight_quant
:
BaseModel
,
input_quant
:
BaseModel
)
->
"CompressedTensorsScheme"
:
if
self
.
_is_w4a16
(
weight_quant
,
input_quant
):
if
self
.
quant_format
==
CompressionFormat
.
marlin_24
.
value
:
if
self
.
_is_wNa16_group_channel
(
weight_quant
,
input_quant
):
if
(
self
.
quant_format
==
CompressionFormat
.
marlin_24
.
value
and
weight_quant
.
num_bits
in
W4A16SPARSE24_SUPPORTED_BITS
):
return
CompressedTensorsW4A16Sparse24
(
strategy
=
weight_quant
.
strategy
,
num_bits
=
weight_quant
.
num_bits
,
group_size
=
weight_quant
.
group_size
)
if
self
.
quant_format
==
CompressionFormat
.
pack_quantized
.
value
:
return
CompressedTensorsW4A16
(
if
(
self
.
quant_format
==
CompressionFormat
.
pack_quantized
.
value
and
weight_quant
.
num_bits
in
WNA16_SUPPORTED_BITS
):
return
CompressedTensorsWNA16
(
num_bits
=
weight_quant
.
num_bits
,
strategy
=
weight_quant
.
strategy
,
group_size
=
weight_quant
.
group_size
)
...
...
vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
View file @
dd248f76
from
.compressed_tensors_scheme
import
CompressedTensorsScheme
# noqa: F401
from
.compressed_tensors_unquantized
import
(
# noqa: F401
CompressedTensorsUnquantized
)
from
.compressed_tensors_w4a16
import
CompressedTensorsW4A16
# noqa: F401
from
.compressed_tensors_w4a16_24
import
(
# noqa: F401
CompressedTensorsW4A16Sparse24
)
W4A16SPARSE24_SUPPORTED_BITS
,
CompressedTensorsW4A16Sparse24
)
from
.compressed_tensors_w8a8_dynamictoken
import
(
# noqa: F401, E501
CompressedTensorsW8A8DynamicToken
)
from
.compressed_tensors_w8a8_statictensor
import
(
# noqa: F401, E501
CompressedTensorsW8A8StaticTensor
)
from
.compressed_tensors_wNa16
import
WNA16_SUPPORTED_BITS
# noqa: F401
from
.compressed_tensors_wNa16
import
CompressedTensorsWNA16
# noqa: F401
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
View file @
dd248f76
...
...
@@ -11,6 +11,7 @@ from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
from
vllm.model_executor.utils
import
set_weight_attrs
__all__
=
[
"CompressedTensorsW4A16Sparse24"
]
W4A16SPARSE24_SUPPORTED_BITS
=
[
4
]
class
CompressedTensorsW4A16Sparse24
(
CompressedTensorsScheme
):
...
...
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w
4
a16.py
→
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w
N
a16.py
View file @
dd248f76
...
...
@@ -11,10 +11,11 @@ from vllm.model_executor.layers.quantization.gptq_marlin import (
marlin_permute_scales
)
from
vllm.model_executor.utils
import
set_weight_attrs
__all__
=
[
"CompressedTensorsW4A16"
]
__all__
=
[
"CompressedTensorsWNA16"
]
WNA16_SUPPORTED_BITS
=
[
4
,
8
]
class
CompressedTensorsW
4
A16
(
CompressedTensorsScheme
):
class
CompressedTensorsW
N
A16
(
CompressedTensorsScheme
):
def
__init__
(
self
,
strategy
:
str
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment