Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
5ad04465
Unverified
Commit
5ad04465
authored
Mar 21, 2026
by
Robert Shaw
Committed by
GitHub
Mar 21, 2026
Browse files
Revert "Consolidate AWQ quantization into single awq_marlin.py file" (#37768)
parent
8cc700dd
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
279 additions
and
253 deletions
+279
-253
vllm/model_executor/layers/quantization/__init__.py
vllm/model_executor/layers/quantization/__init__.py
+2
-1
vllm/model_executor/layers/quantization/awq.py
vllm/model_executor/layers/quantization/awq.py
+275
-5
vllm/model_executor/layers/quantization/awq_marlin.py
vllm/model_executor/layers/quantization/awq_marlin.py
+2
-247
No files found.
vllm/model_executor/layers/quantization/__init__.py
View file @
5ad04465
...
@@ -107,7 +107,8 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
...
@@ -107,7 +107,8 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
# lazy import to avoid triggering `torch.compile` too early
# lazy import to avoid triggering `torch.compile` too early
from
vllm.model_executor.layers.quantization.quark.quark
import
QuarkConfig
from
vllm.model_executor.layers.quantization.quark.quark
import
QuarkConfig
from
.awq_marlin
import
AWQConfig
,
AWQMarlinConfig
from
.awq
import
AWQConfig
from
.awq_marlin
import
AWQMarlinConfig
from
.bitsandbytes
import
BitsAndBytesConfig
from
.bitsandbytes
import
BitsAndBytesConfig
from
.compressed_tensors.compressed_tensors
import
(
from
.compressed_tensors.compressed_tensors
import
(
CompressedTensorsConfig
,
CompressedTensorsConfig
,
...
...
vllm/model_executor/layers/quantization/awq.py
View file @
5ad04465
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Backward compatibility: AWQConfig and AWQLinearMethod have been consolidated
# into awq_marlin.py
from
typing
import
TYPE_CHECKING
,
Any
,
Union
from
vllm.model_executor.layers.quantization.awq_marlin
import
(
# noqa: F401
AWQConfig
,
import
torch
AWQLinearMethod
,
from
safetensors.torch
import
_TYPES
as
_SAFETENSORS_TO_TORCH_DTYPE
from
vllm
import
_custom_ops
as
ops
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.fused_moe.layer
import
FusedMoE
from
vllm.model_executor.layers.linear
import
(
LinearBase
,
LinearMethodBase
,
UnquantizedLinearMethod
,
)
)
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
,
QuantizeMethodBase
,
)
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
is_layer_skipped
from
vllm.model_executor.parameter
import
GroupQuantScaleParameter
,
PackedvLLMParameter
from
vllm.transformers_utils.config
import
get_safetensors_params_metadata
if
TYPE_CHECKING
:
from
vllm.model_executor.layers.quantization
import
QuantizationMethods
from
vllm.model_executor.models.utils
import
WeightsMapper
logger
=
init_logger
(
__name__
)
class
AWQConfig
(
QuantizationConfig
):
"""Config class for AWQ.
Reference: https://arxiv.org/abs/2306.00978
"""
def
__init__
(
self
,
weight_bits
:
int
,
group_size
:
int
,
zero_point
:
bool
,
modules_to_not_convert
:
list
[
str
]
|
None
=
None
,
)
->
None
:
super
().
__init__
()
self
.
weight_bits
=
weight_bits
self
.
group_size
=
group_size
self
.
zero_point
=
zero_point
self
.
modules_to_not_convert
=
modules_to_not_convert
or
[]
if
self
.
weight_bits
!=
4
:
raise
ValueError
(
"Currently, only 4-bit weight quantization is supported for "
f
"AWQ, but got
{
self
.
weight_bits
}
bits."
)
self
.
pack_factor
=
32
//
self
.
weight_bits
def
__repr__
(
self
)
->
str
:
return
(
f
"AWQConfig(weight_bits=
{
self
.
weight_bits
}
, "
f
"group_size=
{
self
.
group_size
}
, "
f
"zero_point=
{
self
.
zero_point
}
, "
f
"modules_to_not_convert=
{
self
.
modules_to_not_convert
}
)"
)
def
get_name
(
self
)
->
"QuantizationMethods"
:
return
"awq"
def
get_supported_act_dtypes
(
self
)
->
list
[
torch
.
dtype
]:
return
[
torch
.
half
]
@
classmethod
def
get_min_capability
(
cls
)
->
int
:
# The AWQ kernel only supports Turing or newer GPUs.
return
75
@
staticmethod
def
get_config_filenames
()
->
list
[
str
]:
return
[
"quant_config.json"
,
# E.g., casperhansen/vicuna-7b-v1.5-awq
# E.g., abhinavkulkarni/mosaicml-mpt-7b-instruct-w4-g128-awq
"quantize_config.json"
,
]
@
classmethod
def
from_config
(
cls
,
config
:
dict
[
str
,
Any
])
->
"AWQConfig"
:
weight_bits
=
cls
.
get_from_keys
(
config
,
[
"w_bit"
,
"bits"
])
group_size
=
cls
.
get_from_keys
(
config
,
[
"q_group_size"
,
"group_size"
])
zero_point
=
cls
.
get_from_keys
(
config
,
[
"zero_point"
])
modules_to_not_convert
=
cls
.
get_from_keys_or
(
config
,
[
"modules_to_not_convert"
],
None
)
return
cls
(
weight_bits
,
group_size
,
zero_point
,
modules_to_not_convert
)
def
get_quant_method
(
self
,
layer
:
torch
.
nn
.
Module
,
prefix
:
str
)
->
Union
[
"LinearMethodBase"
,
"QuantizeMethodBase"
]
|
None
:
if
isinstance
(
layer
,
LinearBase
):
if
is_layer_skipped
(
prefix
,
self
.
modules_to_not_convert
,
self
.
packed_modules_mapping
,
skip_with_substr
=
True
,
):
return
UnquantizedLinearMethod
()
return
AWQLinearMethod
(
self
)
elif
isinstance
(
layer
,
FusedMoE
):
# Lazy import to avoid circular import.
from
.awq_marlin
import
AWQMarlinConfig
from
.moe_wna16
import
MoeWNA16Config
from
.utils.marlin_utils
import
check_moe_marlin_supports_layer
if
not
check_moe_marlin_supports_layer
(
layer
,
self
.
group_size
):
logger
.
warning_once
(
f
"Layer '
{
prefix
}
' is not supported by AWQMoeMarlin. "
"Falling back to Moe WNA16 kernels."
)
config
=
{
"quant_method"
:
"awq"
,
"bits"
:
self
.
weight_bits
,
"group_size"
:
self
.
group_size
,
"zero_point"
:
self
.
zero_point
,
"lm_head"
:
False
,
"modules_to_not_convert"
:
self
.
modules_to_not_convert
,
}
return
MoeWNA16Config
.
from_config
(
config
).
get_quant_method
(
layer
,
prefix
)
marlin_compatible_config_dict
=
{
"quant_method"
:
"awq"
,
"bits"
:
self
.
weight_bits
,
"group_size"
:
self
.
group_size
,
"zero_point"
:
self
.
zero_point
,
"lm_head"
:
False
,
"modules_to_not_convert"
:
self
.
modules_to_not_convert
,
}
awq_marlin_config
=
AWQMarlinConfig
.
from_config
(
marlin_compatible_config_dict
)
return
awq_marlin_config
.
get_quant_method
(
layer
,
prefix
)
return
None
def
apply_vllm_mapper
(
self
,
hf_to_vllm_mapper
:
"WeightsMapper"
):
if
self
.
modules_to_not_convert
:
self
.
modules_to_not_convert
=
hf_to_vllm_mapper
.
apply_list
(
self
.
modules_to_not_convert
)
def
maybe_update_config
(
self
,
model_name
:
str
,
revision
:
str
|
None
=
None
):
if
self
.
modules_to_not_convert
:
return
unquant_dtypes
=
[
torch
.
float16
,
torch
.
bfloat16
,
torch
.
float32
]
metadata
=
get_safetensors_params_metadata
(
model_name
,
revision
=
revision
)
layers
=
{
param_name
.
rsplit
(
"."
,
1
)[
0
]
for
param_name
in
metadata
}
quant_layers
:
set
[
str
]
=
{
param_name
.
rsplit
(
"."
,
1
)[
0
]
for
param_name
,
info
in
metadata
.
items
()
if
(
dtype
:
=
info
.
get
(
"dtype"
,
None
))
and
_SAFETENSORS_TO_TORCH_DTYPE
[
dtype
]
not
in
unquant_dtypes
}
self
.
modules_to_not_convert
=
list
(
layers
-
quant_layers
)
class
AWQLinearMethod
(
LinearMethodBase
):
"""Linear method for AWQ.
Args:
quant_config: The AWQ quantization config.
"""
def
__init__
(
self
,
quant_config
:
AWQConfig
):
self
.
quant_config
=
quant_config
def
create_weights
(
self
,
layer
:
torch
.
nn
.
Module
,
input_size_per_partition
:
int
,
output_partition_sizes
:
list
[
int
],
input_size
:
int
,
output_size
:
int
,
params_dtype
:
torch
.
dtype
,
**
extra_weight_attrs
,
):
# Normalize group_size
if
self
.
quant_config
.
group_size
!=
-
1
:
group_size
=
self
.
quant_config
.
group_size
else
:
group_size
=
input_size
if
input_size_per_partition
%
group_size
!=
0
:
raise
ValueError
(
"The input size is not aligned with the quantized "
"weight shape. This can be caused by too large "
"tensor parallel size."
)
output_size_per_partition
=
sum
(
output_partition_sizes
)
if
output_size_per_partition
%
self
.
quant_config
.
pack_factor
!=
0
:
raise
ValueError
(
"The output size is not aligned with the quantized "
"weight shape. This can be caused by too large "
"tensor parallel size."
)
weight_loader
=
extra_weight_attrs
.
get
(
"weight_loader"
)
qweight
=
PackedvLLMParameter
(
data
=
torch
.
empty
(
input_size_per_partition
,
output_size_per_partition
//
self
.
quant_config
.
pack_factor
,
dtype
=
torch
.
int32
,
),
input_dim
=
0
,
output_dim
=
1
,
packed_dim
=
1
,
packed_factor
=
self
.
quant_config
.
pack_factor
,
weight_loader
=
weight_loader
,
)
num_groups
=
input_size_per_partition
//
group_size
qzeros
=
PackedvLLMParameter
(
data
=
torch
.
empty
(
num_groups
,
output_size_per_partition
//
self
.
quant_config
.
pack_factor
,
dtype
=
torch
.
int32
,
),
input_dim
=
0
,
output_dim
=
1
,
packed_dim
=
1
,
packed_factor
=
self
.
quant_config
.
pack_factor
,
weight_loader
=
weight_loader
,
)
scales
=
GroupQuantScaleParameter
(
data
=
torch
.
empty
(
num_groups
,
output_size_per_partition
,
dtype
=
params_dtype
,
),
input_dim
=
0
,
output_dim
=
1
,
weight_loader
=
weight_loader
,
)
layer
.
register_parameter
(
"qweight"
,
qweight
)
layer
.
register_parameter
(
"qzeros"
,
qzeros
)
layer
.
register_parameter
(
"scales"
,
scales
)
def
process_weights_after_loading
(
self
,
layer
:
torch
.
nn
.
Module
)
->
None
:
layer
.
qweight
=
torch
.
nn
.
Parameter
(
layer
.
qweight
.
data
,
requires_grad
=
False
)
layer
.
qzeros
=
torch
.
nn
.
Parameter
(
layer
.
qzeros
.
data
,
requires_grad
=
False
)
layer
.
scales
=
torch
.
nn
.
Parameter
(
layer
.
scales
.
data
,
requires_grad
=
False
)
def
apply
(
self
,
layer
:
torch
.
nn
.
Module
,
x
:
torch
.
Tensor
,
bias
:
torch
.
Tensor
|
None
=
None
,
)
->
torch
.
Tensor
:
qweight
=
layer
.
qweight
scales
=
layer
.
scales
qzeros
=
layer
.
qzeros
pack_factor
=
self
.
quant_config
.
pack_factor
out_shape
=
x
.
shape
[:
-
1
]
+
(
qweight
.
shape
[
-
1
]
*
pack_factor
,)
reshaped_x
=
x
.
reshape
(
-
1
,
x
.
shape
[
-
1
])
# num_tokens >= threshold
FP16_MATMUL_HEURISTIC_CONDITION
=
x
.
shape
[:
-
1
].
numel
()
>=
256
if
FP16_MATMUL_HEURISTIC_CONDITION
:
out
=
ops
.
awq_dequantize
(
qweight
,
scales
,
qzeros
,
0
,
0
,
0
)
out
=
torch
.
matmul
(
reshaped_x
,
out
)
else
:
out
=
ops
.
awq_gemm
(
reshaped_x
,
qweight
,
scales
,
qzeros
,
pack_factor
)
if
bias
is
not
None
:
out
.
add_
(
bias
)
return
out
.
reshape
(
out_shape
)
vllm/model_executor/layers/quantization/awq_marlin.py
View file @
5ad04465
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
typing
import
TYPE_CHECKING
,
Any
,
Union
from
typing
import
TYPE_CHECKING
,
Any
import
torch
import
torch
from
safetensors.torch
import
_TYPES
as
_SAFETENSORS_TO_TORCH_DTYPE
from
safetensors.torch
import
_TYPES
as
_SAFETENSORS_TO_TORCH_DTYPE
...
@@ -27,6 +27,7 @@ from vllm.model_executor.layers.linear import (
...
@@ -27,6 +27,7 @@ from vllm.model_executor.layers.linear import (
UnquantizedLinearMethod
,
UnquantizedLinearMethod
,
set_weight_attrs
,
set_weight_attrs
,
)
)
from
vllm.model_executor.layers.quantization.awq
import
AWQConfig
from
vllm.model_executor.layers.quantization.base_config
import
(
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
,
QuantizationConfig
,
QuantizeMethodBase
,
QuantizeMethodBase
,
...
@@ -63,252 +64,6 @@ if TYPE_CHECKING:
...
@@ -63,252 +64,6 @@ if TYPE_CHECKING:
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
class
AWQConfig
(
QuantizationConfig
):
"""Config class for AWQ.
Reference: https://arxiv.org/abs/2306.00978
"""
def
__init__
(
self
,
weight_bits
:
int
,
group_size
:
int
,
zero_point
:
bool
,
modules_to_not_convert
:
list
[
str
]
|
None
=
None
,
)
->
None
:
super
().
__init__
()
self
.
weight_bits
=
weight_bits
self
.
group_size
=
group_size
self
.
zero_point
=
zero_point
self
.
modules_to_not_convert
=
modules_to_not_convert
or
[]
if
self
.
weight_bits
!=
4
:
raise
ValueError
(
"Currently, only 4-bit weight quantization is supported for "
f
"AWQ, but got
{
self
.
weight_bits
}
bits."
)
self
.
pack_factor
=
32
//
self
.
weight_bits
def
__repr__
(
self
)
->
str
:
return
(
f
"AWQConfig(weight_bits=
{
self
.
weight_bits
}
, "
f
"group_size=
{
self
.
group_size
}
, "
f
"zero_point=
{
self
.
zero_point
}
, "
f
"modules_to_not_convert=
{
self
.
modules_to_not_convert
}
)"
)
def
get_name
(
self
)
->
"QuantizationMethods"
:
return
"awq"
def
get_supported_act_dtypes
(
self
)
->
list
[
torch
.
dtype
]:
return
[
torch
.
half
]
@
classmethod
def
get_min_capability
(
cls
)
->
int
:
# The AWQ kernel only supports Turing or newer GPUs.
return
75
@
staticmethod
def
get_config_filenames
()
->
list
[
str
]:
return
[
"quant_config.json"
,
# E.g., casperhansen/vicuna-7b-v1.5-awq
# E.g., abhinavkulkarni/mosaicml-mpt-7b-instruct-w4-g128-awq
"quantize_config.json"
,
]
@
classmethod
def
from_config
(
cls
,
config
:
dict
[
str
,
Any
])
->
"AWQConfig"
:
weight_bits
=
cls
.
get_from_keys
(
config
,
[
"w_bit"
,
"bits"
])
group_size
=
cls
.
get_from_keys
(
config
,
[
"q_group_size"
,
"group_size"
])
zero_point
=
cls
.
get_from_keys
(
config
,
[
"zero_point"
])
modules_to_not_convert
=
cls
.
get_from_keys_or
(
config
,
[
"modules_to_not_convert"
],
None
)
return
cls
(
weight_bits
,
group_size
,
zero_point
,
modules_to_not_convert
)
def
get_quant_method
(
self
,
layer
:
torch
.
nn
.
Module
,
prefix
:
str
)
->
Union
[
"LinearMethodBase"
,
"QuantizeMethodBase"
]
|
None
:
if
isinstance
(
layer
,
LinearBase
):
if
is_layer_skipped
(
prefix
,
self
.
modules_to_not_convert
,
self
.
packed_modules_mapping
,
skip_with_substr
=
True
,
):
return
UnquantizedLinearMethod
()
return
AWQLinearMethod
(
self
)
elif
isinstance
(
layer
,
FusedMoE
):
from
.moe_wna16
import
MoeWNA16Config
if
not
check_moe_marlin_supports_layer
(
layer
,
self
.
group_size
):
logger
.
warning_once
(
f
"Layer '
{
prefix
}
' is not supported by AWQMoeMarlin. "
"Falling back to Moe WNA16 kernels."
)
config
=
{
"quant_method"
:
"awq"
,
"bits"
:
self
.
weight_bits
,
"group_size"
:
self
.
group_size
,
"zero_point"
:
self
.
zero_point
,
"lm_head"
:
False
,
"modules_to_not_convert"
:
self
.
modules_to_not_convert
,
}
return
MoeWNA16Config
.
from_config
(
config
).
get_quant_method
(
layer
,
prefix
)
marlin_compatible_config_dict
=
{
"quant_method"
:
"awq"
,
"bits"
:
self
.
weight_bits
,
"group_size"
:
self
.
group_size
,
"zero_point"
:
self
.
zero_point
,
"lm_head"
:
False
,
"modules_to_not_convert"
:
self
.
modules_to_not_convert
,
}
awq_marlin_config
=
AWQMarlinConfig
.
from_config
(
marlin_compatible_config_dict
)
return
awq_marlin_config
.
get_quant_method
(
layer
,
prefix
)
return
None
def
apply_vllm_mapper
(
self
,
hf_to_vllm_mapper
:
"WeightsMapper"
):
if
self
.
modules_to_not_convert
:
self
.
modules_to_not_convert
=
hf_to_vllm_mapper
.
apply_list
(
self
.
modules_to_not_convert
)
def
maybe_update_config
(
self
,
model_name
:
str
,
revision
:
str
|
None
=
None
):
if
self
.
modules_to_not_convert
:
return
unquant_dtypes
=
[
torch
.
float16
,
torch
.
bfloat16
,
torch
.
float32
]
metadata
=
get_safetensors_params_metadata
(
model_name
,
revision
=
revision
)
layers
=
{
param_name
.
rsplit
(
"."
,
1
)[
0
]
for
param_name
in
metadata
}
quant_layers
:
set
[
str
]
=
{
param_name
.
rsplit
(
"."
,
1
)[
0
]
for
param_name
,
info
in
metadata
.
items
()
if
(
dtype
:
=
info
.
get
(
"dtype"
,
None
))
and
_SAFETENSORS_TO_TORCH_DTYPE
[
dtype
]
not
in
unquant_dtypes
}
self
.
modules_to_not_convert
=
list
(
layers
-
quant_layers
)
class
AWQLinearMethod
(
LinearMethodBase
):
"""Linear method for AWQ.
Args:
quant_config: The AWQ quantization config.
"""
def
__init__
(
self
,
quant_config
:
AWQConfig
):
self
.
quant_config
=
quant_config
def
create_weights
(
self
,
layer
:
torch
.
nn
.
Module
,
input_size_per_partition
:
int
,
output_partition_sizes
:
list
[
int
],
input_size
:
int
,
output_size
:
int
,
params_dtype
:
torch
.
dtype
,
**
extra_weight_attrs
,
):
# Normalize group_size
if
self
.
quant_config
.
group_size
!=
-
1
:
group_size
=
self
.
quant_config
.
group_size
else
:
group_size
=
input_size
if
input_size_per_partition
%
group_size
!=
0
:
raise
ValueError
(
"The input size is not aligned with the quantized "
"weight shape. This can be caused by too large "
"tensor parallel size."
)
output_size_per_partition
=
sum
(
output_partition_sizes
)
if
output_size_per_partition
%
self
.
quant_config
.
pack_factor
!=
0
:
raise
ValueError
(
"The output size is not aligned with the quantized "
"weight shape. This can be caused by too large "
"tensor parallel size."
)
weight_loader
=
extra_weight_attrs
.
get
(
"weight_loader"
)
qweight
=
PackedvLLMParameter
(
data
=
torch
.
empty
(
input_size_per_partition
,
output_size_per_partition
//
self
.
quant_config
.
pack_factor
,
dtype
=
torch
.
int32
,
),
input_dim
=
0
,
output_dim
=
1
,
packed_dim
=
1
,
packed_factor
=
self
.
quant_config
.
pack_factor
,
weight_loader
=
weight_loader
,
)
num_groups
=
input_size_per_partition
//
group_size
qzeros
=
PackedvLLMParameter
(
data
=
torch
.
empty
(
num_groups
,
output_size_per_partition
//
self
.
quant_config
.
pack_factor
,
dtype
=
torch
.
int32
,
),
input_dim
=
0
,
output_dim
=
1
,
packed_dim
=
1
,
packed_factor
=
self
.
quant_config
.
pack_factor
,
weight_loader
=
weight_loader
,
)
scales
=
GroupQuantScaleParameter
(
data
=
torch
.
empty
(
num_groups
,
output_size_per_partition
,
dtype
=
params_dtype
,
),
input_dim
=
0
,
output_dim
=
1
,
weight_loader
=
weight_loader
,
)
layer
.
register_parameter
(
"qweight"
,
qweight
)
layer
.
register_parameter
(
"qzeros"
,
qzeros
)
layer
.
register_parameter
(
"scales"
,
scales
)
def
process_weights_after_loading
(
self
,
layer
:
torch
.
nn
.
Module
)
->
None
:
layer
.
qweight
=
torch
.
nn
.
Parameter
(
layer
.
qweight
.
data
,
requires_grad
=
False
)
layer
.
qzeros
=
torch
.
nn
.
Parameter
(
layer
.
qzeros
.
data
,
requires_grad
=
False
)
layer
.
scales
=
torch
.
nn
.
Parameter
(
layer
.
scales
.
data
,
requires_grad
=
False
)
def
apply
(
self
,
layer
:
torch
.
nn
.
Module
,
x
:
torch
.
Tensor
,
bias
:
torch
.
Tensor
|
None
=
None
,
)
->
torch
.
Tensor
:
qweight
=
layer
.
qweight
scales
=
layer
.
scales
qzeros
=
layer
.
qzeros
pack_factor
=
self
.
quant_config
.
pack_factor
out_shape
=
x
.
shape
[:
-
1
]
+
(
qweight
.
shape
[
-
1
]
*
pack_factor
,)
reshaped_x
=
x
.
reshape
(
-
1
,
x
.
shape
[
-
1
])
# num_tokens >= threshold
FP16_MATMUL_HEURISTIC_CONDITION
=
x
.
shape
[:
-
1
].
numel
()
>=
256
if
FP16_MATMUL_HEURISTIC_CONDITION
:
out
=
ops
.
awq_dequantize
(
qweight
,
scales
,
qzeros
,
0
,
0
,
0
)
out
=
torch
.
matmul
(
reshaped_x
,
out
)
else
:
out
=
ops
.
awq_gemm
(
reshaped_x
,
qweight
,
scales
,
qzeros
,
pack_factor
)
if
bias
is
not
None
:
out
.
add_
(
bias
)
return
out
.
reshape
(
out_shape
)
class
AWQMarlinConfig
(
QuantizationConfig
):
class
AWQMarlinConfig
(
QuantizationConfig
):
"""Config class for AWQ Marlin"""
"""Config class for AWQ Marlin"""
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment