Unverified Commit 183a7096 authored by JartX's avatar JartX Committed by GitHub
Browse files

[BUGFIX] GPTQ quantization compatibility for Qwen3 MOE models (AutoGPTQ and...


[BUGFIX] GPTQ quantization compatibility for Qwen3 MOE models (AutoGPTQ and AutoRound-GPTQ) (#23994)
Signed-off-by: default avatarJartX <sagformas@epdcenter.es>
Signed-off-by: default avatarIsotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: default avatarIsotr0py <mozf@mail2.sysu.edu.cn>
parent 14b4326b
...@@ -37,6 +37,7 @@ class GPTQConfig(QuantizationConfig): ...@@ -37,6 +37,7 @@ class GPTQConfig(QuantizationConfig):
desc_act: bool, desc_act: bool,
lm_head_quantized: bool, lm_head_quantized: bool,
dynamic: dict[str, dict[str, Union[int, bool]]], dynamic: dict[str, dict[str, Union[int, bool]]],
autoround_version: str = "",
) -> None: ) -> None:
# GPTQModel use `dynamic` config property to allow per module # GPTQModel use `dynamic` config property to allow per module
# quantization config so each module can be individually optimized. # quantization config so each module can be individually optimized.
...@@ -74,6 +75,9 @@ class GPTQConfig(QuantizationConfig): ...@@ -74,6 +75,9 @@ class GPTQConfig(QuantizationConfig):
"Currently, only 2/3/4/8-bit weight quantization is " "Currently, only 2/3/4/8-bit weight quantization is "
f"supported for GPTQ, but got {self.weight_bits} bits.") f"supported for GPTQ, but got {self.weight_bits} bits.")
# used to identify GPTQ model quantized by autoround
self.autoround_version = autoround_version
def __repr__(self) -> str: def __repr__(self) -> str:
return (f"GPTQConfig(weight_bits={self.weight_bits}, " return (f"GPTQConfig(weight_bits={self.weight_bits}, "
f"group_size={self.group_size}, " f"group_size={self.group_size}, "
...@@ -108,8 +112,10 @@ class GPTQConfig(QuantizationConfig): ...@@ -108,8 +112,10 @@ class GPTQConfig(QuantizationConfig):
desc_act = cls.get_from_keys(config, ["desc_act"]) desc_act = cls.get_from_keys(config, ["desc_act"])
lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
default=False) default=False)
autoround_version = cls.get_from_keys_or(config, ["autoround_version"],
default="")
return cls(weight_bits, group_size, desc_act, lm_head_quantized, return cls(weight_bits, group_size, desc_act, lm_head_quantized,
dynamic) dynamic, autoround_version)
def get_quant_method( def get_quant_method(
self, layer: torch.nn.Module, prefix: str self, layer: torch.nn.Module, prefix: str
......
...@@ -119,6 +119,9 @@ class GPTQMarlinConfig(QuantizationConfig): ...@@ -119,6 +119,9 @@ class GPTQMarlinConfig(QuantizationConfig):
self.quant_type = self.TYPE_MAP[(weight_bits, is_sym)] self.quant_type = self.TYPE_MAP[(weight_bits, is_sym)]
# used to identify GPTQ model quantized by autoround
self.autoround_version = full_config.get("autoround_version", "")
def __repr__(self) -> str: def __repr__(self) -> str:
return (f"GPTQMarlinConfig(quant_type={self.quant_type}, " return (f"GPTQMarlinConfig(quant_type={self.quant_type}, "
f"group_size={self.group_size}, " f"group_size={self.group_size}, "
......
...@@ -159,9 +159,13 @@ class Qwen3MoeSparseMoeBlock(nn.Module): ...@@ -159,9 +159,13 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig): def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
# GPTQ configs do not have a list of ignored modules, however AutoGPTQ # GPTQ configs do not have a list of ignored modules, however AutoGPTQ
# seems to avoid gate quantization. # seems to avoid gate quantization while AutoRound does.
# See: https://huggingface.co/Qwen/Qwen3-30B-A3B-GPTQ-Int4 # See: https://huggingface.co/Qwen/Qwen3-30B-A3B-GPTQ-Int4,
if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)): # and https://huggingface.co/jart25/Qwen3-Coder-30B-A3B-Instruct-Int4-gptq
if isinstance(
quant_config,
(GPTQConfig,
GPTQMarlinConfig)) and not quant_config.autoround_version:
return None return None
return quant_config return quant_config
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment