Unverified Commit 60995c05 authored by Zhenzhong Xu's avatar Zhenzhong Xu Committed by GitHub
Browse files

[Quantization][Autoround][CPU] Add W4A16 Support (#38192)


Signed-off-by: default avatarZhenzhong1 <zhenzhong.xu@intel.com>
Signed-off-by: default avatarZhenzhong Xu <zhenzhong.xu@intel.com>
parent 29e5d102
......@@ -12,6 +12,7 @@ MODELS = [
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", # with g_idx
"Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4", # without g_idx
"RedHatAI/Qwen3-1.7B-quantized.w4a16", # with zp
"OPEA/Qwen2.5-0.5B-Instruct-int4-sym-inc",
]
DTYPE = ["bfloat16"]
......
......@@ -414,6 +414,7 @@ class INCConfig(QuantizationConfig):
def apply_xpu_w4a16_quant_layer(self, layer, prefix: str):
weight_bits, group_size, sym = self.get_layer_config(layer, prefix)
if not self.check_quantized(weight_bits):
if isinstance(layer, (LinearBase, ParallelLMHead)):
return UnquantizedLinearMethod()
......@@ -437,6 +438,27 @@ class INCConfig(QuantizationConfig):
)
return None
def apply_cpu_w4a16_quant_layer(self, layer, prefix: str):
weight_bits, group_size, sym = self.get_layer_config(layer, prefix)
if not self.check_quantized(weight_bits):
if isinstance(layer, (LinearBase, ParallelLMHead)):
return UnquantizedLinearMethod()
else:
return None
if weight_bits != 4:
raise NotImplementedError(
f"INC on CPU only supports 4-bit quantization, "
f"got weight_bits={weight_bits}."
)
if not sym:
raise NotImplementedError(
"INC W4A16 on CPU only supports symmetric quantization for now."
)
if isinstance(layer, (LinearBase, ParallelLMHead)):
return self.apply_gptq_quant_layer(layer, prefix)
return None
def get_quant_method(self, layer: torch.nn.Module, prefix: str):
if prefix and self.extra_config:
for layer_name in self.extra_config:
......@@ -446,11 +468,21 @@ class INCConfig(QuantizationConfig):
return UnquantizedLinearMethod()
if current_platform.is_xpu():
return self.apply_xpu_w4a16_quant_layer(layer, prefix)
if "gptq" in self.packing_format or "gptq" in self.backend:
is_gptq = "gptq" in self.packing_format or "gptq" in self.backend
if current_platform.is_cpu() and is_gptq:
return self.apply_cpu_w4a16_quant_layer(layer, prefix)
if is_gptq:
return self.apply_gptq_quant_layer(layer, prefix)
if "awq" in self.packing_format or "awq" in self.backend:
return self.apply_awq_quant_layer(layer, prefix)
raise NotImplementedError(
f"Unsupported quantization configuration for layer '{prefix}'. "
f"Platform: CPU={current_platform.is_cpu()}. "
f"Platform: XPU={current_platform.is_xpu()}. "
f"Format: {self.packing_format}, Backend: {self.backend}."
)
@classmethod
def override_quantization_method(
cls, hf_quant_cfg, user_quant, hf_config=None
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment