[Quantization][Autoround][CPU] Add W4A16 Support (#38192)

Signed-off-by: Zhenzhong1 <zhenzhong.xu@intel.com> Signed-off-by: Zhenzhong Xu <zhenzhong.xu@intel.com>

[Quantization][Autoround][CPU] Add W4A16 Support (#38192)
Signed-off-by: Zhenzhong1 <zhenzhong.xu@intel.com> Signed-off-by: Zhenzhong Xu <zhenzhong.xu@intel.com>
60995c05 · Zhenzhong Xu · GitHub · 29e5d102 · 60995c05 · 60995c05
Unverified Commit 60995c05 authored Apr 15, 2026 by Zhenzhong Xu Committed by GitHub Apr 15, 2026
Showing with 34 additions and 1 deletion

tests/quantization/test_cpu_wna16.py tests/quantization/test_cpu_wna16.py +1 -0

vllm/model_executor/layers/quantization/inc.py vllm/model_executor/layers/quantization/inc.py +33 -1

No files found.
--- a/tests/quantization/test_cpu_wna16.py
+++ b/tests/quantization/test_cpu_wna16.py
@@ -12,6 +12,7 @@ MODELS = [
    "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",  # with g_idx
    "Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4",  # without g_idx
    "RedHatAI/Qwen3-1.7B-quantized.w4a16",  # with zp
+    "OPEA/Qwen2.5-0.5B-Instruct-int4-sym-inc",
 ]
 DTYPE = ["bfloat16"]


--- a/vllm/model_executor/layers/quantization/inc.py
+++ b/vllm/model_executor/layers/quantization/inc.py
@@ -414,6 +414,7 @@ class INCConfig(QuantizationConfig):

    def apply_xpu_w4a16_quant_layer(self, layer, prefix: str):
        weight_bits, group_size, sym = self.get_layer_config(layer, prefix)
+
        if not self.check_quantized(weight_bits):
            if isinstance(layer, (LinearBase, ParallelLMHead)):
                return UnquantizedLinearMethod()
@@ -437,6 +438,27 @@ class INCConfig(QuantizationConfig):
            )
        return None

+    def apply_cpu_w4a16_quant_layer(self, layer, prefix: str):
+        weight_bits, group_size, sym = self.get_layer_config(layer, prefix)
+        if not self.check_quantized(weight_bits):
+            if isinstance(layer, (LinearBase, ParallelLMHead)):
+                return UnquantizedLinearMethod()
+            else:
+                return None
+
+        if weight_bits != 4:
+            raise NotImplementedError(
+                f"INC on CPU only supports 4-bit quantization, "
+                f"got weight_bits={weight_bits}."
+            )
+        if not sym:
+            raise NotImplementedError(
+                "INC W4A16 on CPU only supports symmetric quantization for now."
+            )
+        if isinstance(layer, (LinearBase, ParallelLMHead)):
+            return self.apply_gptq_quant_layer(layer, prefix)
+        return None
+
    def get_quant_method(self, layer: torch.nn.Module, prefix: str):
        if prefix and self.extra_config:
            for layer_name in self.extra_config:
@@ -446,11 +468,21 @@ class INCConfig(QuantizationConfig):
                    return UnquantizedLinearMethod()
        if current_platform.is_xpu():
            return self.apply_xpu_w4a16_quant_layer(layer, prefix)
-        if "gptq" in self.packing_format or "gptq" in self.backend:
+        is_gptq = "gptq" in self.packing_format or "gptq" in self.backend
+        if current_platform.is_cpu() and is_gptq:
+            return self.apply_cpu_w4a16_quant_layer(layer, prefix)
+        if is_gptq:
            return self.apply_gptq_quant_layer(layer, prefix)
        if "awq" in self.packing_format or "awq" in self.backend:
            return self.apply_awq_quant_layer(layer, prefix)

+        raise NotImplementedError(
+            f"Unsupported quantization configuration for layer '{prefix}'. "
+            f"Platform: CPU={current_platform.is_cpu()}. "
+            f"Platform: XPU={current_platform.is_xpu()}. "
+            f"Format: {self.packing_format}, Backend: {self.backend}."
+        )
+
    @classmethod
    def override_quantization_method(
        cls, hf_quant_cfg, user_quant, hf_config=None