[Bugfix] fix moe_wna16 get_quant_method (#12648)

Fix https://github.com/vllm-project/vllm/issues/12647 The `get_quant_method` of `moe_wna16` always return moe method, GPTQ-based linear method or AWQ-based linear method, even when the target module is attention layer. https://github.com/vllm-project/vllm/blob/baeded25699f9f4851843306f27f685c4d4ee7c5/vllm/attention/layer.py#L86-L92 Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>

[Bugfix] fix moe_wna16 get_quant_method (#12648)
Fix https://github.com/vllm-project/vllm/issues/12647 The `get_quant_method` of `moe_wna16` always return moe method, GPTQ-based linear method or AWQ-based linear method, even when the target module is attention layer. https://github.com/vllm-project/vllm/blob/baeded25699f9f4851843306f27f685c4d4ee7c5/vllm/attention/layer.py#L86-L92 Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
baaa2b24 · Jinzhen Lin · GitHub · b4e5c033 · baaa2b24
Unverified Commit baaa2b24 authored Feb 02, 2025 by Jinzhen Lin Committed by GitHub Feb 02, 2025
Show whitespace changes
Inline Side-by-side

Showing with 12 additions and 15 deletions

vllm/model_executor/layers/quantization/moe_wna16.py vllm/model_executor/layers/quantization/moe_wna16.py +12 -15

No files found.
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -6,16 +6,13 @@ from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group
 from vllm.model_executor.layers.fused_moe.layer import (
    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
 from vllm.model_executor.layers.linear import UnquantizedLinearMethod
-from vllm.model_executor.layers.quantization.awq import (AWQConfig,
-                                                         AWQLinearMethod)
-from vllm.model_executor.layers.quantization.awq_marlin import (
-    AWQMarlinConfig, AWQMarlinLinearMethod)
+from vllm.model_executor.layers.quantization.awq import AWQConfig
+from vllm.model_executor.layers.quantization.awq_marlin import AWQMarlinConfig
 from vllm.model_executor.layers.quantization.base_config import (
    QuantizationConfig, QuantizeMethodBase)
-from vllm.model_executor.layers.quantization.gptq import (GPTQConfig,
-                                                          GPTQLinearMethod)
+from vllm.model_executor.layers.quantization.gptq import GPTQConfig
 from vllm.model_executor.layers.quantization.gptq_marlin import (
-    GPTQMarlinConfig, GPTQMarlinLinearMethod)
+    GPTQMarlinConfig)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform

@@ -131,18 +128,18 @@ class MoeWNA16Config(QuantizationConfig):
        else:
            if self.linear_quant_method == "gptq":
                if self.use_marlin:
-                    return GPTQMarlinLinearMethod(
-                        GPTQMarlinConfig.from_config(self.full_config))
+                    return GPTQMarlinConfig.from_config(
+                        self.full_config).get_quant_method(layer, prefix)
                else:
-                    return GPTQLinearMethod(
-                        GPTQConfig.from_config(self.full_config))
+                    return GPTQConfig.from_config(
+                        self.full_config).get_quant_method(layer, prefix)
            elif self.linear_quant_method == "awq":
                if self.use_marlin:
-                    return AWQMarlinLinearMethod(
-                        AWQMarlinConfig.from_config(self.full_config))
+                    return AWQMarlinConfig.from_config(
+                        self.full_config).get_quant_method(layer, prefix)
                else:
-                    return AWQLinearMethod(
-                        AWQConfig.from_config(self.full_config))
+                    return AWQConfig.from_config(
+                        self.full_config).get_quant_method(layer, prefix)
            else:
                raise ValueError("moe_wna16 only support gptq and awq.")