[Quant] Add `SupportsQuant` to phi3 and clip (#13104)

12913d17 · Kyle Sayers · GitHub · 80f63a39 · 12913d17 · 12913d17
Unverified Commit 12913d17 authored Feb 15, 2025 by Kyle Sayers Committed by GitHub Feb 15, 2025
20 changed files
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -169,6 +169,7 @@ class AQLMConfig(QuantizationConfig):
        num_codebooks: int,
        out_group_size: int,
    ) -> None:
+        super().__init__()
        self.in_group_size = in_group_size
        self.nbits_per_codebook = nbits_per_codebook
        self.num_codebooks = num_codebooks

--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -26,6 +26,7 @@ class AWQConfig(QuantizationConfig):
        zero_point: bool,
        modules_to_not_convert: Optional[List[str]] = None,
    ) -> None:
+        super().__init__()
        self.weight_bits = weight_bits
        self.group_size = group_size
        self.zero_point = zero_point

--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -47,6 +47,7 @@ class AWQMarlinConfig(QuantizationConfig):
                 lm_head_quantized: bool,
                 modules_to_not_convert: Optional[List[str]],
                 full_config: Dict[str, Any]) -> None:
+        super().__init__()
        self.pack_factor = 32 // weight_bits  # packed into int32
        self.group_size = group_size
        self.zero_point = zero_point

--- a/vllm/model_executor/layers/quantization/base_config.py
+++ b/vllm/model_executor/layers/quantization/base_config.py
@@ -2,7 +2,7 @@
 import inspect
 from abc import ABC, abstractmethod
-from typing import Any, Dict, List, Mapping, Optional, Type
+from typing import Any, Dict, List, Optional, Type
 import torch
 from torch import nn
@@ -59,7 +59,11 @@ def method_has_implemented_embedding(
 class QuantizationConfig(ABC):
    """Base class for quantization configs."""
-    packed_modules_mapping: Mapping[str, List[str]] = dict()
+    def __init__(self):
+        super().__init__()
+        # mapping is updated by models as they initialize
+        self.packed_modules_mapping: Dict[str, List[str]] = dict()
    @abstractmethod
    def get_name(self) -> str:

--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -30,7 +30,7 @@ class BitsAndBytesConfig(QuantizationConfig):
        llm_int8_skip_modules: Optional[List[str]] = None,
        llm_int8_threshold: float = 6.0,
    ) -> None:
+        super().__init__()
        self.load_in_8bit = load_in_8bit
        self.load_in_4bit = load_in_4bit
        self.bnb_4bit_compute_dtype = bnb_4bit_compute_dtype

--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -51,7 +51,7 @@ class CompressedTensorsConfig(QuantizationConfig):
        kv_cache_scheme: Optional[Dict[str, Any]] = None,
        config: Optional[Dict[str, Any]] = None,
    ):
+        super().__init__()
        self.ignore = ignore
        self.quant_format = quant_format
        # Map from [target -> scheme]

--- a/vllm/model_executor/layers/quantization/deepspeedfp.py
+++ b/vllm/model_executor/layers/quantization/deepspeedfp.py
@@ -25,6 +25,7 @@ class DeepSpeedFPConfig(QuantizationConfig):
        weight_bits: int = 8,
        group_size: int = 512,
    ) -> None:
+        super().__init__()
        self.weight_bits = weight_bits
        self.group_size = group_size
        self.valid_types = [torch.bfloat16, torch.float16]

--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -17,7 +17,7 @@ class ExpertsInt8Config(QuantizationConfig):
    """Config class for Int8 experts quantization."""
    def __init__(self) -> None:
-        pass
+        super().__init__()
    @classmethod
    def get_name(cls) -> str:

--- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py
+++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
@@ -29,6 +29,7 @@ class FBGEMMFp8Config(QuantizationConfig):
    """Config class for FBGEMM Fp8."""
    def __init__(self, ignore_list: List[str], input_scale_ub: float):
+        super().__init__()
        self.ignore_list = ignore_list if ignore_list else []
        self.input_scale_ub = input_scale_ub

--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -47,6 +47,7 @@ class Fp8Config(QuantizationConfig):
        ignored_layers: Optional[List[str]] = None,
        weight_block_size: Optional[List[int]] = None,
    ) -> None:
+        super().__init__()
        self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
        if is_checkpoint_fp8_serialized:
            logger.warning("Detected fp8 checkpoint. Please note that the "

--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -20,7 +20,7 @@ class GGUFConfig(QuantizationConfig):
    """Config class for GGUF."""
    def __init__(self, ) -> None:
-        pass
+        super().__init__()
    def __repr__(self) -> str:
        return ("GGUFConfig()")

--- a/vllm/model_executor/layers/quantization/gptq.py
+++ b/vllm/model_executor/layers/quantization/gptq.py
@@ -58,6 +58,7 @@ class GPTQConfig(QuantizationConfig):
        #  r"+:.*\.(?:1[6-9]|20|21)\..*": {"bits": 8, "group_size": 64,},
        #  r"-:.*\.moe\..*": {}, # negative match (skip) all `moe` layers
        # }
+        super().__init__()
        self.dynamic = dynamic
        self.weight_bits = weight_bits

--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -46,6 +46,7 @@ class GPTQMarlinConfig(QuantizationConfig):
                 is_sym: bool, lm_head_quantized: bool,
                 dynamic: Dict[str, Dict[str, Union[int, bool]]],
                 full_config: Dict[str, Any]) -> None:
+        super().__init__()
        if desc_act and group_size == -1:
            # In this case, act_order == True is the same as act_order == False
            # (since we have only one group per output channel)

--- a/vllm/model_executor/layers/quantization/gptq_marlin_24.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin_24.py
@@ -38,6 +38,7 @@ class GPTQMarlin24Config(QuantizationConfig):
        weight_bits: int,
        group_size: int,
    ) -> None:
+        super().__init__()
        quant_type = {
            4: scalar_types.uint4b8,
            8: scalar_types.uint8b128,

--- a/vllm/model_executor/layers/quantization/hqq_marlin.py
+++ b/vllm/model_executor/layers/quantization/hqq_marlin.py
@@ -33,6 +33,7 @@ class HQQMarlinConfig(QuantizationConfig):
        group_size: int,
        skip_modules: Optional[List[str]] = None,
    ) -> None:
+        super().__init__()
        assert group_size == 64, ("The only supported HQQ group size is "
                                  "currently 64.")
        assert weight_bits == 4, ("The only supported HQQ quantization "

--- a/vllm/model_executor/layers/quantization/ipex_quant.py
+++ b/vllm/model_executor/layers/quantization/ipex_quant.py
@@ -35,6 +35,7 @@ class IPEXConfig(QuantizationConfig):
        desc_act: Optional[bool] = None,
        lm_head_quantized: Optional[bool] = None,
    ) -> None:
+        super().__init__()
        self.method = method
        self.weight_bits = weight_bits
        self.group_size = group_size

--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -28,6 +28,7 @@ class ModelOptFp8Config(QuantizationConfig):
        self,
        is_checkpoint_fp8_serialized: bool = False,
    ) -> None:
+        super().__init__()
        self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
        if is_checkpoint_fp8_serialized:
            logger.warning("Detected ModelOpt fp8 checkpoint. Please note that"

--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -24,6 +24,7 @@ class MoeWNA16Config(QuantizationConfig):
                 group_size: int, has_zp: bool, lm_head_quantized: bool,
                 modules_to_not_convert: Optional[List[str]],
                 full_config: Dict[str, Any]) -> None:
+        super().__init__()
        self.weight_bits = weight_bits
        self.group_size = group_size
        self.has_zp = has_zp

--- a/vllm/model_executor/layers/quantization/neuron_quant.py
+++ b/vllm/model_executor/layers/quantization/neuron_quant.py
@@ -20,6 +20,7 @@ class NeuronQuantConfig(QuantizationConfig):
        dequant_dtype: str = "f16",
        quantize_method: str = "vector_dynamic",
    ) -> None:
+        super().__init__()
        self.quant_dtype = os.getenv("NEURON_QUANT_DTYPE", "s8")
        if self.quant_dtype not in SUPPORTED_QUANT_DTYPE_LIST:
            raise ValueError(

--- a/vllm/model_executor/layers/quantization/qqq.py
+++ b/vllm/model_executor/layers/quantization/qqq.py
@@ -39,6 +39,7 @@ class QQQConfig(QuantizationConfig):
        group_size: int,
        is_sym: bool = True,
    ) -> None:
+        super().__init__()
        self.weight_bits = weight_bits
        self.group_size = group_size
        self.is_sym = is_sym