"vllm/vscode:/vscode.git/clone" did not exist on "0130223bd9900710a0d93e46a4255ec5d1a077a8"
gptq.py 13.5 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

CHU Tianxiang's avatar
CHU Tianxiang committed
4
5
import enum
from enum import Enum
6
from fractions import Fraction
7
from typing import TYPE_CHECKING, Any, Optional, Union
CHU Tianxiang's avatar
CHU Tianxiang committed
8
9

import torch
10
from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
CHU Tianxiang's avatar
CHU Tianxiang committed
11
12
from torch.nn.parameter import Parameter

13
from vllm import _custom_ops as ops
14
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
15
from vllm.model_executor.layers.linear import LinearMethodBase
CHU Tianxiang's avatar
CHU Tianxiang committed
16
from vllm.model_executor.layers.quantization.base_config import (
17
    QuantizationConfig, QuantizeMethodBase)
18
19
from vllm.model_executor.layers.quantization.utils.gptq_utils import (
    get_linear_quant_method)
20
21
22
23
24
from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
                                           GroupQuantScaleParameter,
                                           PackedColumnParameter,
                                           PackedvLLMParameter,
                                           RowvLLMParameter)
25
26
from vllm.transformers_utils.config import get_safetensors_params_metadata
from vllm.utils import is_list_of
CHU Tianxiang's avatar
CHU Tianxiang committed
27

28
29
30
31
32
if TYPE_CHECKING:
    from vllm.model_executor.layers.quantization import QuantizationMethods
else:
    QuantizationMethods = str

CHU Tianxiang's avatar
CHU Tianxiang committed
33
34
35
36
37
38
39
40
41
42
43
44

class GPTQConfig(QuantizationConfig):
    """Config class for GPTQ.

    Reference: https://arxiv.org/abs/2210.17323
    """

    def __init__(
        self,
        weight_bits: int,
        group_size: int,
        desc_act: bool,
45
        lm_head_quantized: bool,
46
        dynamic: dict[str, dict[str, Union[int, bool]]],
47
        autoround_version: str = "",
48
        modules_in_block_to_quantize: Optional[list[str]] = None,
CHU Tianxiang's avatar
CHU Tianxiang committed
49
    ) -> None:
50
51
        # GPTQModel use `dynamic` config property to allow per module
        # quantization config so each module can be individually optimized.
52
        # Format is dict[str, dict] where key is a regex string that can
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
        # perform both positive ("+:" prefixed) or negative ("-:" prefixed)
        # matching of a module.
        # Default to positive match, override base quant config mode, if no
        # prefix is used. Value is in dict format of field key and override
        # value.
        # Negative matching will skip quantization init for this module
        # entirely:
        # non-quantized inference. More details and quantization examples can be
        # found at: https://github.com/ModelCloud/GPTQModel
        # Example:
        #  # last 1/2 of the layers 10-21 has 8bit vs 4bit for 0-9
        #  # last 1/4 of the layers 16-21 has 8bit and group_size 64
        # dynamic = {
        #  #`.*\.` matches the layers_node prefix
        #  # positive match layer 10-15
        #  r"+:.*\.(?:1[0-5])\..*": {"bits": 8,},
        #  # positive match layer 16-21
        #  r"+:.*\.(?:1[6-9]|20|21)\..*": {"bits": 8, "group_size": 64,},
        #  r"-:.*\.moe\..*": {}, # negative match (skip) all `moe` layers
        # }
73
        super().__init__()
74
75
        self.dynamic = dynamic

CHU Tianxiang's avatar
CHU Tianxiang committed
76
77
78
        self.weight_bits = weight_bits
        self.group_size = group_size
        self.desc_act = desc_act
79
        self.lm_head_quantized = lm_head_quantized
80
81
        self.pack_factor = Fraction(32, self.weight_bits)
        if self.weight_bits not in [2, 3, 4, 8]:
CHU Tianxiang's avatar
CHU Tianxiang committed
82
            raise ValueError(
83
84
                "Currently, only 2/3/4/8-bit weight quantization is "
                f"supported for GPTQ, but got {self.weight_bits} bits.")
CHU Tianxiang's avatar
CHU Tianxiang committed
85

86
87
        self.modules_in_block_to_quantize = modules_in_block_to_quantize or []

88
89
90
        # used to identify GPTQ model quantized by autoround
        self.autoround_version = autoround_version

CHU Tianxiang's avatar
CHU Tianxiang committed
91
    def __repr__(self) -> str:
92
93
94
95
96
97
98
99
        return (
            f"GPTQConfig(weight_bits={self.weight_bits}, "
            f"group_size={self.group_size}, "
            f"desc_act={self.desc_act}), "
            f"lm_head_quantized={self.lm_head_quantized}, "
            f"dynamic={self.dynamic}, "
            f"modules_in_block_to_quantize={self.modules_in_block_to_quantize})"
        )
CHU Tianxiang's avatar
CHU Tianxiang committed
100
101

    @classmethod
102
    def get_name(cls) -> QuantizationMethods:
CHU Tianxiang's avatar
CHU Tianxiang committed
103
104
105
        return "gptq"

    @classmethod
106
    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
CHU Tianxiang's avatar
CHU Tianxiang committed
107
108
109
110
111
112
113
114
        return [torch.half]

    @classmethod
    # Need to figure it out
    def get_min_capability(cls) -> int:
        return 60

    @classmethod
115
    def get_config_filenames(cls) -> list[str]:
CHU Tianxiang's avatar
CHU Tianxiang committed
116
117
118
        return ["quantize_config.json"]

    @classmethod
119
    def from_config(cls, config: dict[str, Any]) -> "GPTQConfig":
120
121
122
        dynamic = cls.get_from_keys_or(config, ["dynamic"], default={})
        dynamic = {} if dynamic is None else dynamic

CHU Tianxiang's avatar
CHU Tianxiang committed
123
124
125
        weight_bits = cls.get_from_keys(config, ["bits"])
        group_size = cls.get_from_keys(config, ["group_size"])
        desc_act = cls.get_from_keys(config, ["desc_act"])
126
127
        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
                                                 default=False)
128
129
        autoround_version = cls.get_from_keys_or(config, ["autoround_version"],
                                                 default="")
130
131
        modules_in_block_to_quantize = cls.get_from_keys_or(
            config, ["modules_in_block_to_quantize"], default=None)
132
        return cls(weight_bits, group_size, desc_act, lm_head_quantized,
133
                   dynamic, autoround_version, modules_in_block_to_quantize)
CHU Tianxiang's avatar
CHU Tianxiang committed
134

135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
    def get_quant_method(
        self, layer: torch.nn.Module, prefix: str
    ) -> Optional[Union["GPTQLinearMethod", "QuantizeMethodBase"]]:
        if isinstance(layer, FusedMoE):
            # GPTQ MoE support: fall back to MoeWNA16 for broad compatibility
            from .moe_wna16 import MoeWNA16Config

            config = {
                "quant_method": "gptq",
                "bits": self.weight_bits,
                "group_size": self.group_size,
                "sym": True,  # GPTQ typically uses symmetric quantization
                "lm_head": False,
            }
            return MoeWNA16Config.from_config(config).get_quant_method(
                layer, prefix)

152
        return get_linear_quant_method(self, layer, prefix, GPTQLinearMethod)
CHU Tianxiang's avatar
CHU Tianxiang committed
153

154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
    def apply_vllm_mapper(self, hf_to_vllm_mapper):
        if self.modules_in_block_to_quantize is not None:
            self.modules_in_block_to_quantize = hf_to_vllm_mapper.apply_list(
                self.modules_in_block_to_quantize)

    def maybe_update_config(self,
                            model_name: str,
                            revision: Optional[str] = None):
        if self.modules_in_block_to_quantize:
            if is_list_of(self.modules_in_block_to_quantize, list):
                # original modules_in_block_to_quantize: list[list[str]]
                # flatten original modules_in_block_to_quantize
                self.modules_in_block_to_quantize = [
                    item for sublist in self.modules_in_block_to_quantize
                    for item in sublist
                ]
            return

        unquant_dtypes = [torch.float16, torch.bfloat16, torch.float32]
        metadata = get_safetensors_params_metadata(model_name,
                                                   revision=revision)
        quant_layers: set[str] = {
            param_name.rsplit(".", 1)[0]
            for param_name, info in metadata.items()
            if (dtype := info.get('dtype', None))
            and _SAFETENSORS_TO_TORCH_DTYPE[dtype] not in unquant_dtypes
        }
        self.modules_in_block_to_quantize = list(quant_layers)

CHU Tianxiang's avatar
CHU Tianxiang committed
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202

class ExllamaState(Enum):

    UNUSED = enum.auto()
    UNINITIALIZED = enum.auto()
    READY = enum.auto()


class GPTQLinearMethod(LinearMethodBase):
    """Linear method for GPTQ.

    Args:
        quant_config: The GPTQ quantization config.
    """

    def __init__(self, quant_config: GPTQConfig):
        self.quant_config = quant_config

    def create_weights(
        self,
203
        layer: torch.nn.Module,
CHU Tianxiang's avatar
CHU Tianxiang committed
204
        input_size_per_partition: int,
205
        output_partition_sizes: list[int],
CHU Tianxiang's avatar
CHU Tianxiang committed
206
207
208
        input_size: int,
        output_size: int,
        params_dtype: torch.dtype,
209
210
        **extra_weight_attrs,
    ):
CHU Tianxiang's avatar
CHU Tianxiang committed
211
        del output_size  # Unused.
212
        weight_loader = extra_weight_attrs.get("weight_loader")
CHU Tianxiang's avatar
CHU Tianxiang committed
213
214
215
216
217
        if input_size_per_partition % self.quant_config.group_size != 0:
            raise ValueError(
                "The input size is not aligned with the quantized "
                "weight shape. This can be caused by too large "
                "tensor parallel size.")
James Fleming's avatar
James Fleming committed
218
        output_size_per_partition = sum(output_partition_sizes)
219
220
        if (output_size_per_partition % self.quant_config.pack_factor.numerator
                != 0):
CHU Tianxiang's avatar
CHU Tianxiang committed
221
222
223
224
225
226
227
228
229
230
231
232
            raise ValueError(
                "The output size is not aligned with the quantized "
                "weight shape. This can be caused by too large "
                "tensor parallel size.")

        if self.quant_config.group_size != -1:
            group_size = self.quant_config.group_size
        else:
            group_size = input_size
        exllama_state = ExllamaState.UNINITIALIZED
        scale_and_zero_size = input_size // group_size
        scale_and_zero_input_dim = None
233
234
        if (input_size != input_size_per_partition
                and self.quant_config.group_size != -1):
CHU Tianxiang's avatar
CHU Tianxiang committed
235
236
237
238
239
240
241
242
            # For act-order models, we cannot use Exllama for row parallel layer
            if self.quant_config.desc_act:
                exllama_state = ExllamaState.UNUSED
            else:
                # we need to partition qzeros and scales for exllama kernel
                scale_and_zero_size = input_size_per_partition // group_size
                scale_and_zero_input_dim = 0

243
244
        qweight = PackedvLLMParameter(
            data=torch.empty(
CHU Tianxiang's avatar
CHU Tianxiang committed
245
246
247
248
                input_size_per_partition // self.quant_config.pack_factor,
                output_size_per_partition,
                dtype=torch.int32,
            ),
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
            input_dim=0,
            output_dim=1,
            packed_dim=0,
            packed_factor=self.quant_config.pack_factor,
            weight_loader=weight_loader)

        g_idx = RowvLLMParameter(data=torch.tensor(
            [
                i // self.quant_config.group_size
                for i in range(input_size_per_partition)
            ],
            dtype=torch.int32,
        ),
                                 input_dim=0,
                                 weight_loader=weight_loader)
        qzeros_args = {
            "data":
CHU Tianxiang's avatar
CHU Tianxiang committed
266
267
268
269
270
            torch.empty(
                scale_and_zero_size,
                output_size_per_partition // self.quant_config.pack_factor,
                dtype=torch.int32,
            ),
271
272
273
274
275
            "weight_loader":
            weight_loader
        }
        weight_scale_args = {
            "data":
CHU Tianxiang's avatar
CHU Tianxiang committed
276
277
278
279
280
            torch.empty(
                scale_and_zero_size,
                output_size_per_partition,
                dtype=params_dtype,
            ),
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
            "weight_loader":
            weight_loader
        }
        if scale_and_zero_input_dim is None:
            scales = ChannelQuantScaleParameter(output_dim=1,
                                                **weight_scale_args)
            qzeros = PackedColumnParameter(
                output_dim=1,
                packed_dim=1,
                packed_factor=self.quant_config.pack_factor,
                **qzeros_args)

        else:
            scales = GroupQuantScaleParameter(output_dim=1,
                                              input_dim=0,
                                              **weight_scale_args)
            qzeros = PackedvLLMParameter(
                input_dim=0,
                output_dim=1,
                packed_dim=1,
                packed_factor=self.quant_config.pack_factor,
                **qzeros_args)
303
304
305
306
307
308
309

        layer.register_parameter("qweight", qweight)
        layer.register_parameter("g_idx", g_idx)
        layer.register_parameter("qzeros", qzeros)
        layer.register_parameter("scales", scales)

        layer.exllama_state = exllama_state
CHU Tianxiang's avatar
CHU Tianxiang committed
310

311
    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
312
313
314
315
        # for torch.compile
        layer.qzeros = Parameter(layer.qzeros.data, requires_grad=False)
        layer.qweight = Parameter(layer.qweight.data, requires_grad=False)
        layer.g_idx = Parameter(layer.g_idx.data, requires_grad=False)
316
        layer.scales = Parameter(layer.scales.data, requires_grad=False)
317

CHU Tianxiang's avatar
CHU Tianxiang committed
318
319
        # exllama needs to shuffle the weight after the weight is loaded
        # here we do the shuffle on first forward pass
320
        if layer.exllama_state == ExllamaState.UNINITIALIZED:
CHU Tianxiang's avatar
CHU Tianxiang committed
321
            if self.quant_config.desc_act:
322
                layer.g_idx.data = torch.argsort(layer.g_idx).to(torch.int)
CHU Tianxiang's avatar
CHU Tianxiang committed
323
            else:
324
                layer.g_idx.data = torch.empty((0, ),
325
                                               dtype=torch.int,
326
327
328
                                               device=layer.g_idx.device)
            layer.exllama_state = ExllamaState.READY
            ops.gptq_shuffle(layer.qweight, layer.g_idx,
329
                             self.quant_config.weight_bits)
330
331
332
333
334
335
336
337

    def apply(self,
              layer: torch.nn.Module,
              x: torch.Tensor,
              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
        out_shape = x.shape[:-1] + (layer.qweight.shape[-1], )
        reshaped_x = x.reshape(-1, x.shape[-1])

338
339
340
        output = ops.gptq_gemm(reshaped_x, layer.qweight, layer.qzeros,
                               layer.scales, layer.g_idx,
                               layer.exllama_state == ExllamaState.READY,
341
                               self.quant_config.weight_bits)
CHU Tianxiang's avatar
CHU Tianxiang committed
342
        if bias is not None:
343
            output.add_(bias)
CHU Tianxiang's avatar
CHU Tianxiang committed
344
        return output.reshape(out_shape)