"vllm/vscode:/vscode.git/clone" did not exist on "bf3e05215c7f20baf9fcd82d8877d8453dcebf6e"
gptq.py 13.4 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

CHU Tianxiang's avatar
CHU Tianxiang committed
4
5
import enum
from enum import Enum
6
from fractions import Fraction
7
from typing import Any, Optional, Union
CHU Tianxiang's avatar
CHU Tianxiang committed
8
9

import torch
10
from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
CHU Tianxiang's avatar
CHU Tianxiang committed
11
12
from torch.nn.parameter import Parameter

13
from vllm import _custom_ops as ops
14
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
15
from vllm.model_executor.layers.linear import LinearMethodBase
16
from vllm.model_executor.layers.quantization import QuantizationMethods
CHU Tianxiang's avatar
CHU Tianxiang committed
17
from vllm.model_executor.layers.quantization.base_config import (
18
    QuantizationConfig, QuantizeMethodBase)
19
20
from vllm.model_executor.layers.quantization.utils.gptq_utils import (
    get_linear_quant_method)
21
22
23
24
25
from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
                                           GroupQuantScaleParameter,
                                           PackedColumnParameter,
                                           PackedvLLMParameter,
                                           RowvLLMParameter)
26
27
from vllm.transformers_utils.config import get_safetensors_params_metadata
from vllm.utils import is_list_of
CHU Tianxiang's avatar
CHU Tianxiang committed
28
29
30
31
32
33
34
35
36
37
38
39
40


class GPTQConfig(QuantizationConfig):
    """Config class for GPTQ.

    Reference: https://arxiv.org/abs/2210.17323
    """

    def __init__(
        self,
        weight_bits: int,
        group_size: int,
        desc_act: bool,
41
        lm_head_quantized: bool,
42
        dynamic: dict[str, dict[str, Union[int, bool]]],
43
        autoround_version: str = "",
44
        modules_in_block_to_quantize: Optional[list[str]] = None,
CHU Tianxiang's avatar
CHU Tianxiang committed
45
    ) -> None:
46
47
        # GPTQModel use `dynamic` config property to allow per module
        # quantization config so each module can be individually optimized.
48
        # Format is dict[str, dict] where key is a regex string that can
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
        # perform both positive ("+:" prefixed) or negative ("-:" prefixed)
        # matching of a module.
        # Default to positive match, override base quant config mode, if no
        # prefix is used. Value is in dict format of field key and override
        # value.
        # Negative matching will skip quantization init for this module
        # entirely:
        # non-quantized inference. More details and quantization examples can be
        # found at: https://github.com/ModelCloud/GPTQModel
        # Example:
        #  # last 1/2 of the layers 10-21 has 8bit vs 4bit for 0-9
        #  # last 1/4 of the layers 16-21 has 8bit and group_size 64
        # dynamic = {
        #  #`.*\.` matches the layers_node prefix
        #  # positive match layer 10-15
        #  r"+:.*\.(?:1[0-5])\..*": {"bits": 8,},
        #  # positive match layer 16-21
        #  r"+:.*\.(?:1[6-9]|20|21)\..*": {"bits": 8, "group_size": 64,},
        #  r"-:.*\.moe\..*": {}, # negative match (skip) all `moe` layers
        # }
69
        super().__init__()
70
71
        self.dynamic = dynamic

CHU Tianxiang's avatar
CHU Tianxiang committed
72
73
74
        self.weight_bits = weight_bits
        self.group_size = group_size
        self.desc_act = desc_act
75
        self.lm_head_quantized = lm_head_quantized
76
77
        self.pack_factor = Fraction(32, self.weight_bits)
        if self.weight_bits not in [2, 3, 4, 8]:
CHU Tianxiang's avatar
CHU Tianxiang committed
78
            raise ValueError(
79
80
                "Currently, only 2/3/4/8-bit weight quantization is "
                f"supported for GPTQ, but got {self.weight_bits} bits.")
CHU Tianxiang's avatar
CHU Tianxiang committed
81

82
83
        self.modules_in_block_to_quantize = modules_in_block_to_quantize or []

84
85
86
        # used to identify GPTQ model quantized by autoround
        self.autoround_version = autoround_version

CHU Tianxiang's avatar
CHU Tianxiang committed
87
    def __repr__(self) -> str:
88
89
90
91
92
93
94
95
        return (
            f"GPTQConfig(weight_bits={self.weight_bits}, "
            f"group_size={self.group_size}, "
            f"desc_act={self.desc_act}), "
            f"lm_head_quantized={self.lm_head_quantized}, "
            f"dynamic={self.dynamic}, "
            f"modules_in_block_to_quantize={self.modules_in_block_to_quantize})"
        )
CHU Tianxiang's avatar
CHU Tianxiang committed
96
97

    @classmethod
98
    def get_name(cls) -> QuantizationMethods:
CHU Tianxiang's avatar
CHU Tianxiang committed
99
100
101
        return "gptq"

    @classmethod
102
    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
CHU Tianxiang's avatar
CHU Tianxiang committed
103
104
105
106
107
108
109
110
        return [torch.half]

    @classmethod
    # Need to figure it out
    def get_min_capability(cls) -> int:
        return 60

    @classmethod
111
    def get_config_filenames(cls) -> list[str]:
CHU Tianxiang's avatar
CHU Tianxiang committed
112
113
114
        return ["quantize_config.json"]

    @classmethod
115
    def from_config(cls, config: dict[str, Any]) -> "GPTQConfig":
116
117
118
        dynamic = cls.get_from_keys_or(config, ["dynamic"], default={})
        dynamic = {} if dynamic is None else dynamic

CHU Tianxiang's avatar
CHU Tianxiang committed
119
120
121
        weight_bits = cls.get_from_keys(config, ["bits"])
        group_size = cls.get_from_keys(config, ["group_size"])
        desc_act = cls.get_from_keys(config, ["desc_act"])
122
123
        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
                                                 default=False)
124
125
        autoround_version = cls.get_from_keys_or(config, ["autoround_version"],
                                                 default="")
126
127
        modules_in_block_to_quantize = cls.get_from_keys_or(
            config, ["modules_in_block_to_quantize"], default=None)
128
        return cls(weight_bits, group_size, desc_act, lm_head_quantized,
129
                   dynamic, autoround_version, modules_in_block_to_quantize)
CHU Tianxiang's avatar
CHU Tianxiang committed
130

131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
    def get_quant_method(
        self, layer: torch.nn.Module, prefix: str
    ) -> Optional[Union["GPTQLinearMethod", "QuantizeMethodBase"]]:
        if isinstance(layer, FusedMoE):
            # GPTQ MoE support: fall back to MoeWNA16 for broad compatibility
            from .moe_wna16 import MoeWNA16Config

            config = {
                "quant_method": "gptq",
                "bits": self.weight_bits,
                "group_size": self.group_size,
                "sym": True,  # GPTQ typically uses symmetric quantization
                "lm_head": False,
            }
            return MoeWNA16Config.from_config(config).get_quant_method(
                layer, prefix)

148
        return get_linear_quant_method(self, layer, prefix, GPTQLinearMethod)
CHU Tianxiang's avatar
CHU Tianxiang committed
149

150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
    def apply_vllm_mapper(self, hf_to_vllm_mapper):
        if self.modules_in_block_to_quantize is not None:
            self.modules_in_block_to_quantize = hf_to_vllm_mapper.apply_list(
                self.modules_in_block_to_quantize)

    def maybe_update_config(self,
                            model_name: str,
                            revision: Optional[str] = None):
        if self.modules_in_block_to_quantize:
            if is_list_of(self.modules_in_block_to_quantize, list):
                # original modules_in_block_to_quantize: list[list[str]]
                # flatten original modules_in_block_to_quantize
                self.modules_in_block_to_quantize = [
                    item for sublist in self.modules_in_block_to_quantize
                    for item in sublist
                ]
            return

        unquant_dtypes = [torch.float16, torch.bfloat16, torch.float32]
        metadata = get_safetensors_params_metadata(model_name,
                                                   revision=revision)
        quant_layers: set[str] = {
            param_name.rsplit(".", 1)[0]
            for param_name, info in metadata.items()
            if (dtype := info.get('dtype', None))
            and _SAFETENSORS_TO_TORCH_DTYPE[dtype] not in unquant_dtypes
        }
        self.modules_in_block_to_quantize = list(quant_layers)

CHU Tianxiang's avatar
CHU Tianxiang committed
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198

class ExllamaState(Enum):

    UNUSED = enum.auto()
    UNINITIALIZED = enum.auto()
    READY = enum.auto()


class GPTQLinearMethod(LinearMethodBase):
    """Linear method for GPTQ.

    Args:
        quant_config: The GPTQ quantization config.
    """

    def __init__(self, quant_config: GPTQConfig):
        self.quant_config = quant_config

    def create_weights(
        self,
199
        layer: torch.nn.Module,
CHU Tianxiang's avatar
CHU Tianxiang committed
200
        input_size_per_partition: int,
201
        output_partition_sizes: list[int],
CHU Tianxiang's avatar
CHU Tianxiang committed
202
203
204
        input_size: int,
        output_size: int,
        params_dtype: torch.dtype,
205
206
        **extra_weight_attrs,
    ):
CHU Tianxiang's avatar
CHU Tianxiang committed
207
        del output_size  # Unused.
208
        weight_loader = extra_weight_attrs.get("weight_loader")
CHU Tianxiang's avatar
CHU Tianxiang committed
209
210
211
212
213
        if input_size_per_partition % self.quant_config.group_size != 0:
            raise ValueError(
                "The input size is not aligned with the quantized "
                "weight shape. This can be caused by too large "
                "tensor parallel size.")
James Fleming's avatar
James Fleming committed
214
        output_size_per_partition = sum(output_partition_sizes)
215
216
        if (output_size_per_partition % self.quant_config.pack_factor.numerator
                != 0):
CHU Tianxiang's avatar
CHU Tianxiang committed
217
218
219
220
221
222
223
224
225
226
227
228
            raise ValueError(
                "The output size is not aligned with the quantized "
                "weight shape. This can be caused by too large "
                "tensor parallel size.")

        if self.quant_config.group_size != -1:
            group_size = self.quant_config.group_size
        else:
            group_size = input_size
        exllama_state = ExllamaState.UNINITIALIZED
        scale_and_zero_size = input_size // group_size
        scale_and_zero_input_dim = None
229
230
        if (input_size != input_size_per_partition
                and self.quant_config.group_size != -1):
CHU Tianxiang's avatar
CHU Tianxiang committed
231
232
233
234
235
236
237
238
            # For act-order models, we cannot use Exllama for row parallel layer
            if self.quant_config.desc_act:
                exllama_state = ExllamaState.UNUSED
            else:
                # we need to partition qzeros and scales for exllama kernel
                scale_and_zero_size = input_size_per_partition // group_size
                scale_and_zero_input_dim = 0

239
240
        qweight = PackedvLLMParameter(
            data=torch.empty(
CHU Tianxiang's avatar
CHU Tianxiang committed
241
242
243
244
                input_size_per_partition // self.quant_config.pack_factor,
                output_size_per_partition,
                dtype=torch.int32,
            ),
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
            input_dim=0,
            output_dim=1,
            packed_dim=0,
            packed_factor=self.quant_config.pack_factor,
            weight_loader=weight_loader)

        g_idx = RowvLLMParameter(data=torch.tensor(
            [
                i // self.quant_config.group_size
                for i in range(input_size_per_partition)
            ],
            dtype=torch.int32,
        ),
                                 input_dim=0,
                                 weight_loader=weight_loader)
        qzeros_args = {
            "data":
CHU Tianxiang's avatar
CHU Tianxiang committed
262
263
264
265
266
            torch.empty(
                scale_and_zero_size,
                output_size_per_partition // self.quant_config.pack_factor,
                dtype=torch.int32,
            ),
267
268
269
270
271
            "weight_loader":
            weight_loader
        }
        weight_scale_args = {
            "data":
CHU Tianxiang's avatar
CHU Tianxiang committed
272
273
274
275
276
            torch.empty(
                scale_and_zero_size,
                output_size_per_partition,
                dtype=params_dtype,
            ),
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
            "weight_loader":
            weight_loader
        }
        if scale_and_zero_input_dim is None:
            scales = ChannelQuantScaleParameter(output_dim=1,
                                                **weight_scale_args)
            qzeros = PackedColumnParameter(
                output_dim=1,
                packed_dim=1,
                packed_factor=self.quant_config.pack_factor,
                **qzeros_args)

        else:
            scales = GroupQuantScaleParameter(output_dim=1,
                                              input_dim=0,
                                              **weight_scale_args)
            qzeros = PackedvLLMParameter(
                input_dim=0,
                output_dim=1,
                packed_dim=1,
                packed_factor=self.quant_config.pack_factor,
                **qzeros_args)
299
300
301
302
303
304
305

        layer.register_parameter("qweight", qweight)
        layer.register_parameter("g_idx", g_idx)
        layer.register_parameter("qzeros", qzeros)
        layer.register_parameter("scales", scales)

        layer.exllama_state = exllama_state
CHU Tianxiang's avatar
CHU Tianxiang committed
306

307
    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
308
309
310
311
        # for torch.compile
        layer.qzeros = Parameter(layer.qzeros.data, requires_grad=False)
        layer.qweight = Parameter(layer.qweight.data, requires_grad=False)
        layer.g_idx = Parameter(layer.g_idx.data, requires_grad=False)
312
        layer.scales = Parameter(layer.scales.data, requires_grad=False)
313

CHU Tianxiang's avatar
CHU Tianxiang committed
314
315
        # exllama needs to shuffle the weight after the weight is loaded
        # here we do the shuffle on first forward pass
316
        if layer.exllama_state == ExllamaState.UNINITIALIZED:
CHU Tianxiang's avatar
CHU Tianxiang committed
317
            if self.quant_config.desc_act:
318
                layer.g_idx.data = torch.argsort(layer.g_idx).to(torch.int)
CHU Tianxiang's avatar
CHU Tianxiang committed
319
            else:
320
                layer.g_idx.data = torch.empty((0, ),
321
                                               dtype=torch.int,
322
323
324
                                               device=layer.g_idx.device)
            layer.exllama_state = ExllamaState.READY
            ops.gptq_shuffle(layer.qweight, layer.g_idx,
325
                             self.quant_config.weight_bits)
326
327
328
329
330
331
332
333

    def apply(self,
              layer: torch.nn.Module,
              x: torch.Tensor,
              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
        out_shape = x.shape[:-1] + (layer.qweight.shape[-1], )
        reshaped_x = x.reshape(-1, x.shape[-1])

334
335
336
        output = ops.gptq_gemm(reshaped_x, layer.qweight, layer.qzeros,
                               layer.scales, layer.g_idx,
                               layer.exllama_state == ExllamaState.READY,
337
                               self.quant_config.weight_bits)
CHU Tianxiang's avatar
CHU Tianxiang committed
338
        if bias is not None:
339
            output.add_(bias)
CHU Tianxiang's avatar
CHU Tianxiang committed
340
        return output.reshape(out_shape)