fp8.py 22.6 KB
Newer Older
1
from typing import Any, Dict, List, Optional
2
3
4
5
6

import torch
from torch.nn import Module
from torch.nn.parameter import Parameter

7
import vllm.envs as envs
8
from vllm import _custom_ops as ops
9
from vllm.logger import init_logger
10
from vllm.model_executor.layers.fused_moe import FusedMoE, FusedMoEMethodBase
11
12
from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                               UnquantizedLinearMethod)
13
from vllm.model_executor.layers.quantization.base_config import (
14
    QuantizationConfig, QuantizeMethodBase)
15
from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
16
from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
17
    apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
18
19
from vllm.model_executor.layers.quantization.utils.quant_utils import (
    is_layer_skipped)
20
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
21
    all_close_1d, apply_fp8_linear, convert_to_channelwise,
22
    cutlass_fp8_supported, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize,
23
    requantize_with_max_scale)
24
25
from vllm.model_executor.parameter import (ModelWeightParameter,
                                           PerTensorScaleParameter)
26
from vllm.model_executor.utils import set_weight_attrs
27
from vllm.platforms import current_platform
28
from vllm.utils import is_hip, print_warning_once
29

30
31
32
33
ACTIVATION_SCHEMES = ["static", "dynamic"]

logger = init_logger(__name__)

34

35
class Fp8Config(QuantizationConfig):
36
37
    """Config class for FP8."""

38
39
    def __init__(
        self,
40
        is_checkpoint_fp8_serialized: bool = False,
41
        activation_scheme: str = "dynamic",
42
        ignored_layers: Optional[List[str]] = None,
43
    ) -> None:
44
45
46
47
48
49
50
        self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
        if is_checkpoint_fp8_serialized:
            logger.warning("Detected fp8 checkpoint. Please note that the "
                           "format is experimental and subject to change.")
        if activation_scheme not in ACTIVATION_SCHEMES:
            raise ValueError(
                f"Unsupported activation scheme {activation_scheme}")
51
        self.activation_scheme = activation_scheme
52
        self.ignored_layers = ignored_layers or []
53

54
55
56
57
58
59
60
61
62
63
    @classmethod
    def get_name(cls) -> str:
        return "fp8"

    @classmethod
    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
        return [torch.bfloat16, torch.half]

    @classmethod
    def get_min_capability(cls) -> int:
64
        return 80
65
66
67
68
69
70

    @classmethod
    def get_config_filenames(cls) -> List[str]:
        return []

    @classmethod
71
    def from_config(cls, config: Dict[str, Any]) -> "Fp8Config":
72
73
        quant_method = cls.get_from_keys(config, ["quant_method"])
        is_checkpoint_fp8_serialized = ("fp8" in quant_method)
74
        activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
75
        ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None)
76
        return cls(is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized,
77
78
                   activation_scheme=activation_scheme,
                   ignored_layers=ignored_layers)
79

80
81
    def get_quant_method(self, layer: torch.nn.Module,
                         prefix: str) -> Optional["QuantizeMethodBase"]:
82
83
        from vllm.attention.layer import Attention  # Avoid circular import

84
        if isinstance(layer, LinearBase):
85
86
            if is_layer_skipped(prefix, self.ignored_layers):
                return UnquantizedLinearMethod()
87
            return Fp8LinearMethod(self)
88
89
90
        elif isinstance(layer, FusedMoE):
            return Fp8MoEMethod(self)
        elif isinstance(layer, Attention):
91
            return Fp8KVCacheMethod(self)
92
        return None
93
94
95
96
97
98
99

    def get_scaled_act_names(self) -> List[str]:
        return []


class Fp8LinearMethod(LinearMethodBase):
    """Linear method for FP8.
100
101
102
103
104
105
    Supports loading FP8 checkpoints with static weight scale and
    dynamic/static activation scale.

    Also supports loading quantized FP16/BF16 model checkpoints with dynamic
    activation scaling. The weight scaling factor will be initialized after
    the model weights are loaded.
106
107
108
109
110

    Limitations:
    1. Only support per-tensor quantization due to torch._scaled_mm support.
    2. Only support float8_e4m3fn data type due to the limitation of
       torch._scaled_mm (https://github.com/pytorch/pytorch/blob/2e48b39603411a41c5025efbe52f89560b827825/aten/src/ATen/native/cuda/Blas.cpp#L854-L856)
111

112
113
114
115
    Args:
        quant_config: The quantization config.
    """

116
    def __init__(self, quant_config: Fp8Config):
117
        self.quant_config = quant_config
118
        self.cutlass_fp8_supported = cutlass_fp8_supported()
119

120
121
122
123
        # For GPUs that lack FP8 hardware support, we can leverage the Marlin
        # kernel for fast weight-only FP8 quantization
        capability = current_platform.get_device_capability()
        capability = capability[0] * 10 + capability[1]
124
        self.use_marlin = capability < 89 or envs.VLLM_TEST_FORCE_FP8_MARLIN
125
126
127
        # Disable marlin for rocm
        if is_hip():
            self.use_marlin = False
128

129
130
131
132
    def create_weights(
        self,
        layer: torch.nn.Module,
        input_size_per_partition: int,
133
        output_partition_sizes: List[int],
134
135
136
137
138
        input_size: int,
        output_size: int,
        params_dtype: torch.dtype,
        **extra_weight_attrs,
    ):
139
        del input_size, output_size
140
        output_size_per_partition = sum(output_partition_sizes)
141
        weight_loader = extra_weight_attrs.get("weight_loader")
142
143
144

        layer.logical_widths = output_partition_sizes

145
146
147
148
        layer.input_size_per_partition = input_size_per_partition
        layer.output_size_per_partition = output_size_per_partition
        layer.orig_dtype = params_dtype

149
150
151
152
        # WEIGHT
        weight_dtype = (torch.float8_e4m3fn
                        if self.quant_config.is_checkpoint_fp8_serialized else
                        params_dtype)
153
154
155
156
157
158
159
160

        weight = ModelWeightParameter(data=torch.empty(
            output_size_per_partition,
            input_size_per_partition,
            dtype=weight_dtype),
                                      input_dim=1,
                                      output_dim=0,
                                      weight_loader=weight_loader)
161
162
        layer.register_parameter("weight", weight)

163
164
165
166
        # If checkpoint is serialized fp8, load them.
        # Otherwise, wait until process_weights_after_loading.
        if self.quant_config.is_checkpoint_fp8_serialized:
            # WEIGHT SCALE
167
168
169
170
171
            scale = PerTensorScaleParameter(data=torch.empty(
                len(output_partition_sizes), dtype=torch.float32),
                                            weight_loader=weight_loader)

            scale[:] = torch.finfo(torch.float32).min
172
            layer.register_parameter("weight_scale", scale)
173

174
            # INPUT ACTIVATION SCALE
175
            if self.quant_config.activation_scheme == "static":
176
177
178
179
180
                scale = PerTensorScaleParameter(data=torch.empty(
                    len(output_partition_sizes), dtype=torch.float32),
                                                weight_loader=weight_loader)

                scale[:] = torch.finfo(torch.float32).min
181
                layer.register_parameter("input_scale", scale)
182
183
            else:
                layer.register_parameter("input_scale", None)
184

185
    def process_weights_after_loading(self, layer: Module) -> None:
186
187
        layer.weight = torch.nn.Parameter(layer.weight.data,
                                          requires_grad=False)
188
        # If checkpoint not serialized fp8, quantize the weights.
189
190
191
        if not self.quant_config.is_checkpoint_fp8_serialized:
            qweight, weight_scale = ops.scaled_fp8_quant(layer.weight,
                                                         scale=None)
192

193
194
195
196
197
198
199
200
            # If using marlin (w8a16), kernel uses channelwise weights,
            # so extend the weight scales to be channelwise.
            if self.use_marlin:
                assert weight_scale.numel() == 1
                weight_scale = convert_to_channelwise(
                    weight_scale.expand(len(layer.logical_widths)),
                    layer.logical_widths)

201
            # Update the layer with the new values.
202
203
            layer.weight = Parameter(qweight.t(), requires_grad=False)
            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
204
            layer.input_scale = None
205

206
207
        # If checkpoint is fp8, handle that there are N scales for N
        # shards in a fused module
208
        else:
209
210
211
212
213
            layer.weight_scale = torch.nn.Parameter(layer.weight_scale.data,
                                                    requires_grad=False)
            if self.quant_config.activation_scheme == "static":
                layer.input_scale = torch.nn.Parameter(layer.input_scale.data,
                                                       requires_grad=False)
214
215
216
217
218
219
220
221
222
223
224
            # If using marlin (w8a16), kernel uses channelwise weights,
            # so extend the weight scales to be channelwise.
            if self.use_marlin:
                weight = layer.weight
                weight_scale = convert_to_channelwise(layer.weight_scale,
                                                      layer.logical_widths)

            # If using w8a8, torch._scaled_mm needs per tensor, so
            # requantize the logical shards as a single weight.
            else:
                # Dequant -> Quant with max scale so we can run per tensor.
225
226
227
228
229
230
231
232
233
234
235
236
237
238
                weight = layer.weight
                weight_scale = layer.weight_scale

                # If rocm, use float8_e4m3fnuz.
                if is_hip():
                    weight, weight_scale, input_scale = \
                        normalize_e4m3fn_to_e4m3fnuz(
                            weight=weight,
                            weight_scale=weight_scale,
                            input_scale=layer.input_scale)
                    if input_scale is not None:
                        layer.input_scale = Parameter(input_scale,
                                                      requires_grad=False)

239
                weight_scale, weight = requantize_with_max_scale(
240
241
                    weight=weight,
                    weight_scale=weight_scale,
242
243
                    logical_widths=layer.logical_widths,
                )
244

245
            # Update layer with new values.
246
            layer.weight = Parameter(weight.t(), requires_grad=False)
247
            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
248
            if self.quant_config.activation_scheme == "static":
249
250
                layer.input_scale = Parameter(layer.input_scale.max(),
                                              requires_grad=False)
251

252
253
254
255
        if self.use_marlin:
            prepare_fp8_layer_for_marlin(layer)
            # Activations not quantized for marlin.
            del layer.input_scale
256

257
258
259
260
    def apply(self,
              layer: torch.nn.Module,
              x: torch.Tensor,
              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
261

262
        if self.use_marlin:
263
264
265
266
            return apply_fp8_marlin_linear(
                input=x,
                weight=layer.weight,
                weight_scale=layer.weight_scale,
267
268
269
                workspace=layer.workspace,
                size_n=layer.output_size_per_partition,
                size_k=layer.input_size_per_partition,
270
                bias=bias)
271

272
273
274
275
276
277
        return apply_fp8_linear(
            input=x,
            weight=layer.weight,
            weight_scale=layer.weight_scale,
            input_scale=layer.input_scale,
            bias=bias,
278
279
            cutlass_fp8_supported=self.cutlass_fp8_supported,
            use_per_token_if_dynamic=False)
280
281


282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
class Fp8MoEMethod(FusedMoEMethodBase):
    """MoE method for FP8.
    Supports loading FP8 checkpoints with static weight scale and
    dynamic/static activation scale.

    Also supports loading quantized FP16/BF16 model checkpoints with dynamic
    activation scaling. The weight scaling factor will be initialized after
    the model weights are loaded.

    Args:
        quant_config: The quantization config.
    """

    def __init__(self, quant_config: Fp8Config):
        self.quant_config = quant_config

    def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
                       intermediate_size: int, params_dtype: torch.dtype,
                       **extra_weight_attrs):

        if self.quant_config.is_checkpoint_fp8_serialized:
            params_dtype = torch.float8_e4m3fn

        # WEIGHTS
        w13_weight = torch.nn.Parameter(torch.empty(num_experts,
                                                    2 * intermediate_size,
                                                    hidden_size,
                                                    dtype=params_dtype),
                                        requires_grad=False)
        layer.register_parameter("w13_weight", w13_weight)
        set_weight_attrs(w13_weight, extra_weight_attrs)

        w2_weight = torch.nn.Parameter(torch.empty(num_experts,
                                                   hidden_size,
                                                   intermediate_size,
                                                   dtype=params_dtype),
                                       requires_grad=False)
        layer.register_parameter("w2_weight", w2_weight)
        set_weight_attrs(w2_weight, extra_weight_attrs)

        # WEIGHT_SCALES
        # Allocate 2 scales for w1 and w3 respectively.
        # They will be combined to a single scale after weight loading.
325
326
327
328
329
        w13_weight_scale = torch.nn.Parameter(torch.ones(num_experts,
                                                         2,
                                                         dtype=torch.float32),
                                              requires_grad=False)
        layer.register_parameter("w13_weight_scale", w13_weight_scale)
330

331
332
333
334
        w2_weight_scale = torch.nn.Parameter(torch.ones(num_experts,
                                                        dtype=torch.float32),
                                             requires_grad=False)
        layer.register_parameter("w2_weight_scale", w2_weight_scale)
335

336
337
338
339
        # If loading fp8 checkpoint, pass the weight loaders.
        # If loading an fp16 checkpoint, do not (we will quantize in
        #   process_weights_after_loading()
        if self.quant_config.is_checkpoint_fp8_serialized:
340
341
342
343
344
345
346
347
            set_weight_attrs(w13_weight_scale, {
                "is_fp8_scale": True,
                **extra_weight_attrs
            })
            set_weight_attrs(w2_weight_scale, {
                "is_fp8_scale": True,
                **extra_weight_attrs
            })
348
349
350
351
352
353
354
355

        # INPUT_SCALES
        if self.quant_config.activation_scheme == "static":
            if not self.quant_config.is_checkpoint_fp8_serialized:
                raise ValueError(
                    "Found static activation scheme for checkpoint that "
                    "was not serialized fp8.")

356
357
358
359
            w13_input_scale = torch.nn.Parameter(torch.ones(
                num_experts, dtype=torch.float32),
                                                 requires_grad=False)
            layer.register_parameter("w13_input_scale", w13_input_scale)
360
361
362
363
            set_weight_attrs(w13_input_scale, {
                "is_fp8_scale": True,
                **extra_weight_attrs
            })
364
365
366
367
368

            w2_input_scale = torch.nn.Parameter(torch.ones(
                num_experts, dtype=torch.float32),
                                                requires_grad=False)
            layer.register_parameter("w2_input_scale", w2_input_scale)
369
370
371
372
            set_weight_attrs(w2_input_scale, {
                "is_fp8_scale": True,
                **extra_weight_attrs
            })
373
        else:
374
375
            layer.w13_input_scale = None
            layer.w2_input_scale = None
376
377
378
379
380

    def process_weights_after_loading(self, layer: Module) -> None:

        # If checkpoint is fp16, quantize in place.
        if not self.quant_config.is_checkpoint_fp8_serialized:
381
382
383
            # If rocm, use float8_e4m3fnuz as dtype
            fp8_dtype = torch.float8_e4m3fnuz \
                        if is_hip() else torch.float8_e4m3fn
384
            w13_weight = torch.empty_like(layer.w13_weight.data,
385
386
                                          dtype=fp8_dtype)
            w2_weight = torch.empty_like(layer.w2_weight.data, dtype=fp8_dtype)
387
388
389

            # Re-initialize w13_scale because we directly quantize
            # merged w13 weights and generate a single scaling factor.
390
            layer.w13_weight_scale = torch.nn.Parameter(torch.ones(
391
392
393
                layer.num_experts,
                dtype=torch.float32,
                device=w13_weight.device),
394
                                                        requires_grad=False)
395
            for expert in range(layer.num_experts):
396
                w13_weight[expert, :, :], layer.w13_weight_scale[
397
398
                    expert] = ops.scaled_fp8_quant(
                        layer.w13_weight.data[expert, :, :])
399
                w2_weight[expert, :, :], layer.w2_weight_scale[
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
                    expert] = ops.scaled_fp8_quant(
                        layer.w2_weight.data[expert, :, :])
            layer.w13_weight = torch.nn.Parameter(w13_weight,
                                                  requires_grad=False)
            layer.w2_weight = torch.nn.Parameter(w2_weight,
                                                 requires_grad=False)
            return

        # If checkpoint is fp8, we need to handle that the
        # MoE kernels require single activation scale and single weight
        # scale for w13 per expert.
        else:
            # Fp8 moe kernels require a single activation scale.
            # We take the max of all the scales in case they differ.
            if self.quant_config.activation_scheme == "static":
415
416
                if (layer.w13_input_scale is None
                        or layer.w2_input_scale is None):
417
418
419
                    raise ValueError(
                        "QuantConfig has static quantization, but found "
                        "activation scales are None.")
420
421
                if (not all_close_1d(layer.w13_input_scale)
                        or not all_close_1d(layer.w2_input_scale)):
422
423
424
425
                    print_warning_once(
                        "Found input_scales that are not equal for "
                        "fp8 MoE layer. Using the maximum across experts "
                        "for each layer. ")
426
427
428
429
                layer.w13_input_scale = torch.nn.Parameter(
                    layer.w13_input_scale.max(), requires_grad=False)
                layer.w2_input_scale = torch.nn.Parameter(
                    layer.w2_input_scale.max(), requires_grad=False)
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
            # If rocm, normalize the weights and scales to e4m3fnuz
            if is_hip():
                # Normalize the weights and scales
                w13_weight, w13_weight_scale, w13_input_scale = \
                    normalize_e4m3fn_to_e4m3fnuz(
                        layer.w13_weight, layer.w13_weight_scale,
                        layer.w13_input_scale)
                w2_weight, w2_weight_scale, w2_input_scale = \
                    normalize_e4m3fn_to_e4m3fnuz(
                        layer.w2_weight, layer.w2_weight_scale,
                        layer.w2_input_scale)
                # Reset the parameter
                layer.w13_weight = torch.nn.Parameter(w13_weight,
                                                      requires_grad=False)
                layer.w13_weight_scale = torch.nn.Parameter(
                    w13_weight_scale, requires_grad=False)
                if w13_input_scale is not None:
                    layer.w13_input_scale = torch.nn.Parameter(
                        w13_input_scale, requires_grad=False)
                layer.w2_weight = torch.nn.Parameter(w2_weight,
                                                     requires_grad=False)
                layer.w2_weight_scale = torch.nn.Parameter(w2_weight_scale,
                                                           requires_grad=False)
                if w2_input_scale is not None:
                    layer.w2_input_scale = torch.nn.Parameter(
                        w2_input_scale, requires_grad=False)
456
457
458

            # Fp8 moe kernel needs single weight scale for w13 per expert.
            # We take the max then dequant and requant each expert.
459
            assert layer.w13_weight_scale is not None
460
            shard_size = layer.intermediate_size_per_partition
461
            max_w13_scales = layer.w13_weight_scale.max(dim=1).values
462
463
464
465
466
467
            for expert_id in range(layer.num_experts):
                start = 0
                for shard_id in range(2):
                    dq_weight = per_tensor_dequantize(
                        layer.w13_weight[expert_id][start:start +
                                                    shard_size, :],
468
                        layer.w13_weight_scale[expert_id][shard_id])
469
                    layer.w13_weight[expert_id][
470
                        start:start + shard_size, :], _ = ops.scaled_fp8_quant(
471
472
473
                            dq_weight, max_w13_scales[expert_id])
                    start += shard_size

474
475
            layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales,
                                                        requires_grad=False)
476
477
478
479
480
481
482
            return

    def apply(self,
              layer: torch.nn.Module,
              x: torch.Tensor,
              router_logits: torch.Tensor,
              top_k: int,
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
              renormalize: bool,
              use_grouped_topk: bool,
              topk_group: Optional[int] = None,
              num_expert_group: Optional[int] = None) -> torch.Tensor:

        from vllm.model_executor.layers.fused_moe import fused_experts

        topk_weights, topk_ids = FusedMoE.select_experts(
            hidden_states=x,
            router_logits=router_logits,
            use_grouped_topk=use_grouped_topk,
            top_k=top_k,
            renormalize=renormalize,
            topk_group=topk_group,
            num_expert_group=num_expert_group)

        return fused_experts(x,
                             layer.w13_weight,
                             layer.w2_weight,
                             topk_weights=topk_weights,
                             topk_ids=topk_ids,
                             inplace=True,
505
                             use_fp8_w8a8=True,
506
507
508
509
                             w1_scale=layer.w13_weight_scale,
                             w2_scale=layer.w2_weight_scale,
                             a1_scale=layer.w13_input_scale,
                             a2_scale=layer.w2_input_scale)
510
511


512
513
514
class Fp8KVCacheMethod(BaseKVCacheMethod):
    """
    Supports loading kv-cache scaling factors from FP8 checkpoints.
515
516
517
    """

    def __init__(self, quant_config: Fp8Config):
518
        super().__init__(quant_config)