"vllm/vscode:/vscode.git/clone" did not exist on "b854321ffe50fd04c6b1ac58eecdab4caf5b4295"
noop_elimination.py 5.12 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
from collections.abc import Iterable
5
6
7

import torch.fx
from torch import SymInt
8
from torch.fx.experimental.symbolic_shapes import statically_known_true
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25

from vllm.logger import init_logger

from .fx_utils import is_func
from .vllm_inductor_pass import VllmInductorPass

logger = init_logger(__name__)


class NoOpEliminationPass(VllmInductorPass):
    """
    This is an inductor pass that removes redundant reshape/slice operations.
    It is required for RMSNorm-quant fusion to work properly.
    That's because apply_fp8_linear adds a reshape, which is redundant
    in the 2D-case. Additionally, torch internal no-op elimination pass does
    not handle certain slice variants.

26
27
28
29
30
31
    Cases handled:
      1. A chain of reshapes is equivalent to the last reshape called on the
      base tensor (input of the first reshape).
      2. A reshape that produces the shape of the input is redundant
      3. A slice that produces the shape of the input is redundant

32
    Example graph 1:
33
34
35
36
37
38
39
40
41
42
    mul_1: "f16[s0, 4096]" = ...
    view_1: "f16[s0, 128, 32]" = torch.reshape(mul_1, [-1, 128, 32])
    view_2: "f16[s0, 4096]" = torch.reshape(view_2, [-1, 4096])
    view_3: "f16[s0, 128, 32]" = torch.reshape(view_3, [-1, 128, 32])

    Can be replaced with:
    mul_1: "f16[s0, 4096]" = ...
    view_3: "f16[s0, 128, 32]" = ...

    Example graph 2:
43
44
45
46
47
48
49
50
51
52
    getitem_1: "f16[s0, 4096]" = ...
    view_1: "f16[s0, 4096]" = torch.reshape(getitem_1, [-1, 4096])
    at = auto_functionalized(static_scaled_fp8_quant, input = view_1, ...)
    out: "f8e4m3fn[s0, 4096]" = at[1]

    Can be replaced with:
    getitem_1: "f16[s0, 4096]" = ...
    at = auto_functionalized(static_scaled_fp8_quant, input = getitem_1, ...)
    out: "f8e4m3fn[s0, 4096]" = at[1]

53
    Example graph 3:
54
55
56
57
58
59
60
61
62
63
64
65
66
    arg0: "s0" = SymInt(s0)
    scaled_mm: "f16[s0, 4096]" = ...
    slice_1: "f16[s0, 4096]" = torch.slice(scaled_mm, -1, 0, arg0)
    at = auto_functionalized(fused_add_rms_norm, input = slice_1, ...)
    out: "f16[s0, 4096]" = torch.slice_scatter(scaled_mm, at[1], 0, 0, arg0)

    Can be replaced with:
    arg0: "s0" = SymInt(s0)
    scaled_mm: "f16[s0, 4096]" = ...
    at = auto_functionalized(fused_add_rms_norm, input = scaled_mm, ...)
    out: "f16[s0, 4096]" = at[1]
    """

67
    @VllmInductorPass.time_and_log
68
69
70
71
72
    def __call__(self, graph: torch.fx.Graph):
        count = 0
        # Remove no-op reshapes/views:
        for node in graph.nodes:
            if is_func(node, torch.ops.aten.reshape.default):
73
74
75
76
77
78
79
80
81
82
83
                # Case 1: rewrite reshape chains to reshapes on the base tensor
                input = node.args[0]
                # If the input is a reshape, rebind to that node
                if is_func(input, torch.ops.aten.reshape.default):
                    # The new input is guaranteed not to be a reshape,
                    # because we process nodes in order
                    node.update_arg(0, input.args[0])
                    if len(input.users) == 0:
                        graph.erase_node(input)
                        count += 1

84
85
86
87
88
            # remove reshape/slice if it produces the original shape
            if is_func(node, torch.ops.aten.reshape.default) or is_func(
                node, torch.ops.aten.slice.Tensor
            ):
                input = node.args[0]
89
                input_shape = input.meta["val"].shape
90
                output_shape = node.meta["val"].shape
91
                if self.all_dims_equivalent(input_shape, output_shape):
92
93
94
95
96
97
98
99
                    node.replace_all_uses_with(input)
                    graph.erase_node(node)
                    count += 1
            elif is_func(node, torch.ops.aten.slice_scatter.default):
                base, view, dim_index, start, end = node.args[:5]
                base_shape = base.meta["val"].shape
                view_shape = view.meta["val"].shape

100
                if self.all_dims_equivalent(base_shape, view_shape):
101
102
103
104
105
106
                    node.replace_all_uses_with(view)
                    graph.erase_node(node)
                    count += 1

        logger.debug("Removed %s no-op reshapes and slices", count)

107
    # ---------------------- Shape comparison helpers ----------------------
108
    def dims_equivalent(self, dim: int | SymInt, i_dim: int | SymInt) -> bool:
109
110
111
112
113
114
        """
        This function checks if two dimensions are equivalent.
        :param dim: The dimension arg to reshape/slice
        :param i_dim: The corresponding dimension in the input tensor
        :return: Are the dimensions equivalent?

115
        There are two cases in which the dimensions are equivalent:
116
        1. The dimensions are equal (both integers)
117
        2. The dimensions both correspond to the same SymInt
118
        """
119
        # Case 1
120
        return statically_known_true(dim == i_dim)
121
122

    def all_dims_equivalent(
123
        self, dims: Iterable[int | SymInt], i_dims: Iterable[int | SymInt]
124
    ) -> bool:
125
126
127
128
129
130
        dims_ = list(dims)
        i_dims_ = list(i_dims)
        if len(dims_) != len(i_dims_):
            # Different ranks can't be equivalent
            return False
        return all(self.dims_equivalent(s, i_s) for s, i_s in zip(dims, i_dims))