mimo.py 7.16 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27

# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py
# Copyright 2025 Xiaomi Corporation.
# Copyright 2024 The Qwen team.
# Copyright 2023 The vLLM team.
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only MiMo model compatible with HuggingFace weights."""
28

29
from collections.abc import Iterable
30
from itertools import islice
31
32
33
34
35
36
37
38
39
40
41

import torch
import torch.nn as nn

from vllm.compilation.decorators import support_torch_compile
from vllm.config import VllmConfig
from vllm.distributed import get_pp_group
from vllm.logger import init_logger
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.model_executor.model_loader.weight_utils import (
42
43
44
    default_weight_loader,
    maybe_remap_kv_scale_name,
)
45
46
47
48
49
50
51
52
53
54
55
56
57
58
from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM, Qwen2Model
from vllm.sequence import IntermediateTensors

from .utils import PPMissingLayer, is_pp_missing_parameter, maybe_prefix

logger = init_logger(__name__)


@support_torch_compile(
    dynamic_arg_dims={
        "input_ids": 0,
        "positions": -1,
        "intermediate_tensors": 0,
        "inputs_embeds": 0,
59
60
    }
)
61
62
63
class MiMoModel(Qwen2Model):
    def forward(
        self,
zhuwenwen's avatar
zhuwenwen committed
64
        input_ids: torch.Tensor,
65
        positions: torch.Tensor,
66
67
68
        intermediate_tensors: IntermediateTensors | None = None,
        inputs_embeds: torch.Tensor | None = None,
    ) -> torch.Tensor | IntermediateTensors:
69
70
71
72
        if get_pp_group().is_first_rank:
            if inputs_embeds is not None:
                hidden_states = inputs_embeds
            else:
73
                hidden_states = self.embed_input_ids(input_ids)
74
75
76
77
78
            residual = None
        else:
            assert intermediate_tensors is not None
            hidden_states = intermediate_tensors["hidden_states"]
            residual = intermediate_tensors["residual"]
79
        for layer in islice(self.layers, self.start_layer, self.end_layer):
80
81
82
83
84
85
            hidden_states, residual = layer(
                positions,
                hidden_states,
                residual,
            )
        if not get_pp_group().is_last_rank:
86
87
88
            return IntermediateTensors(
                {"hidden_states": hidden_states, "residual": residual}
            )
89
90
91
        hidden_states = hidden_states + residual
        return hidden_states

92
    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
93
94
95
96
97
98
99
100
        stacked_params_mapping = [
            ("qkv_proj", "q_proj", "q"),
            ("qkv_proj", "k_proj", "k"),
            ("qkv_proj", "v_proj", "v"),
            ("gate_up_proj", "gate_proj", 0),
            ("gate_up_proj", "up_proj", 1),
        ]
        params_dict = dict(self.named_parameters(remove_duplicate=False))
101
        loaded_params: set[str] = set()
102
103
104
105
106
        for name, loaded_weight in weights:
            if "mtp_layers" in name:
                continue
            if "rotary_emb.inv_freq" in name:
                continue
107
108
109
            if self.quant_config is not None and (
                scale_name := self.quant_config.get_cache_scale(name)
            ):
110
111
                # Loading kv cache quantization scales
                param = params_dict[scale_name]
112
113
114
115
                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                loaded_weight = (
                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
                )
116
117
118
                weight_loader(param, loaded_weight)
                loaded_params.add(scale_name)
                continue
119
            for param_name, weight_name, shard_id in stacked_params_mapping:
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
                if weight_name not in name:
                    continue
                name = name.replace(weight_name, param_name)
                # Skip loading extra bias for GPTQ models.
                if name.endswith(".bias") and name not in params_dict:
                    continue
                if is_pp_missing_parameter(name, self):
                    continue
                param = params_dict[name]
                weight_loader = param.weight_loader
                weight_loader(param, loaded_weight, shard_id)
                break
            else:
                # Skip loading extra bias for GPTQ models.
                if name.endswith(".bias") and name not in params_dict:
                    continue
                # Remapping the name of FP8 kv-scale.
                name = maybe_remap_kv_scale_name(name, params_dict)
                if name is None:
                    continue
                if is_pp_missing_parameter(name, self):
                    continue
                param = params_dict[name]
143
                weight_loader = getattr(param, "weight_loader", default_weight_loader)
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
                weight_loader(param, loaded_weight)
            loaded_params.add(name)
        return loaded_params


class MiMoForCausalLM(Qwen2ForCausalLM, nn.Module):
    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        nn.Module.__init__(self)
        config = vllm_config.model_config.hf_config
        quant_config = vllm_config.quant_config

        self.config = config

        self.quant_config = quant_config

159
160
161
        self.model = MiMoModel(
            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
        )
162
163
164
165
166

        if get_pp_group().is_last_rank:
            if config.tie_word_embeddings:
                self.lm_head = self.model.embed_tokens
            else:
167
168
169
170
171
172
                self.lm_head = ParallelLMHead(
                    config.vocab_size,
                    config.hidden_size,
                    quant_config=quant_config,
                    prefix=maybe_prefix(prefix, "lm_head"),
                )
173
174
175
176
177
178
        else:
            self.lm_head = PPMissingLayer()

        self.logits_processor = LogitsProcessor(config.vocab_size)

        self.make_empty_intermediate_tensors = (
179
180
            self.model.make_empty_intermediate_tensors
        )
181
182
183
184

    def compute_logits(
        self,
        hidden_states: torch.Tensor,
185
    ) -> torch.Tensor | None:
186
        hidden_states = self.model.norm(hidden_states)
187
        logits = self.logits_processor(self.lm_head, hidden_states)
zhuwenwen's avatar
zhuwenwen committed
188
        return logits