mimo.py 7.25 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27

# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py
# Copyright 2025 Xiaomi Corporation.
# Copyright 2024 The Qwen team.
# Copyright 2023 The vLLM team.
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only MiMo model compatible with HuggingFace weights."""
28

29
from collections.abc import Iterable
30
from itertools import islice
31
32
33
34
35
36
37
38
39
40
41

import torch
import torch.nn as nn

from vllm.compilation.decorators import support_torch_compile
from vllm.config import VllmConfig
from vllm.distributed import get_pp_group
from vllm.logger import init_logger
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.model_executor.model_loader.weight_utils import (
42
43
44
    default_weight_loader,
    maybe_remap_kv_scale_name,
)
45
46
47
48
49
50
51
52
53
54
55
56
57
58
from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM, Qwen2Model
from vllm.sequence import IntermediateTensors

from .utils import PPMissingLayer, is_pp_missing_parameter, maybe_prefix

logger = init_logger(__name__)


@support_torch_compile(
    dynamic_arg_dims={
        "input_ids": 0,
        "positions": -1,
        "intermediate_tensors": 0,
        "inputs_embeds": 0,
59
60
    }
)
61
62
63
64
65
class MiMoModel(Qwen2Model):
    def forward(
        self,
        input_ids: torch.Tensor,
        positions: torch.Tensor,
66
67
68
        intermediate_tensors: IntermediateTensors | None = None,
        inputs_embeds: torch.Tensor | None = None,
    ) -> torch.Tensor | IntermediateTensors:
69
70
71
72
73
74
75
76
77
78
        if get_pp_group().is_first_rank:
            if inputs_embeds is not None:
                hidden_states = inputs_embeds
            else:
                hidden_states = self.get_input_embeddings(input_ids)
            residual = None
        else:
            assert intermediate_tensors is not None
            hidden_states = intermediate_tensors["hidden_states"]
            residual = intermediate_tensors["residual"]
79
        for layer in islice(self.layers, self.start_layer, self.end_layer):
80
81
82
83
84
85
            hidden_states, residual = layer(
                positions,
                hidden_states,
                residual,
            )
        if not get_pp_group().is_last_rank:
86
87
88
            return IntermediateTensors(
                {"hidden_states": hidden_states, "residual": residual}
            )
89
90
91
        hidden_states = hidden_states + residual
        return hidden_states

92
    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
93
94
95
96
97
98
99
100
        stacked_params_mapping = [
            ("qkv_proj", "q_proj", "q"),
            ("qkv_proj", "k_proj", "k"),
            ("qkv_proj", "v_proj", "v"),
            ("gate_up_proj", "gate_proj", 0),
            ("gate_up_proj", "up_proj", 1),
        ]
        params_dict = dict(self.named_parameters(remove_duplicate=False))
101
        loaded_params: set[str] = set()
102
103
104
105
106
        for name, loaded_weight in weights:
            if "mtp_layers" in name:
                continue
            if "rotary_emb.inv_freq" in name:
                continue
107
108
109
            if self.quant_config is not None and (
                scale_name := self.quant_config.get_cache_scale(name)
            ):
110
111
                # Loading kv cache quantization scales
                param = params_dict[scale_name]
112
113
114
115
                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                loaded_weight = (
                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
                )
116
117
118
                weight_loader(param, loaded_weight)
                loaded_params.add(scale_name)
                continue
119
            for param_name, weight_name, shard_id in stacked_params_mapping:
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
                if weight_name not in name:
                    continue
                name = name.replace(weight_name, param_name)
                # Skip loading extra bias for GPTQ models.
                if name.endswith(".bias") and name not in params_dict:
                    continue
                if is_pp_missing_parameter(name, self):
                    continue
                param = params_dict[name]
                weight_loader = param.weight_loader
                weight_loader(param, loaded_weight, shard_id)
                break
            else:
                # Skip loading extra bias for GPTQ models.
                if name.endswith(".bias") and name not in params_dict:
                    continue
                # Remapping the name of FP8 kv-scale.
                name = maybe_remap_kv_scale_name(name, params_dict)
                if name is None:
                    continue
                if is_pp_missing_parameter(name, self):
                    continue
                param = params_dict[name]
143
                weight_loader = getattr(param, "weight_loader", default_weight_loader)
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
                weight_loader(param, loaded_weight)
            loaded_params.add(name)
        return loaded_params


class MiMoForCausalLM(Qwen2ForCausalLM, nn.Module):
    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        nn.Module.__init__(self)
        config = vllm_config.model_config.hf_config
        quant_config = vllm_config.quant_config
        lora_config = vllm_config.lora_config

        self.config = config
        self.lora_config = lora_config

        self.quant_config = quant_config

161
162
163
        self.model = MiMoModel(
            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
        )
164
165
166
167
168

        if get_pp_group().is_last_rank:
            if config.tie_word_embeddings:
                self.lm_head = self.model.embed_tokens
            else:
169
170
171
172
173
174
                self.lm_head = ParallelLMHead(
                    config.vocab_size,
                    config.hidden_size,
                    quant_config=quant_config,
                    prefix=maybe_prefix(prefix, "lm_head"),
                )
175
176
177
178
179
180
        else:
            self.lm_head = PPMissingLayer()

        self.logits_processor = LogitsProcessor(config.vocab_size)

        self.make_empty_intermediate_tensors = (
181
182
            self.model.make_empty_intermediate_tensors
        )
183
184
185
186

    def compute_logits(
        self,
        hidden_states: torch.Tensor,
187
    ) -> torch.Tensor | None:
188
        hidden_states = self.model.norm(hidden_states)
189
        logits = self.logits_processor(self.lm_head, hidden_states)
190
        return logits