moe.py 9.49 KB
Newer Older
chenych's avatar
chenych committed
1
# Copyright 2025 the LlamaFactory team.
chenych's avatar
chenych committed
2
3
4
5
6
7
8
9
10
11
12
13
14
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

chenych's avatar
chenych committed
15
from typing import TYPE_CHECKING, Union
chenych's avatar
chenych committed
16

shihm's avatar
uodata  
shihm committed
17
18
19
import torch
from torch import nn
from torch.nn import functional as F
chenych's avatar
chenych committed
20
from transformers.integrations import is_deepspeed_zero3_enabled
luopl's avatar
luopl committed
21
22

from ...extras.misc import check_version
shihm's avatar
uodata  
shihm committed
23
from ...extras.packages import is_transformers_version_greater_than
chenych's avatar
chenych committed
24
25
26


if TYPE_CHECKING:
chenych's avatar
chenych committed
27
    from torch import nn
chenych's avatar
chenych committed
28
29
30
31
    from transformers import PretrainedConfig, PreTrainedModel

    from ...hparams import ModelArguments

shihm's avatar
uodata  
shihm committed
32
33
34
if is_transformers_version_greater_than("4.57.0"):
    from transformers.models.qwen3_omni_moe import modeling_qwen3_omni_moe

chenych's avatar
chenych committed
35

chenych's avatar
chenych committed
36
def _set_z3_leaf_modules(model: "PreTrainedModel", leaf_modules: list[Union["nn.Module", str]]) -> None:
luopl's avatar
luopl committed
37
    check_version("deepspeed>=0.13.0")
chenych's avatar
chenych committed
38
39
40
41
42
43
    from deepspeed.utils import set_z3_leaf_modules  # type: ignore

    set_z3_leaf_modules(model, leaf_modules)


def add_z3_leaf_module(model: "PreTrainedModel") -> None:
chenych's avatar
chenych committed
44
    r"""Set module as a leaf module to skip partitioning in deepspeed zero3."""
chenych's avatar
chenych committed
45
46
47
    if not is_deepspeed_zero3_enabled():
        return

luopl's avatar
luopl committed
48
    model_type = getattr(model.config, "model_type", None)
shihm's avatar
uodata  
shihm committed
49
50
51
    text_config = getattr(model.config, "text_config", None)
    text_model_type = getattr(text_config, "model_type", None)

luopl's avatar
luopl committed
52
    if model_type == "dbrx":
chenych's avatar
chenych committed
53
54
55
56
        from transformers.models.dbrx.modeling_dbrx import DbrxFFN

        _set_z3_leaf_modules(model, [DbrxFFN])

chenych's avatar
chenych committed
57
58
59
60
61
62
63
64
    if model_type == "deepseek_v2":
        # deepseek v2 uses custom code
        _set_z3_leaf_modules(model, ["DeepseekV2MoE"])

    if model_type == "deepseek_v3" or model_type == "kimi_vl":
        # deepseek v3 and kimi vl use custom code
        _set_z3_leaf_modules(model, ["DeepseekV3MoE"])

shihm's avatar
uodata  
shihm committed
65
66
67
68
69
    if model_type == "ernie4_5_moe":
        from transformers.models.ernie4_5_moe.modeling_ernie4_5_moe import Ernie4_5_MoeSparseMoeBlock

        _set_z3_leaf_modules(model, [Ernie4_5_MoeSparseMoeBlock])

chenych's avatar
chenych committed
70
71
72
73
74
    if model_type == "granitemoe":
        from transformers.models.granitemoe.modeling_granitemoe import GraniteMoeMoE

        _set_z3_leaf_modules(model, [GraniteMoeMoE])

shihm's avatar
uodata  
shihm committed
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
    if model_type == "glm4_moe":
        from transformers.models.glm4_moe.modeling_glm4_moe import Glm4MoeMoE

        _set_z3_leaf_modules(model, [Glm4MoeMoE])

    if model_type == "glm4v_moe":
        from transformers.models.glm4v_moe.modeling_glm4v_moe import Glm4vMoeTextMoE

        _set_z3_leaf_modules(model, [Glm4vMoeTextMoE])

    if model_type == "gpt_oss":
        from transformers.models.gpt_oss.modeling_gpt_oss import GptOssMLP

        _set_z3_leaf_modules(model, [GptOssMLP])

luopl's avatar
luopl committed
90
    if model_type == "jamba":
chenych's avatar
chenych committed
91
92
93
94
        from transformers.models.jamba.modeling_jamba import JambaSparseMoeBlock

        _set_z3_leaf_modules(model, [JambaSparseMoeBlock])

luopl's avatar
luopl committed
95
    if model_type == "jetmoe":
chenych's avatar
chenych committed
96
97
98
99
        from transformers.models.jetmoe.modeling_jetmoe import JetMoeMoA, JetMoeMoE

        _set_z3_leaf_modules(model, [JetMoeMoA, JetMoeMoE])

chenych's avatar
chenych committed
100
101
    if model_type == "llama4":
        from transformers.models.llama4.modeling_llama4 import Llama4TextMoe
chenych's avatar
chenych committed
102

chenych's avatar
chenych committed
103
        _set_z3_leaf_modules(model, [Llama4TextMoe])
chenych's avatar
chenych committed
104

luopl's avatar
luopl committed
105
    if model_type == "mixtral":
chenych's avatar
chenych committed
106
107
108
109
        from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock

        _set_z3_leaf_modules(model, [MixtralSparseMoeBlock])

chenych's avatar
chenych committed
110
111
112
113
114
115
116
117
118
119
    if model_type == "olmoe":
        from transformers.models.olmoe.modeling_olmoe import OlmoeSparseMoeBlock

        _set_z3_leaf_modules(model, [OlmoeSparseMoeBlock])

    if model_type == "phimoe":
        from transformers.models.phimoe.modeling_phimoe import PhimoeSparseMoeBlock

        _set_z3_leaf_modules(model, [PhimoeSparseMoeBlock])

chenych's avatar
chenych committed
120
    if model_type == "qwen2_moe":
chenych's avatar
chenych committed
121
122
123
124
        from transformers.models.qwen2_moe.modeling_qwen2_moe import Qwen2MoeSparseMoeBlock

        _set_z3_leaf_modules(model, [Qwen2MoeSparseMoeBlock])

shihm's avatar
uodata  
shihm committed
125
    if model_type == "qwen3_moe" or text_model_type == "qwen3_moe":  # internvl 3.5
chenych's avatar
chenych committed
126
127
128
129
        from transformers.models.qwen3_moe.modeling_qwen3_moe import Qwen3MoeSparseMoeBlock

        _set_z3_leaf_modules(model, [Qwen3MoeSparseMoeBlock])

shihm's avatar
uodata  
shihm committed
130
131
132
133
134
135
136
137
138
139
    if model_type == "qwen3_vl_moe":
        from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeTextSparseMoeBlock

        _set_z3_leaf_modules(model, [Qwen3VLMoeTextSparseMoeBlock])

    if model_type in ("qwen3_omni_moe", "qwen3_omni_moe_thinker"):
        from transformers.models.qwen3_omni_moe.modeling_qwen3_omni_moe import Qwen3OmniMoeThinkerTextSparseMoeBlock

        _set_z3_leaf_modules(model, [Qwen3OmniMoeThinkerTextSparseMoeBlock])

chenych's avatar
chenych committed
140
141

def configure_moe(config: "PretrainedConfig", model_args: "ModelArguments", is_trainable: bool) -> None:
chenych's avatar
chenych committed
142
143
144
    if not is_trainable or not model_args.moe_aux_loss_coef:
        return

luopl's avatar
luopl committed
145
    model_type = getattr(config, "model_type", None)
shihm's avatar
uodata  
shihm committed
146
147
    text_config = getattr(config, "text_config", None)  # for multimodal model

chenych's avatar
chenych committed
148
149
    if model_type in [
        "dbrx",
shihm's avatar
uodata  
shihm committed
150
        "ernie4_5_moe",
chenych's avatar
chenych committed
151
152
153
154
155
156
157
158
159
160
161
162
        "granitemoe",
        "jamba",
        "jetmoe",
        "llama4",
        "mixtral",
        "olmoe",
        "phimoe",
        "qwen2_moe",
        "qwen3_moe",
    ]:
        setattr(config, "output_router_logits", True)

shihm's avatar
uodata  
shihm committed
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
    if text_config and getattr(text_config, "model_type", None) in [
        "glm4v_moe_text",  # glmv4_5
        "qwen3_moe",  # internvl_3_5
    ]:
        setattr(text_config, "output_router_logits", True)

    if model_type in [
        "ernie4_5_moe",
        "granitemoe",
        "jamba",
        "llama4",
        "mixtral",
        "olmoe",
        "phimoe",
        "qwen2_moe",
        "qwen3_moe",
    ]:
chenych's avatar
chenych committed
180
181
        setattr(config, "router_aux_loss_coef", model_args.moe_aux_loss_coef)

shihm's avatar
uodata  
shihm committed
182
183
184
    elif text_config and getattr(text_config, "model_type", None) in ["qwen3_moe"]:
        setattr(text_config, "router_aux_loss_coef", model_args.moe_aux_loss_coef)

chenych's avatar
chenych committed
185
186
187
188
189
    elif model_type == "deepseek":
        setattr(config, "aux_loss_alpha", model_args.moe_aux_loss_coef)

    elif model_type == "jetmoe":
        setattr(config, "aux_loss_coef", model_args.moe_aux_loss_coef)
shihm's avatar
uodata  
shihm committed
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252


class Qwen3OmniMoeThinkerTextSparseMoeBlock(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.num_experts = config.num_experts
        self.top_k = config.num_experts_per_tok
        self.norm_topk_prob = config.norm_topk_prob

        # gating
        self.gate = nn.Linear(config.hidden_size, config.num_experts, bias=False)
        self.experts = nn.ModuleList(
            [
                modeling_qwen3_omni_moe.Qwen3OmniMoeThinkerTextMLP(
                    config, intermediate_size=config.moe_intermediate_size
                )
                for _ in range(self.num_experts)
            ]
        )

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        batch_size, sequence_length, hidden_dim = hidden_states.shape
        hidden_states = hidden_states.view(-1, hidden_dim)
        # router_logits: (batch * sequence_length, n_experts)
        router_logits = self.gate(hidden_states)

        # Calculate the routing weights for all experts
        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)

        # Retain the weight of the top_k and reset the rest of the expert rights to 0 (instead of retaining only top_k experts)
        top_k_weights, top_k_indices = torch.topk(routing_weights, self.top_k, dim=-1)
        # Initialize the all-zero weight matrix (same shape as all experts)
        full_routing_weights = torch.zeros_like(routing_weights)
        # Only the weight of top_k experts is retained, and the weight of the rest of the experts remains at 0
        full_routing_weights.scatter_(1, top_k_indices, top_k_weights)

        # Normalized top_k weights (keep the original logic consistent)
        if self.norm_topk_prob:
            # Calculate the sum of the weights top_k each row (for normalization)
            top_k_sum = full_routing_weights.sum(dim=-1, keepdim=True)
            # Avoid dividing by zero
            top_k_sum = torch.clamp(top_k_sum, min=1e-9)
            full_routing_weights /= top_k_sum

        # Convert back to the input data type
        full_routing_weights = full_routing_weights.to(hidden_states.dtype)

        final_hidden_states = torch.zeros(
            (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
        )

        # Go through all the experts (not just the selected ones)
        for expert_idx in range(self.num_experts):
            expert_layer = self.experts[expert_idx]
            # Get the weight of the current expert (inactive expert has a weight of 0 here)
            expert_weights = full_routing_weights[:, expert_idx, None]  # shape: (batch*seq, 1)
            # All samples participate in the calculations of the current expert, the weight may be equal to 0
            current_hidden_states = expert_layer(hidden_states) * expert_weights
            # Add-up to all expert outputs (experts with a weight of 0 do not affect the result)
            final_hidden_states += current_hidden_states

        final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
        return final_hidden_states, router_logits