[`Mixtral`] Change mistral op order (#27955)

up

[`Mixtral`] Change mistral op order (#27955)
up
54d0b1c2 · Younes Belkada · GitHub · 4850aaba · 54d0b1c2
Unverified Commit 54d0b1c2 authored Dec 11, 2023 by Younes Belkada Committed by GitHub Dec 11, 2023
Show whitespace changes
Inline Side-by-side

Showing with 3 additions and 3 deletions

src/transformers/models/mixtral/modeling_mixtral.py src/transformers/models/mixtral/modeling_mixtral.py +3 -3

No files found.
--- a/src/transformers/models/mixtral/modeling_mixtral.py
+++ b/src/transformers/models/mixtral/modeling_mixtral.py
@@ -663,10 +663,10 @@ class MixtralBLockSparseTop2MLP(nn.Module):

        self.act_fn = ACT2FN[config.hidden_act]

-    def forward(self, hidden_states, routing_weights):
+    def forward(self, hidden_states):
        current_hidden_states = self.act_fn(self.w1(hidden_states)) * self.w3(hidden_states)
        current_hidden_states = self.w2(current_hidden_states)
-        return routing_weights * current_hidden_states
+        return current_hidden_states


 MISTRAL_ATTENTION_CLASSES = {
@@ -736,7 +736,7 @@ class MixtralSparseMoeBlock(nn.Module):
            # the current expert. We need to make sure to multiply the output hidden
            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
            current_state = hidden_states[None, top_x_list].reshape(-1, hidden_dim)
-            current_hidden_states = expert_layer(current_state, routing_weights[top_x_list, idx_list, None])
+            current_hidden_states = expert_layer(current_state) * routing_weights[top_x_list, idx_list, None]

            # However `index_add_` only support torch tensors for indexing so we'll use
            # the `top_x` tensor here.