MixtralFlashAttention2: put "plus 1" inside parentheses when calculating...

MixtralFlashAttention2: put "plus 1" inside parentheses when calculating rotary_seq_len, allowing None position_ids input. (#31500) * Mixtral: remove unnecessary plus 1 when calculating rotary_seq_len, allowing position_ids=None (no auto position_ids generation could be unsafe) * fix typo [:-1] to [:, -1] * to meet formatting requirement * to meet formatting requirement * remove white space * MixtralFlashAttention2: put "+ 1" inside parentheses when calculating rotary_seq_len, allowing None position_ids input. Fix format/style issue. * propagate to startcoder2, phi3, mixtral and qwen2 * update qwen2_moe

MixtralFlashAttention2: put "plus 1" inside parentheses when calculating...
MixtralFlashAttention2: put "plus 1" inside parentheses when calculating rotary_seq_len, allowing None position_ids input. (#31500) * Mixtral: remove unnecessary plus 1 when calculating rotary_seq_len, allowing position_ids=None (no auto position_ids generation could be unsafe) * fix typo [:-1] to [:, -1] * to meet formatting requirement * to meet formatting requirement * remove white space * MixtralFlashAttention2: put "+ 1" inside parentheses when calculating rotary_seq_len, allowing None position_ids input. Fix format/style issue. * propagate to startcoder2, phi3, mixtral and qwen2 * update qwen2_moe
621fb3c0 · Xueshen Liu · GitHub · 7c31d05b · 621fb3c0 · 621fb3c0
Unverified Commit 621fb3c0 authored Aug 03, 2024 by Xueshen Liu Committed by GitHub Aug 03, 2024
5 changed files
--- a/src/transformers/models/mixtral/modeling_mixtral.py
+++ b/src/transformers/models/mixtral/modeling_mixtral.py
@@ -469,7 +469,10 @@ class MixtralFlashAttention2(MixtralAttention):
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
        # Because the input can be padded, the absolute sequence length depends on the max position id.
-        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
+        rotary_seq_len = (
+            max(kv_seq_len, position_ids[:, -1].max().item() + 1) if position_ids is not None else kv_seq_len
+        )
        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)

--- a/src/transformers/models/phi3/modeling_phi3.py
+++ b/src/transformers/models/phi3/modeling_phi3.py
@@ -536,8 +536,11 @@ class Phi3FlashAttention2(Phi3Attention):
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
        # Because the input can be padded, the absolute sequence length depends on the max position id.
-        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
+        rotary_seq_len = (
-        cos, sin = self.rotary_emb(value_states, position_ids, seq_len=rotary_seq_len)
+            max(kv_seq_len, position_ids[:, -1].max().item() + 1) if position_ids is not None else kv_seq_len
+        )
+        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)

--- a/src/transformers/models/qwen2/modeling_qwen2.py
+++ b/src/transformers/models/qwen2/modeling_qwen2.py
@@ -402,7 +402,10 @@ class Qwen2FlashAttention2(Qwen2Attention):
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
        # Because the input can be padded, the absolute sequence length depends on the max position id.
-        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
+        rotary_seq_len = (
+            max(kv_seq_len, position_ids[:, -1].max().item() + 1) if position_ids is not None else kv_seq_len
+        )
        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)

--- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
+++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py
@@ -481,7 +481,10 @@ class Qwen2MoeFlashAttention2(Qwen2MoeAttention):
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
        # Because the input can be padded, the absolute sequence length depends on the max position id.
-        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
+        rotary_seq_len = (
+            max(kv_seq_len, position_ids[:, -1].max().item() + 1) if position_ids is not None else kv_seq_len
+        )
        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)

--- a/src/transformers/models/starcoder2/modeling_starcoder2.py
+++ b/src/transformers/models/starcoder2/modeling_starcoder2.py
@@ -383,7 +383,10 @@ class Starcoder2FlashAttention2(Starcoder2Attention):
            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
        # Because the input can be padded, the absolute sequence length depends on the max position id.
-        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
+        rotary_seq_len = (
+            max(kv_seq_len, position_ids[:, -1].max().item() + 1) if position_ids is not None else kv_seq_len
+        )
        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)