[shardformer]fix flash attention, when mask is casual, just don't unpad it (#5084)

* fix flash attn * fix fix

[shardformer]fix flash attention, when mask is casual, just don't unpad it (#5084)
* fix flash attn * fix fix
aae49663 · flybird11111 · GitHub · 75af66cd · aae49663 · aae49663
Unverified Commit aae49663 authored Nov 22, 2023 by flybird11111 Committed by GitHub Nov 22, 2023
6 changed files
--- a/colossalai/shardformer/modeling/chatglm2.py
+++ b/colossalai/shardformer/modeling/chatglm2.py
@@ -51,6 +51,7 @@ def get_flash_core_attention_forward():
                attn_mask_type = AttnMaskType.causal
            else:
                flash_attention_mask = ~(attention_mask[:, :, -1].squeeze(1).to(torch.bool)).contiguous()
+                if not torch.all(flash_attention_mask):
                    attn_mask_type = AttnMaskType.paddedcausal
            attention = ColoAttention(

--- a/colossalai/shardformer/modeling/gpt2.py
+++ b/colossalai/shardformer/modeling/gpt2.py
@@ -771,11 +771,12 @@ def get_gpt2_flash_attention_forward():
            attn_mask_type = AttnMaskType.causal
            flash_attention_mask = None
        if attention_mask != None:
+            flash_attention_mask = ~(attention_mask[:, :, -1].squeeze(1).to(torch.bool)).contiguous()
+            if not torch.all(flash_attention_mask):
                if attn_mask_type == AttnMaskType.causal:
                    attn_mask_type == AttnMaskType.paddedcausal
                else:
                    attn_mask_type = AttnMaskType.padding
-            flash_attention_mask = ~(attention_mask[:, :, -1].squeeze(1).to(torch.bool)).contiguous()
        scale = value.size(-1) ** -0.5
        if self.scale_attn_by_inverse_layer_idx:

--- a/colossalai/shardformer/modeling/llama.py
+++ b/colossalai/shardformer/modeling/llama.py
@@ -465,6 +465,7 @@ def get_llama_flash_attention_forward():
                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                )
            flash_attention_mask = ~(attention_mask[:, :, -1].squeeze(1).to(torch.bool)).contiguous()
+            if not torch.all(flash_attention_mask):
                attn_mask_type = AttnMaskType.paddedcausal
        attention = ColoAttention(embed_dim=self.hidden_size, num_heads=self.num_heads)

--- a/colossalai/shardformer/modeling/opt.py
+++ b/colossalai/shardformer/modeling/opt.py
@@ -581,6 +581,7 @@ def get_opt_flash_attention_forward():
                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
                )
            flash_attention_mask = ~(attention_mask[:, :, -1].squeeze(1).to(torch.bool)).contiguous()
+            if not torch.all(flash_attention_mask):
                attn_mask_type = AttnMaskType.paddedcausal
        attention = ColoAttention(

--- a/colossalai/shardformer/modeling/whisper.py
+++ b/colossalai/shardformer/modeling/whisper.py
@@ -106,7 +106,10 @@ def get_whisper_flash_attention_forward():
                        f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
                    )
                flash_attention_mask = ~(attention_mask[:, :, -1].squeeze(1).to(torch.bool).contiguous())
+                if not torch.all(flash_attention_mask):
                    attn_type = AttnMaskType.paddedcausal
+                else:
+                    attn_type = AttnMaskType.causal
        attention = ColoAttention(
            embed_dim=self.embed_dim, num_heads=self.num_heads, dropout=self.dropout, scale=self.scaling

--- a/examples/language/llama2/pretrain.py
+++ b/examples/language/llama2/pretrain.py
@@ -76,6 +76,7 @@ def tokenize_batch_for_pretrain(batch, tokenizer: Optional[LlamaTokenizer] = Non
 def all_reduce_mean(tensor: torch.Tensor) -> torch.Tensor:
    dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
+    tensor = tensor.data
    tensor.div_(dist.get_world_size())
    return tensor