Unverified Commit 7096e475 authored by Patrick von Platen's avatar Patrick von Platen Committed by GitHub
Browse files

[Longformer] fix longformer global attention output (#5659)

* fix longformer global attention output

* fix multi gpu problem

* replace -10000 with 0

* better comment

* make attention output equal local and global

* Update src/transformers/modeling_longformer.py
parent ce374ba8
...@@ -442,12 +442,14 @@ class LongformerSelfAttention(nn.Module): ...@@ -442,12 +442,14 @@ class LongformerSelfAttention(nn.Module):
if output_attentions: if output_attentions:
if is_global_attn: if is_global_attn:
# With global attention, return global attention probabilities only # With global attention, return global attention probabilities only
# batch_size x num_heads x max_num_global_attention_tokens x sequence_length # batch_size x num_heads x sequence_length x window_size
# which is the attention weights from tokens with global attention to all tokens # which is the attention weights from all tokens to all tokens for global attention
# It doesn't not return local attention # It doesn't not return local attention. Only tokens with global attention have values > 0.0
# In case of variable number of global attantion in the rows of a batch, attn_probs = attn_probs[:, :, :, :max_num_global_attn_indices]
# attn_probs are padded with -10000.0 attention scores # pad attn_probs to max length with 0.0 since global attn did not attend there
attn_probs = attn_probs.view(batch_size, self.num_heads, max_num_global_attn_indices, seq_len) window_size = self.one_sided_attn_window_size * 2 + 1
attn_probs = F.pad(attn_probs, (0, window_size - max_num_global_attn_indices), value=0.0,)
attn_probs = attn_probs.permute(0, 2, 1, 3)
else: else:
# without global attention, return local attention probabilities # without global attention, return local attention probabilities
# batch_size x num_heads x sequence_length x window_size # batch_size x num_heads x sequence_length x window_size
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment