Unverified Commit 6ed9882d authored by Thomas Viehmann's avatar Thomas Viehmann Committed by GitHub
Browse files

use functional interface for softmax in attention (#14198)

* use functional interface instead of instantiating module and immediately calling it

* fix torch.nn.functional to nn.functional. Thank you Stas!
parent 4176bc16
...@@ -152,7 +152,7 @@ class BertSelfAttention(nn.Module): ...@@ -152,7 +152,7 @@ class BertSelfAttention(nn.Module):
attention_scores = attention_scores + attention_mask attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities. # Normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-1)(attention_scores) attention_probs = nn.functional.softmax(attention_scores, dim=-1)
# This is actually dropping out entire tokens to attend to, which might # This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper. # seem a bit unusual, but is taken from the original Transformer paper.
......
...@@ -348,7 +348,7 @@ class AlbertAttention(nn.Module): ...@@ -348,7 +348,7 @@ class AlbertAttention(nn.Module):
attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
# Normalize the attention scores to probabilities. # Normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-1)(attention_scores) attention_probs = nn.functional.softmax(attention_scores, dim=-1)
# This is actually dropping out entire tokens to attend to, which might # This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper. # seem a bit unusual, but is taken from the original Transformer paper.
......
...@@ -244,7 +244,7 @@ class BeitSelfAttention(nn.Module): ...@@ -244,7 +244,7 @@ class BeitSelfAttention(nn.Module):
attention_scores = attention_scores + relative_position_bias attention_scores = attention_scores + relative_position_bias
# Normalize the attention scores to probabilities. # Normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-1)(attention_scores) attention_probs = nn.functional.softmax(attention_scores, dim=-1)
# This is actually dropping out entire tokens to attend to, which might # This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper. # seem a bit unusual, but is taken from the original Transformer paper.
......
...@@ -327,7 +327,7 @@ class BertSelfAttention(nn.Module): ...@@ -327,7 +327,7 @@ class BertSelfAttention(nn.Module):
attention_scores = attention_scores + attention_mask attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities. # Normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-1)(attention_scores) attention_probs = nn.functional.softmax(attention_scores, dim=-1)
# This is actually dropping out entire tokens to attend to, which might # This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper. # seem a bit unusual, but is taken from the original Transformer paper.
......
...@@ -477,7 +477,7 @@ class CanineSelfAttention(nn.Module): ...@@ -477,7 +477,7 @@ class CanineSelfAttention(nn.Module):
attention_scores = attention_scores + attention_mask attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities. # Normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-1)(attention_scores) attention_probs = nn.functional.softmax(attention_scores, dim=-1)
# This is actually dropping out entire tokens to attend to, which might # This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper. # seem a bit unusual, but is taken from the original Transformer paper.
......
...@@ -159,7 +159,7 @@ class DeiTSelfAttention(nn.Module): ...@@ -159,7 +159,7 @@ class DeiTSelfAttention(nn.Module):
attention_scores = attention_scores / math.sqrt(self.attention_head_size) attention_scores = attention_scores / math.sqrt(self.attention_head_size)
# Normalize the attention scores to probabilities. # Normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-1)(attention_scores) attention_probs = nn.functional.softmax(attention_scores, dim=-1)
# This is actually dropping out entire tokens to attend to, which might # This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper. # seem a bit unusual, but is taken from the original Transformer paper.
......
...@@ -206,7 +206,7 @@ class MultiHeadSelfAttention(nn.Module): ...@@ -206,7 +206,7 @@ class MultiHeadSelfAttention(nn.Module):
mask = (mask == 0).view(mask_reshp).expand_as(scores) # (bs, n_heads, q_length, k_length) mask = (mask == 0).view(mask_reshp).expand_as(scores) # (bs, n_heads, q_length, k_length)
scores = scores.masked_fill(mask, -float("inf")) # (bs, n_heads, q_length, k_length) scores = scores.masked_fill(mask, -float("inf")) # (bs, n_heads, q_length, k_length)
weights = nn.Softmax(dim=-1)(scores) # (bs, n_heads, q_length, k_length) weights = nn.functional.softmax(scores, dim=-1) # (bs, n_heads, q_length, k_length)
weights = self.dropout(weights) # (bs, n_heads, q_length, k_length) weights = self.dropout(weights) # (bs, n_heads, q_length, k_length)
# Mask heads if we want to # Mask heads if we want to
......
...@@ -319,7 +319,7 @@ class ElectraSelfAttention(nn.Module): ...@@ -319,7 +319,7 @@ class ElectraSelfAttention(nn.Module):
attention_scores = attention_scores + attention_mask attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities. # Normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-1)(attention_scores) attention_probs = nn.functional.softmax(attention_scores, dim=-1)
# This is actually dropping out entire tokens to attend to, which might # This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper. # seem a bit unusual, but is taken from the original Transformer paper.
......
...@@ -209,7 +209,7 @@ class GPT2Attention(nn.Module): ...@@ -209,7 +209,7 @@ class GPT2Attention(nn.Module):
# Apply the attention mask # Apply the attention mask
attn_weights = attn_weights + attention_mask attn_weights = attn_weights + attention_mask
attn_weights = nn.Softmax(dim=-1)(attn_weights) attn_weights = nn.functional.softmax(attn_weights, dim=-1)
# Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op otherwise # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op otherwise
attn_weights = attn_weights.type(value.dtype) attn_weights = attn_weights.type(value.dtype)
...@@ -260,7 +260,7 @@ class GPT2Attention(nn.Module): ...@@ -260,7 +260,7 @@ class GPT2Attention(nn.Module):
# Apply the attention mask # Apply the attention mask
attn_weights = attn_weights + attention_mask attn_weights = attn_weights + attention_mask
attn_weights = nn.Softmax(dim=-1)(attn_weights) attn_weights = nn.functional.softmax(attn_weights, dim=-1)
# Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op if otherwise # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op if otherwise
if attn_weights.dtype != torch.float32: if attn_weights.dtype != torch.float32:
......
...@@ -199,7 +199,7 @@ class GPTNeoSelfAttention(nn.Module): ...@@ -199,7 +199,7 @@ class GPTNeoSelfAttention(nn.Module):
# Apply the attention mask # Apply the attention mask
attn_weights = attn_weights + attention_mask attn_weights = attn_weights + attention_mask
attn_weights = nn.Softmax(dim=-1)(attn_weights) attn_weights = nn.functional.softmax(attn_weights, dim=-1)
attn_weights = attn_weights.to(value.dtype) attn_weights = attn_weights.to(value.dtype)
attn_weights = self.attn_dropout(attn_weights) attn_weights = self.attn_dropout(attn_weights)
......
...@@ -151,7 +151,7 @@ class GPTJAttention(nn.Module): ...@@ -151,7 +151,7 @@ class GPTJAttention(nn.Module):
# Apply the attention mask # Apply the attention mask
attn_weights = attn_weights + attention_mask attn_weights = attn_weights + attention_mask
attn_weights = nn.Softmax(dim=-1)(attn_weights) attn_weights = nn.functional.softmax(attn_weights, dim=-1)
attn_weights = attn_weights.to(value.dtype) attn_weights = attn_weights.to(value.dtype)
attn_weights = self.attn_dropout(attn_weights) attn_weights = self.attn_dropout(attn_weights)
......
...@@ -409,7 +409,7 @@ class IntSoftmax(nn.Module): ...@@ -409,7 +409,7 @@ class IntSoftmax(nn.Module):
def forward(self, x, scaling_factor): def forward(self, x, scaling_factor):
if not self.quant_mode: if not self.quant_mode:
return nn.Softmax(dim=-1)(x), None return nn.functional.softmax(x, dim=-1), None
x_int = x / scaling_factor x_int = x / scaling_factor
......
...@@ -235,7 +235,7 @@ class LayoutLMSelfAttention(nn.Module): ...@@ -235,7 +235,7 @@ class LayoutLMSelfAttention(nn.Module):
attention_scores = attention_scores + attention_mask attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities. # Normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-1)(attention_scores) attention_probs = nn.functional.softmax(attention_scores, dim=-1)
# This is actually dropping out entire tokens to attend to, which might # This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper. # seem a bit unusual, but is taken from the original Transformer paper.
......
...@@ -399,7 +399,7 @@ class LukeSelfAttention(nn.Module): ...@@ -399,7 +399,7 @@ class LukeSelfAttention(nn.Module):
attention_scores = attention_scores + attention_mask attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities. # Normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-1)(attention_scores) attention_probs = nn.functional.softmax(attention_scores, dim=-1)
# This is actually dropping out entire tokens to attend to, which might # This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper. # seem a bit unusual, but is taken from the original Transformer paper.
......
...@@ -356,7 +356,7 @@ class LxmertAttention(nn.Module): ...@@ -356,7 +356,7 @@ class LxmertAttention(nn.Module):
attention_scores = attention_scores + attention_mask attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities. # Normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-1)(attention_scores) attention_probs = nn.functional.softmax(attention_scores, dim=-1)
# This is actually dropping out entire tokens to attend to, which might # This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper. # seem a bit unusual, but is taken from the original Transformer paper.
......
...@@ -298,7 +298,7 @@ class MegatronBertSelfAttention(nn.Module): ...@@ -298,7 +298,7 @@ class MegatronBertSelfAttention(nn.Module):
attention_scores = attention_scores + attention_mask attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities. # Normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-1)(attention_scores) attention_probs = nn.functional.softmax(attention_scores, dim=-1)
# This is actually dropping out entire tokens to attend to, which might # This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper. # seem a bit unusual, but is taken from the original Transformer paper.
......
...@@ -264,7 +264,7 @@ class MobileBertSelfAttention(nn.Module): ...@@ -264,7 +264,7 @@ class MobileBertSelfAttention(nn.Module):
# Apply the attention mask is (precomputed for all layers in BertModel forward() function) # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
attention_scores = attention_scores + attention_mask attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities. # Normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-1)(attention_scores) attention_probs = nn.functional.softmax(attention_scores, dim=-1)
# This is actually dropping out entire tokens to attend to, which might # This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper. # seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = self.dropout(attention_probs) attention_probs = self.dropout(attention_probs)
......
...@@ -184,7 +184,7 @@ class MPNetSelfAttention(nn.Module): ...@@ -184,7 +184,7 @@ class MPNetSelfAttention(nn.Module):
attention_scores = attention_scores + attention_mask attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities. # Normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-1)(attention_scores) attention_probs = nn.functional.softmax(attention_scores, dim=-1)
attention_probs = self.dropout(attention_probs) attention_probs = self.dropout(attention_probs)
......
...@@ -185,7 +185,7 @@ class Attention(nn.Module): ...@@ -185,7 +185,7 @@ class Attention(nn.Module):
# Apply the attention mask # Apply the attention mask
w = w + attention_mask w = w + attention_mask
w = nn.Softmax(dim=-1)(w) w = nn.functional.softmax(w, dim=-1)
w = self.attn_dropout(w) w = self.attn_dropout(w)
# Mask heads if we want to # Mask heads if we want to
......
...@@ -290,7 +290,7 @@ class RemBertSelfAttention(nn.Module): ...@@ -290,7 +290,7 @@ class RemBertSelfAttention(nn.Module):
attention_scores = attention_scores + attention_mask attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities. # Normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-1)(attention_scores) attention_probs = nn.functional.softmax(attention_scores, dim=-1)
# This is actually dropping out entire tokens to attend to, which might # This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper. # seem a bit unusual, but is taken from the original Transformer paper.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment