Unverified Commit 6ed9882d authored by Thomas Viehmann's avatar Thomas Viehmann Committed by GitHub
Browse files

use functional interface for softmax in attention (#14198)

* use functional interface instead of instantiating module and immediately calling it

* fix torch.nn.functional to nn.functional. Thank you Stas!
parent 4176bc16
...@@ -262,7 +262,7 @@ class RobertaSelfAttention(nn.Module): ...@@ -262,7 +262,7 @@ class RobertaSelfAttention(nn.Module):
attention_scores = attention_scores + attention_mask attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities. # Normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-1)(attention_scores) attention_probs = nn.functional.softmax(attention_scores, dim=-1)
# This is actually dropping out entire tokens to attend to, which might # This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper. # seem a bit unusual, but is taken from the original Transformer paper.
......
...@@ -301,7 +301,7 @@ class RoFormerSelfAttention(nn.Module): ...@@ -301,7 +301,7 @@ class RoFormerSelfAttention(nn.Module):
attention_scores = attention_scores + attention_mask attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities. # Normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-1)(attention_scores) attention_probs = nn.functional.softmax(attention_scores, dim=-1)
# This is actually dropping out entire tokens to attend to, which might # This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper. # seem a bit unusual, but is taken from the original Transformer paper.
......
...@@ -165,7 +165,7 @@ class SegformerEfficientSelfAttention(nn.Module): ...@@ -165,7 +165,7 @@ class SegformerEfficientSelfAttention(nn.Module):
attention_scores = attention_scores / math.sqrt(self.attention_head_size) attention_scores = attention_scores / math.sqrt(self.attention_head_size)
# Normalize the attention scores to probabilities. # Normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-1)(attention_scores) attention_probs = nn.functional.softmax(attention_scores, dim=-1)
# This is actually dropping out entire tokens to attend to, which might # This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper. # seem a bit unusual, but is taken from the original Transformer paper.
......
...@@ -202,7 +202,7 @@ class SplinterSelfAttention(nn.Module): ...@@ -202,7 +202,7 @@ class SplinterSelfAttention(nn.Module):
attention_scores = attention_scores + attention_mask attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities. # Normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-1)(attention_scores) attention_probs = nn.functional.softmax(attention_scores, dim=-1)
# This is actually dropping out entire tokens to attend to, which might # This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper. # seem a bit unusual, but is taken from the original Transformer paper.
......
...@@ -419,7 +419,7 @@ class TapasSelfAttention(nn.Module): ...@@ -419,7 +419,7 @@ class TapasSelfAttention(nn.Module):
attention_scores = attention_scores + attention_mask attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities. # Normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-1)(attention_scores) attention_probs = nn.functional.softmax(attention_scores, dim=-1)
# This is actually dropping out entire tokens to attend to, which might # This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper. # seem a bit unusual, but is taken from the original Transformer paper.
......
...@@ -244,7 +244,7 @@ class VisualBertSelfAttention(nn.Module): ...@@ -244,7 +244,7 @@ class VisualBertSelfAttention(nn.Module):
attention_scores = attention_scores + attention_mask attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities. # Normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-1)(attention_scores) attention_probs = nn.functional.softmax(attention_scores, dim=-1)
# This is actually dropping out entire tokens to attend to, which might # This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper. # seem a bit unusual, but is taken from the original Transformer paper.
......
...@@ -193,7 +193,7 @@ class ViTSelfAttention(nn.Module): ...@@ -193,7 +193,7 @@ class ViTSelfAttention(nn.Module):
attention_scores = attention_scores / math.sqrt(self.attention_head_size) attention_scores = attention_scores / math.sqrt(self.attention_head_size)
# Normalize the attention scores to probabilities. # Normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-1)(attention_scores) attention_probs = nn.functional.softmax(attention_scores, dim=-1)
# This is actually dropping out entire tokens to attend to, which might # This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper. # seem a bit unusual, but is taken from the original Transformer paper.
......
...@@ -304,7 +304,7 @@ class {{cookiecutter.camelcase_modelname}}SelfAttention(nn.Module): ...@@ -304,7 +304,7 @@ class {{cookiecutter.camelcase_modelname}}SelfAttention(nn.Module):
attention_scores = attention_scores + attention_mask attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities. # Normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-1)(attention_scores) attention_probs = nn.functional.softmax(attention_scores, dim=-1)
# This is actually dropping out entire tokens to attend to, which might # This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper. # seem a bit unusual, but is taken from the original Transformer paper.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment