reword explanation of encoder_attention_mask

87d60b6e · Rémi Louf · 638fe7f5 · 87d60b6e
Commit 87d60b6e authored Oct 17, 2019 by Rémi Louf
Show whitespace changes
Inline Side-by-side

Showing with 3 additions and 3 deletions

transformers/modeling_bert.py transformers/modeling_bert.py +3 -3

No files found.
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -201,9 +201,9 @@ class BertSelfAttention(nn.Module):
    def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None):
        mixed_query_layer = self.query(hidden_states)
-        # if the attention Module is a encoder-decoder self attention module
+        # If this is instantiated as a cross-attention module, the keys
-        # they keys & values are given by the encoder; the attention mask
+        # and values come from an encoder; the attention mask needs to be
-        # needs to be such that there is no atention on the encoder's padding tokens.
+        # such that the encoder's padding tokens are not attended to.
        if encoder_hidden_states is not None:
            mixed_key_layer = self.key(encoder_hidden_states)
            mixed_value_layer = self.value(encoder_hidden_states)