smarter way to avoid applying encoder key mask

fc830685 · alexeib · Myle Ott · b2374e52 · fc830685 · fc830685
Commit fc830685 authored Apr 05, 2018 by alexeib Committed by Myle Ott Jun 15, 2018
Hide whitespace changes
Inline Side-by-side

Showing with 9 additions and 8 deletions

fairseq/models/transformer.py fairseq/models/transformer.py +2 -0

fairseq/modules/multihead_attention.py fairseq/modules/multihead_attention.py +7 -8

No files found.
--- a/fairseq/models/transformer.py
+++ b/fairseq/models/transformer.py
@@ -137,6 +137,8 @@ class TransformerEncoder(FairseqEncoder):
        # compute padding mask
        encoder_padding_mask = src_tokens.eq(self.padding_idx)
+        if not encoder_padding_mask.any():
+            encoder_padding_mask = None
        # encoder layers
        for layer in self.layers:

--- a/fairseq/modules/multihead_attention.py
+++ b/fairseq/modules/multihead_attention.py
@@ -122,15 +122,14 @@ class MultiheadAttention(nn.Module):
            assert query.size() == key.size(), \
                'mask_future_timesteps only applies to self-attention'
            attn_weights += self.buffered_mask(attn_weights).unsqueeze(0)
-        if key_padding_mask is not None and incremental_state is None:
+        if key_padding_mask is not None:
            # don't attend to padding symbols
-            if utils.item(key_padding_mask.max()) > 0:
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-                attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.masked_fill(
-                attn_weights = attn_weights.masked_fill(
+                key_padding_mask.unsqueeze(1).unsqueeze(2),
-                    key_padding_mask.unsqueeze(1).unsqueeze(2),
+                -math.inf,
-                    -math.inf,
+            )
-                )
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
-                attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
        attn_weights = F.softmax(attn_weights, dim=-1)
        attn_weights = F.dropout(attn_weights, p=self.dropout, training=self.training)