Merge pull request #2107 from huggingface/encoder-mask-shape

create encoder attention mask from shape of hidden states

Merge pull request #2107 from huggingface/encoder-mask-shape
create encoder attention mask from shape of hidden states
ecabbf6d · Thomas Wolf · GitHub · 1d189304 · 3520be78 · ecabbf6d
Unverified Commit ecabbf6d authored Dec 10, 2019 by Thomas Wolf Committed by GitHub Dec 10, 2019
Hide whitespace changes
Inline Side-by-side

Showing with 6 additions and 4 deletions

transformers/modeling_bert.py transformers/modeling_bert.py +6 -4

No files found.
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -692,17 +692,19 @@ class BertModel(BertPreTrainedModel):
        # If a 2D ou 3D attention mask is provided for the cross-attention
        # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
-        if self.config.is_decoder:
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
            if encoder_attention_mask is None:
-                encoder_attention_mask = torch.ones(input_shape, device=device)
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
            if encoder_attention_mask.dim() == 3:
                encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
            elif encoder_attention_mask.dim() == 2:
                encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
            else:
-                raise ValueError("Wrong shape for input_ids (shape {}) or encoder_attention_mask (shape {})".format(input_shape,
+                raise ValueError("Wrong shape for encoder_hidden_shape (shape {}) or encoder_attention_mask (shape {})".format(encoder_hidden_shape,
-                                                                                                                    encoder_attention_mask.shape))
+                                                                                                                               encoder_attention_mask.shape))
            encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0