Fix LXMERT with DataParallel (#7471)

886ef35c · Lysandre Debut · GitHub · 35e94c68 · 886ef35c
Unverified Commit 886ef35c authored Sep 30, 2020 by Lysandre Debut Committed by GitHub Sep 30, 2020
Show whitespace changes
Inline Side-by-side

Showing with 1 addition and 1 deletion

src/transformers/modeling_lxmert.py src/transformers/modeling_lxmert.py +1 -1

No files found.
--- a/src/transformers/modeling_lxmert.py
+++ b/src/transformers/modeling_lxmert.py
@@ -958,7 +958,7 @@ class LxmertModel(LxmertPreTrainedModel):
        # positions we want to attend and -10000.0 for masked positions.
        # Since we are adding it to the raw scores before the softmax, this is
        # effectively the same as removing these entirely.
-        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)
+        extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0

        # Process the visual attention mask