fixed: hard coding for max and min number will out of range in fp16, which will cause nan.

8bdee1cb · Zili Wang · 7424b284 · 8bdee1cb · 8bdee1cb
Commit 8bdee1cb authored Sep 11, 2019 by Zili Wang
Showing with 26 additions and 15 deletions

pytorch_transformers/modeling_transfo_xl.py pytorch_transformers/modeling_transfo_xl.py +21 -13

pytorch_transformers/modeling_utils.py pytorch_transformers/modeling_utils.py +5 -2

No files found.
--- a/pytorch_transformers/modeling_transfo_xl.py
+++ b/pytorch_transformers/modeling_transfo_xl.py
@@ -451,9 +451,17 @@ class RelPartialLearnableMultiHeadAttn(RelMultiHeadAttn):
        if attn_mask is not None and torch.sum(attn_mask).item():
            attn_mask = (attn_mask == 1)  # Switch to bool
            if attn_mask.dim() == 2:
+                if next(self.parameters()).dtype == torch.float16:
+                    attn_score = attn_score.float().masked_fill(
+                        attn_mask[None,:,:,None], -65000).type_as(attn_score)
+                else:
                    attn_score = attn_score.float().masked_fill(
                        attn_mask[None,:,:,None], -1e30).type_as(attn_score)
            elif attn_mask.dim() == 3:
+                if next(self.parameters()).dtype == torch.float16:
+                    attn_score = attn_score.float().masked_fill(
+                        attn_mask[:,:,:,None], -65000).type_as(attn_score)
+                else:
                    attn_score = attn_score.float().masked_fill(
                        attn_mask[:,:,:,None], -1e30).type_as(attn_score)

--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -434,6 +434,9 @@ class PoolerStartLogits(nn.Module):
        x = self.dense(hidden_states).squeeze(-1)
        if p_mask is not None:
+            if next(self.parameters()).dtype == torch.float16:
+                x = x * (1 - p_mask) - 65500 * p_mask
+            else:
                x = x * (1 - p_mask) - 1e30 * p_mask
        return x