Use FP32 for multi-head attention softmax

d6be0c7e · Myle Ott · 2d27ae08 · d6be0c7e
Commit d6be0c7e authored Apr 07, 2018 by Myle Ott
Hide whitespace changes
Inline Side-by-side

Showing with 1 addition and 1 deletion

fairseq/modules/multihead_attention.py fairseq/modules/multihead_attention.py +1 -1

No files found.
--- a/fairseq/modules/multihead_attention.py
+++ b/fairseq/modules/multihead_attention.py
@@ -129,7 +129,7 @@ class MultiheadAttention(nn.Module):
                float('-inf'),
            ).type_as(attn_weights)  # FP16 support: cast to float and back
            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
-        attn_weights = F.softmax(attn_weights, dim=-1)
+        attn_weights = F.softmax(attn_weights.float(), dim=-1).type_as(attn_weights)
        attn_weights = F.dropout(attn_weights, p=self.dropout, training=self.training)
        attn = torch.bmm(attn_weights, v)