added query-key layer scaling and softmax fp32 option

6c2d0337 · Mohammad Shoeybi · 691747b1 · 6c2d0337
Commit 6c2d0337 authored Jan 13, 2020 by Mohammad Shoeybi
Hide whitespace changes
Inline Side-by-side

Showing with 1 addition and 1 deletion

megatron/model/transformer.py megatron/model/transformer.py +1 -1

No files found.
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -269,7 +269,7 @@ class ParallelSelfAttention(MegatronModule):
        # Attention probabilities. [b, np, s, s]
        if self.apply_query_key_layer_scaling:
            attention_scores = attention_scores * self.layer_number
-        attention_probs = torch.nn.Softmax(dim=-1)(attention_probs)
+        attention_probs = torch.nn.Softmax(dim=-1)(attention_scores)
        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.
        with mpu.get_cuda_rng_tracker().fork():