Commit 6c2d0337 authored by Mohammad Shoeybi's avatar Mohammad Shoeybi
Browse files

added query-key layer scaling and softmax fp32 option

parent 691747b1
...@@ -269,7 +269,7 @@ class ParallelSelfAttention(MegatronModule): ...@@ -269,7 +269,7 @@ class ParallelSelfAttention(MegatronModule):
# Attention probabilities. [b, np, s, s] # Attention probabilities. [b, np, s, s]
if self.apply_query_key_layer_scaling: if self.apply_query_key_layer_scaling:
attention_scores = attention_scores * self.layer_number attention_scores = attention_scores * self.layer_number
attention_probs = torch.nn.Softmax(dim=-1)(attention_probs) attention_probs = torch.nn.Softmax(dim=-1)(attention_scores)
# This is actually dropping out entire tokens to attend to, which might # This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper. # seem a bit unusual, but is taken from the original Transformer paper.
with mpu.get_cuda_rng_tracker().fork(): with mpu.get_cuda_rng_tracker().fork():
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment