[megatron gpt checkpoint conversion] causal mask requires pos_embed dimension (#13735)

400c5a15 · Stas Bekman · GitHub · 91df4551 · 400c5a15
Unverified Commit 400c5a15 authored Sep 26, 2021 by Stas Bekman Committed by GitHub Sep 26, 2021
Hide whitespace changes
Inline Side-by-side

Showing with 5 additions and 6 deletions

src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py .../models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py +5 -6

No files found.
--- a/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
+++ b/src/transformers/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py
@@ -121,12 +121,11 @@ def convert_megatron_checkpoint(args, input_state_dict, config):
    # The position embeddings.
    pos_embeddings = embeddings["position_embeddings"]["weight"]
-    # Read the hidden dimension.
+    # Read the causal mask dimension (seqlen). [max_sequence_length, hidden_size]
-    n_embed = pos_embeddings.size(1)
+    n_ctx = pos_embeddings.size(0)
-    # DEBUG.
    assert (
-        n_embed == heads * hidden_size_per_head
+        n_ctx == config.n_ctx
-    ), f"detected mismatch n_embed={n_embed} != heads={heads}*hidden_size_per_head={hidden_size_per_head}"
+    ), f"pos_embeddings.max_sequence_length={n_ctx} and config.n_ctx={config.n_ctx} don't match"
    # Store the position embeddings.
    output_state_dict["transformer.wpe.weight"] = pos_embeddings
@@ -175,7 +174,7 @@ def convert_megatron_checkpoint(args, input_state_dict, config):
        ) and weight_or_bias == "weight":
            # Insert a tensor of 1x1xDxD bias.
-            causal_mask = torch.tril(torch.ones((n_embed, n_embed), dtype=torch.float16)).view(1, 1, n_embed, n_embed)
+            causal_mask = torch.tril(torch.ones((n_ctx, n_ctx), dtype=torch.float16)).view(1, 1, n_ctx, n_ctx)
            output_state_dict[layer_name + ".attn.bias"] = causal_mask
            # Insert a "dummy" tensor for masked_bias.