Unverified Commit e02037b3 authored by Andrej's avatar Andrej Committed by GitHub
Browse files

Fix bug in gpt2's (from-scratch) special scaled weight initialization (#17877)



* only special scale init each gpt2 c_proj weight once, on exact match

* fix double quotes
Co-authored-by: default avatarleandro <leandro.vonwerra@spoud.io>
parent 6dd00f6b
...@@ -484,7 +484,7 @@ class GPT2PreTrainedModel(PreTrainedModel): ...@@ -484,7 +484,7 @@ class GPT2PreTrainedModel(PreTrainedModel):
# #
# Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
for name, p in module.named_parameters(): for name, p in module.named_parameters():
if "c_proj" in name and "weight" in name: if name == "c_proj.weight":
# Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
p.data.normal_(mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.n_layer))) p.data.normal_(mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.n_layer)))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment