"vscode:/vscode.git/clone" did not exist on "bc0cee9d1c5da33ba96b8b3d2e904f3c21409e6a"
Commit ade99d61 authored by Vijay Korthikanti's avatar Vijay Korthikanti
Browse files

avoid allocation of word embedding for head in T5 pp=2 case

parent 26ea8314
...@@ -85,7 +85,9 @@ class MegatronModule(torch.nn.Module): ...@@ -85,7 +85,9 @@ class MegatronModule(torch.nn.Module):
# 3. In the training loop, before an all-reduce between the grads of # 3. In the training loop, before an all-reduce between the grads of
# the two word_embeddings layers to ensure that every applied weight # the two word_embeddings layers to ensure that every applied weight
# update is the same on both stages. # update is the same on both stages.
if mpu.is_pipeline_last_stage(): if mpu.is_pipeline_last_stage() and \
(not hasattr(self.language_model, 'embedding') or
self.language_model.embedding is None):
assert not mpu.is_pipeline_first_stage() assert not mpu.is_pipeline_first_stage()
self._word_embeddings_for_head_key = 'word_embeddings_for_head' self._word_embeddings_for_head_key = 'word_embeddings_for_head'
# set word_embeddings weights to 0 here, then copy first # set word_embeddings weights to 0 here, then copy first
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment