Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
26ea8314
Commit
26ea8314
authored
Dec 30, 2021
by
Vijay Korthikanti
Browse files
fixing the corner case pp=2
parent
81ad628e
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
17 additions
and
19 deletions
+17
-19
megatron/model/module.py
megatron/model/module.py
+17
-19
No files found.
megatron/model/module.py
View file @
26ea8314
...
...
@@ -51,8 +51,8 @@ class MegatronModule(torch.nn.Module):
def
word_embeddings_weight
(
self
):
if
not
mpu
.
is_pipeline_last_stage
(
ignore_virtual
=
True
)
or
\
mpu
.
get_pipeline_model_parallel_world_size
()
==
1
:
if
hasattr
(
self
.
language_model
,
'embedding'
)
and
\
self
.
language_model
.
embedding
is
not
None
:
return
self
.
language_model
.
embedding
.
word_embeddings
.
weight
else
:
if
not
self
.
share_word_embeddings
:
...
...
@@ -99,8 +99,9 @@ class MegatronModule(torch.nn.Module):
# Zero out initial weights for decoder embedding.
# NOTE: We don't currently support T5 with the interleaved schedule.
if
not
mpu
.
is_pipeline_first_stage
(
ignore_virtual
=
True
)
and
\
not
mpu
.
is_pipeline_last_stage
(
ignore_virtual
=
True
)
and
\
mpu
.
is_rank_in_embedding_group
():
mpu
.
is_rank_in_embedding_group
()
and
\
hasattr
(
self
.
language_model
,
'embedding'
)
and
\
self
.
language_model
.
embedding
is
not
None
:
self
.
language_model
.
embedding
.
zero_parameters
()
# Ensure that first and last stages have the same initial parameter
...
...
@@ -109,21 +110,18 @@ class MegatronModule(torch.nn.Module):
if
mpu
.
is_rank_in_embedding_group
():
torch
.
distributed
.
all_reduce
(
self
.
word_embeddings_weight
().
data
,
group
=
mpu
.
get_embedding_group
())
# All-reduce other embeddings as well as necessary. The last stage
# does not have these other embeddings, so just create placeholder
# tensors of the right shape with all zeros.
# NOTE: We don't currently support T5 with the interleaved schedule.
if
args
.
pipeline_model_parallel_split_rank
is
not
None
:
# TODO: Support tokentype embedding.
dimensions
=
(
args
.
max_position_embeddings
,
args
.
hidden_size
)
if
mpu
.
is_pipeline_last_stage
(
ignore_virtual
=
True
):
position_embeddings
=
torch
.
nn
.
Embedding
(
*
dimensions
).
cuda
()
position_embeddings
.
weight
.
data
.
fill_
(
0
)
else
:
self
.
language_model
.
embedding
.
cuda
()
position_embeddings
=
self
.
language_model
.
embedding
.
position_embeddings
torch
.
distributed
.
all_reduce
(
position_embeddings
.
weight
.
data
,
group
=
mpu
.
get_embedding_group
())
# All-reduce other embeddings as well as necessary. The last stage
# does not have these other embeddings, so just create placeholder
# tensors of the right shape with all zeros.
# NOTE: We don't currently support T5 with the interleaved schedule.
if
mpu
.
is_rank_in_position_embedding_group
()
and
\
args
.
pipeline_model_parallel_split_rank
is
not
None
:
# TODO: Support tokentype embedding.
self
.
language_model
.
embedding
.
cuda
()
position_embeddings
=
self
.
language_model
.
embedding
.
position_embeddings
torch
.
distributed
.
all_reduce
(
position_embeddings
.
weight
.
data
,
group
=
mpu
.
get_position_embedding_group
())
else
:
print
(
"WARNING! Distributed processes aren't initialized, so "
"word embeddings in the last layer are not initialized. "
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment