Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
c04c4977
Commit
c04c4977
authored
Feb 04, 2022
by
Lawrence McAfee
Browse files
renamed argument; 'embed' -> 'embedding'
parent
b93bef00
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
6 additions
and
6 deletions
+6
-6
megatron/arguments.py
megatron/arguments.py
+2
-2
megatron/model/transformer.py
megatron/model/transformer.py
+2
-2
megatron/mpu/initialize.py
megatron/mpu/initialize.py
+2
-2
No files found.
megatron/arguments.py
View file @
c04c4977
...
@@ -68,7 +68,7 @@ def parse_args(extra_args_provider=None, defaults={},
...
@@ -68,7 +68,7 @@ def parse_args(extra_args_provider=None, defaults={},
(
args
.
world_size
//
args
.
tensor_model_parallel_size
))
(
args
.
world_size
//
args
.
tensor_model_parallel_size
))
args
.
transformer_pipeline_model_parallel_size
=
(
args
.
transformer_pipeline_model_parallel_size
=
(
args
.
pipeline_model_parallel_size
-
1
args
.
pipeline_model_parallel_size
-
1
if
args
.
standalone_embed_stage
else
if
args
.
standalone_embed
ding
_stage
else
args
.
pipeline_model_parallel_size
args
.
pipeline_model_parallel_size
)
)
# Checks.
# Checks.
...
@@ -689,7 +689,7 @@ def _add_distributed_args(parser):
...
@@ -689,7 +689,7 @@ def _add_distributed_args(parser):
help
=
'Call torch.cuda.empty_cache() each iteration '
help
=
'Call torch.cuda.empty_cache() each iteration '
'(training and eval), to reduce fragmentation.'
'(training and eval), to reduce fragmentation.'
'0=off, 1=moderate, 2=aggressive.'
)
'0=off, 1=moderate, 2=aggressive.'
)
group
.
add_argument
(
'--standalone-embed-stage'
,
action
=
'store_true'
,
group
.
add_argument
(
'--standalone-embed
ding
-stage'
,
action
=
'store_true'
,
default
=
False
,
help
=
'If set, *input* embedding layer '
default
=
False
,
help
=
'If set, *input* embedding layer '
'is placed on its own pipeline stage, without any '
'is placed on its own pipeline stage, without any '
'transformer layers. (For T5, this flag currently only '
'transformer layers. (For T5, this flag currently only '
...
...
megatron/model/transformer.py
View file @
c04c4977
...
@@ -546,7 +546,7 @@ class NoopTransformerLayer(MegatronModule):
...
@@ -546,7 +546,7 @@ class NoopTransformerLayer(MegatronModule):
"""A single 'no-op' transformer layer.
"""A single 'no-op' transformer layer.
The sole purpose of this layer is for when a standalone embedding layer
The sole purpose of this layer is for when a standalone embedding layer
is used (i.e., args.standalone_embed_stage == True). In this case,
is used (i.e., args.standalone_embed
ding
_stage == True). In this case,
zero transformer layers are assigned when pipeline rank == 0. Additionally,
zero transformer layers are assigned when pipeline rank == 0. Additionally,
when virtual pipeline rank >= 1, zero total model parameters are created
when virtual pipeline rank >= 1, zero total model parameters are created
(virtual rank 0 contains the input embedding). This results in the model's
(virtual rank 0 contains the input embedding). This results in the model's
...
@@ -635,7 +635,7 @@ class ParallelTransformer(MegatronModule):
...
@@ -635,7 +635,7 @@ class ParallelTransformer(MegatronModule):
if
self
.
num_layers
==
0
:
if
self
.
num_layers
==
0
:
# When a standalone embedding stage is used (e.g.,
# When a standalone embedding stage is used (e.g.,
# args.standalone_embed_stage == True), virtual pipeline ranks
# args.standalone_embed
ding
_stage == True), virtual pipeline ranks
# on pipeline rank 0 will have zero transformer layers assigned to
# on pipeline rank 0 will have zero transformer layers assigned to
# them. This results in the model's input and output tensors to be
# them. This results in the model's input and output tensors to be
# the same, which will cause failure for certain output tensor
# the same, which will cause failure for certain output tensor
...
...
megatron/mpu/initialize.py
View file @
c04c4977
...
@@ -330,7 +330,7 @@ def get_num_layers(args, is_encoder_and_decoder_model):
...
@@ -330,7 +330,7 @@ def get_num_layers(args, is_encoder_and_decoder_model):
# the same whether or not a standalone embedding stage is used.
# the same whether or not a standalone embedding stage is used.
num_ranks_in_encoder
=
(
num_ranks_in_encoder
=
(
args
.
pipeline_model_parallel_split_rank
-
1
args
.
pipeline_model_parallel_split_rank
-
1
if
args
.
standalone_embed_stage
else
if
args
.
standalone_embed
ding
_stage
else
args
.
pipeline_model_parallel_split_rank
args
.
pipeline_model_parallel_split_rank
)
)
num_ranks_in_decoder
=
args
.
transformer_pipeline_model_parallel_size
-
num_ranks_in_encoder
num_ranks_in_decoder
=
args
.
transformer_pipeline_model_parallel_size
-
num_ranks_in_encoder
...
@@ -352,7 +352,7 @@ def get_num_layers(args, is_encoder_and_decoder_model):
...
@@ -352,7 +352,7 @@ def get_num_layers(args, is_encoder_and_decoder_model):
# or no layers at all (virtual pp rank >= 1).
# or no layers at all (virtual pp rank >= 1).
num_layers
=
(
num_layers
=
(
0
0
if
args
.
standalone_embed_stage
if
args
.
standalone_embed
ding
_stage
and
get_pipeline_model_parallel_rank
()
==
0
else
and
get_pipeline_model_parallel_rank
()
==
0
else
args
.
num_layers
//
args
.
transformer_pipeline_model_parallel_size
args
.
num_layers
//
args
.
transformer_pipeline_model_parallel_size
)
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment