Commit c04c4977 authored by Lawrence McAfee's avatar Lawrence McAfee
Browse files

renamed argument; 'embed' -> 'embedding'

parent b93bef00
...@@ -68,7 +68,7 @@ def parse_args(extra_args_provider=None, defaults={}, ...@@ -68,7 +68,7 @@ def parse_args(extra_args_provider=None, defaults={},
(args.world_size // args.tensor_model_parallel_size)) (args.world_size // args.tensor_model_parallel_size))
args.transformer_pipeline_model_parallel_size = ( args.transformer_pipeline_model_parallel_size = (
args.pipeline_model_parallel_size - 1 args.pipeline_model_parallel_size - 1
if args.standalone_embed_stage else if args.standalone_embedding_stage else
args.pipeline_model_parallel_size args.pipeline_model_parallel_size
) )
# Checks. # Checks.
...@@ -689,7 +689,7 @@ def _add_distributed_args(parser): ...@@ -689,7 +689,7 @@ def _add_distributed_args(parser):
help='Call torch.cuda.empty_cache() each iteration ' help='Call torch.cuda.empty_cache() each iteration '
'(training and eval), to reduce fragmentation.' '(training and eval), to reduce fragmentation.'
'0=off, 1=moderate, 2=aggressive.') '0=off, 1=moderate, 2=aggressive.')
group.add_argument('--standalone-embed-stage', action='store_true', group.add_argument('--standalone-embedding-stage', action='store_true',
default=False, help='If set, *input* embedding layer ' default=False, help='If set, *input* embedding layer '
'is placed on its own pipeline stage, without any ' 'is placed on its own pipeline stage, without any '
'transformer layers. (For T5, this flag currently only ' 'transformer layers. (For T5, this flag currently only '
......
...@@ -546,7 +546,7 @@ class NoopTransformerLayer(MegatronModule): ...@@ -546,7 +546,7 @@ class NoopTransformerLayer(MegatronModule):
"""A single 'no-op' transformer layer. """A single 'no-op' transformer layer.
The sole purpose of this layer is for when a standalone embedding layer The sole purpose of this layer is for when a standalone embedding layer
is used (i.e., args.standalone_embed_stage == True). In this case, is used (i.e., args.standalone_embedding_stage == True). In this case,
zero transformer layers are assigned when pipeline rank == 0. Additionally, zero transformer layers are assigned when pipeline rank == 0. Additionally,
when virtual pipeline rank >= 1, zero total model parameters are created when virtual pipeline rank >= 1, zero total model parameters are created
(virtual rank 0 contains the input embedding). This results in the model's (virtual rank 0 contains the input embedding). This results in the model's
...@@ -635,7 +635,7 @@ class ParallelTransformer(MegatronModule): ...@@ -635,7 +635,7 @@ class ParallelTransformer(MegatronModule):
if self.num_layers == 0: if self.num_layers == 0:
# When a standalone embedding stage is used (e.g., # When a standalone embedding stage is used (e.g.,
# args.standalone_embed_stage == True), virtual pipeline ranks # args.standalone_embedding_stage == True), virtual pipeline ranks
# on pipeline rank 0 will have zero transformer layers assigned to # on pipeline rank 0 will have zero transformer layers assigned to
# them. This results in the model's input and output tensors to be # them. This results in the model's input and output tensors to be
# the same, which will cause failure for certain output tensor # the same, which will cause failure for certain output tensor
......
...@@ -330,7 +330,7 @@ def get_num_layers(args, is_encoder_and_decoder_model): ...@@ -330,7 +330,7 @@ def get_num_layers(args, is_encoder_and_decoder_model):
# the same whether or not a standalone embedding stage is used. # the same whether or not a standalone embedding stage is used.
num_ranks_in_encoder = ( num_ranks_in_encoder = (
args.pipeline_model_parallel_split_rank - 1 args.pipeline_model_parallel_split_rank - 1
if args.standalone_embed_stage else if args.standalone_embedding_stage else
args.pipeline_model_parallel_split_rank args.pipeline_model_parallel_split_rank
) )
num_ranks_in_decoder = args.transformer_pipeline_model_parallel_size - num_ranks_in_encoder num_ranks_in_decoder = args.transformer_pipeline_model_parallel_size - num_ranks_in_encoder
...@@ -352,7 +352,7 @@ def get_num_layers(args, is_encoder_and_decoder_model): ...@@ -352,7 +352,7 @@ def get_num_layers(args, is_encoder_and_decoder_model):
# or no layers at all (virtual pp rank >= 1). # or no layers at all (virtual pp rank >= 1).
num_layers = ( num_layers = (
0 0
if args.standalone_embed_stage if args.standalone_embedding_stage
and get_pipeline_model_parallel_rank() == 0 else and get_pipeline_model_parallel_rank() == 0 else
args.num_layers // args.transformer_pipeline_model_parallel_size args.num_layers // args.transformer_pipeline_model_parallel_size
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment