"src/vscode:/vscode.git/clone" did not exist on "896fb6d8d7c10001eb2a92568be7b4bd3d5ddea3"
Commit c04c4977 authored by Lawrence McAfee's avatar Lawrence McAfee
Browse files

renamed argument; 'embed' -> 'embedding'

parent b93bef00
......@@ -68,7 +68,7 @@ def parse_args(extra_args_provider=None, defaults={},
(args.world_size // args.tensor_model_parallel_size))
args.transformer_pipeline_model_parallel_size = (
args.pipeline_model_parallel_size - 1
if args.standalone_embed_stage else
if args.standalone_embedding_stage else
args.pipeline_model_parallel_size
)
# Checks.
......@@ -689,7 +689,7 @@ def _add_distributed_args(parser):
help='Call torch.cuda.empty_cache() each iteration '
'(training and eval), to reduce fragmentation.'
'0=off, 1=moderate, 2=aggressive.')
group.add_argument('--standalone-embed-stage', action='store_true',
group.add_argument('--standalone-embedding-stage', action='store_true',
default=False, help='If set, *input* embedding layer '
'is placed on its own pipeline stage, without any '
'transformer layers. (For T5, this flag currently only '
......
......@@ -546,7 +546,7 @@ class NoopTransformerLayer(MegatronModule):
"""A single 'no-op' transformer layer.
The sole purpose of this layer is for when a standalone embedding layer
is used (i.e., args.standalone_embed_stage == True). In this case,
is used (i.e., args.standalone_embedding_stage == True). In this case,
zero transformer layers are assigned when pipeline rank == 0. Additionally,
when virtual pipeline rank >= 1, zero total model parameters are created
(virtual rank 0 contains the input embedding). This results in the model's
......@@ -635,7 +635,7 @@ class ParallelTransformer(MegatronModule):
if self.num_layers == 0:
# When a standalone embedding stage is used (e.g.,
# args.standalone_embed_stage == True), virtual pipeline ranks
# args.standalone_embedding_stage == True), virtual pipeline ranks
# on pipeline rank 0 will have zero transformer layers assigned to
# them. This results in the model's input and output tensors to be
# the same, which will cause failure for certain output tensor
......
......@@ -330,7 +330,7 @@ def get_num_layers(args, is_encoder_and_decoder_model):
# the same whether or not a standalone embedding stage is used.
num_ranks_in_encoder = (
args.pipeline_model_parallel_split_rank - 1
if args.standalone_embed_stage else
if args.standalone_embedding_stage else
args.pipeline_model_parallel_split_rank
)
num_ranks_in_decoder = args.transformer_pipeline_model_parallel_size - num_ranks_in_encoder
......@@ -352,7 +352,7 @@ def get_num_layers(args, is_encoder_and_decoder_model):
# or no layers at all (virtual pp rank >= 1).
num_layers = (
0
if args.standalone_embed_stage
if args.standalone_embedding_stage
and get_pipeline_model_parallel_rank() == 0 else
args.num_layers // args.transformer_pipeline_model_parallel_size
)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment