Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
dcef9069
Commit
dcef9069
authored
Feb 13, 2021
by
Deepak Narayanan
Browse files
Change argument to control the number of model chunks in a stage
parent
5489bda9
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
12 additions
and
4 deletions
+12
-4
megatron/arguments.py
megatron/arguments.py
+12
-4
No files found.
megatron/arguments.py
View file @
dcef9069
...
@@ -116,10 +116,18 @@ def parse_args(extra_args_provider=None, defaults={},
...
@@ -116,10 +116,18 @@ def parse_args(extra_args_provider=None, defaults={},
print
(
'setting global batch size to {}'
.
format
(
print
(
'setting global batch size to {}'
.
format
(
args
.
global_batch_size
),
flush
=
True
)
args
.
global_batch_size
),
flush
=
True
)
assert
args
.
global_batch_size
>
0
assert
args
.
global_batch_size
>
0
if
args
.
virtual_pipeline_model_parallel_size
is
not
None
:
if
args
.
num_layers_per_virtual_pipeline_stage
is
not
None
:
assert
args
.
num_layers
%
args
.
num_layers_per_virtual_pipeline_stage
==
0
,
\
'number of layers is not divisible by number of layers per virtual '
\
'pipeline stage'
args
.
virtual_pipeline_model_parallel_size
=
\
(
args
.
num_layers
//
args
.
pipeline_model_parallel_size
)
//
\
args
.
num_layers_per_virtual_pipeline_stage
assert
args
.
global_batch_size
%
args
.
pipeline_model_parallel_size
==
0
,
\
assert
args
.
global_batch_size
%
args
.
pipeline_model_parallel_size
==
0
,
\
'global batch size is not divisible by pipeline parallel size when '
\
'global batch size is not divisible by pipeline parallel size when '
\
'using interleaved schedule'
'using interleaved schedule'
else
:
args
.
virtual_pipeline_model_parallel_size
=
None
# Parameters dtype.
# Parameters dtype.
args
.
params_dtype
=
torch
.
float
args
.
params_dtype
=
torch
.
float
...
@@ -561,8 +569,8 @@ def _add_distributed_args(parser):
...
@@ -561,8 +569,8 @@ def _add_distributed_args(parser):
group
.
add_argument
(
'--model-parallel-size'
,
type
=
int
,
default
=
None
,
group
.
add_argument
(
'--model-parallel-size'
,
type
=
int
,
default
=
None
,
help
=
'Old model parallel argument, do not use. Use '
help
=
'Old model parallel argument, do not use. Use '
'--tensor-model-parallel-size instead.'
)
'--tensor-model-parallel-size instead.'
)
group
.
add_argument
(
'--virtual-pipeline-
model-parallel-siz
e'
,
type
=
int
,
default
=
None
,
group
.
add_argument
(
'--
num-layers-per-
virtual-pipeline-
stag
e'
,
type
=
int
,
default
=
None
,
help
=
'Number of virtual pipeline stage
s in physical stage.
'
)
help
=
'Number of
layers per
virtual pipeline stage'
)
group
.
add_argument
(
'--distributed-backend'
,
default
=
'nccl'
,
group
.
add_argument
(
'--distributed-backend'
,
default
=
'nccl'
,
choices
=
[
'nccl'
,
'gloo'
],
choices
=
[
'nccl'
,
'gloo'
],
help
=
'Which backend to use for distributed training.'
)
help
=
'Which backend to use for distributed training.'
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment