Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
1eafa861
Commit
1eafa861
authored
Aug 26, 2020
by
Boris Fomitchev
Browse files
Addressing code review comments
Signed-off-by:
Boris Fomitchev
<
bfomitchev@nvidia.com
>
parent
e10760ea
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
8 additions
and
9 deletions
+8
-9
megatron/arguments.py
megatron/arguments.py
+2
-0
megatron/initialize.py
megatron/initialize.py
+1
-0
megatron/mpu/layers.py
megatron/mpu/layers.py
+5
-9
No files found.
megatron/arguments.py
View file @
1eafa861
...
...
@@ -337,6 +337,8 @@ def _add_distributed_args(parser):
help
=
'If set to True, initialize_megatron() skips DDP initialization'
' and returns function to complete it instead'
'This is for external DDP manager.'
)
group
.
add_argument
(
'--use-cpu-initialization'
,
action
=
'store_true'
,
help
=
'If set, affine parallel weights initialization uses CPU'
)
return
parser
...
...
megatron/initialize.py
View file @
1eafa861
...
...
@@ -62,6 +62,7 @@ def initialize_megatron(extra_args_provider=None, args_defaults={},
args
=
get_args
()
if
args
.
lazy_mpu_init
:
args
.
use_cpu_initialization
=
True
# delayed initialization of DDP-related stuff
# We only set basic DDP globals
set_model_parallel_world_size
(
args
.
model_parallel_size
)
...
...
megatron/mpu/layers.py
View file @
1eafa861
...
...
@@ -47,10 +47,6 @@ from .utils import split_tensor_along_last_dim
from
.utils
import
VocabUtility
from
megatron
import
get_args
_USE_CPU_INITIALIZATION
=
False
def
_initialize_affine_weight_gpu
(
weight
,
init_method
,
partition_dim
,
stride
=
1
):
"""Initialize affine weight for model parallel on GPU."""
...
...
@@ -141,7 +137,7 @@ class VocabParallelEmbedding(torch.nn.Module):
# Allocate weights and initialize.
args
=
get_args
()
if
_USE_CPU_INITIALIZATION
or
args
.
lazy_mpu_init
:
if
args
.
use_cpu_initialization
:
self
.
weight
=
Parameter
(
torch
.
empty
(
self
.
num_embeddings_per_partition
,
self
.
embedding_dim
,
dtype
=
args
.
params_dtype
))
...
...
@@ -217,7 +213,7 @@ class ColumnParallelLinear(torch.nn.Module):
# we allocate the transpose.
# Initialize weight.
args
=
get_args
()
if
_USE_CPU_INITIALIZATION
or
args
.
lazy_mpu_init
:
if
args
.
use_cpu_initialization
:
self
.
weight
=
Parameter
(
torch
.
empty
(
self
.
output_size_per_partition
,
self
.
input_size
,
dtype
=
args
.
params_dtype
))
...
...
@@ -233,7 +229,7 @@ class ColumnParallelLinear(torch.nn.Module):
partition_dim
=
0
,
stride
=
stride
)
if
bias
:
if
_USE_CPU_INITIALIZATION
or
args
.
lazy_mpu_init
:
if
args
.
use_cpu_initialization
:
self
.
bias
=
Parameter
(
torch
.
empty
(
self
.
output_size_per_partition
,
dtype
=
args
.
params_dtype
))
else
:
...
...
@@ -311,7 +307,7 @@ class RowParallelLinear(torch.nn.Module):
# we allocate the transpose.
# Initialize weight.
args
=
get_args
()
if
_USE_CPU_INITIALIZATION
or
args
.
lazy_mpu_init
:
if
args
.
use_cpu_initialization
:
self
.
weight
=
Parameter
(
torch
.
empty
(
self
.
output_size
,
self
.
input_size_per_partition
,
dtype
=
args
.
params_dtype
))
...
...
@@ -326,7 +322,7 @@ class RowParallelLinear(torch.nn.Module):
_initialize_affine_weight_gpu
(
self
.
weight
,
init_method
,
partition_dim
=
1
,
stride
=
stride
)
if
bias
:
if
_USE_CPU_INITIALIZATION
or
args
.
lazy_mpu_init
:
if
args
.
use_cpu_initialization
:
self
.
bias
=
Parameter
(
torch
.
empty
(
self
.
output_size
,
dtype
=
args
.
params_dtype
))
else
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment