Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
83aa9219
Commit
83aa9219
authored
Mar 27, 2020
by
Mohammad
Browse files
added global variables
parent
599e959a
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
197 additions
and
19 deletions
+197
-19
megatron/arguments.py
megatron/arguments.py
+3
-2
megatron/data/tokenizer.py
megatron/data/tokenizer.py
+48
-12
megatron/global_vars.py
megatron/global_vars.py
+138
-0
megatron/training.py
megatron/training.py
+2
-2
megatron/utils.py
megatron/utils.py
+6
-3
No files found.
arguments.py
→
megatron/
arguments.py
View file @
83aa9219
...
@@ -23,6 +23,7 @@ import torch
...
@@ -23,6 +23,7 @@ import torch
_GLOBAL_ARGS
=
None
_GLOBAL_ARGS
=
None
def
parse_args
(
extra_args_provider
=
None
):
def
parse_args
(
extra_args_provider
=
None
):
global
_GLOBAL_ARGS
global
_GLOBAL_ARGS
...
@@ -200,6 +201,8 @@ def add_mixed_precision_args(parser):
...
@@ -200,6 +201,8 @@ def add_mixed_precision_args(parser):
def
add_distributed_args
(
parser
):
def
add_distributed_args
(
parser
):
group
=
parser
.
add_argument_group
(
title
=
'mixed precision'
)
group
=
parser
.
add_argument_group
(
title
=
'mixed precision'
)
group
.
add_argument
(
'--model-parallel-size'
,
type
=
int
,
default
=
1
,
help
=
'Size of the model parallel.'
)
group
.
add_argument
(
'--distributed-backend'
,
default
=
'nccl'
,
group
.
add_argument
(
'--distributed-backend'
,
default
=
'nccl'
,
choices
=
[
'nccl'
,
'gloo'
],
choices
=
[
'nccl'
,
'gloo'
],
help
=
'Which backend to use for distributed training.'
)
help
=
'Which backend to use for distributed training.'
)
...
@@ -389,8 +392,6 @@ def add_data_args_(parser):
...
@@ -389,8 +392,6 @@ def add_data_args_(parser):
group
=
parser
.
add_argument_group
(
'data'
,
'data configurations'
)
group
=
parser
.
add_argument_group
(
'data'
,
'data configurations'
)
group
.
add_argument
(
'--model-parallel-size'
,
type
=
int
,
default
=
1
,
help
=
'size of the model parallel.'
)
group
.
add_argument
(
'--shuffle'
,
action
=
'store_true'
,
group
.
add_argument
(
'--shuffle'
,
action
=
'store_true'
,
help
=
'Shuffle data. Shuffling is deterministic '
help
=
'Shuffle data. Shuffling is deterministic '
'based on seed and current epoch.'
)
'based on seed and current epoch.'
)
...
...
megatron/data/tokenizer.py
View file @
83aa9219
# coding=utf-8
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Megatron tokenizer."""
"""Megatron tokenizer."""
from
abc
import
ABC
from
abc
import
ABC
from
abc
import
abstractmethod
from
abc
import
abstractmethod
from
megatron.
util
s
import
vocab_size_with_padding
from
megatron.
argument
s
import
get_args
from
.bert_tokenization
import
FullTokenizer
as
FullBertTokenizer
from
.bert_tokenization
import
FullTokenizer
as
FullBertTokenizer
def
add_tokenizer_to_args
(
args
,
tokenizer_type
):
def
build_tokenizer
():
"""Instantiate tokenizer based on input type and add it to args."""
"""Initialize tokenizer."""
# Retrieve args.
args
=
get_args
()
if
args
.
rank
==
0
:
print
(
'building {} tokenizer ...'
.
format
(
args
.
tokenizer_type
),
flush
=
True
)
# Make sure we have not already called this method.
if
hasattr
(
args
,
'tokenizer'
):
raise
Exception
(
'args already has a tokenizer'
)
# Select and instantiate the tokenizer.
# Select and instantiate the tokenizer.
if
tokenizer_type
==
'BertWordPieceLowerCase'
:
if
args
.
tokenizer_type
==
'BertWordPieceLowerCase'
:
args
.
tokenizer
=
_BertWordPieceTokenizer
(
vocab_file
=
args
.
vocab
,
tokenizer
=
_BertWordPieceTokenizer
(
vocab_file
=
args
.
vocab
_file
,
lower_case
=
True
)
lower_case
=
True
)
else
:
else
:
raise
NotImplementedError
(
'{} tokenizer is not '
raise
NotImplementedError
(
'{} tokenizer is not '
'implemented.'
.
format
(
tokenizer_type
))
'implemented.'
.
format
(
args
.
tokenizer_type
))
# Add vocab size.
# Add vocab size.
args
.
vocab_size
=
vocab_size_with_padding
(
args
.
tokenizer
.
vocab_size
,
args
)
args
.
padded_vocab_size
=
_vocab_size_with_padding
(
tokenizer
.
vocab_size
)
return
tokenizer
def
_vocab_size_with_padding
(
orig_vocab_size
):
"""Pad vocab size so it is divisible by model parallel size and
still having GPU friendly size."""
args
=
get_args
()
after
=
orig_vocab_size
multiple
=
args
.
make_vocab_size_divisible_by
*
\
args
.
model_parallel_size
while
(
after
%
multiple
)
!=
0
:
after
+=
1
if
args
.
rank
==
0
:
print
(
' > padded vocab (size: {}) with {} dummy tokens '
'(new size: {})'
.
format
(
orig_vocab_size
,
after
-
orig_vocab_size
,
after
),
flush
=
True
)
return
after
class
AbstractTokenizer
(
ABC
):
class
AbstractTokenizer
(
ABC
):
...
...
megatron/global_vars.py
0 → 100644
View file @
83aa9219
# coding=utf-8
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Megatron global variables."""
import
os
import
sys
from
megatron.data.tokenizer
import
build_tokenizer
from
.arguments
import
parse_args
from
.utils
import
Timers
_GLOBAL_ARGS
=
None
_GLOBAL_TOKENIZER
=
None
_GLOBAL_TENSORBOARD_WRITER
=
None
_GLOBAL_ADLR_AUTORESUME
=
None
_GLOBAL_TIMERS
=
None
def
get_args
():
"""Return arguments."""
_ensure_var_is_initialized
(
_GLOBAL_ARGS
,
'args'
)
return
_GLOBAL_ARGS
def
get_tokenizer
():
"""Return tokenizer."""
_ensure_var_is_initialized
(
_GLOBAL_TOKENIZER
,
'tokenizer'
)
return
_GLOBAL_TOKENIZER
def
get_tensorboard_writer
():
"""Return tensorboard writer. It can be None so no need
to check if it is initialized."""
return
_GLOBAL_TENSORBOARD_WRITER
def
get_adlr_autoresume
():
"""ADLR autoresume object. It can be None so no need
to check if it is initialized."""
return
_GLOBAL_ADLR_AUTORESUME
def
get_timers
():
"""Return timers."""
_ensure_var_is_initialized
(
_GLOBAL_TIMERS
,
'timers'
)
return
_GLOBAL_TIMERS
def
set_global_variables
(
extra_args_provider
=
None
):
"""Set args, tokenizer, tensorboard-writer, adlr-autoresume, and timers."""
_parse_args
(
extra_args_provider
=
extra_args_provider
)
_build_tokenizer
()
_set_tensorboard_writer
()
_set_adlr_autoresume
()
_set_timers
()
def
_parse_args
(
extra_args_provider
=
None
):
"""Parse entire arguments."""
global
_GLOBAL_ARGS
_ensure_var_is_not_initialized
(
_GLOBAL_ARGS
,
'args'
)
_GLOBAL_ARGS
=
parse_args
(
extra_args_provider
=
extra_args_provider
)
def
_build_tokenizer
():
"""Initialize tokenizer."""
global
_GLOBAL_TOKENIZER
_ensure_var_is_not_initialized
(
_GLOBAL_TOKENIZER
,
'tokenizer'
)
_GLOBAL_TOKENIZER
=
build_tokenizer
()
def
_set_tensorboard_writer
():
"""Set tensorboard writer."""
global
_GLOBAL_TENSORBOARD_WRITER
_ensure_var_is_not_initialized
(
_GLOBAL_TENSORBOARD_WRITER
,
'tensorboard writer'
)
args
=
get_args
()
if
hasattr
(
args
,
'tensorboard_dir'
)
and
\
args
.
tensorboard_dir
and
args
.
rank
==
0
:
try
:
from
torch.utils.tensorboard
import
SummaryWriter
print
(
'> setting tensorboard ...'
)
_GLOBAL_TENSORBOARD_WRITER
=
SummaryWriter
(
log_dir
=
args
.
tensorboard_dir
)
except
ModuleNotFoundError
:
print
(
'WARNING: TensorBoard writing requested but is not '
'available (are you using PyTorch 1.1.0 or later?), '
'no TensorBoard logs will be written.'
,
flush
=
True
)
def
_set_adlr_autoresume
():
"""Initialize ADLR autoresume."""
global
_GLOBAL_ADLR_AUTORESUME
_ensure_var_is_not_initialized
(
_GLOBAL_ADLR_AUTORESUME
,
'adlr autoresume'
)
args
=
get_args
()
if
args
.
adlr_autoresume
:
if
args
.
rank
==
0
:
print
(
'enabling autoresume ...'
,
flush
=
True
)
sys
.
path
.
append
(
os
.
environ
.
get
(
'SUBMIT_SCRIPTS'
,
'.'
))
try
:
from
userlib.auto_resume
import
AutoResume
except
:
print
(
'ADLR autoresume is not available, exiting ...'
)
sys
.
exit
()
_GLOBAL_ADLR_AUTORESUME
=
AutoResume
def
_set_timers
():
"""Initialize timers."""
global
_GLOBAL_TIMERS
_ensure_var_is_not_initialized
(
_GLOBAL_TIMERS
,
'timers'
)
_GLOBAL_TIMERS
=
Timers
()
def
_ensure_var_is_initialized
(
var
,
name
):
"""Make sure the input variable is not None."""
assert
var
is
not
None
,
'{} is not initialized.'
.
format
(
name
)
def
_ensure_var_is_not_initialized
(
var
,
name
):
"""Make sure the input variable is not None."""
assert
var
is
None
,
'{} is already initialized.'
.
format
(
name
)
megatron/training.py
View file @
83aa9219
...
@@ -22,7 +22,7 @@ import torch
...
@@ -22,7 +22,7 @@ import torch
from
torch.nn.parallel.distributed
import
DistributedDataParallel
as
torchDDP
from
torch.nn.parallel.distributed
import
DistributedDataParallel
as
torchDDP
from
apex.optimizers
import
FusedAdam
as
Adam
from
apex.optimizers
import
FusedAdam
as
Adam
from
arguments
import
get_args
from
megatron.
arguments
import
get_args
from
megatron
import
mpu
from
megatron
import
mpu
from
megatron.fp16
import
FP16_Module
from
megatron.fp16
import
FP16_Module
from
megatron.fp16
import
FP16_Optimizer
from
megatron.fp16
import
FP16_Optimizer
...
@@ -129,7 +129,7 @@ def initialize_megatron(message, args):
...
@@ -129,7 +129,7 @@ def initialize_megatron(message, args):
initialize_distributed
(
args
)
initialize_distributed
(
args
)
if
torch
.
distributed
.
get_rank
()
==
0
:
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
message
,
flush
=
True
)
print
(
message
,
flush
=
True
)
print_args
(
args
,
writer
)
print_args
(
args
,
writer
)
# Autoresume.
# Autoresume.
torch
.
distributed
.
barrier
()
torch
.
distributed
.
barrier
()
...
...
megatron/utils.py
View file @
83aa9219
...
@@ -150,13 +150,15 @@ def check_adlr_autoresume_termination(iteration, model, optimizer,
...
@@ -150,13 +150,15 @@ def check_adlr_autoresume_termination(iteration, model, optimizer,
def
print_args
(
args
,
writer
=
None
):
def
print_args
(
args
,
writer
=
None
):
"""Print arguments."""
"""Print arguments."""
print
(
'arguments:'
,
flush
=
True
)
print_rank_0
(
'arguments:'
)
str_list
=
[]
for
arg
in
vars
(
args
):
for
arg
in
vars
(
args
):
dots
=
'.'
*
(
29
-
len
(
arg
))
dots
=
'.'
*
(
29
-
len
(
arg
))
print
(
' {} {} {}'
.
format
(
arg
,
dots
,
getattr
(
args
,
arg
)),
flush
=
True
)
str_list
.
append
(
' {} {} {}'
.
format
(
arg
,
dots
,
getattr
(
args
,
arg
)))
if
writer
:
if
writer
:
writer
.
add_text
(
arg
,
str
(
getattr
(
args
,
arg
)))
writer
.
add_text
(
arg
,
str
(
getattr
(
args
,
arg
)))
for
arg
in
sorted
(
str_list
,
key
=
lambda
a
:
a
.
lower
()):
print_rank_0
(
arg
)
def
print_params_min_max_norm
(
optimizer
,
iteration
):
def
print_params_min_max_norm
(
optimizer
,
iteration
):
...
@@ -290,6 +292,7 @@ def initialize_distributed(args):
...
@@ -290,6 +292,7 @@ def initialize_distributed(args):
device
=
args
.
rank
%
torch
.
cuda
.
device_count
()
device
=
args
.
rank
%
torch
.
cuda
.
device_count
()
if
args
.
local_rank
is
not
None
:
if
args
.
local_rank
is
not
None
:
device
=
args
.
local_rank
device
=
args
.
local_rank
torch
.
cuda
.
set_device
(
device
)
torch
.
cuda
.
set_device
(
device
)
# Call the init process
# Call the init process
init_method
=
'tcp://'
init_method
=
'tcp://'
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment