Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
ebbe40cd
Commit
ebbe40cd
authored
Oct 09, 2019
by
Raul Puri
Browse files
Merge branch 'move_vocab_padding_to_utils' into 'master'
Move vocab padding to utils See merge request ADLR/megatron-lm!6
parents
6b68bb8a
7e46ca58
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
26 additions
and
22 deletions
+26
-22
megatron/utils.py
megatron/utils.py
+13
-0
pretrain_bert.py
pretrain_bert.py
+6
-12
pretrain_gpt2.py
pretrain_gpt2.py
+7
-10
No files found.
megatron/utils.py
View file @
ebbe40cd
...
@@ -185,6 +185,19 @@ def report_memory(name):
...
@@ -185,6 +185,19 @@ def report_memory(name):
print_rank_0
(
string
)
print_rank_0
(
string
)
def
vocab_size_with_padding
(
num_tokens
,
args
):
after
=
num_tokens
multiple
=
args
.
make_vocab_size_divisible_by
*
\
mpu
.
get_model_parallel_world_size
()
while
(
after
%
multiple
)
!=
0
:
after
+=
1
print_rank_0
(
'> padded vocab (size: {}) with {} dummy '
'tokens (new size: {})'
.
format
(
num_tokens
,
after
-
num_tokens
,
after
))
return
after
def
initialize_distributed
(
args
):
def
initialize_distributed
(
args
):
"""Initialize torch.distributed."""
"""Initialize torch.distributed."""
...
...
pretrain_bert.py
View file @
ebbe40cd
...
@@ -44,7 +44,7 @@ from megatron.utils import check_adlr_autoresume_termination
...
@@ -44,7 +44,7 @@ from megatron.utils import check_adlr_autoresume_termination
from
megatron.utils
import
initialize_distributed
from
megatron.utils
import
initialize_distributed
from
megatron.utils
import
set_random_seed
from
megatron.utils
import
set_random_seed
from
megatron.utils
import
wrap_model_for_distributed_training
from
megatron.utils
import
wrap_model_for_distributed_training
from
megatron.utils
import
vocab_size_with_padding
def
get_model
(
args
):
def
get_model
(
args
):
"""Build the model."""
"""Build the model."""
...
@@ -477,19 +477,13 @@ def get_train_val_test_data(args):
...
@@ -477,19 +477,13 @@ def get_train_val_test_data(args):
ds_type
=
'BERT'
ds_type
=
'BERT'
data_config
.
set_defaults
(
data_set_type
=
ds_type
,
transpose
=
False
)
data_config
.
set_defaults
(
data_set_type
=
ds_type
,
transpose
=
False
)
(
train_data
,
val_data
,
test_data
),
tokenizer
=
data_config
.
apply
(
args
)
(
train_data
,
val_data
,
test_data
),
tokenizer
=
data_config
.
apply
(
args
)
before
=
tokenizer
.
num_tokens
num_tokens
=
vocab_size_with_padding
(
tokenizer
.
num_tokens
,
args
)
after
=
before
multiple
=
args
.
make_vocab_size_divisible_by
*
\
mpu
.
get_model_parallel_world_size
()
while
(
after
%
multiple
)
!=
0
:
after
+=
1
print_rank_0
(
'> padded vocab (size: {}) with {} dummy '
'tokens (new size: {})'
.
format
(
before
,
after
-
before
,
after
))
# Need to broadcast num_tokens and num_type_tokens.
# Need to broadcast num_tokens and num_type_tokens.
token_counts
=
torch
.
cuda
.
LongTensor
([
after
,
token_counts
=
torch
.
cuda
.
LongTensor
([
num_tokens
,
tokenizer
.
num_type_tokens
,
tokenizer
.
num_type_tokens
,
int
(
args
.
do_train
),
int
(
args
.
do_valid
),
int
(
args
.
do_test
)])
int
(
args
.
do_train
),
int
(
args
.
do_valid
),
int
(
args
.
do_test
)])
else
:
else
:
token_counts
=
torch
.
cuda
.
LongTensor
([
0
,
0
,
0
,
0
,
0
])
token_counts
=
torch
.
cuda
.
LongTensor
([
0
,
0
,
0
,
0
,
0
])
...
...
pretrain_gpt2.py
View file @
ebbe40cd
...
@@ -43,6 +43,7 @@ from megatron.utils import check_adlr_autoresume_termination
...
@@ -43,6 +43,7 @@ from megatron.utils import check_adlr_autoresume_termination
from
megatron.utils
import
initialize_distributed
from
megatron.utils
import
initialize_distributed
from
megatron.utils
import
set_random_seed
from
megatron.utils
import
set_random_seed
from
megatron.utils
import
wrap_model_for_distributed_training
from
megatron.utils
import
wrap_model_for_distributed_training
from
megatron.utils
import
vocab_size_with_padding
from
gpt2_data_loader
import
make_gpt2_dataloaders
from
gpt2_data_loader
import
make_gpt2_dataloaders
...
@@ -509,17 +510,13 @@ def get_train_val_test_data(args):
...
@@ -509,17 +510,13 @@ def get_train_val_test_data(args):
num_tokens
=
tokenizer
.
num_tokens
num_tokens
=
tokenizer
.
num_tokens
eod_token
=
tokenizer
.
get_command
(
'eos'
).
Id
eod_token
=
tokenizer
.
get_command
(
'eos'
).
Id
assert
eod_token
==
tokenizer
.
get_command
(
'pad'
).
Id
assert
eod_token
==
tokenizer
.
get_command
(
'pad'
).
Id
before
=
num_tokens
# pad.
after
=
before
num_tokens
=
vocab_size_with_padding
(
num_tokens
,
args
)
multiple
=
args
.
make_vocab_size_divisible_by
*
\
mpu
.
get_model_parallel_world_size
()
while
(
after
%
multiple
)
!=
0
:
after
+=
1
print_rank_0
(
'> padded vocab (size: {}) with {} dummy '
'tokens (new size: {})'
.
format
(
before
,
after
-
before
,
after
))
print_rank_0
(
'> found end-of-document token: {}'
.
format
(
eod_token
))
print_rank_0
(
'> found end-of-document token: {}'
.
format
(
eod_token
))
token_counts
=
torch
.
cuda
.
LongTensor
([
after
,
eod_token
,
int
(
args
.
do_train
),
int
(
args
.
do_valid
),
int
(
args
.
do_test
)])
token_counts
=
torch
.
cuda
.
LongTensor
([
num_tokens
,
eod_token
,
int
(
args
.
do_train
),
int
(
args
.
do_valid
),
int
(
args
.
do_test
)])
else
:
else
:
token_counts
=
torch
.
cuda
.
LongTensor
([
0
,
0
,
0
,
0
,
0
])
token_counts
=
torch
.
cuda
.
LongTensor
([
0
,
0
,
0
,
0
,
0
])
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment