Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
0e5dfd7f
Commit
0e5dfd7f
authored
Mar 29, 2020
by
Mohammad
Browse files
added gpt2 tokenizer
parent
b6e0377b
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
55 additions
and
51 deletions
+55
-51
megatron/arguments.py
megatron/arguments.py
+10
-18
megatron/tokenizer/tokenizer.py
megatron/tokenizer/tokenizer.py
+28
-0
pretrain_gpt2.py
pretrain_gpt2.py
+17
-33
No files found.
megatron/arguments.py
View file @
0e5dfd7f
...
@@ -35,6 +35,8 @@ def parse_args(extra_args_provider=None, defaults={}):
...
@@ -35,6 +35,8 @@ def parse_args(extra_args_provider=None, defaults={}):
parser
=
_add_validation_args
(
parser
)
parser
=
_add_validation_args
(
parser
)
parser
=
_add_data_args
(
parser
)
parser
=
_add_data_args
(
parser
)
parser
=
_add_autoresume_args
(
parser
)
parser
=
_add_autoresume_args
(
parser
)
# TODO: Refactor
parser
=
_add_gpt2_args
(
parser
)
# Custom arguments.
# Custom arguments.
if
extra_args_provider
is
not
None
:
if
extra_args_provider
is
not
None
:
...
@@ -293,6 +295,8 @@ def _add_data_args(parser):
...
@@ -293,6 +295,8 @@ def _add_data_args(parser):
'validation and 5% for test.'
)
'validation and 5% for test.'
)
group
.
add_argument
(
'--vocab-file'
,
type
=
str
,
required
=
True
,
group
.
add_argument
(
'--vocab-file'
,
type
=
str
,
required
=
True
,
help
=
'Path to the vocab file.'
)
help
=
'Path to the vocab file.'
)
group
.
add_argument
(
'--merge-file'
,
type
=
str
,
default
=
None
,
help
=
'Path to the BPE merge file.'
)
group
.
add_argument
(
'--seq-length'
,
type
=
int
,
required
=
True
,
group
.
add_argument
(
'--seq-length'
,
type
=
int
,
required
=
True
,
help
=
"Maximum sequence length to process."
)
help
=
"Maximum sequence length to process."
)
group
.
add_argument
(
'--mask-prob'
,
type
=
float
,
default
=
0.15
,
group
.
add_argument
(
'--mask-prob'
,
type
=
float
,
default
=
0.15
,
...
@@ -330,19 +334,19 @@ def _add_autoresume_args(parser):
...
@@ -330,19 +334,19 @@ def _add_autoresume_args(parser):
########################################################################
########################################################################
def
add_
training
_args
_
(
parser
):
def
_
add_
gpt2
_args
(
parser
):
"""Training arguments."""
group
=
parser
.
add_argument_group
(
title
=
'gpt2'
)
group
=
parser
.
add_argument
_group
(
'train'
,
'training configurations'
)
group
.
add_argument
(
'--input-data-sizes-file'
,
type
=
str
,
default
=
'sizes.txt'
,
help
=
'The filename containing all the shards '
# Batch prodecuer arguments
'sizes for numpy data loader'
)
group
.
add_argument
(
'--reset-position-ids'
,
action
=
'store_true'
,
group
.
add_argument
(
'--reset-position-ids'
,
action
=
'store_true'
,
help
=
'Reset posistion ids after end-of-document token.'
)
help
=
'Reset posistion ids after end-of-document token.'
)
group
.
add_argument
(
'--reset-attention-mask'
,
action
=
'store_true'
,
group
.
add_argument
(
'--reset-attention-mask'
,
action
=
'store_true'
,
help
=
'Reset self attention maske after '
help
=
'Reset self attention maske after '
'end-of-document token.'
)
'end-of-document token.'
)
group
.
add_argument
(
'--eod-mask-loss'
,
action
=
'store_true'
,
group
.
add_argument
(
'--eod-mask-loss'
,
action
=
'store_true'
,
help
=
'Mask loss for the end of document tokens'
)
help
=
'Mask loss for the end of document tokens
.
'
)
return
parser
return
parser
...
@@ -411,18 +415,6 @@ def add_data_args_(parser):
...
@@ -411,18 +415,6 @@ def add_data_args_(parser):
choices
=
[
'raw'
,
'lazy'
,
'tfrecords'
,
'numpy'
,
'binary'
],
choices
=
[
'raw'
,
'lazy'
,
'tfrecords'
,
'numpy'
,
'binary'
],
help
=
'Which data loader to use. Default varies by model.'
)
help
=
'Which data loader to use. Default varies by model.'
)
group
.
add_argument
(
'--train-data'
,
nargs
=
'+'
,
default
=
None
,
help
=
'Whitespace separated paths or corpora names '
'for training.'
)
group
.
add_argument
(
'--valid-data'
,
nargs
=
'*'
,
default
=
None
,
help
=
'path(s) to the validation data.'
)
group
.
add_argument
(
'--test-data'
,
nargs
=
'*'
,
default
=
None
,
help
=
'path(s) to the testing data.'
)
# arguments for binary data loader
# arguments for numpy data loader
group
.
add_argument
(
'--input-data-sizes-file'
,
type
=
str
,
default
=
'sizes.txt'
,
help
=
'the filename containing all the shards sizes for numpy data loader'
)
return
parser
return
parser
megatron/tokenizer/tokenizer.py
View file @
0e5dfd7f
...
@@ -19,6 +19,7 @@ from abc import ABC
...
@@ -19,6 +19,7 @@ from abc import ABC
from
abc
import
abstractmethod
from
abc
import
abstractmethod
from
.bert_tokenization
import
FullTokenizer
as
FullBertTokenizer
from
.bert_tokenization
import
FullTokenizer
as
FullBertTokenizer
from
.gpt2_tokenization
import
GPT2Tokenizer
def
build_tokenizer
(
args
):
def
build_tokenizer
(
args
):
...
@@ -28,9 +29,13 @@ def build_tokenizer(args):
...
@@ -28,9 +29,13 @@ def build_tokenizer(args):
flush
=
True
)
flush
=
True
)
# Select and instantiate the tokenizer.
# Select and instantiate the tokenizer.
assert
args
.
vocab_file
is
not
None
if
args
.
tokenizer_type
==
'BertWordPieceLowerCase'
:
if
args
.
tokenizer_type
==
'BertWordPieceLowerCase'
:
tokenizer
=
_BertWordPieceTokenizer
(
vocab_file
=
args
.
vocab_file
,
tokenizer
=
_BertWordPieceTokenizer
(
vocab_file
=
args
.
vocab_file
,
lower_case
=
True
)
lower_case
=
True
)
elif
args
.
tokenizer_type
==
'GPT2BPETokenizer'
:
assert
args
.
merge_file
is
not
None
tokenizer
=
_GPT2BPETokenizer
(
args
.
vocab_file
,
args
.
merge_file
)
else
:
else
:
raise
NotImplementedError
(
'{} tokenizer is not '
raise
NotImplementedError
(
'{} tokenizer is not '
'implemented.'
.
format
(
args
.
tokenizer_type
))
'implemented.'
.
format
(
args
.
tokenizer_type
))
...
@@ -129,3 +134,26 @@ class _BertWordPieceTokenizer(AbstractTokenizer):
...
@@ -129,3 +134,26 @@ class _BertWordPieceTokenizer(AbstractTokenizer):
@
property
@
property
def
pad
(
self
):
def
pad
(
self
):
return
self
.
pad_id
return
self
.
pad_id
class
_GPT2BPETokenizer
(
AbstractTokenizer
):
"""Original GPT2 BPE tokenizer."""
def
__init__
(
self
,
vocab_file
,
merge_file
):
name
=
'GPT2 BPE'
super
().
__init__
(
name
)
self
.
tokenizer
=
GPT2Tokenizer
(
vocab_file
,
merge_file
,
errors
=
'replace'
,
special_tokens
=
[],
max_len
=
None
)
self
.
eod_id
=
self
.
tokenizer
.
encoder
[
'<|endoftext|>'
]
@
property
def
vocab_size
(
self
):
return
len
(
self
.
tokenizer
.
encoder
)
def
tokenize
(
self
,
text
):
return
self
.
tokenizer
.
encode
(
text
)
@
property
def
eod
(
self
):
return
self
.
eod_id
pretrain_gpt2.py
View file @
0e5dfd7f
...
@@ -17,20 +17,16 @@
...
@@ -17,20 +17,16 @@
import
torch
import
torch
from
gpt2_data_loader
import
make_gpt2_dataloaders
from
megatron
import
get_args
from
megatron
import
get_args
from
megatron
import
get_timers
from
megatron
import
get_timers
from
configure_data
import
configure_data
from
gpt2_data_loader
import
make_gpt2_dataloaders
from
megatron
import
mpu
from
megatron
import
mpu
from
megatron
import
print_rank_0
from
megatron.model
import
GPT2Model
from
megatron.model
import
GPT2Model
from
megatron.training
import
pretrain
from
megatron.utils
import
get_ltor_masks_and_position_ids
from
megatron.utils
import
get_ltor_masks_and_position_ids
from
megatron
import
print_rank_0
from
megatron.utils
import
reduce_losses
from
megatron.utils
import
reduce_losses
from
megatron.utils
import
vocab_size_with_padding
import
os
from
megatron.training
import
pretrain
def
model_provider
():
def
model_provider
():
"""Build the model."""
"""Build the model."""
...
@@ -97,7 +93,7 @@ def forward_step(data_iterator, model):
...
@@ -97,7 +93,7 @@ def forward_step(data_iterator, model):
# Get the batch.
# Get the batch.
timers
(
'batch generator'
).
start
()
timers
(
'batch generator'
).
start
()
tokens
,
labels
,
loss_mask
,
attention_mask
,
position_ids
=
get_batch
(
tokens
,
labels
,
loss_mask
,
attention_mask
,
position_ids
=
get_batch
(
data_iterator
,
args
,
timers
)
data_iterator
)
timers
(
'batch generator'
).
stop
()
timers
(
'batch generator'
).
stop
()
# Forward model.
# Forward model.
...
@@ -121,28 +117,17 @@ def get_train_val_test_data():
...
@@ -121,28 +117,17 @@ def get_train_val_test_data():
# Data loader only on rank 0 of each model parallel group.
# Data loader only on rank 0 of each model parallel group.
if
mpu
.
get_model_parallel_rank
()
==
0
:
if
mpu
.
get_model_parallel_rank
()
==
0
:
if
args
.
data_loader
==
'numpy'
:
assert
len
(
args
.
train_data
)
==
1
args
.
cache_dir
=
'cache'
args
.
train_data
=
args
.
train_data
[
0
]
args
.
train_data
=
os
.
path
.
join
(
args
.
data_path
,
'train'
)
assert
len
(
args
.
valid_data
)
==
1
args
.
valid_data
=
os
.
path
.
join
(
args
.
data_path
,
'valid'
)
args
.
valid_data
=
args
.
valid_data
[
0
]
args
.
test_data
=
os
.
path
.
join
(
args
.
data_path
,
'test'
)
assert
len
(
args
.
test_data
)
==
1
(
train_data
,
val_data
,
test_data
),
num_tokens
,
\
args
.
test_data
=
args
.
test_data
[
0
]
eod_token
=
make_gpt2_dataloaders
(
args
)
(
train_data
,
val_data
,
test_data
),
num_tokens
,
\
eod_token
=
make_gpt2_dataloaders
(
args
)
elif
args
.
data_loader
==
'raw'
or
args
.
data_loader
==
'lazy'
:
data_config
=
configure_data
()
data_config
.
set_defaults
(
data_set_type
=
'GPT2'
,
transpose
=
False
)
(
train_data
,
val_data
,
test_data
),
tokenizer
=
data_config
.
apply
(
args
)
num_tokens
=
tokenizer
.
num_tokens
eod_token
=
tokenizer
.
get_command
(
'eos'
).
Id
assert
eod_token
==
tokenizer
.
get_command
(
'pad'
).
Id
else
:
print
(
"Unsupported data loader for GPT2."
)
exit
(
1
)
# pad.
# pad.
num_tokens
=
vocab_size_with_padding
(
num_tokens
,
args
)
from
megatron.tokenizer.tokenizer
import
_vocab_size_with_padding
num_tokens
=
_vocab_size_with_padding
(
num_tokens
,
args
)
print_rank_0
(
'> found end-of-document token: {}'
.
format
(
eod_token
))
print_rank_0
(
'> found end-of-document token: {}'
.
format
(
eod_token
))
token_counts
=
torch
.
cuda
.
LongTensor
([
num_tokens
,
eod_token
,
token_counts
=
torch
.
cuda
.
LongTensor
([
num_tokens
,
eod_token
,
int
(
args
.
do_train
),
int
(
args
.
do_train
),
...
@@ -161,7 +146,6 @@ def get_train_val_test_data():
...
@@ -161,7 +146,6 @@ def get_train_val_test_data():
args
.
do_valid
=
token_counts
[
3
].
item
()
args
.
do_valid
=
token_counts
[
3
].
item
()
args
.
do_test
=
token_counts
[
4
].
item
()
args
.
do_test
=
token_counts
[
4
].
item
()
args
.
vocab_size
=
num_tokens
args
.
eod_token
=
eod_token
args
.
eod_token
=
eod_token
return
train_data
,
val_data
,
test_data
return
train_data
,
val_data
,
test_data
...
@@ -169,5 +153,5 @@ def get_train_val_test_data():
...
@@ -169,5 +153,5 @@ def get_train_val_test_data():
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
pretrain
(
get_train_val_test_data
,
pretrain
(
get_train_val_test_data
,
model_provider
,
forward_step
,
model_provider
,
forward_step
)
args_defaults
=
{
'tokenizer_type'
:
'GPT2BPETokenizer'
}
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment